diff options
Diffstat (limited to 'hypervideo_dl/extractor')
345 files changed, 24114 insertions, 6762 deletions
diff --git a/hypervideo_dl/extractor/_extractors.py b/hypervideo_dl/extractor/_extractors.py index 2fe15f6..f11554b 100644 --- a/hypervideo_dl/extractor/_extractors.py +++ b/hypervideo_dl/extractor/_extractors.py @@ -15,13 +15,13 @@ from .youtube import ( # Youtube is moved to the top to improve performance YoutubeSearchURLIE, YoutubeMusicSearchURLIE, YoutubeSubscriptionsIE, - YoutubeStoriesIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeYtBeIE, YoutubeYtUserIE, YoutubeWatchLaterIE, - YoutubeShortsAudioPivotIE + YoutubeShortsAudioPivotIE, + YoutubeConsentRedirectIE, ) from .abc import ( @@ -78,6 +78,8 @@ from .agora import ( WyborczaVideoIE, ) from .airmozilla import AirMozillaIE +from .airtv import AirTVIE +from .aitube import AitubeKZVideoIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .amara import AmaraIE @@ -86,7 +88,10 @@ from .alura import ( AluraCourseIE ) from .amcnetworks import AMCNetworksIE -from .amazon import AmazonStoreIE +from .amazon import ( + AmazonStoreIE, + AmazonReviewsIE, +) from .amazonminitv import ( AmazonMiniTVIE, AmazonMiniTVSeasonIE, @@ -96,6 +101,7 @@ from .americastestkitchen import ( AmericasTestKitchenIE, AmericasTestKitchenSeasonIE, ) +from .anchorfm import AnchorFMEpisodeIE from .angel import AngelIE from .anvato import AnvatoIE from .aol import AolIE @@ -116,6 +122,7 @@ from .applepodcasts import ApplePodcastsIE from .archiveorg import ( ArchiveOrgIE, YoutubeWebArchiveIE, + VLiveWebArchiveIE, ) from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE @@ -183,6 +190,10 @@ from .bbc import ( from .beeg import BeegIE from .behindkink import BehindKinkIE from .bellmedia import BellMediaIE +from .beatbump import ( + BeatBumpVideoIE, + BeatBumpPlaylistIE, +) from .beatport import BeatportIE from .berufetv import BerufeTVIE from .bet import BetIE @@ -192,13 +203,18 @@ from .bfmtv import ( BFMTVLiveIE, BFMTVArticleIE, ) -from .bibeltv import BibelTVIE +from .bibeltv import ( + BibelTVLiveIE, + BibelTVSeriesIE, + BibelTVVideoIE, +) from .bigflix import BigflixIE from .bigo import BigoIE from .bild import BildIE from .bilibili import ( BiliBiliIE, BiliBiliBangumiIE, + BiliBiliBangumiSeasonIE, BiliBiliBangumiMediaIE, BiliBiliSearchIE, BilibiliCategoryIE, @@ -227,19 +243,28 @@ from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, ) +from .blerp import BlerpIE from .blogger import BloggerIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE from .bostonglobe import BostonGlobeIE from .box import BoxIE -from .booyah import BooyahClipsIE +from .boxcast import BoxCastVideoIE from .bpb import BpbIE from .br import ( BRIE, BRMediathekIE, ) from .bravotv import BravoTVIE +from .brainpop import ( + BrainPOPIE, + BrainPOPJrIE, + BrainPOPELLIE, + BrainPOPEspIE, + BrainPOPFrIE, + BrainPOPIlIE, +) from .breakcom import BreakIE from .breitbart import BreitBartIE from .brightcove import ( @@ -259,6 +284,10 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) +from .camfm import ( + CamFMEpisodeIE, + CamFMShowIE +) from .cammodels import CamModelsIE from .camsoda import CamsodaIE from .camtasia import CamtasiaEmbedIE @@ -266,12 +295,6 @@ from .camwithher import CamWithHerIE from .canalalpha import CanalAlphaIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE -from .canvas import ( - CanvasIE, - CanvasEenIE, - VrtNUIE, - DagelijkseKostIE, -) from .carambatv import ( CarambaTVIE, CarambaTVPageIE, @@ -280,19 +303,23 @@ from .cartoonnetwork import CartoonNetworkIE from .cbc import ( CBCIE, CBCPlayerIE, + CBCPlayerPlaylistIE, CBCGemIE, CBCGemPlaylistIE, CBCGemLiveIE, ) -from .cbs import CBSIE -from .cbslocal import ( - CBSLocalIE, - CBSLocalArticleIE, +from .cbs import ( + CBSIE, + ParamountPressExpressIE, ) from .cbsinteractive import CBSInteractiveIE from .cbsnews import ( CBSNewsEmbedIE, CBSNewsIE, + CBSLocalIE, + CBSLocalArticleIE, + CBSLocalLiveIE, + CBSNewsLiveIE, CBSNewsLiveVideoIE, ) from .cbssports import ( @@ -331,6 +358,7 @@ from .ciscolive import ( ) from .ciscowebex import CiscoWebexIE from .cjsw import CJSWIE +from .clipchamp import ClipchampIE from .cliphunter import CliphunterIE from .clippit import ClippitIE from .cliprs import ClipRsIE @@ -378,9 +406,12 @@ from .crowdbunker import ( CrowdBunkerIE, CrowdBunkerChannelIE, ) +from .crtvg import CrtvgIE from .crunchyroll import ( CrunchyrollBetaIE, CrunchyrollBetaShowIE, + CrunchyrollMusicIE, + CrunchyrollArtistIE, ) from .cspan import CSpanIE, CSpanCongressIE from .ctsnews import CtsNewsIE @@ -397,6 +428,10 @@ from .cybrary import ( CybraryIE, CybraryCourseIE ) +from .dacast import ( + DacastVODIE, + DacastPlaylistIE, +) from .daftsex import DaftsexIE from .dailymail import DailyMailIE from .dailymotion import ( @@ -427,6 +462,10 @@ from .deezer import ( ) from .democracynow import DemocracynowIE from .detik import DetikEmbedIE +from .dlf import ( + DLFIE, + DLFCorpusIE, +) from .dfb import DFBIE from .dhm import DHMIE from .digg import DiggIE @@ -459,6 +498,7 @@ from .dplay import ( DiscoveryPlusItalyIE, DiscoveryPlusItalyShowIE, DiscoveryPlusIndiaShowIE, + GlobalCyclingNetworkPlusIE, ) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE @@ -466,6 +506,8 @@ from .drtuber import DrTuberIE from .drtv import ( DRTVIE, DRTVLiveIE, + DRTVSeasonIE, + DRTVSeriesIE, ) from .dtube import DTubeIE from .dvtv import DVTVIE @@ -480,6 +522,7 @@ from .deuxm import ( DeuxMNewsIE ) from .digitalconcerthall import DigitalConcertHallIE +from .discogs import DiscogsReleasePlaylistIE from .discovery import DiscoveryIE from .disney import DisneyIE from .dispeak import DigitallySpeakingIE @@ -494,6 +537,7 @@ from .dw import ( ) from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE from .ebaumsworld import EbaumsWorldIE +from .ebay import EbayIE from .echomsk import EchoMskIE from .egghead import ( EggheadCourseIE, @@ -503,6 +547,7 @@ from .ehow import EHowIE from .eighttracks import EightTracksIE from .einthusan import EinthusanIE from .eitb import EitbIE +from .elevensports import ElevenSportsIE from .ellentube import ( EllenTubeIE, EllenTubeVideoIE, @@ -536,7 +581,8 @@ from .espn import ( ESPNCricInfoIE, ) from .esri import EsriVideoIE -from .europa import EuropaIE +from .ettutv import EttuTvIE +from .europa import EuropaIE, EuroParlWebstreamIE from .europeantour import EuropeanTourIE from .eurosport import EurosportIE from .euscreen import EUScreenIE @@ -622,6 +668,7 @@ from .funimation import ( FunimationShowIE, ) from .funk import FunkIE +from .funker530 import Funker530IE from .fusion import FusionIE from .fuyintv import FuyinTVIE from .gab import ( @@ -657,10 +704,18 @@ from .gfycat import GfycatIE from .giantbomb import GiantBombIE from .giga import GigaIE from .glide import GlideIE +from .globalplayer import ( + GlobalPlayerLiveIE, + GlobalPlayerLivePlaylistIE, + GlobalPlayerAudioIE, + GlobalPlayerAudioEpisodeIE, + GlobalPlayerVideoIE +) from .globo import ( GloboIE, GloboArticleIE, ) +from .gmanetwork import GMANetworkVideoIE from .go import GoIE from .godtube import GodTubeIE from .gofile import GofileIE @@ -692,13 +747,16 @@ from .hearthisat import HearThisAtIE from .heise import HeiseIE from .hellporno import HellPornoIE from .helsinki import HelsinkiIE -from .hentaistigma import HentaiStigmaIE from .hgtv import HGTVComShowIE from .hketv import HKETVIE from .hidive import HiDiveIE from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE from .hitrecord import HitRecordIE +from .hollywoodreporter import ( + HollywoodReporterIE, + HollywoodReporterPlaylistIE, +) from .holodex import HolodexIE from .hotnewhiphop import HotNewHipHopIE from .hotstar import ( @@ -710,6 +768,7 @@ from .hotstar import ( ) from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE +from .hrefli import HrefLiRedirectIE from .hrfensehen import HRFernsehenIE from .hrti import ( HRTiIE, @@ -732,12 +791,14 @@ from .hungama import ( HungamaAlbumPlaylistIE, ) from .hypem import HypemIE +from .hypergryph import MonsterSirenHypergryphMusicIE from .hytale import HytaleIE from .icareus import IcareusIE from .ichinanalive import ( IchinanaLiveIE, IchinanaLiveClipIE, ) +from .idolplus import IdolPlusIE from .ign import ( IGNIE, IGNVideoIE, @@ -822,23 +883,29 @@ from .japandiet import ( from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .joj import JojIE +from .jstream import JStreamIE from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE from .kanal2 import Kanal2IE +from .kankanews import KankaNewsIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .kelbyone import KelbyOneIE -from .ketnet import KetnetIE from .khanacademy import ( KhanAcademyIE, KhanAcademyUnitIE, ) +from .kick import ( + KickIE, + KickVODIE, +) from .kicker import KickerIE from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE +from .kommunetv import KommunetvIE from .kompas import KompasVideoIE from .konserthusetplay import KonserthusetPlayIE from .koo import KooIE @@ -890,6 +957,10 @@ from .leeco import ( LePlaylistIE, LetvCloudIE, ) +from .lefigaro import ( + LeFigaroVideoEmbedIE, + LeFigaroVideoSectionIE, +) from .lego import LEGOIE from .lemonde import LemondeIE from .lenta import LentaIE @@ -908,10 +979,6 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) -from .line import ( - LineLiveIE, - LineLiveChannelIE, -) from .linkedin import ( LinkedInIE, LinkedInLearningIE, @@ -938,11 +1005,15 @@ from .lrt import ( LRTVODIE, LRTStreamIE ) +from .lumni import ( + LumniIE +) from .lynda import ( LyndaIE, LyndaCourseIE ) from .m6 import M6IE +from .magellantv import MagellanTVIE from .magentamusik360 import MagentaMusik360IE from .mailru import ( MailRuIE, @@ -982,6 +1053,10 @@ from .mediasite import ( MediasiteCatalogIE, MediasiteNamedCatalogIE, ) +from .mediastream import ( + MediaStreamIE, + WinSportsVideoIE, +) from .mediaworksnz import MediaWorksNZVODIE from .medici import MediciIE from .megaphone import MegaphoneIE @@ -1047,7 +1122,8 @@ from .mojvideo import MojvideoIE from .morningstar import MorningstarIE from .motherless import ( MotherlessIE, - MotherlessGroupIE + MotherlessGroupIE, + MotherlessGalleryIE, ) from .motorsport import MotorsportIE from .movieclips import MovieClipsIE @@ -1067,6 +1143,7 @@ from .mtv import ( ) from .muenchentv import MuenchenTVIE from .murrtube import MurrtubeIE, MurrtubeUserIE +from .museai import MuseAIIE from .musescore import MuseScoreIE from .musicdex import ( MusicdexSongIE, @@ -1088,6 +1165,7 @@ from .myvi import ( ) from .myvideoge import MyVideoGeIE from .myvidster import MyVidsterIE +from .mzaalo import MzaaloIE from .n1 import ( N1InfoAssetIE, N1InfoIIE, @@ -1136,6 +1214,7 @@ from .nebula import ( NebulaSubscriptionsIE, NebulaChannelIE, ) +from .nekohacker import NekoHackerIE from .nerdcubed import NerdCubedFeedIE from .netzkino import NetzkinoIE from .neteasemusic import ( @@ -1150,6 +1229,7 @@ from .neteasemusic import ( from .netverse import ( NetverseIE, NetversePlaylistIE, + NetverseSearchIE, ) from .newgrounds import ( NewgroundsIE, @@ -1174,6 +1254,8 @@ from .nfhsnetwork import NFHSNetworkIE from .nfl import ( NFLIE, NFLArticleIE, + NFLPlusEpisodeIE, + NFLPlusReplayIE, ) from .nhk import ( NhkVodIE, @@ -1181,6 +1263,9 @@ from .nhk import ( NhkForSchoolBangumiIE, NhkForSchoolSubjectIE, NhkForSchoolProgramListIE, + NhkRadioNewsPageIE, + NhkRadiruIE, + NhkRadiruLiveIE, ) from .nhl import NHLIE from .nick import ( @@ -1200,6 +1285,7 @@ from .niconico import ( NicovideoSearchIE, NicovideoSearchURLIE, NicovideoTagURLIE, + NiconicoLiveIE, ) from .ninecninemedia import ( NineCNineMediaIE, @@ -1211,6 +1297,7 @@ from .nintendo import NintendoIE from .nitter import NitterIE from .njpwworld import NJPWWorldIE from .nobelprize import NobelPrizeIE +from .noice import NoicePodcastIE from .nonktube import NonkTubeIE from .noodlemagazine import NoodleMagazineIE from .noovo import NoovoIE @@ -1256,6 +1343,7 @@ from .nrl import NRLTVIE from .ntvcojp import NTVCoJpCUIE from .ntvde import NTVDeIE from .ntvru import NTVRuIE +from .nubilesporn import NubilesPornIE from .nytimes import ( NYTimesIE, NYTimesArticleIE, @@ -1263,8 +1351,10 @@ from .nytimes import ( ) from .nuvid import NuvidIE from .nzherald import NZHeraldIE +from .nzonscreen import NZOnScreenIE from .nzz import NZZIE from .odatv import OdaTVIE +from .odkmedia import OnDemandChinaEpisodeIE from .odnoklassniki import OdnoklassnikiIE from .oftv import ( OfTVIE, @@ -1276,6 +1366,7 @@ from .on24 import On24IE from .ondemandkorea import OnDemandKoreaIE from .onefootball import OneFootballIE from .onenewsnz import OneNewsNZIE +from .oneplace import OnePlacePodcastIE from .onet import ( OnetIE, OnetChannelIE, @@ -1304,6 +1395,7 @@ from .orf import ( ORFIPTVIE, ) from .outsidetv import OutsideTVIE +from .owncloud import OwnCloudIE from .packtpub import ( PacktPubIE, PacktPubCourseIE, @@ -1329,7 +1421,7 @@ from .patreon import ( PatreonIE, PatreonCampaignIE ) -from .pbs import PBSIE +from .pbs import PBSIE, PBSKidsIE from .pearvideo import PearVideoIE from .peekvids import PeekVidsIE, PlayVidsIE from .peertube import ( @@ -1347,6 +1439,7 @@ from .periscope import ( PeriscopeIE, PeriscopeUserIE, ) +from .pgatour import PGATourIE from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE @@ -1398,11 +1491,12 @@ from .pokergo import ( from .polsatgo import PolsatGoIE from .polskieradio import ( PolskieRadioIE, + PolskieRadioLegacyIE, + PolskieRadioAuditionIE, PolskieRadioCategoryIE, PolskieRadioPlayerIE, PolskieRadioPodcastIE, PolskieRadioPodcastListIE, - PolskieRadioRadioKierowcowIE, ) from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE @@ -1425,6 +1519,7 @@ from .puhutv import ( PuhuTVIE, PuhuTVSerieIE, ) +from .pr0gramm import Pr0grammStaticIE, Pr0grammIE from .prankcast import PrankCastIE from .premiershiprugby import PremiershipRugbyIE from .presstv import PressTVIE @@ -1439,6 +1534,7 @@ from .prx import ( ) from .puls4 import Puls4IE from .pyvideo import PyvideoIE +from .qdance import QDanceIE from .qingting import QingTingIE from .qqmusic import ( QQMusicIE, @@ -1471,6 +1567,8 @@ from .radlive import ( RadLiveSeasonIE, ) from .rai import ( + RaiIE, + RaiCulturaIE, RaiPlayIE, RaiPlayLiveIE, RaiPlayPlaylistIE, @@ -1479,13 +1577,16 @@ from .rai import ( RaiPlaySoundPlaylistIE, RaiNewsIE, RaiSudtirolIE, - RaiIE, ) from .raywenderlich import ( RayWenderlichIE, RayWenderlichCourseIE, ) from .rbmaradio import RBMARadioIE +from .rbgtum import ( + RbgTumIE, + RbgTumCourseIE, +) from .rcs import ( RCSIE, RCSEmbedsIE, @@ -1497,6 +1598,7 @@ from .rcti import ( RCTIPlusTVIE, ) from .rds import RDSIE +from .recurbate import RecurbateIE from .redbee import ParliamentLiveUKIE, RTBFIE from .redbulltv import ( RedBullTVIE, @@ -1519,6 +1621,7 @@ from .rentv import ( from .restudy import RestudyIE from .reuters import ReutersIE from .reverbnation import ReverbNationIE +from .rheinmaintv import RheinMainTVIE from .rice import RICEIE from .rmcdecouverte import RMCDecouverteIE from .rockstargames import RockstarGamesIE @@ -1530,7 +1633,11 @@ from .rokfin import ( ) from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE from .rottentomatoes import RottenTomatoesIE -from .rozhlas import RozhlasIE +from .rozhlas import ( + RozhlasIE, + RozhlasVltavaIE, + MujRozhlasIE, +) from .rte import RteIE, RteRadioIE from .rtlnl import ( RtlNlIE, @@ -1553,6 +1660,11 @@ from .rtnews import ( from .rtp import RTPIE from .rtrfm import RTRFMIE from .rts import RTSIE +from .rtvcplay import ( + RTVCPlayIE, + RTVCPlayEmbedIE, + RTVCKalturaIE, +) from .rtve import ( RTVEALaCartaIE, RTVEAudioIE, @@ -1567,6 +1679,7 @@ from .ruhd import RUHDIE from .rule34video import Rule34VideoIE from .rumble import ( RumbleEmbedIE, + RumbleIE, RumbleChannelIE, ) from .rutube import ( @@ -1586,8 +1699,8 @@ from .megatvcom import ( MegaTVComIE, MegaTVComEmbedIE, ) -from .ant1newsgr import ( - Ant1NewsGrWatchIE, +from .antenna import ( + AntennaGrWatchIE, Ant1NewsGrArticleIE, Ant1NewsGrEmbedIE, ) @@ -1597,6 +1710,7 @@ from .ruv import ( RuvIE, RuvSpilaIE ) +from .s4c import S4CIE from .safari import ( SafariIE, SafariApiIE, @@ -1621,6 +1735,7 @@ from .scte import ( ) from .scrolller import ScrolllerIE from .seeker import SeekerIE +from .senalcolombia import SenalColombiaLiveIE from .senategov import SenateISVPIE, SenateGovIE from .sendtonews import SendtoNewsIE from .servus import ServusIE @@ -1639,6 +1754,7 @@ from .shared import ( VivoIE, ) from .sharevideos import ShareVideosEmbedIE +from .sibnet import SibnetEmbedIE from .shemaroome import ShemarooMeIE from .showroomlive import ShowRoomLiveIE from .simplecast import ( @@ -1686,6 +1802,7 @@ from .soundcloud import ( SoundcloudSetIE, SoundcloudRelatedIE, SoundcloudUserIE, + SoundcloudUserPermalinkIE, SoundcloudTrackStationIE, SoundcloudPlaylistIE, SoundcloudSearchIE, @@ -1716,6 +1833,7 @@ from .spike import ( BellatorIE, ParamountNetworkIE, ) +from .stageplus import StagePlusVODConcertIE from .startrek import StarTrekIE from .stitcher import ( StitcherIE, @@ -1741,6 +1859,10 @@ from .srgssr import ( SRGSSRPlayIE, ) from .srmediathek import SRMediathekIE +from .stacommu import ( + StacommuLiveIE, + StacommuVODIE, +) from .stanfordoc import StanfordOpenClassroomIE from .startv import StarTVIE from .steam import ( @@ -1753,7 +1875,6 @@ from .storyfire import ( StoryFireSeriesIE, ) from .streamable import StreamableIE -from .streamanity import StreamanityIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streamff import StreamFFIE @@ -1781,6 +1902,11 @@ from .sztvhu import SztvHuIE from .tagesschau import TagesschauIE from .tass import TassIE from .tbs import TBSIE +from .tbsjp import ( + TBSJPEpisodeIE, + TBSJPProgramIE, + TBSJPPlaylistIE, +) from .tdslifeway import TDSLifewayIE from .teachable import ( TeachableIE, @@ -1791,7 +1917,10 @@ from .teachertube import ( TeacherTubeUserIE, ) from .teachingchannel import TeachingChannelIE -from .teamcoco import TeamcocoIE +from .teamcoco import ( + TeamcocoIE, + ConanClassicIE, +) from .teamtreehouse import TeamTreeHouseIE from .techtalks import TechTalksIE from .ted import ( @@ -1803,6 +1932,7 @@ from .ted import ( from .tele5 import Tele5IE from .tele13 import Tele13IE from .telebruxelles import TeleBruxellesIE +from .telecaribe import TelecaribePlayIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE from .telegram import TelegramEmbedIE @@ -1817,7 +1947,7 @@ from .telequebec import ( ) from .teletask import TeleTaskIE from .telewebion import TelewebionIE -from .tempo import TempoIE +from .tempo import TempoIE, IVXPlayerIE from .tencent import ( IflixEpisodeIE, IflixSeriesIE, @@ -1847,6 +1977,11 @@ from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .thisvid import ( + ThisVidIE, + ThisVidMemberIE, + ThisVidPlaylistIE, +) from .threespeak import ( ThreeSpeakIE, ThreeSpeakUserIE, @@ -1859,6 +1994,7 @@ from .tiktok import ( TikTokEffectIE, TikTokTagIE, TikTokVMIE, + TikTokLiveIE, DouyinIE, ) from .tinypic import TinyPicIE @@ -1888,6 +2024,7 @@ from .traileraddict import TrailerAddictIE from .triller import ( TrillerIE, TrillerUserIE, + TrillerShortIE, ) from .trilulilu import TriluliluIE from .trovo import ( @@ -1896,6 +2033,7 @@ from .trovo import ( TrovoChannelVodIE, TrovoChannelClipIE, ) +from .trtcocuk import TrtCocukVideoIE from .trueid import TrueIDIE from .trunews import TruNewsIE from .truth import TruthIE @@ -1908,10 +2046,9 @@ from .tubitv import ( ) from .tumblr import TumblrIE from .tunein import ( - TuneInClipIE, TuneInStationIE, - TuneInProgramIE, - TuneInTopicIE, + TuneInPodcastIE, + TuneInPodcastEpisodeIE, TuneInShortenerIE, ) from .tunepk import TunePkIE @@ -1979,7 +2116,6 @@ from .tvp import ( ) from .tvplay import ( TVPlayIE, - ViafreeIE, TVPlayHomeIE, ) from .tvplayer import TVPlayerIE @@ -2009,6 +2145,10 @@ from .twitter import ( TwitterSpacesIE, TwitterShortenerIE, ) +from .txxx import ( + TxxxIE, + PornTopIE, +) from .udemy import ( UdemyIE, UdemyCourseIE @@ -2079,6 +2219,13 @@ from .videocampus_sachsen import ( ) from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE +from .videoken import ( + VideoKenIE, + VideoKenPlayerIE, + VideoKenPlaylistIE, + VideoKenCategoryIE, + VideoKenTopicIE, +) from .videomore import ( VideomoreIE, VideomoreVideoIE, @@ -2127,17 +2274,16 @@ from .viu import ( ViuIE, ViuPlaylistIE, ViuOTTIE, + ViuOTTIndonesiaIE, ) from .vk import ( VKIE, VKUserVideosIE, VKWallPostIE, + VKPlayIE, + VKPlayLiveIE, ) -from .vlive import ( - VLiveIE, - VLivePostIE, - VLiveChannelIE, -) +from .vocaroo import VocarooIE from .vodlocker import VodlockerIE from .vodpl import VODPlIE from .vodplatform import VODPlatformIE @@ -2146,6 +2292,7 @@ from .voicy import ( VoicyIE, VoicyChannelIE, ) +from .volejtv import VolejTVIE from .voot import ( VootIE, VootSeriesIE, @@ -2154,7 +2301,12 @@ from .voxmedia import ( VoxMediaVolumeIE, VoxMediaIE, ) -from .vrt import VRTIE +from .vrt import ( + VRTIE, + VrtNUIE, + KetnetIE, + DagelijkseKostIE, +) from .vrak import VrakIE from .vrv import ( VRVIE, @@ -2191,6 +2343,7 @@ from .wdr import ( WDRElefantIE, WDRMobileIE, ) +from .webcamerapl import WebcameraplIE from .webcaster import ( WebcasterIE, WebcasterFeedIE, @@ -2204,8 +2357,20 @@ from .weibo import ( WeiboMobileIE ) from .weiqitv import WeiqiTVIE +from .weverse import ( + WeverseIE, + WeverseMediaIE, + WeverseMomentIE, + WeverseLiveTabIE, + WeverseMediaTabIE, + WeverseLiveIE, +) +from .wevidi import WeVidiIE +from .weyyak import WeyyakIE +from .whyp import WhypIE from .wikimedia import WikimediaIE from .willow import WillowIE +from .wimbledon import WimbledonIE from .wimtv import WimTVIE from .whowatch import WhoWatchIE from .wistia import ( @@ -2222,11 +2387,22 @@ from .wppilot import ( WPPilotIE, WPPilotChannelsIE, ) +from .wrestleuniverse import ( + WrestleUniverseVODIE, + WrestleUniversePPVIE, +) from .wsj import ( WSJIE, WSJArticleIE, ) from .wwe import WWEIE +from .wykop import ( + WykopDigIE, + WykopDigCommentIE, + WykopPostIE, + WykopPostCommentIE, +) +from .xanimu import XanimuIE from .xbef import XBefIE from .xboxclips import XboxClipsIE from .xfileshare import XFileShareIE @@ -2235,12 +2411,6 @@ from .xhamster import ( XHamsterEmbedIE, XHamsterUserIE, ) -from .xiami import ( - XiamiSongIE, - XiamiAlbumIE, - XiamiArtistIE, - XiamiCollectionIE -) from .ximalaya import ( XimalayaIE, XimalayaAlbumIE @@ -2251,13 +2421,14 @@ from .xnxx import XNXXIE from .xstream import XstreamIE from .xtube import XTubeUserIE, XTubeIE from .xuite import XuiteIE -from .xvideos import XVideosIE +from .xvideos import ( + XVideosIE, + XVideosQuickiesIE +) from .xxxymovies import XXXYMoviesIE from .yahoo import ( YahooIE, YahooSearchIE, - YahooGyaOPlayerIE, - YahooGyaOIE, YahooJapanNewsIE, ) from .yandexdisk import YandexDiskIE @@ -2275,6 +2446,10 @@ from .yandexvideo import ( ZenYandexChannelIE, ) from .yapfiles import YapFilesIE +from .yappy import ( + YappyIE, + YappyProfileIE, +) from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE from .yle_areena import YleAreenaIE @@ -2292,6 +2467,10 @@ from .younow import ( from .youporn import YouPornIE from .yourporn import YourPornIE from .yourupload import YourUploadIE +from .zaiko import ( + ZaikoIE, + ZaikoETicketIE, +) from .zapiks import ZapiksIE from .zattoo import ( BBVTVIE, @@ -2349,6 +2528,7 @@ from .zingmp3 import ( ZingMp3WeekChartIE, ZingMp3ChartMusicVideoIE, ZingMp3UserIE, + ZingMp3HubIE, ) from .zoom import ZoomIE from .zype import ZypeIE diff --git a/hypervideo_dl/extractor/abc.py b/hypervideo_dl/extractor/abc.py index 0ca76b8..f56133e 100644 --- a/hypervideo_dl/extractor/abc.py +++ b/hypervideo_dl/extractor/abc.py @@ -12,6 +12,7 @@ from ..utils import ( int_or_none, parse_iso8601, str_or_none, + traverse_obj, try_get, unescapeHTML, update_url_query, @@ -85,6 +86,15 @@ class ABCIE(InfoExtractor): 'uploader': 'Behind the News', 'uploader_id': 'behindthenews', } + }, { + 'url': 'https://www.abc.net.au/news/2023-06-25/wagner-boss-orders-troops-back-to-bases-to-avoid-bloodshed/102520540', + 'info_dict': { + 'id': '102520540', + 'title': 'Wagner Group retreating from Russia, leader Prigozhin to move to Belarus', + 'ext': 'mp4', + 'description': 'Wagner troops leave Rostov-on-Don and\xa0Yevgeny Prigozhin will move to Belarus under a deal brokered by Belarusian President Alexander Lukashenko to end the mutiny.', + 'thumbnail': 'https://live-production.wcms.abc-cdn.net.au/0c170f5b57f0105c432f366c0e8e267b?impolicy=wcms_crop_resize&cropH=2813&cropW=5000&xPos=0&yPos=249&width=862&height=485', + } }] def _real_extract(self, url): @@ -107,7 +117,7 @@ class ABCIE(InfoExtractor): video = True if mobj is None: - mobj = re.search(r'(?P<type>)"sources": (?P<json_data>\[[^\]]+\]),', webpage) + mobj = re.search(r'(?P<type>)"(?:sources|files|renditions)":\s*(?P<json_data>\[[^\]]+\])', webpage) if mobj is None: mobj = re.search( r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);', @@ -121,7 +131,8 @@ class ABCIE(InfoExtractor): urls_info = self._parse_json( mobj.group('json_data'), video_id, transform_source=js_to_json) youtube = mobj.group('type') == 'YouTube' - video = mobj.group('type') == 'Video' or urls_info[0]['contentType'] == 'video/mp4' + video = mobj.group('type') == 'Video' or traverse_obj( + urls_info, (0, ('contentType', 'MIMEType')), get_all=False) == 'video/mp4' if not isinstance(urls_info, list): urls_info = [urls_info] diff --git a/hypervideo_dl/extractor/abematv.py b/hypervideo_dl/extractor/abematv.py index 80046af..8f962ba 100644 --- a/hypervideo_dl/extractor/abematv.py +++ b/hypervideo_dl/extractor/abematv.py @@ -22,80 +22,23 @@ from ..utils import ( int_or_none, intlist_to_bytes, OnDemandPagedList, - request_to_url, time_seconds, traverse_obj, update_url_query, ) -# NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862) - -def add_opener(ydl, handler): - ''' Add a handler for opening URLs, like _download_webpage ''' - # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 - # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 - assert isinstance(ydl._opener, urllib.request.OpenerDirector) - ydl._opener.add_handler(handler) - - -def remove_opener(ydl, handler): - ''' - Remove handler(s) for opening URLs - @param handler Either handler object itself or handler type. - Specifying handler type will remove all handler which isinstance returns True. - ''' +def add_opener(ydl, handler): # FIXME: Create proper API in .networking + """Add a handler for opening URLs, like _download_webpage""" # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 - opener = ydl._opener - assert isinstance(ydl._opener, urllib.request.OpenerDirector) - if isinstance(handler, (type, tuple)): - find_cp = lambda x: isinstance(x, handler) - else: - find_cp = lambda x: x is handler - - removed = [] - for meth in dir(handler): - if meth in ["redirect_request", "do_open", "proxy_open"]: - # oops, coincidental match - continue - - i = meth.find("_") - protocol = meth[:i] - condition = meth[i + 1:] - - if condition.startswith("error"): - j = condition.find("_") + i + 1 - kind = meth[j + 1:] - try: - kind = int(kind) - except ValueError: - pass - lookup = opener.handle_error.get(protocol, {}) - opener.handle_error[protocol] = lookup - elif condition == "open": - kind = protocol - lookup = opener.handle_open - elif condition == "response": - kind = protocol - lookup = opener.process_response - elif condition == "request": - kind = protocol - lookup = opener.process_request - else: - continue - - handlers = lookup.setdefault(kind, []) - if handlers: - handlers[:] = [x for x in handlers if not find_cp(x)] - - removed.append(x for x in handlers if find_cp(x)) - - if removed: - for x in opener.handlers: - if find_cp(x): - x.add_parent(None) - opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)] + rh = ydl._request_director.handlers['Urllib'] + if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES: + return + opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=ydl.proxies) + assert isinstance(opener, urllib.request.OpenerDirector) + opener.add_handler(handler) + rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license') class AbemaLicenseHandler(urllib.request.BaseHandler): @@ -137,11 +80,11 @@ class AbemaLicenseHandler(urllib.request.BaseHandler): return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey)) def abematv_license_open(self, url): - url = request_to_url(url) + url = url.get_full_url() if isinstance(url, urllib.request.Request) else url ticket = urllib.parse.urlparse(url).netloc response_data = self._get_videokey_from_ticket(ticket) return urllib.response.addinfourl(io.BytesIO(response_data), headers={ - 'Content-Length': len(response_data), + 'Content-Length': str(len(response_data)), }, url=url, code=200) @@ -156,7 +99,7 @@ class AbemaTVBaseIE(InfoExtractor): def _generate_aks(cls, deviceid): deviceid = deviceid.encode('utf-8') # add 1 hour and then drop minute and secs - ts_1hour = int((time_seconds(hours=9) // 3600 + 1) * 3600) + ts_1hour = int((time_seconds() // 3600 + 1) * 3600) time_struct = time.gmtime(ts_1hour) ts_1hour_str = str(ts_1hour).encode('utf-8') @@ -190,6 +133,16 @@ class AbemaTVBaseIE(InfoExtractor): if self._USERTOKEN: return self._USERTOKEN + username, _ = self._get_login_info() + AbemaTVBaseIE._USERTOKEN = username and self.cache.load(self._NETRC_MACHINE, username) + if AbemaTVBaseIE._USERTOKEN: + # try authentication with locally stored token + try: + self._get_media_token(True) + return + except ExtractorError as e: + self.report_warning(f'Failed to login with cached user token; obtaining a fresh one ({e})') + AbemaTVBaseIE._DEVICE_ID = str(uuid.uuid4()) aks = self._generate_aks(self._DEVICE_ID) user_data = self._download_json( @@ -203,10 +156,7 @@ class AbemaTVBaseIE(InfoExtractor): }) AbemaTVBaseIE._USERTOKEN = user_data['token'] - # don't allow adding it 2 times or more, though it's guarded - remove_opener(self._downloader, AbemaLicenseHandler) add_opener(self._downloader, AbemaLicenseHandler(self)) - return self._USERTOKEN def _get_media_token(self, invalidate=False, to_show=True): @@ -300,6 +250,11 @@ class AbemaTVIE(AbemaTVBaseIE): _TIMETABLE = None def _perform_login(self, username, password): + self._get_device_token() + if self.cache.load(self._NETRC_MACHINE, username) and self._get_media_token(): + self.write_debug('Skipping logging in') + return + if '@' in username: # don't strictly check if it's email address or not ep, method = 'user/email', 'email' else: @@ -319,6 +274,7 @@ class AbemaTVIE(AbemaTVBaseIE): AbemaTVBaseIE._USERTOKEN = login_response['token'] self._get_media_token(True) + self.cache.store(self._NETRC_MACHINE, username, AbemaTVBaseIE._USERTOKEN) def _real_extract(self, url): # starting download using infojson from this extractor is undefined behavior, @@ -416,10 +372,20 @@ class AbemaTVIE(AbemaTVBaseIE): f'https://api.abema.io/v1/video/programs/{video_id}', video_id, note='Checking playability', headers=headers) - ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType'), default=[]) + ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType')) if 3 not in ondemand_types: # cannot acquire decryption key for these streams self.report_warning('This is a premium-only stream') + info.update(traverse_obj(api_response, { + 'series': ('series', 'title'), + 'season': ('season', 'title'), + 'season_number': ('season', 'sequence'), + 'episode_number': ('episode', 'number'), + })) + if not title: + title = traverse_obj(api_response, ('episode', 'title')) + if not description: + description = traverse_obj(api_response, ('episode', 'content')) m3u8_url = f'https://vod-abematv.akamaized.net/program/{video_id}/playlist.m3u8' elif video_type == 'slots': @@ -489,7 +455,7 @@ class AbemaTVTitleIE(AbemaTVBaseIE): }) yield from ( self.url_result(f'https://abema.tv/video/episode/{x}') - for x in traverse_obj(programs, ('programs', ..., 'id'), default=[])) + for x in traverse_obj(programs, ('programs', ..., 'id'))) def _entries(self, playlist_id, series_version): return OnDemandPagedList( diff --git a/hypervideo_dl/extractor/acast.py b/hypervideo_dl/extractor/acast.py index f2f828f..427d04c 100644 --- a/hypervideo_dl/extractor/acast.py +++ b/hypervideo_dl/extractor/acast.py @@ -40,28 +40,33 @@ class ACastBaseIE(InfoExtractor): class ACastIE(ACastBaseIE): IE_NAME = 'acast' - _VALID_URL = r'''(?x) + _VALID_URL = r'''(?x: https?:// (?: (?:(?:embed|www)\.)?acast\.com/| play\.acast\.com/s/ ) - (?P<channel>[^/]+)/(?P<id>[^/#?]+) - ''' + (?P<channel>[^/]+)/(?P<id>[^/#?"]+) + )''' + _EMBED_REGEX = [rf'(?x)<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL})'] _TESTS = [{ 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': 'f5598f3ad1e4776fed12ec1407153e4b', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', - 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67', + 'description': 'md5:013959207e05011ad14a222cf22278cc', 'timestamp': 1477346700, 'upload_date': '20161024', 'duration': 2766, - 'creator': 'Anton Berg & Martin Johnson', + 'creator': 'Third Ear Studio', 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', + 'thumbnail': 'https://assets.pippa.io/shows/616ebe1886d7b1398620b943/616ebe33c7e6e70013cae7da.jpg', + 'episode_number': 2, + 'display_id': '2.raggarmordet-rosterurdetforflutna', + 'season_number': 4, + 'season': 'Season 4', } }, { 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', @@ -73,6 +78,23 @@ class ACastIE(ACastBaseIE): 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + 'url': 'https://ausi.anu.edu.au/news/democracy-sausage-episode-can-labor-be-long-form-government', + 'info_dict': { + 'id': '646c68fb21fbf20011e9c651', + 'ext': 'mp3', + 'creator': 'The Australian National University', + 'display_id': 'can-labor-be-a-long-form-government', + 'duration': 2618, + 'thumbnail': 'https://assets.pippa.io/shows/6113e8578b4903809f16f7e5/1684821529295-515b9520db9ce53275b995eb302f941c.jpeg', + 'title': 'Can Labor be a long-form government?', + 'episode': 'Can Labor be a long-form government?', + 'upload_date': '20230523', + 'series': 'Democracy Sausage with Mark Kenny', + 'timestamp': 1684826362, + 'description': 'md5:feabe1fc5004c78ee59c84a46bf4ba16', + } + }] def _real_extract(self, url): channel, display_id = self._match_valid_url(url).groups() diff --git a/hypervideo_dl/extractor/adn.py b/hypervideo_dl/extractor/adn.py index e0c18c8..b59dbc8 100644 --- a/hypervideo_dl/extractor/adn.py +++ b/hypervideo_dl/extractor/adn.py @@ -6,10 +6,8 @@ import random from .common import InfoExtractor from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 -from ..compat import ( - compat_HTTPError, - compat_b64decode, -) +from ..compat import compat_b64decode +from ..networking.exceptions import HTTPError from ..utils import ( ass_subtitles_timecode, bytes_to_intlist, @@ -142,9 +140,9 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' self._HEADERS = {'authorization': 'Bearer ' + access_token} except ExtractorError as e: message = None - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: resp = self._parse_json( - e.cause.read().decode(), None, fatal=False) or {} + e.cause.response.read().decode(), None, fatal=False) or {} message = resp.get('message') or resp.get('code') self.report_warning(message or self._LOGIN_ERR_MESSAGE) @@ -168,7 +166,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }, data=b'')['token'] links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link') - self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) + self._K = ''.join(random.choices('0123456789abcdef', k=16)) message = bytes_to_intlist(json.dumps({ 'k': self._K, 't': token, @@ -195,14 +193,14 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }) break except ExtractorError as e: - if not isinstance(e.cause, compat_HTTPError): + if not isinstance(e.cause, HTTPError): raise e - if e.cause.code == 401: + if e.cause.status == 401: # This usually goes away with a different random pkcs1pad, so retry continue - error = self._parse_json(e.cause.read(), video_id) + error = self._parse_json(e.cause.response.read(), video_id) message = error.get('message') if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': self.raise_geo_restricted(msg=message) diff --git a/hypervideo_dl/extractor/adobepass.py b/hypervideo_dl/extractor/adobepass.py index e5944f7..5eed0ca 100644 --- a/hypervideo_dl/extractor/adobepass.py +++ b/hypervideo_dl/extractor/adobepass.py @@ -2,11 +2,11 @@ import getpass import json import re import time -import urllib.error import xml.etree.ElementTree as etree from .common import InfoExtractor from ..compat import compat_urlparse +from ..networking.exceptions import HTTPError from ..utils import ( NO_DEFAULT, ExtractorError, @@ -1394,7 +1394,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en form_page, urlh = form_page_res post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url') if not re.match(r'https?://', post_url): - post_url = compat_urlparse.urljoin(urlh.geturl(), post_url) + post_url = compat_urlparse.urljoin(urlh.url, post_url) form_data = self._hidden_inputs(form_page) form_data.update(data) return self._download_webpage_handle( @@ -1473,7 +1473,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en elif 'automatically signed in with' in provider_redirect_page: # Seems like comcast is rolling up new way of automatically signing customers oauth_redirect_url = self._html_search_regex( - r'continue:\s*"(https://oauth.xfinity.com/oauth/authorize\?.+)"', provider_redirect_page, + r'continue:\s*"(https://oauth\.xfinity\.com/oauth/authorize\?.+)"', provider_redirect_page, 'oauth redirect (signed)') # Just need to process the request. No useful data comes back self._download_webpage(oauth_redirect_url, video_id, 'Confirming auto login') @@ -1573,7 +1573,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en }), headers={ 'Content-Type': 'application/x-www-form-urlencoded' }) - elif mso_id == 'Spectrum': + elif mso_id in ('Spectrum', 'Charter_Direct'): # Spectrum's login for is dynamically loaded via JS so we need to hardcode the flow # as a one-off implementation. provider_redirect_page, urlh = provider_redirect_page_res @@ -1619,7 +1619,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en hidden_data['history'] = 1 provider_login_page_res = self._download_webpage_handle( - urlh.geturl(), video_id, 'Sending first bookend', + urlh.url, video_id, 'Sending first bookend', query=hidden_data) provider_association_redirect, urlh = post_form( @@ -1629,7 +1629,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en }) provider_refresh_redirect_url = extract_redirect_url( - provider_association_redirect, url=urlh.geturl()) + provider_association_redirect, url=urlh.url) last_bookend_page, urlh = self._download_webpage_handle( provider_refresh_redirect_url, video_id, @@ -1638,7 +1638,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en hidden_data['history'] = 3 mvpd_confirm_page_res = self._download_webpage_handle( - urlh.geturl(), video_id, 'Sending final bookend', + urlh.url, video_id, 'Sending final bookend', query=hidden_data) post_form(mvpd_confirm_page_res, 'Confirming Login') @@ -1652,7 +1652,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en hidden_data['history_val'] = 1 provider_login_redirect_page_res = self._download_webpage_handle( - urlh.geturl(), video_id, 'Sending First Bookend', + urlh.url, video_id, 'Sending First Bookend', query=hidden_data) provider_login_redirect_page, urlh = provider_login_redirect_page_res @@ -1680,7 +1680,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en }) provider_refresh_redirect_url = extract_redirect_url( - provider_association_redirect, url=urlh.geturl()) + provider_association_redirect, url=urlh.url) last_bookend_page, urlh = self._download_webpage_handle( provider_refresh_redirect_url, video_id, @@ -1690,7 +1690,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en hidden_data['history_val'] = 3 mvpd_confirm_page_res = self._download_webpage_handle( - urlh.geturl(), video_id, 'Sending Final Bookend', + urlh.url, video_id, 'Sending Final Bookend', query=hidden_data) post_form(mvpd_confirm_page_res, 'Confirming Login') @@ -1699,7 +1699,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en # based redirect that should be followed. provider_redirect_page, urlh = provider_redirect_page_res provider_refresh_redirect_url = extract_redirect_url( - provider_redirect_page, url=urlh.geturl()) + provider_redirect_page, url=urlh.url) if provider_refresh_redirect_url: provider_redirect_page_res = self._download_webpage_handle( provider_refresh_redirect_url, video_id, @@ -1724,7 +1724,7 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should en 'requestor_id': requestor_id, }), headers=mvpd_headers) except ExtractorError as e: - if not mso_id and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + if not mso_id and isinstance(e.cause, HTTPError) and e.cause.status == 401: raise_mvpd_required() raise if '<pendingLogout' in session: diff --git a/hypervideo_dl/extractor/adultswim.py b/hypervideo_dl/extractor/adultswim.py index bd29eb4..daaedde 100644 --- a/hypervideo_dl/extractor/adultswim.py +++ b/hypervideo_dl/extractor/adultswim.py @@ -170,8 +170,10 @@ class AdultSwimIE(TurnerBaseIE): continue ext = determine_ext(asset_url, mimetype2ext(asset.get('mime_type'))) if ext == 'm3u8': - info['formats'].extend(self._extract_m3u8_formats( - asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + info['formats'].extend(fmts) + self._merge_subtitles(subs, target=info['subtitles']) elif ext == 'f4m': continue # info['formats'].extend(self._extract_f4m_formats( diff --git a/hypervideo_dl/extractor/aenetworks.py b/hypervideo_dl/extractor/aenetworks.py index d7c4010..f049a0f 100644 --- a/hypervideo_dl/extractor/aenetworks.py +++ b/hypervideo_dl/extractor/aenetworks.py @@ -3,6 +3,8 @@ from ..utils import ( ExtractorError, GeoRestrictedError, int_or_none, + remove_start, + traverse_obj, update_url_query, urlencode_postdata, ) @@ -72,7 +74,14 @@ class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE requestor_id, brand = self._DOMAIN_MAP[domain] result = self._download_json( 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, - filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + filter_value, query={'filter[%s]' % filter_key: filter_value}) + result = traverse_obj( + result, ('results', + lambda k, v: k == 0 and v[filter_key] == filter_value), + get_all=False) + if not result: + raise ExtractorError('Show not found in A&E feed (too new?)', expected=True, + video_id=remove_start(filter_value, '/')) title = result['title'] video_id = result['id'] media_url = result['publicUrl'] @@ -123,7 +132,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - 'skip': 'This video is only available for users of participating TV providers.', + 'skip': 'Geo-restricted - This content is not available in your location.' }, { 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', 'info_dict': { @@ -140,6 +149,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': 'This video is only available for users of participating TV providers.', }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True @@ -303,6 +313,7 @@ class HistoryTopicIE(AENetworksBaseIE): class HistoryPlayerIE(AENetworksBaseIE): IE_NAME = 'history:player' _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|biography)\.com)/player/(?P<id>\d+)' + _TESTS = [] def _real_extract(self, url): domain, video_id = self._match_valid_url(url).groups() diff --git a/hypervideo_dl/extractor/aeonco.py b/hypervideo_dl/extractor/aeonco.py index 4655862..390eae3 100644 --- a/hypervideo_dl/extractor/aeonco.py +++ b/hypervideo_dl/extractor/aeonco.py @@ -1,5 +1,6 @@ from .common import InfoExtractor from .vimeo import VimeoIE +from ..utils import ExtractorError, traverse_obj, url_or_none class AeonCoIE(InfoExtractor): @@ -19,22 +20,55 @@ class AeonCoIE(InfoExtractor): } }, { 'url': 'https://aeon.co/videos/dazzling-timelapse-shows-how-microbes-spoil-our-food-and-sometimes-enrich-it', - 'md5': '4e5f3dad9dbda0dbfa2da41a851e631e', + 'md5': '03582d795382e49f2fd0b427b55de409', 'info_dict': { - 'id': '728595228', + 'id': '759576926', 'ext': 'mp4', 'title': 'Wrought', - 'thumbnail': 'https://i.vimeocdn.com/video/1484618528-c91452611f9a4e4497735a533da60d45b2fe472deb0c880f0afaab0cd2efb22a-d_1280', - 'uploader': 'Biofilm Productions', - 'uploader_id': 'user140352216', - 'uploader_url': 'https://vimeo.com/user140352216', + 'thumbnail': 'https://i.vimeocdn.com/video/1525599692-84614af88e446612f49ca966cf8f80eab2c73376bedd80555741c521c26f9a3e-d_1280', + 'uploader': 'Aeon Video', + 'uploader_id': 'aeonvideo', + 'uploader_url': 'https://vimeo.com/aeonvideo', 'duration': 1344 } + }, { + 'url': 'https://aeon.co/videos/chew-over-the-prisoners-dilemma-and-see-if-you-can-find-the-rational-path-out', + 'md5': '1cfda0bf3ae24df17d00f2c0cb6cc21b', + 'info_dict': { + 'id': 'emyi4z-O0ls', + 'ext': 'mp4', + 'title': 'How to outsmart the Prisoner’s Dilemma - Lucas Husted', + 'thumbnail': 'https://i.ytimg.com/vi_webp/emyi4z-O0ls/maxresdefault.webp', + 'uploader': 'TED-Ed', + 'uploader_id': '@TEDEd', + 'uploader_url': 'https://www.youtube.com/@TEDEd', + 'duration': 344, + 'upload_date': '20200827', + 'channel_id': 'UCsooa4yRKGN_zEE8iknghZA', + 'playable_in_embed': True, + 'description': 'md5:c0959524f08cb60f96fd010f3dfb17f3', + 'categories': ['Education'], + 'like_count': int, + 'channel': 'TED-Ed', + 'chapters': 'count:7', + 'channel_url': 'https://www.youtube.com/channel/UCsooa4yRKGN_zEE8iknghZA', + 'tags': 'count:26', + 'availability': 'public', + 'channel_follower_count': int, + 'view_count': int, + 'age_limit': 0, + 'live_status': 'not_live', + 'comment_count': int, + }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - vimeo_id = self._search_regex(r'hosterId":\s*"(?P<id>[0-9]+)', webpage, 'vimeo id') - vimeo_url = VimeoIE._smuggle_referrer(f'https://player.vimeo.com/video/{vimeo_id}', 'https://aeon.co') - return self.url_result(vimeo_url, VimeoIE) + embed_url = traverse_obj(self._yield_json_ld(webpage, video_id), ( + lambda _, v: v['@type'] == 'VideoObject', 'embedUrl', {url_or_none}), get_all=False) + if not embed_url: + raise ExtractorError('No embed URL found in webpage') + if 'player.vimeo.com' in embed_url: + embed_url = VimeoIE._smuggle_referrer(embed_url, 'https://aeon.co/') + return self.url_result(embed_url) diff --git a/hypervideo_dl/extractor/afreecatv.py b/hypervideo_dl/extractor/afreecatv.py index 9276fe7..3d26d9c 100644 --- a/hypervideo_dl/extractor/afreecatv.py +++ b/hypervideo_dl/extractor/afreecatv.py @@ -77,59 +77,6 @@ class AfreecaTVIE(InfoExtractor): }], 'skip': 'Video is gone', }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793', - 'info_dict': { - 'id': '18650793', - 'ext': 'mp4', - 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': '윈아디', - 'uploader_id': 'badkids', - 'duration': 107, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652', - 'info_dict': { - 'id': '10481652', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'duration': 6492, - }, - 'playlist_count': 2, - 'playlist': [{ - 'md5': 'd8b7c174568da61d774ef0203159bf97', - 'info_dict': { - 'id': '20160502_c4c62b9d_174361386_1', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'upload_date': '20160502', - 'duration': 3601, - }, - }, { - 'md5': '58f2ce7f6044e34439ab2d50612ab02b', - 'info_dict': { - 'id': '20160502_39e739bb_174361386_2', - 'ext': 'mp4', - 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)", - 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', - 'uploader': 'dailyapril', - 'uploader_id': 'dailyapril', - 'upload_date': '20160502', - 'duration': 2891, - }, - }], - 'params': { - 'skip_download': True, - }, - }, { # non standard key 'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605', 'info_dict': { @@ -146,8 +93,8 @@ class AfreecaTVIE(InfoExtractor): 'skip_download': True, }, }, { - # PARTIAL_ADULT - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439', + # adult content + 'url': 'https://vod.afreecatv.com/player/97267690', 'info_dict': { 'id': '20180327_27901457_202289533_1', 'ext': 'mp4', @@ -161,16 +108,25 @@ class AfreecaTVIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'expected_warnings': ['adult content'], + 'skip': 'The VOD does not exist', }, { 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', 'only_matching': True, }, { - 'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030', - 'only_matching': True, - }, { - 'url': 'http://vod.afreecatv.com/player/15055030', - 'only_matching': True, + 'url': 'https://vod.afreecatv.com/player/96753363', + 'info_dict': { + 'id': '20230108_9FF5BEE1_244432674_1', + 'ext': 'mp4', + 'uploader_id': 'rlantnghks', + 'uploader': '페이즈으', + 'duration': 10840, + 'thumbnail': 'http://videoimg.afreecatv.com/php/SnapshotLoad.php?rowKey=20230108_9FF5BEE1_244432674_1_r', + 'upload_date': '20230108', + 'title': '젠지 페이즈', + }, + 'params': { + 'skip_download': True, + }, }] @staticmethod @@ -223,26 +179,21 @@ class AfreecaTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - if re.search(r'alert\(["\']This video has been deleted', webpage): - raise ExtractorError( - 'Video %s has been deleted' % video_id, expected=True) - - station_id = self._search_regex( - r'nStationNo\s*=\s*(\d+)', webpage, 'station') - bbs_id = self._search_regex( - r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs') - video_id = self._search_regex( - r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) - partial_view = False adult_view = False for _ in range(2): + data = self._download_json( + 'https://api.m.afreecatv.com/station/video/a/view', + video_id, headers={'Referer': url}, data=urlencode_postdata({ + 'nTitleNo': video_id, + 'nApiLevel': 10, + }))['data'] + if traverse_obj(data, ('code', {int})) == -6221: + raise ExtractorError('The VOD does not exist', expected=True) query = { 'nTitleNo': video_id, - 'nStationNo': station_id, - 'nBbsNo': bbs_id, + 'nStationNo': data['station_no'], + 'nBbsNo': data['bbs_no'], } if partial_view: query['partialView'] = 'SKIP_ADULT' diff --git a/hypervideo_dl/extractor/airtv.py b/hypervideo_dl/extractor/airtv.py new file mode 100644 index 0000000..0b73a96 --- /dev/null +++ b/hypervideo_dl/extractor/airtv.py @@ -0,0 +1,96 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( + determine_ext, + int_or_none, + mimetype2ext, + parse_iso8601, + traverse_obj +) + + +class AirTVIE(InfoExtractor): + _VALID_URL = r'https?://www\.air\.tv/watch\?v=(?P<id>\w+)' + _TESTS = [{ + # without youtube_id + 'url': 'https://www.air.tv/watch?v=W87jcWleSn2hXZN47zJZsQ', + 'info_dict': { + 'id': 'W87jcWleSn2hXZN47zJZsQ', + 'ext': 'mp4', + 'release_date': '20221003', + 'release_timestamp': 1664792603, + 'channel_id': 'vgfManQlRQKgoFQ8i8peFQ', + 'title': 'md5:c12d49ed367c3dadaa67659aff43494c', + 'upload_date': '20221003', + 'duration': 151, + 'view_count': int, + 'thumbnail': 'https://cdn-sp-gcs.air.tv/videos/W/8/W87jcWleSn2hXZN47zJZsQ/b13fc56464f47d9d62a36d110b9b5a72-4096x2160_9.jpg', + 'timestamp': 1664792603, + } + }, { + # with youtube_id + 'url': 'https://www.air.tv/watch?v=sv57EC8tRXG6h8dNXFUU1Q', + 'info_dict': { + 'id': '2ZTqmpee-bQ', + 'ext': 'mp4', + 'comment_count': int, + 'tags': 'count:11', + 'channel_follower_count': int, + 'like_count': int, + 'uploader': 'Newsflare', + 'thumbnail': 'https://i.ytimg.com/vi_webp/2ZTqmpee-bQ/maxresdefault.webp', + 'availability': 'public', + 'title': 'Geese Chase Alligator Across Golf Course', + 'uploader_id': 'NewsflareBreaking', + 'channel_url': 'https://www.youtube.com/channel/UCzSSoloGEz10HALUAbYhngQ', + 'description': 'md5:99b21d9cea59330149efbd9706e208f5', + 'age_limit': 0, + 'channel_id': 'UCzSSoloGEz10HALUAbYhngQ', + 'uploader_url': 'http://www.youtube.com/user/NewsflareBreaking', + 'view_count': int, + 'categories': ['News & Politics'], + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel': 'Newsflare', + 'duration': 37, + 'upload_date': '20180511', + } + }] + + def _get_formats_and_subtitle(self, json_data, video_id): + formats, subtitles = [], {} + for source in traverse_obj(json_data, 'sources', 'sources_desktop', ...): + ext = determine_ext(source.get('src'), mimetype2ext(source.get('type'))) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles(source.get('src'), video_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({'url': source.get('src'), 'ext': ext}) + return formats, subtitles + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + nextjs_json = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['initialState']['videos'][display_id] + if nextjs_json.get('youtube_id'): + return self.url_result( + f'https://www.youtube.com/watch?v={nextjs_json.get("youtube_id")}', YoutubeIE) + + formats, subtitles = self._get_formats_and_subtitle(nextjs_json, display_id) + return { + 'id': display_id, + 'title': nextjs_json.get('title') or self._html_search_meta('og:title', webpage), + 'formats': formats, + 'subtitles': subtitles, + 'description': nextjs_json.get('description') or None, + 'duration': int_or_none(nextjs_json.get('duration')), + 'thumbnails': [ + {'url': thumbnail} + for thumbnail in traverse_obj(nextjs_json, ('default_thumbnails', ...))], + 'channel_id': traverse_obj(nextjs_json, 'channel', 'channel_slug'), + 'timestamp': parse_iso8601(nextjs_json.get('created')), + 'release_timestamp': parse_iso8601(nextjs_json.get('published')), + 'view_count': int_or_none(nextjs_json.get('views')), + } diff --git a/hypervideo_dl/extractor/aitube.py b/hypervideo_dl/extractor/aitube.py new file mode 100644 index 0000000..89a6450 --- /dev/null +++ b/hypervideo_dl/extractor/aitube.py @@ -0,0 +1,60 @@ +from .common import InfoExtractor +from ..utils import int_or_none, merge_dicts + + +class AitubeKZVideoIE(InfoExtractor): + _VALID_URL = r'https?://aitube\.kz/(?:video|embed/)\?(?:[^\?]+)?id=(?P<id>[\w-]+)' + _TESTS = [{ + # id paramater as first parameter + 'url': 'https://aitube.kz/video?id=9291d29b-c038-49a1-ad42-3da2051d353c&playlistId=d55b1f5f-ef2a-4f23-b646-2a86275b86b7&season=1', + 'info_dict': { + 'id': '9291d29b-c038-49a1-ad42-3da2051d353c', + 'ext': 'mp4', + 'duration': 2174.0, + 'channel_id': '94962f73-013b-432c-8853-1bd78ca860fe', + 'like_count': int, + 'channel': 'ASTANA TV', + 'comment_count': int, + 'view_count': int, + 'description': 'Смотреть любимые сериалы и видео, поделиться видео и сериалами с друзьями и близкими', + 'thumbnail': 'https://cdn.static02.aitube.kz/kz.aitudala.aitube.staticaccess/files/ddf2a2ff-bee3-409b-b5f2-2a8202bba75b', + 'upload_date': '20221102', + 'timestamp': 1667370519, + 'title': 'Ангел хранитель 1 серия', + 'channel_follower_count': int, + } + }, { + # embed url + 'url': 'https://aitube.kz/embed/?id=9291d29b-c038-49a1-ad42-3da2051d353c', + 'only_matching': True, + }, { + # id parameter is not as first paramater + 'url': 'https://aitube.kz/video?season=1&id=9291d29b-c038-49a1-ad42-3da2051d353c&playlistId=d55b1f5f-ef2a-4f23-b646-2a86275b86b7', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + nextjs_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['videoInfo'] + json_ld_data = self._search_json_ld(webpage, video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://api-http.aitube.kz/kz.aitudala.aitube.staticaccess/video/{video_id}/video', video_id) + + return merge_dicts({ + 'id': video_id, + 'title': nextjs_data.get('title') or self._html_search_meta(['name', 'og:title'], webpage), + 'description': nextjs_data.get('description'), + 'formats': formats, + 'subtitles': subtitles, + 'view_count': (nextjs_data.get('viewCount') + or int_or_none(self._html_search_meta('ya:ovs:views_total', webpage))), + 'like_count': nextjs_data.get('likeCount'), + 'channel': nextjs_data.get('channelTitle'), + 'channel_id': nextjs_data.get('channelId'), + 'thumbnail': nextjs_data.get('coverUrl'), + 'comment_count': nextjs_data.get('commentCount'), + 'channel_follower_count': int_or_none(nextjs_data.get('channelSubscriberCount')), + }, json_ld_data) diff --git a/hypervideo_dl/extractor/amazon.py b/hypervideo_dl/extractor/amazon.py index 4d31706..a03f983 100644 --- a/hypervideo_dl/extractor/amazon.py +++ b/hypervideo_dl/extractor/amazon.py @@ -1,5 +1,17 @@ +import re + from .common import InfoExtractor -from ..utils import ExtractorError, int_or_none +from ..utils import ( + ExtractorError, + clean_html, + float_or_none, + get_element_by_attribute, + get_element_by_class, + int_or_none, + js_to_json, + traverse_obj, + url_or_none, +) class AmazonStoreIE(InfoExtractor): @@ -9,7 +21,7 @@ class AmazonStoreIE(InfoExtractor): 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', 'info_dict': { 'id': 'B098XNCHLD', - 'title': 'md5:dae240564cbb2642170c02f7f0d7e472', + 'title': str, }, 'playlist_mincount': 1, 'playlist': [{ @@ -20,28 +32,32 @@ class AmazonStoreIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 34, }, - }] + }], + 'expected_warnings': ['Unable to extract data'], }, { 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', 'info_dict': { 'id': 'B0863TXGM3', - 'title': 'md5:d1d3352428f8f015706c84b31e132169', + 'title': str, }, 'playlist_mincount': 4, + 'expected_warnings': ['Unable to extract data'], }, { 'url': 'https://www.amazon.com/dp/B0845NXCXF/', 'info_dict': { 'id': 'B0845NXCXF', - 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', + 'title': str, }, 'playlist-mincount': 1, + 'expected_warnings': ['Unable to extract data'], }, { 'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ', 'info_dict': { 'id': 'B08WX337PQ', - 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', + 'title': str, }, 'playlist_mincount': 1, + 'expected_warnings': ['Unable to extract data'], }] def _real_extract(self, url): @@ -52,7 +68,7 @@ class AmazonStoreIE(InfoExtractor): try: data_json = self._search_json( r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id, - transform_source=lambda x: x.replace(R'\\u', R'\u')) + transform_source=js_to_json) except ExtractorError as e: retry.error = e @@ -66,3 +82,89 @@ class AmazonStoreIE(InfoExtractor): 'width': int_or_none(video.get('videoWidth')), } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title')) + + +class AmazonReviewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/gp/customer-reviews/(?P<id>[^/&#$?]+)' + _TESTS = [{ + 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl', + 'info_dict': { + 'id': 'R10VE9VUSY19L3', + 'ext': 'mp4', + 'title': 'Get squad #Suspicious', + 'description': 'md5:7012695052f440a1e064e402d87e0afb', + 'uploader': 'Kimberly Cronkright', + 'average_rating': 1.0, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['Review body was not found in webpage'], + }, { + 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl?language=es_US', + 'info_dict': { + 'id': 'R10VE9VUSY19L3', + 'ext': 'mp4', + 'title': 'Get squad #Suspicious', + 'description': 'md5:7012695052f440a1e064e402d87e0afb', + 'uploader': 'Kimberly Cronkright', + 'average_rating': 1.0, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['Review body was not found in webpage'], + }, { + 'url': 'https://www.amazon.in/gp/customer-reviews/RV1CO8JN5VGXV/', + 'info_dict': { + 'id': 'RV1CO8JN5VGXV', + 'ext': 'mp4', + 'title': 'Not sure about its durability', + 'description': 'md5:1a252c106357f0a3109ebf37d2e87494', + 'uploader': 'Shoaib Gulzar', + 'average_rating': 2.0, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['Review body was not found in webpage'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + for retry in self.RetryManager(): + webpage = self._download_webpage(url, video_id) + review_body = get_element_by_attribute('data-hook', 'review-body', webpage) + if not review_body: + retry.error = ExtractorError('Review body was not found in webpage', expected=True) + + formats, subtitles = [], {} + + manifest_url = self._search_regex( + r'data-video-url="([^"]+)"', review_body, 'm3u8 url', default=None) + if url_or_none(manifest_url): + fmts, subtitles = self._extract_m3u8_formats_and_subtitles( + manifest_url, video_id, 'mp4', fatal=False) + formats.extend(fmts) + + video_url = self._search_regex( + r'<input[^>]+\bvalue="([^"]+)"[^>]+\bclass="video-url"', review_body, 'mp4 url', default=None) + if url_or_none(video_url): + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'format_id': 'http-mp4', + }) + + if not formats: + self.raise_no_formats('No video found for this customer review', expected=True) + + return { + 'id': video_id, + 'title': (clean_html(get_element_by_attribute('data-hook', 'review-title', webpage)) + or self._html_extract_title(webpage)), + 'description': clean_html(traverse_obj(re.findall( + r'<span(?:\s+class="cr-original-review-content")?>(.+?)</span>', review_body), -1)), + 'uploader': clean_html(get_element_by_class('a-profile-name', webpage)), + 'average_rating': float_or_none(clean_html(get_element_by_attribute( + 'data-hook', 'review-star-rating', webpage) or '').partition(' ')[0]), + 'thumbnail': self._search_regex( + r'data-thumbnail-url="([^"]+)"', review_body, 'thumbnail', default=None), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/amazonminitv.py b/hypervideo_dl/extractor/amazonminitv.py index 7309968..b57d985 100644 --- a/hypervideo_dl/extractor/amazonminitv.py +++ b/hypervideo_dl/extractor/amazonminitv.py @@ -191,7 +191,7 @@ query content($sessionIdToken: String!, $deviceLocale: String, $contentId: ID!, class AmazonMiniTVSeasonIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:season' _VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)' - IE_DESC = 'Amazon MiniTV Series, "minitv:season:" prefix' + IE_DESC = 'Amazon MiniTV Season, "minitv:season:" prefix' _TESTS = [{ 'url': 'amazonminitv:season:amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', 'playlist_mincount': 6, @@ -250,6 +250,7 @@ query getEpisodes($sessionIdToken: String!, $clientId: String, $episodeOrSeasonI class AmazonMiniTVSeriesIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:series' _VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)' + IE_DESC = 'Amazon MiniTV Series, "minitv:series:" prefix' _TESTS = [{ 'url': 'amazonminitv:series:amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', 'playlist_mincount': 3, diff --git a/hypervideo_dl/extractor/americastestkitchen.py b/hypervideo_dl/extractor/americastestkitchen.py index abda55d..e889458 100644 --- a/hypervideo_dl/extractor/americastestkitchen.py +++ b/hypervideo_dl/extractor/americastestkitchen.py @@ -11,7 +11,7 @@ from ..utils import ( class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:cooks(?:country|illustrated)/)?(?P<resource_type>episode|videos)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?:cooks(?:country|illustrated)/)?(?P<resource_type>episode|videos)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', @@ -72,6 +72,12 @@ class AmericasTestKitchenIE(InfoExtractor): }, { 'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington', 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'only_matching': True, }] def _real_extract(self, url): @@ -100,7 +106,7 @@ class AmericasTestKitchenIE(InfoExtractor): class AmericasTestKitchenSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com(?P<show>/cookscountry)?/episodes/browse/season_(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|(?P<cooks>cooks(?:country|illustrated)))\.com(?:(?:/(?P<show2>cooks(?:country|illustrated)))?(?:/?$|(?<!ated)(?<!ated\.com)/episodes/browse/season_(?P<season>\d+)))' _TESTS = [{ # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', @@ -117,29 +123,73 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): 'title': 'Season 12', }, 'playlist_count': 13, + }, { + # America's Test Kitchen Series + 'url': 'https://www.americastestkitchen.com/', + 'info_dict': { + 'id': 'americastestkitchen', + 'title': 'America\'s Test Kitchen', + }, + 'playlist_count': 558, + }, { + # Cooks Country Series + 'url': 'https://www.americastestkitchen.com/cookscountry', + 'info_dict': { + 'id': 'cookscountry', + 'title': 'Cook\'s Country', + }, + 'playlist_count': 199, + }, { + 'url': 'https://www.americastestkitchen.com/cookscountry/', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com', + 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cooksillustrated/', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com', + 'only_matching': True, }] def _real_extract(self, url): - show_path, season_number = self._match_valid_url(url).group('show', 'id') - season_number = int(season_number) + season_number, show1, show = self._match_valid_url(url).group('season', 'show', 'show2') + show_path = ('/' + show) if show else '' + show = show or show1 + season_number = int_or_none(season_number) + + slug, title = { + 'americastestkitchen': ('atk', 'America\'s Test Kitchen'), + 'cookscountry': ('cco', 'Cook\'s Country'), + 'cooksillustrated': ('cio', 'Cook\'s Illustrated'), + }[show] - slug = 'cco' if show_path == '/cookscountry' else 'atk' + facet_filters = [ + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ] - season = 'Season %d' % season_number + if season_number: + playlist_id = 'season_%d' % season_number + playlist_title = 'Season %d' % season_number + facet_filters.append('search_season_list:' + playlist_title) + else: + playlist_id = show + playlist_title = title season_search = self._download_json( 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, - season, headers={ + playlist_id, headers={ 'Origin': 'https://www.americastestkitchen.com', 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ - 'facetFilters': json.dumps([ - 'search_season_list:' + season, - 'search_document_klass:episode', - 'search_show_slug:' + slug, - ]), - 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, + 'facetFilters': json.dumps(facet_filters), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title,search_atk_episode_season' % slug, 'attributesToHighlight': '', 'hitsPerPage': 1000, }) @@ -162,4 +212,4 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): } return self.playlist_result( - entries(), 'season_%d' % season_number, season) + entries(), playlist_id, playlist_title) diff --git a/hypervideo_dl/extractor/amp.py b/hypervideo_dl/extractor/amp.py index b0cbd77..0d259c5 100644 --- a/hypervideo_dl/extractor/amp.py +++ b/hypervideo_dl/extractor/amp.py @@ -5,6 +5,7 @@ from ..utils import ( int_or_none, mimetype2ext, parse_iso8601, + strip_jsonp, unified_timestamp, url_or_none, ) @@ -15,7 +16,7 @@ class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with def _extract_feed_info(self, url): feed = self._download_json( url, None, 'Downloading Akamai AMP feed', - 'Unable to download Akamai AMP feed') + 'Unable to download Akamai AMP feed', transform_source=strip_jsonp) item = feed.get('channel', {}).get('item') if not item: raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error'])) @@ -73,8 +74,10 @@ class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: formats.append({ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), diff --git a/hypervideo_dl/extractor/anchorfm.py b/hypervideo_dl/extractor/anchorfm.py new file mode 100644 index 0000000..52f2ad0 --- /dev/null +++ b/hypervideo_dl/extractor/anchorfm.py @@ -0,0 +1,98 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + float_or_none, + int_or_none, + str_or_none, + traverse_obj, + unified_timestamp +) + + +class AnchorFMEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://anchor\.fm/(?P<channel_name>\w+)/(?:embed/)?episodes/[\w-]+-(?P<episode_id>\w+)' + _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL})'] + _TESTS = [{ + 'url': 'https://anchor.fm/lovelyti/episodes/Chrisean-Rock-takes-to-twitter-to-announce-shes-pregnant--Blueface-denies-he-is-the-father-e1tpt3d', + 'info_dict': { + 'id': 'e1tpt3d', + 'ext': 'mp3', + 'title': ' Chrisean Rock takes to twitter to announce she\'s pregnant, Blueface denies he is the father!', + 'description': 'md5:207d167de3e28ceb4ddc1ebf5a30044c', + 'thumbnail': 'https://s3-us-west-2.amazonaws.com/anchor-generated-image-bank/production/podcast_uploaded_nologo/1034827/1034827-1658438968460-5f3bfdf3601e8.jpg', + 'duration': 624.718, + 'uploader': 'Lovelyti ', + 'uploader_id': '991541', + 'channel': 'lovelyti', + 'modified_date': '20230121', + 'modified_timestamp': 1674285178, + 'release_date': '20230121', + 'release_timestamp': 1674285179, + 'episode_id': 'e1tpt3d', + } + }, { + # embed url + 'url': 'https://anchor.fm/apakatatempo/embed/episodes/S2E75-Perang-Bintang-di-Balik-Kasus-Ferdy-Sambo-dan-Ismail-Bolong-e1shjqd', + 'info_dict': { + 'id': 'e1shjqd', + 'ext': 'mp3', + 'title': 'S2E75 Perang Bintang di Balik Kasus Ferdy Sambo dan Ismail Bolong', + 'description': 'md5:9e95ad9293bf00178bf8d33e9cb92c41', + 'duration': 1042.008, + 'thumbnail': 'https://s3-us-west-2.amazonaws.com/anchor-generated-image-bank/production/podcast_uploaded_episode400/2627805/2627805-1671590688729-4db3882ac9e4b.jpg', + 'release_date': '20221221', + 'release_timestamp': 1671595916, + 'modified_date': '20221221', + 'modified_timestamp': 1671590834, + 'channel': 'apakatatempo', + 'uploader': 'Podcast Tempo', + 'uploader_id': '2585461', + 'season': 'Season 2', + 'season_number': 2, + 'episode_id': 'e1shjqd', + } + }] + + _WEBPAGE_TESTS = [{ + 'url': 'https://podcast.tempo.co/podcast/192/perang-bintang-di-balik-kasus-ferdy-sambo-dan-ismail-bolong', + 'info_dict': { + 'id': 'e1shjqd', + 'ext': 'mp3', + 'release_date': '20221221', + 'duration': 1042.008, + 'season': 'Season 2', + 'modified_timestamp': 1671590834, + 'uploader_id': '2585461', + 'modified_date': '20221221', + 'description': 'md5:9e95ad9293bf00178bf8d33e9cb92c41', + 'season_number': 2, + 'title': 'S2E75 Perang Bintang di Balik Kasus Ferdy Sambo dan Ismail Bolong', + 'release_timestamp': 1671595916, + 'episode_id': 'e1shjqd', + 'thumbnail': 'https://s3-us-west-2.amazonaws.com/anchor-generated-image-bank/production/podcast_uploaded_episode400/2627805/2627805-1671590688729-4db3882ac9e4b.jpg', + 'uploader': 'Podcast Tempo', + 'channel': 'apakatatempo', + } + }] + + def _real_extract(self, url): + channel_name, episode_id = self._match_valid_url(url).group('channel_name', 'episode_id') + api_data = self._download_json(f'https://anchor.fm/api/v3/episodes/{episode_id}', episode_id) + + return { + 'id': episode_id, + 'title': traverse_obj(api_data, ('episode', 'title')), + 'url': traverse_obj(api_data, ('episode', 'episodeEnclosureUrl'), ('episodeAudios', 0, 'url')), + 'ext': 'mp3', + 'vcodec': 'none', + 'thumbnail': traverse_obj(api_data, ('episode', 'episodeImage')), + 'description': clean_html(traverse_obj(api_data, ('episode', ('description', 'descriptionPreview')), get_all=False)), + 'duration': float_or_none(traverse_obj(api_data, ('episode', 'duration')), 1000), + 'modified_timestamp': unified_timestamp(traverse_obj(api_data, ('episode', 'modified'))), + 'release_timestamp': int_or_none(traverse_obj(api_data, ('episode', 'publishOnUnixTimestamp'))), + 'episode_id': episode_id, + 'uploader': traverse_obj(api_data, ('creator', 'name')), + 'uploader_id': str_or_none(traverse_obj(api_data, ('creator', 'userId'))), + 'season_number': int_or_none(traverse_obj(api_data, ('episode', 'podcastSeasonNumber'))), + 'channel': channel_name or traverse_obj(api_data, ('creator', 'vanitySlug')), + } diff --git a/hypervideo_dl/extractor/antenna.py b/hypervideo_dl/extractor/antenna.py new file mode 100644 index 0000000..c78717a --- /dev/null +++ b/hypervideo_dl/extractor/antenna.py @@ -0,0 +1,143 @@ +import urllib.parse + +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import ( + ExtractorError, + determine_ext, + make_archive_id, + scale_thumbnails_to_max_format_width, +) + + +class AntennaBaseIE(InfoExtractor): + def _download_and_extract_api_data(self, video_id, netloc, cid=None): + info = self._download_json(f'{self.http_scheme()}//{netloc}{self._API_PATH}', + video_id, query={'cid': cid or video_id}) + if not info.get('url'): + raise ExtractorError(f'No source found for {video_id}') + + ext = determine_ext(info['url']) + if ext == 'm3u8': + formats, subs = self._extract_m3u8_formats_and_subtitles(info['url'], video_id, 'mp4') + else: + formats, subs = [{'url': info['url'], 'format_id': ext}], {} + + thumbnails = scale_thumbnails_to_max_format_width( + formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+') if info.get('thumb') else [] + return { + 'id': video_id, + 'title': info.get('title'), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subs, + } + + +class AntennaGrWatchIE(AntennaBaseIE): + IE_NAME = 'antenna:watch' + IE_DESC = 'antenna.gr and ant1news.gr videos' + _VALID_URL = r'https?://(?P<netloc>(?:www\.)?(?:antenna|ant1news)\.gr)/watch/(?P<id>\d+)/' + _API_PATH = '/templates/data/player' + + _TESTS = [{ + 'url': 'https://www.ant1news.gr/watch/1506168/ant1-news-09112021-stis-18-45', + 'md5': 'c472d9dd7cd233c63aff2ea42201cda6', + 'info_dict': { + 'id': '1506168', + 'ext': 'mp4', + 'title': 'md5:0ad00fa66ecf8aa233d26ab0dba7514a', + 'description': 'md5:18665af715a6dcfeac1d6153a44f16b0', + 'thumbnail': r're:https://ant1media\.azureedge\.net/imgHandler/\d+/26d46bf6-8158-4f02-b197-7096c714b2de\.jpg', + }, + }, { + 'url': 'https://www.antenna.gr/watch/1643812/oi-prodotes-epeisodio-01', + 'md5': '8f6f7dd3b1dba4d835ba990e25f31243', + 'info_dict': { + 'id': '1643812', + 'ext': 'mp4', + 'format_id': 'mp4', + 'title': 'ΟΙ ΠΡΟΔΟΤΕΣ – ΕΠΕΙΣΟΔΙΟ 01', + 'thumbnail': r're:https://ant1media\.azureedge\.net/imgHandler/\d+/b3d63096-e72d-43c4-87a0-00d4363d242f\.jpg', + }, + }] + + def _real_extract(self, url): + video_id, netloc = self._match_valid_url(url).group('id', 'netloc') + webpage = self._download_webpage(url, video_id) + info = self._download_and_extract_api_data(video_id, netloc) + info['description'] = self._og_search_description(webpage, default=None) + info['_old_archive_ids'] = [make_archive_id('Ant1NewsGrWatch', video_id)], + return info + + +class Ant1NewsGrArticleIE(AntennaBaseIE): + IE_NAME = 'ant1newsgr:article' + IE_DESC = 'ant1news.gr articles' + _VALID_URL = r'https?://(?:www\.)?ant1news\.gr/[^/]+/article/(?P<id>\d+)/' + + _TESTS = [{ + 'url': 'https://www.ant1news.gr/afieromata/article/549468/o-tzeims-mpont-sta-meteora-oi-apeiles-kai-o-xesikomos-ton-kalogeron', + 'md5': '294f18331bb516539d72d85a82887dcc', + 'info_dict': { + 'id': '_xvg/m_cmbatw=', + 'ext': 'mp4', + 'title': 'md5:a93e8ecf2e4073bfdffcb38f59945411', + 'timestamp': 1603092840, + 'upload_date': '20201019', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/756206d2-d640-40e2-b201-3555abdfc0db.jpg', + }, + }, { + 'url': 'https://ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn', + 'info_dict': { + 'id': '620286', + 'title': 'md5:91fe569e952e4d146485740ae927662b', + }, + 'playlist_mincount': 2, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle') + embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage)) + if not embed_urls: + raise ExtractorError('no videos found for %s' % video_id, expected=True) + return self.playlist_from_matches( + embed_urls, video_id, info.get('title'), ie=Ant1NewsGrEmbedIE.ie_key(), + video_kwargs={'url_transparent': True, 'timestamp': info.get('timestamp')}) + + +class Ant1NewsGrEmbedIE(AntennaBaseIE): + IE_NAME = 'ant1newsgr:embed' + IE_DESC = 'ant1news.gr embedded videos' + _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player' + _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)' + _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)'] + _API_PATH = '/news/templates/data/jsonPlayer' + + _TESTS = [{ + 'url': 'https://www.antenna.gr/templates/pages/player?cid=3f_li_c_az_jw_y_u=&w=670&h=377', + 'md5': 'dfc58c3a11a5a9aad2ba316ed447def3', + 'info_dict': { + 'id': '3f_li_c_az_jw_y_u=', + 'ext': 'mp4', + 'title': 'md5:a30c93332455f53e1e84ae0724f0adf7', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/bbe31201-3f09-4a4e-87f5-8ad2159fffe2.jpg', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + canonical_url = self._request_webpage( + HEADRequest(url), video_id, + note='Resolve canonical player URL', + errnote='Could not resolve canonical player URL').url + _, netloc, _, _, query, _ = urllib.parse.urlparse(canonical_url) + cid = urllib.parse.parse_qs(query)['cid'][0] + + return self._download_and_extract_api_data(video_id, netloc, cid=cid) diff --git a/hypervideo_dl/extractor/anvato.py b/hypervideo_dl/extractor/anvato.py index 79bfe41..0df5033 100644 --- a/hypervideo_dl/extractor/anvato.py +++ b/hypervideo_dl/extractor/anvato.py @@ -336,7 +336,7 @@ class AnvatoIE(InfoExtractor): elif media_format == 'm3u8-variant' or ext == 'm3u8': # For some videos the initial m3u8 URL returns JSON instead manifest_json = self._download_json( - video_url, video_id, note='Downloading manifest JSON', errnote=False) + video_url, video_id, note='Downloading manifest JSON', fatal=False) if manifest_json: video_url = manifest_json.get('master_m3u8') if not video_url: @@ -392,14 +392,6 @@ class AnvatoIE(InfoExtractor): url = smuggle_url(url, {'token': anvplayer_data['token']}) yield cls.url_result(url, AnvatoIE, video_id) - def _extract_anvato_videos(self, webpage, video_id): - anvplayer_data = self._parse_json( - self._html_search_regex( - self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), - video_id) - return self._get_anvato_videos( - anvplayer_data['accessKey'], anvplayer_data['video'], 'default') # cbslocal token = 'default' - def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) self._initialize_geo_bypass({ diff --git a/hypervideo_dl/extractor/archiveorg.py b/hypervideo_dl/extractor/archiveorg.py index 90dda9f..2541cd6 100644 --- a/hypervideo_dl/extractor/archiveorg.py +++ b/hypervideo_dl/extractor/archiveorg.py @@ -3,12 +3,14 @@ import re import urllib.parse from .common import InfoExtractor +from .naver import NaverBaseIE from .youtube import YoutubeBaseInfoExtractor, YoutubeIE -from ..compat import compat_HTTPError, compat_urllib_parse_unquote +from ..compat import compat_urllib_parse_unquote +from ..networking import HEADRequest +from ..networking.exceptions import HTTPError from ..utils import ( KNOWN_EXTENSIONS, ExtractorError, - HEADRequest, bug_reports_message, clean_html, dict_get, @@ -897,7 +899,7 @@ class YoutubeWebArchiveIE(InfoExtractor): video_id, note='Fetching archived video file url', expected_status=True) except ExtractorError as e: # HTTP Error 404 is expected if the video is not saved. - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: self.raise_no_formats( 'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True) else: @@ -924,7 +926,7 @@ class YoutubeWebArchiveIE(InfoExtractor): info['thumbnails'] = self._extract_thumbnails(video_id) if urlh: - url = compat_urllib_parse_unquote(urlh.geturl()) + url = compat_urllib_parse_unquote(urlh.url) video_file_url_qs = parse_qs(url) # Attempt to recover any ext & format info from playback url & response headers format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))} @@ -945,3 +947,237 @@ class YoutubeWebArchiveIE(InfoExtractor): if not info.get('title'): info['title'] = video_id return info + + +class VLiveWebArchiveIE(InfoExtractor): + IE_NAME = 'web.archive:vlive' + IE_DESC = 'web.archive.org saved vlive videos' + _VALID_URL = r'''(?x) + (?:https?://)?web\.archive\.org/ + (?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional + (?:https?(?::|%3[Aa])//)?(?: + (?:(?:www|m)\.)?vlive\.tv(?::(?:80|443))?/(?:video|embed)/(?P<id>[0-9]+) # VLive URL + ) + ''' + _TESTS = [{ + 'url': 'https://web.archive.org/web/20221221144331/http://www.vlive.tv/video/1326', + 'md5': 'cc7314812855ce56de70a06a27314983', + 'info_dict': { + 'id': '1326', + 'ext': 'mp4', + 'title': "Girl's Day's Broadcast", + 'creator': "Girl's Day", + 'view_count': int, + 'uploader_id': 'muploader_a', + 'uploader_url': None, + 'uploader': None, + 'upload_date': '20150817', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'timestamp': 1439816449, + 'like_count': int, + 'channel': 'Girl\'s Day', + 'channel_id': 'FDF27', + 'comment_count': int, + 'release_timestamp': 1439818140, + 'release_date': '20150817', + 'duration': 1014, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://web.archive.org/web/20221221182103/http://www.vlive.tv/video/16937', + 'info_dict': { + 'id': '16937', + 'ext': 'mp4', + 'title': '첸백시 걍방', + 'creator': 'EXO', + 'view_count': int, + 'subtitles': 'mincount:12', + 'uploader_id': 'muploader_j', + 'uploader_url': 'http://vlive.tv', + 'uploader': None, + 'upload_date': '20161112', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'timestamp': 1478923074, + 'like_count': int, + 'channel': 'EXO', + 'channel_id': 'F94BD', + 'comment_count': int, + 'release_timestamp': 1478924280, + 'release_date': '20161112', + 'duration': 906, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870', + 'info_dict': { + 'id': '101870', + 'ext': 'mp4', + 'title': '[ⓓ xV] “레벨이들 매력에 반해? 안 반해?” 움직이는 HD 포토 (레드벨벳:Red Velvet)', + 'creator': 'Dispatch', + 'view_count': int, + 'subtitles': 'mincount:6', + 'uploader_id': 'V__FRA08071', + 'uploader_url': 'http://vlive.tv', + 'uploader': None, + 'upload_date': '20181130', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'timestamp': 1543601327, + 'like_count': int, + 'channel': 'Dispatch', + 'channel_id': 'C796F3', + 'comment_count': int, + 'release_timestamp': 1543601040, + 'release_date': '20181130', + 'duration': 279, + }, + 'params': { + 'skip_download': True, + }, + }] + + # The wayback machine has special timestamp and "mode" values: + # timestamp: + # 1 = the first capture + # 2 = the last capture + # mode: + # id_ = Identity - perform no alterations of the original resource, return it as it was archived. + _WAYBACK_BASE_URL = 'https://web.archive.org/web/2id_/' + + def _download_archived_page(self, url, video_id, *, timestamp='2', **kwargs): + for retry in self.RetryManager(): + try: + return self._download_webpage(f'https://web.archive.org/web/{timestamp}id_/{url}', video_id, **kwargs) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: + raise ExtractorError('Page was not archived', expected=True) + retry.error = e + continue + + def _download_archived_json(self, url, video_id, **kwargs): + page = self._download_archived_page(url, video_id, **kwargs) + if not page: + raise ExtractorError('Page was not archived', expected=True) + else: + return self._parse_json(page, video_id) + + def _extract_formats_from_m3u8(self, m3u8_url, params, video_id): + m3u8_doc = self._download_archived_page(m3u8_url, video_id, note='Downloading m3u8', query=params, fatal=False) + if not m3u8_doc: + return + + # M3U8 document should be changed to archive domain + m3u8_doc = m3u8_doc.splitlines() + url_base = m3u8_url.rsplit('/', 1)[0] + first_segment = None + for i, line in enumerate(m3u8_doc): + if not line.startswith('#'): + m3u8_doc[i] = f'{self._WAYBACK_BASE_URL}{url_base}/{line}?{urllib.parse.urlencode(params)}' + first_segment = first_segment or m3u8_doc[i] + + # Segments may not have been archived. See https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870 + urlh = self._request_webpage(HEADRequest(first_segment), video_id, errnote=False, + fatal=False, note='Check first segment availablity') + if urlh: + formats, subtitles = self._parse_m3u8_formats_and_subtitles('\n'.join(m3u8_doc), ext='mp4', video_id=video_id) + if subtitles: + self._report_ignoring_subs('m3u8') + return formats + + # Closely follows the logic of the ArchiveTeam grab script + # See: https://github.com/ArchiveTeam/vlive-grab/blob/master/vlive.lua + def _real_extract(self, url): + video_id, url_date = self._match_valid_url(url).group('id', 'date') + + webpage = self._download_archived_page(f'https://www.vlive.tv/video/{video_id}', video_id, timestamp=url_date) + + player_info = self._search_json(r'__PRELOADED_STATE__\s*=', webpage, 'player info', video_id) + user_country = traverse_obj(player_info, ('common', 'userCountry')) + + main_script_url = self._search_regex(r'<script\s+src="([^"]+/js/main\.[^"]+\.js)"', webpage, 'main script url') + main_script = self._download_archived_page(main_script_url, video_id, note='Downloading main script') + app_id = self._search_regex(r'appId\s*=\s*"([^"]+)"', main_script, 'app id') + + inkey = self._download_archived_json( + f'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/{video_id}/inkey', video_id, note='Fetching inkey', query={ + 'appId': app_id, + 'platformType': 'PC', + 'gcc': user_country, + 'locale': 'en_US', + }, fatal=False) + + vod_id = traverse_obj(player_info, ('postDetail', 'post', 'officialVideo', 'vodId')) + + vod_data = self._download_archived_json( + f'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{vod_id}', video_id, note='Fetching vod data', query={ + 'key': inkey.get('inkey'), + 'pid': 'rmcPlayer_16692457559726800', # partially unix time and partially random. Fixed value used by archiveteam project + 'sid': '2024', + 'ver': '2.0', + 'devt': 'html5_pc', + 'doct': 'json', + 'ptc': 'https', + 'sptc': 'https', + 'cpt': 'vtt', + 'ctls': '%7B%22visible%22%3A%7B%22fullscreen%22%3Atrue%2C%22logo%22%3Afalse%2C%22playbackRate%22%3Afalse%2C%22scrap%22%3Afalse%2C%22playCount%22%3Atrue%2C%22commentCount%22%3Atrue%2C%22title%22%3Atrue%2C%22writer%22%3Atrue%2C%22expand%22%3Afalse%2C%22subtitles%22%3Atrue%2C%22thumbnails%22%3Atrue%2C%22quality%22%3Atrue%2C%22setting%22%3Atrue%2C%22script%22%3Afalse%2C%22logoDimmed%22%3Atrue%2C%22badge%22%3Atrue%2C%22seekingTime%22%3Atrue%2C%22muted%22%3Atrue%2C%22muteButton%22%3Afalse%2C%22viewerNotice%22%3Afalse%2C%22linkCount%22%3Afalse%2C%22createTime%22%3Afalse%2C%22thumbnail%22%3Atrue%7D%2C%22clicked%22%3A%7B%22expand%22%3Afalse%2C%22subtitles%22%3Afalse%7D%7D', + 'pv': '4.26.9', + 'dr': '1920x1080', + 'cpl': 'en_US', + 'lc': 'en_US', + 'adi': '%5B%7B%22type%22%3A%22pre%22%2C%22exposure%22%3Afalse%2C%22replayExposure%22%3Afalse%7D%5D', + 'adu': '%2F', + 'videoId': vod_id, + 'cc': user_country, + }) + + formats = [] + + streams = traverse_obj(vod_data, ('streams', ...)) + if len(streams) > 1: + self.report_warning('Multiple streams found. Only the first stream will be downloaded.') + stream = streams[0] + + max_stream = max( + stream.get('videos') or [], + key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None) + if max_stream is not None: + params = {arg.get('name'): arg.get('value') for arg in stream.get('keys', []) if arg.get('type') == 'param'} + formats = self._extract_formats_from_m3u8(max_stream.get('source'), params, video_id) or [] + + # For parts of the project MP4 files were archived + max_video = max( + traverse_obj(vod_data, ('videos', 'list', ...)), + key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None) + if max_video is not None: + video_url = self._WAYBACK_BASE_URL + max_video.get('source') + urlh = self._request_webpage(HEADRequest(video_url), video_id, errnote=False, + fatal=False, note='Check video availablity') + if urlh: + formats.append({'url': video_url}) + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(player_info, ('postDetail', 'post', { + 'title': ('officialVideo', 'title', {str}), + 'creator': ('author', 'nickname', {str}), + 'channel': ('channel', 'channelName', {str}), + 'channel_id': ('channel', 'channelCode', {str}), + 'duration': ('officialVideo', 'playTime', {int_or_none}), + 'view_count': ('officialVideo', 'playCount', {int_or_none}), + 'like_count': ('officialVideo', 'likeCount', {int_or_none}), + 'comment_count': ('officialVideo', 'commentCount', {int_or_none}), + 'timestamp': ('officialVideo', 'createdAt', {lambda x: int_or_none(x, scale=1000)}), + 'release_timestamp': ('officialVideo', 'willStartAt', {lambda x: int_or_none(x, scale=1000)}), + })), + **traverse_obj(vod_data, ('meta', { + 'uploader_id': ('user', 'id', {str}), + 'uploader': ('user', 'name', {str}), + 'uploader_url': ('user', 'url', {url_or_none}), + 'thumbnail': ('cover', 'source', {url_or_none}), + }), expected_type=lambda x: x or None), + **NaverBaseIE.process_subtitles(vod_data, lambda x: [self._WAYBACK_BASE_URL + x]), + } diff --git a/hypervideo_dl/extractor/ard.py b/hypervideo_dl/extractor/ard.py index 0a8a874..ca1faa7 100644 --- a/hypervideo_dl/extractor/ard.py +++ b/hypervideo_dl/extractor/ard.py @@ -13,6 +13,7 @@ from ..utils import ( try_get, unified_strdate, unified_timestamp, + update_url, update_url_query, url_or_none, xpath_text, @@ -46,6 +47,9 @@ class ARDMediathekBaseIE(InfoExtractor): subtitles['de'] = [{ 'ext': 'ttml', 'url': subtitle_url, + }, { + 'ext': 'vtt', + 'url': subtitle_url.replace('/ebutt/', '/webvtt/') + '.vtt', }] return { @@ -286,16 +290,16 @@ class ARDMediathekIE(ARDMediathekBaseIE): class ARDIE(InfoExtractor): _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html' _TESTS = [{ - # available till 7.01.2022 - 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html', - 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1', + # available till 7.12.2023 + 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html', + 'md5': 'a438f671e87a7eba04000336a119ccc4', 'info_dict': { - 'id': 'maischberger-die-woche-video100', - 'display_id': 'maischberger-die-woche-video100', + 'id': 'maischberger-video-424', + 'display_id': 'maischberger-video-424', 'ext': 'mp4', - 'duration': 3687.0, - 'title': 'maischberger. die woche vom 7. Januar 2021', - 'upload_date': '20210107', + 'duration': 4452.0, + 'title': 'maischberger am 07.12.2022', + 'upload_date': '20221207', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -405,6 +409,23 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))''' _TESTS = [{ + 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy8xZGY0ZGJmZS00ZWQwLTRmMGItYjhhYy0wOGQ4ZmYxNjVhZDI', + 'md5': '3fd5fead7a370a819341129c8d713136', + 'info_dict': { + 'display_id': 'filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen', + 'id': '12172961', + 'title': 'Wolfsland - Die traurigen Schwestern', + 'description': r're:^Als der Polizeiobermeister Raaben', + 'duration': 5241, + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:efa186f7b0054957', + 'timestamp': 1670710500, + 'upload_date': '20221210', + 'ext': 'mp4', + 'age_limit': 12, + 'episode': 'Wolfsland - Die traurigen Schwestern', + 'series': 'Filme im MDR' + }, + }, { 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', 'info_dict': { @@ -421,7 +442,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'skip': 'Error', }, { 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', - 'md5': 'f1837e563323b8a642a8ddeff0131f51', + 'md5': '1e73ded21cb79bac065117e80c81dc88', 'info_dict': { 'id': '10049223', 'ext': 'mp4', @@ -429,13 +450,11 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'timestamp': 1636398000, 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b', 'upload_date': '20211108', - }, - }, { - 'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1', - 'playlist_count': 6, - 'info_dict': { - 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw', - 'title': 'beforeigners/beforeigners/staffel-1', + 'display_id': 'tagesschau-oder-tagesschau-20-00-uhr/das-erste', + 'duration': 915, + 'episode': 'tagesschau, 20:00 Uhr', + 'series': 'tagesschau', + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49', }, }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', @@ -599,6 +618,9 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): show { title } + image { + src + } synopsis title tracking { @@ -637,6 +659,15 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'description': description, 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), 'series': try_get(player_page, lambda x: x['show']['title']), + 'thumbnail': (media_collection.get('_previewImage') + or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None)) + or self.get_thumbnail_from_html(display_id, url)), }) info.update(self._ARD_extract_episode_info(info['title'])) return info + + def get_thumbnail_from_html(self, display_id, url): + webpage = self._download_webpage(url, display_id, fatal=False) or '' + return ( + self._og_search_thumbnail(webpage, default=None) + or self._html_search_meta('thumbnailUrl', webpage, default=None)) diff --git a/hypervideo_dl/extractor/arte.py b/hypervideo_dl/extractor/arte.py index 54e4d2d..e3cc5af 100644 --- a/hypervideo_dl/extractor/arte.py +++ b/hypervideo_dl/extractor/arte.py @@ -65,6 +65,21 @@ class ArteTVIE(ArteTVBaseIE): }, { 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE', 'only_matching': True, + }, { + 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/', + 'info_dict': { + 'id': '110203-006-A', + 'chapters': 'count:16', + 'description': 'md5:cf592f1df52fe52007e3f8eac813c084', + 'alt_title': 'Zaz', + 'title': 'Baloise Session 2022', + 'timestamp': 1668445200, + 'duration': 4054, + 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/ubQjmVCGyRx3hmBuZEK9QZ/940x530', + 'upload_date': '20221114', + 'ext': 'mp4', + }, + 'expected_warnings': ['geo restricted'] }] _GEO_BYPASS = True @@ -180,10 +195,6 @@ class ArteTVIE(ArteTVBaseIE): else: self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}') - # TODO: chapters from stream['segments']? - # The JS also looks for chapters in config['data']['attributes']['chapters'], - # but I am yet to find a video having those - formats.extend(secondary_formats) self._remove_duplicate_formats(formats) @@ -205,6 +216,11 @@ class ArteTVIE(ArteTVBaseIE): {'url': image['url'], 'id': image.get('caption')} for image in metadata.get('images') or [] if url_or_none(image.get('url')) ], + # TODO: chapters may also be in stream['segments']? + 'chapters': traverse_obj(config, ('data', 'attributes', 'chapters', 'elements', ..., { + 'start_time': 'startTime', + 'title': 'title', + })) or None, } diff --git a/hypervideo_dl/extractor/atresplayer.py b/hypervideo_dl/extractor/atresplayer.py index a20e7f9..3a44e52 100644 --- a/hypervideo_dl/extractor/atresplayer.py +++ b/hypervideo_dl/extractor/atresplayer.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -34,8 +34,8 @@ class AtresPlayerIE(InfoExtractor): _API_BASE = 'https://api.atresplayer.com/' def _handle_error(self, e, code): - if isinstance(e.cause, compat_HTTPError) and e.cause.code == code: - error = self._parse_json(e.cause.read(), None) + if isinstance(e.cause, HTTPError) and e.cause.status == code: + error = self._parse_json(e.cause.response.read(), None) if error.get('error') == 'required_registered': self.raise_login_required() raise ExtractorError(error['error_description'], expected=True) diff --git a/hypervideo_dl/extractor/bandcamp.py b/hypervideo_dl/extractor/bandcamp.py index de81e0d..e89b3a6 100644 --- a/hypervideo_dl/extractor/bandcamp.py +++ b/hypervideo_dl/extractor/bandcamp.py @@ -29,11 +29,18 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭", + 'title': 'youtube-dl "\'/\\ä↭ - youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭', 'duration': 9.8485, - 'uploader': 'youtube-dl "\'/\\ä↭', + 'uploader': 'youtube-dl "\'/\\ä↭', 'upload_date': '20121129', 'timestamp': 1354224127, + 'track': 'youtube-dl "\'/\\ä↭ - youtube-dl test song "\'/\\ä↭', + 'album_artist': 'youtube-dl "\'/\\ä↭', + 'track_id': '1812978515', + 'artist': 'youtube-dl "\'/\\ä↭', + 'uploader_url': 'https://youtube-dl.bandcamp.com', + 'uploader_id': 'youtube-dl', + 'thumbnail': 'https://f4.bcbits.com/img/a3216802731_5.jpg', }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { @@ -41,7 +48,8 @@ class BandcampIE(InfoExtractor): 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', 'info_dict': { 'id': '2650410135', - 'ext': 'aiff', + 'ext': 'm4a', + 'acodec': r're:[fa]lac', 'title': 'Ben Prunty - Lanius (Battle)', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Ben Prunty', @@ -54,7 +62,10 @@ class BandcampIE(InfoExtractor): 'track_number': 1, 'track_id': '2650410135', 'artist': 'Ben Prunty', + 'album_artist': 'Ben Prunty', 'album': 'FTL: Advanced Edition Soundtrack', + 'uploader_url': 'https://benprunty.bandcamp.com', + 'uploader_id': 'benprunty', }, }, { # no free download, mp3 128 @@ -75,7 +86,34 @@ class BandcampIE(InfoExtractor): 'track_number': 5, 'track_id': '2584466013', 'artist': 'Mastodon', + 'album_artist': 'Mastodon', 'album': 'Call of the Mastodon', + 'uploader_url': 'https://relapsealumni.bandcamp.com', + 'uploader_id': 'relapsealumni', + }, + }, { + # track from compilation album (artist/album_artist difference) + 'url': 'https://diskotopia.bandcamp.com/track/safehouse', + 'md5': '19c5337bca1428afa54129f86a2f6a69', + 'info_dict': { + 'id': '1978174799', + 'ext': 'mp3', + 'title': 'submerse - submerse - Safehouse', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'submerse', + 'timestamp': 1480779297, + 'upload_date': '20161203', + 'release_timestamp': 1481068800, + 'release_date': '20161207', + 'duration': 154.066, + 'track': 'submerse - Safehouse', + 'track_number': 3, + 'track_id': '1978174799', + 'artist': 'submerse', + 'album_artist': 'Diskotopia', + 'album': 'DSK F/W 2016-2017 Free Compilation', + 'uploader_url': 'https://diskotopia.bandcamp.com', + 'uploader_id': 'diskotopia', }, }] @@ -121,6 +159,9 @@ class BandcampIE(InfoExtractor): embed = self._extract_data_attr(webpage, title, 'embed', False) current = tralbum.get('current') or {} artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') + album_artist = self._html_search_regex( + r'<h3 class="albumTitle">[\S\s]*?by\s*<span>\s*<a href="[^>]+">\s*([^>]+?)\s*</a>', + webpage, 'album artist', fatal=False) timestamp = unified_timestamp( current.get('publish_date') or tralbum.get('album_publish_date')) @@ -205,6 +246,7 @@ class BandcampIE(InfoExtractor): 'track_id': track_id, 'artist': artist, 'album': embed.get('album_title'), + 'album_artist': album_artist, 'formats': formats, } diff --git a/hypervideo_dl/extractor/bbc.py b/hypervideo_dl/extractor/bbc.py index 9d28e70..a55cdef 100644 --- a/hypervideo_dl/extractor/bbc.py +++ b/hypervideo_dl/extractor/bbc.py @@ -2,11 +2,11 @@ import functools import itertools import json import re -import urllib.error import xml.etree.ElementTree from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str, compat_urlparse +from ..compat import compat_str, compat_urlparse +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, @@ -277,7 +277,7 @@ class BBCCoUkIE(InfoExtractor): post_url, None, 'Logging in', data=urlencode_postdata(login_form), headers={'Referer': self._LOGIN_URL}) - if self._LOGIN_URL in urlh.geturl(): + if self._LOGIN_URL in urlh.url: error = clean_html(get_element_by_class('form-message', response)) if error: raise ExtractorError( @@ -388,8 +388,8 @@ class BBCCoUkIE(InfoExtractor): href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) except ExtractorError as e: - if not (isinstance(e.exc_info[1], urllib.error.HTTPError) - and e.exc_info[1].code in (403, 404)): + if not (isinstance(e.exc_info[1], HTTPError) + and e.exc_info[1].status in (403, 404)): raise fmts = [] formats.extend(fmts) @@ -472,7 +472,7 @@ class BBCCoUkIE(InfoExtractor): return programme_id, title, description, duration, formats, subtitles except ExtractorError as ee: - if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): + if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404): raise # fallback to legacy playlist @@ -983,7 +983,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE # Some playlist URL may fail with 500, at the same time # the other one may work fine (e.g. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: + if isinstance(e.cause, HTTPError) and e.cause.status == 500: continue raise if entry: diff --git a/hypervideo_dl/extractor/beatbump.py b/hypervideo_dl/extractor/beatbump.py new file mode 100644 index 0000000..0f40ebe --- /dev/null +++ b/hypervideo_dl/extractor/beatbump.py @@ -0,0 +1,101 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE, YoutubeTabIE + + +class BeatBumpVideoIE(InfoExtractor): + _VALID_URL = r'https://beatbump\.ml/listen\?id=(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://beatbump.ml/listen?id=MgNrAu2pzNs', + 'md5': '5ff3fff41d3935b9810a9731e485fe66', + 'info_dict': { + 'id': 'MgNrAu2pzNs', + 'ext': 'mp4', + 'uploader_url': 'http://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA', + 'artist': 'Stephen', + 'thumbnail': 'https://i.ytimg.com/vi_webp/MgNrAu2pzNs/maxresdefault.webp', + 'channel_url': 'https://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA', + 'upload_date': '20190312', + 'categories': ['Music'], + 'playable_in_embed': True, + 'duration': 169, + 'like_count': int, + 'alt_title': 'Voyeur Girl', + 'view_count': int, + 'track': 'Voyeur Girl', + 'uploader': 'Stephen - Topic', + 'title': 'Voyeur Girl', + 'channel_follower_count': int, + 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA', + 'age_limit': 0, + 'availability': 'public', + 'live_status': 'not_live', + 'album': 'it\'s too much love to know my dear', + 'channel': 'Stephen', + 'comment_count': int, + 'description': 'md5:7ae382a65843d6df2685993e90a8628f', + 'tags': 'count:11', + 'creator': 'Stephen', + 'channel_id': 'UC-pWHpBjdGG69N9mM2auIAA', + } + }] + + def _real_extract(self, url): + id_ = self._match_id(url) + return self.url_result(f'https://music.youtube.com/watch?v={id_}', YoutubeIE, id_) + + +class BeatBumpPlaylistIE(InfoExtractor): + _VALID_URL = r'https://beatbump\.ml/(?:release\?id=|artist/|playlist/)(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://beatbump.ml/release?id=MPREb_gTAcphH99wE', + 'playlist_count': 50, + 'info_dict': { + 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0', + 'availability': 'unlisted', + 'view_count': int, + 'title': 'Album - Royalty Free Music Library V2 (50 Songs)', + 'description': '', + 'tags': [], + 'modified_date': '20221223', + } + }, { + 'url': 'https://beatbump.ml/artist/UC_aEa8K-EOJ3D6gOs7HcyNg', + 'playlist_mincount': 1, + 'params': {'flatplaylist': True}, + 'info_dict': { + 'id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'uploader_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg', + 'channel_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg', + 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'channel_follower_count': int, + 'title': 'NoCopyrightSounds - Videos', + 'uploader': 'NoCopyrightSounds', + 'description': 'md5:cd4fd53d81d363d05eee6c1b478b491a', + 'channel': 'NoCopyrightSounds', + 'tags': 'count:12', + 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + }, + }, { + 'url': 'https://beatbump.ml/playlist/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'playlist_mincount': 1, + 'params': {'flatplaylist': True}, + 'info_dict': { + 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'uploader_url': 'https://www.youtube.com/@NoCopyrightSounds', + 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', + 'view_count': int, + 'channel_url': 'https://www.youtube.com/@NoCopyrightSounds', + 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'title': 'NCS : All Releases 💿', + 'uploader': 'NoCopyrightSounds', + 'availability': 'public', + 'channel': 'NoCopyrightSounds', + 'tags': [], + 'modified_date': '20221225', + 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + } + }] + + def _real_extract(self, url): + id_ = self._match_id(url) + return self.url_result(f'https://music.youtube.com/browse/{id_}', YoutubeTabIE, id_) diff --git a/hypervideo_dl/extractor/bfmtv.py b/hypervideo_dl/extractor/bfmtv.py index d86d283..a7be0e6 100644 --- a/hypervideo_dl/extractor/bfmtv.py +++ b/hypervideo_dl/extractor/bfmtv.py @@ -5,7 +5,7 @@ from ..utils import extract_attributes class BFMTVBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?bfmtv\.com/' + _VALID_URL_BASE = r'https?://(?:www\.|rmc\.)?bfmtv\.com/' _VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P<id>\d{12})\.html' _VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block"[^>]*>)' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' @@ -31,6 +31,9 @@ class BFMTVIE(BFMTVBaseIE): 'uploader_id': '876450610001', 'upload_date': '20201002', 'timestamp': 1601629620, + 'duration': 44.757, + 'tags': ['bfmactu', 'politique'], + 'thumbnail': 'https://cf-images.eu-west-1.prod.boltdns.net/v1/static/876450610001/5041f4c1-bc48-4af8-a256-1b8300ad8ef0/cf2f9114-e8e2-4494-82b4-ab794ea4bc7d/1920x1080/match/image.jpg', }, }] @@ -81,6 +84,20 @@ class BFMTVArticleIE(BFMTVBaseIE): }, { 'url': 'https://www.bfmtv.com/sante/covid-19-oui-le-vaccin-de-pfizer-distribue-en-france-a-bien-ete-teste-sur-des-personnes-agees_AN-202101060275.html', 'only_matching': True, + }, { + 'url': 'https://rmc.bfmtv.com/actualites/societe/transports/ce-n-est-plus-tout-rentable-le-bioethanol-e85-depasse-1eu-le-litre-des-automobilistes-regrettent_AV-202301100268.html', + 'info_dict': { + 'id': '6318445464112', + 'ext': 'mp4', + 'title': 'Le plein de bioéthanol fait de plus en plus mal à la pompe', + 'description': None, + 'uploader_id': '876630703001', + 'upload_date': '20230110', + 'timestamp': 1673341692, + 'duration': 109.269, + 'tags': ['rmc', 'show', 'apolline de malherbe', 'info', 'talk', 'matinale', 'radio'], + 'thumbnail': 'https://cf-images.eu-west-1.prod.boltdns.net/v1/static/876630703001/5bef74b8-9d5e-4480-a21f-60c2e2480c46/96c88b74-f9db-45e1-8040-e199c5da216c/1920x1080/match/image.jpg' + } }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/bibeltv.py b/hypervideo_dl/extractor/bibeltv.py index fd20aad..34464da 100644 --- a/hypervideo_dl/extractor/bibeltv.py +++ b/hypervideo_dl/extractor/bibeltv.py @@ -1,27 +1,197 @@ +from functools import partial + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + format_field, + int_or_none, + js_to_json, + orderedSet, + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class BibelTVBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['AT', 'CH', 'DE'] + _GEO_BYPASS = False + + API_URL = 'https://www.bibeltv.de/mediathek/api' + AUTH_TOKEN = 'j88bRXY8DsEqJ9xmTdWhrByVi5Hm' + + def _extract_formats_and_subtitles(self, data, crn_id, *, is_live=False): + formats = [] + subtitles = {} + for media_url in traverse_obj(data, (..., 'src', {url_or_none})): + media_ext = determine_ext(media_url) + if media_ext == 'm3u8': + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + media_url, crn_id, live=is_live) + formats.extend(m3u8_formats) + subtitles.update(m3u8_subs) + elif media_ext == 'mpd': + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(media_url, crn_id) + formats.extend(mpd_formats) + subtitles.update(mpd_subs) + elif media_ext == 'mp4': + formats.append({'url': media_url}) + else: + self.report_warning(f'Unknown format {media_ext!r}') + + return formats, subtitles + + @staticmethod + def _extract_base_info(data): + return { + 'id': data['crn'], + **traverse_obj(data, { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {partial(int_or_none, scale=1000)}), + 'timestamp': ('schedulingStart', {parse_iso8601}), + 'season_number': 'seasonNumber', + 'episode_number': 'episodeNumber', + 'view_count': 'viewCount', + 'like_count': 'likeCount', + }), + 'thumbnails': orderedSet(traverse_obj(data, ('images', ..., { + 'url': ('url', {url_or_none}), + }))), + } + + def _extract_url_info(self, data): + return { + '_type': 'url', + 'url': format_field(data, 'slug', 'https://www.bibeltv.de/mediathek/videos/%s'), + **self._extract_base_info(data), + } + + def _extract_video_info(self, data): + crn_id = data['crn'] + if data.get('drm'): + self.report_drm(crn_id) + + json_data = self._download_json( + format_field(data, 'id', f'{self.API_URL}/video/%s'), crn_id, + headers={'Authorization': self.AUTH_TOKEN}, fatal=False, + errnote='No formats available') or {} + + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(json_data, ('video', 'videoUrls', ...)), crn_id) + + return { + '_type': 'video', + **self._extract_base_info(data), + 'formats': formats, + 'subtitles': subtitles, + } + + +class BibelTVVideoIE(BibelTVBaseIE): + IE_DESC = 'BibelTV single video' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?P<id>\d+)[\w-]+' + IE_NAME = 'bibeltv:video' -class BibelTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?:crn/)?(?P<id>\d+)' _TESTS = [{ - 'url': 'https://www.bibeltv.de/mediathek/videos/329703-sprachkurs-in-malaiisch', - 'md5': '252f908192d611de038b8504b08bf97f', + 'url': 'https://www.bibeltv.de/mediathek/videos/344436-alte-wege', + 'md5': 'ec1c07efe54353780512e8a4103b612e', 'info_dict': { - 'id': 'ref:329703', + 'id': '344436', 'ext': 'mp4', - 'title': 'Sprachkurs in Malaiisch', - 'description': 'md5:3e9f197d29ee164714e67351cf737dfe', - 'timestamp': 1608316701, - 'uploader_id': '5840105145001', - 'upload_date': '20201218', - } + 'title': 'Alte Wege', + 'description': 'md5:2f4eb7294c9797a47b8fd13cccca22e9', + 'timestamp': 1677877071, + 'duration': 150.0, + 'upload_date': '20230303', + 'thumbnail': r're:https://bibeltv\.imgix\.net/[\w-]+\.jpg', + 'episode': 'Episode 1', + 'episode_number': 1, + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'format': '6', + }, + }] + + def _real_extract(self, url): + crn_id = self._match_id(url) + video_data = traverse_obj( + self._search_nextjs_data(self._download_webpage(url, crn_id), crn_id), + ('props', 'pageProps', 'videoPageData', 'videos', 0, {dict})) + if not video_data: + raise ExtractorError('Missing video data.') + + return self._extract_video_info(video_data) + + +class BibelTVSeriesIE(BibelTVBaseIE): + IE_DESC = 'BibelTV series playlist' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/serien/(?P<id>\d+)[\w-]+' + IE_NAME = 'bibeltv:series' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/mediathek/serien/333485-ein-wunder-fuer-jeden-tag', + 'playlist_mincount': 400, + 'info_dict': { + 'id': '333485', + 'title': 'Ein Wunder für jeden Tag', + 'description': 'Tägliche Kurzandacht mit Déborah Rosenkranz.', + }, + }] + + def _real_extract(self, url): + crn_id = self._match_id(url) + webpage = self._download_webpage(url, crn_id) + nextjs_data = self._search_nextjs_data(webpage, crn_id) + series_data = traverse_obj(nextjs_data, ('props', 'pageProps', 'seriePageData', {dict})) + if not series_data: + raise ExtractorError('Missing series data.') + + return self.playlist_result( + traverse_obj(series_data, ('videos', ..., {dict}, {self._extract_url_info})), + crn_id, series_data.get('title'), clean_html(series_data.get('description'))) + + +class BibelTVLiveIE(BibelTVBaseIE): + IE_DESC = 'BibelTV live program' + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/livestreams/(?P<id>[\w-]+)' + IE_NAME = 'bibeltv:live' + + _TESTS = [{ + 'url': 'https://www.bibeltv.de/livestreams/bibeltv/', + 'info_dict': { + 'id': 'bibeltv', + 'ext': 'mp4', + 'title': 're:Bibel TV', + 'live_status': 'is_live', + 'thumbnail': 'https://streampreview.bibeltv.de/bibeltv.webp', + }, + 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'https://www.bibeltv.de/mediathek/videos/crn/326374', + 'url': 'https://www.bibeltv.de/livestreams/impuls/', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5840105145001/default_default/index.html?videoId=ref:%s' def _real_extract(self, url): - crn_id = self._match_id(url) - return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % crn_id, 'BrightcoveNew') + stream_id = self._match_id(url) + webpage = self._download_webpage(url, stream_id) + stream_data = self._search_json( + r'\\"video\\":', webpage, 'bibeltvData', stream_id, + transform_source=lambda jstring: js_to_json(jstring.replace('\\"', '"'))) + + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(stream_data, ('src', ...)), stream_id, is_live=True) + + return { + 'id': stream_id, + 'title': stream_data.get('title'), + 'thumbnail': stream_data.get('poster'), + 'is_live': True, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/bilibili.py b/hypervideo_dl/extractor/bilibili.py index bc04241..cb7ab2a 100644 --- a/hypervideo_dl/extractor/bilibili.py +++ b/hypervideo_dl/extractor/bilibili.py @@ -1,11 +1,14 @@ import base64 import functools +import hashlib import itertools import math -import urllib.error +import time import urllib.parse from .common import InfoExtractor, SearchInfoExtractor +from ..dependencies import Cryptodome +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, GeoRestrictedError, @@ -15,14 +18,20 @@ from ..utils import ( float_or_none, format_field, int_or_none, + join_nonempty, make_archive_id, + merge_dicts, mimetype2ext, parse_count, parse_qs, qualities, + smuggle_url, srt_subtitles_timecode, str_or_none, traverse_obj, + try_call, + unified_timestamp, + unsmuggle_url, url_or_none, urlencode_postdata, ) @@ -77,7 +86,7 @@ class BilibiliBaseIE(InfoExtractor): f'{line["content"]}\n\n') return srt_data - def _get_subtitles(self, video_id, initial_state, cid): + def _get_subtitles(self, video_id, aid, cid): subtitles = { 'danmaku': [{ 'ext': 'xml', @@ -85,7 +94,8 @@ class BilibiliBaseIE(InfoExtractor): }] } - for s in traverse_obj(initial_state, ('videoData', 'subtitle', 'list')) or []: + video_info_json = self._download_json(f'https://api.bilibili.com/x/player/v2?aid={aid}&cid={cid}', video_id) + for s in traverse_obj(video_info_json, ('data', 'subtitle', 'subtitles', ...)): subtitles.setdefault(s['lan'], []).append({ 'ext': 'srt', 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)) @@ -126,9 +136,20 @@ class BilibiliBaseIE(InfoExtractor): for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): yield from children + def _get_episodes_from_season(self, ss_id, url): + season_info = self._download_json( + 'https://api.bilibili.com/pgc/web/season/section', ss_id, + note='Downloading season info', query={'season_id': ss_id}, + headers={'Referer': url, **self.geo_verification_headers()}) + + for entry in traverse_obj(season_info, ( + 'result', 'main_section', 'episodes', + lambda _, v: url_or_none(v['share_url']) and v['id'])): + yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}') + class BiliBiliIE(BilibiliBaseIE): - _VALID_URL = r'https?://www\.bilibili\.com/video/[aAbB][vV](?P<id>[^/?#&]+)' + _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', @@ -276,19 +297,60 @@ class BiliBiliIE(BilibiliBaseIE): 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, 'params': {'skip_download': True}, + }, { + 'note': 'video redirects to festival page', + 'url': 'https://www.bilibili.com/video/BV1wP4y1P72h', + 'info_dict': { + 'id': 'BV1wP4y1P72h', + 'ext': 'mp4', + 'title': '牛虎年相交之际,一首传统民族打击乐《牛斗虎》祝大家新春快乐,虎年大吉!【bilibili音乐虎闹新春】', + 'timestamp': 1643947497, + 'upload_date': '20220204', + 'description': 'md5:8681a0d4d2c06b4ae27e59c8080a7fe6', + 'uploader': '叨叨冯聊音乐', + 'duration': 246.719, + 'uploader_id': '528182630', + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + 'params': {'skip_download': True}, + }, { + 'note': 'newer festival video', + 'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f', + 'info_dict': { + 'id': 'BV1ay4y1d77f', + 'ext': 'mp4', + 'title': '【崩坏3新春剧场】为特别的你送上祝福!', + 'timestamp': 1674273600, + 'upload_date': '20230121', + 'description': 'md5:58af66d15c6a0122dc30c8adfd828dd8', + 'uploader': '果蝇轰', + 'duration': 1111.722, + 'uploader_id': '8469526', + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + 'params': {'skip_download': True}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) - play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] - video_data = initial_state['videoData'] + is_festival = 'videoData' not in initial_state + if is_festival: + video_data = initial_state['videoInfo'] + else: + play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + video_data = initial_state['videoData'] + video_id, title = video_data['bvid'], video_data.get('title') # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. - page_list_json = traverse_obj( + page_list_json = not is_festival and traverse_obj( self._download_json( 'https://api.bilibili.com/x/player/pagelist', video_id, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, @@ -303,106 +365,143 @@ class BiliBiliIE(BilibiliBaseIE): getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}') if is_anthology: - title += f' p{part_id:02d} {traverse_obj(page_list_json, ((part_id or 1) - 1, "part")) or ""}' + part_id = part_id or 1 + title += f' p{part_id:02d} {traverse_obj(page_list_json, (part_id - 1, "part")) or ""}' aid = video_data.get('aid') old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') + festival_info = {} + if is_festival: + play_info = self._download_json( + 'https://api.bilibili.com/x/player/playurl', video_id, + query={'bvid': video_id, 'cid': cid, 'fnval': 4048}, + note='Extracting festival video formats')['data'] + + festival_info = traverse_obj(initial_state, { + 'uploader': ('videoInfo', 'upName'), + 'uploader_id': ('videoInfo', 'upMid', {str_or_none}), + 'like_count': ('videoStatus', 'like', {int_or_none}), + 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'), + }, get_all=False) + return { + **traverse_obj(initial_state, { + 'uploader': ('upData', 'name'), + 'uploader_id': ('upData', 'mid', {str_or_none}), + 'like_count': ('videoData', 'stat', 'like', {int_or_none}), + 'tags': ('tags', ..., 'tag_name'), + 'thumbnail': ('videoData', 'pic', {url_or_none}), + }), + **festival_info, + **traverse_obj(video_data, { + 'description': 'desc', + 'timestamp': ('pubdate', {int_or_none}), + 'view_count': (('viewCount', ('stat', 'view')), {int_or_none}), + 'comment_count': ('stat', 'reply', {int_or_none}), + }, get_all=False), 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', 'formats': self.extract_formats(play_info), '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, 'title': title, - 'description': traverse_obj(initial_state, ('videoData', 'desc')), - 'view_count': traverse_obj(initial_state, ('videoData', 'stat', 'view')), - 'uploader': traverse_obj(initial_state, ('upData', 'name')), - 'uploader_id': traverse_obj(initial_state, ('upData', 'mid')), - 'like_count': traverse_obj(initial_state, ('videoData', 'stat', 'like')), - 'comment_count': traverse_obj(initial_state, ('videoData', 'stat', 'reply')), - 'tags': traverse_obj(initial_state, ('tags', ..., 'tag_name')), - 'thumbnail': traverse_obj(initial_state, ('videoData', 'pic')), - 'timestamp': traverse_obj(initial_state, ('videoData', 'pubdate')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'chapters': self._get_chapters(aid, cid), - 'subtitles': self.extract_subtitles(video_id, initial_state, cid), + 'subtitles': self.extract_subtitles(video_id, aid, cid), '__post_extractor': self.extract_comments(aid), 'http_headers': {'Referer': url}, } class BiliBiliBangumiIE(BilibiliBaseIE): - _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/(?P<id>(?:ss|ep)\d+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/(?P<id>ep\d+)' _TESTS = [{ - 'url': 'https://www.bilibili.com/bangumi/play/ss897', + 'url': 'https://www.bilibili.com/bangumi/play/ep267851', 'info_dict': { - 'id': 'ss897', + 'id': '267851', 'ext': 'mp4', - 'series': '神的记事本', - 'season': '神的记事本', - 'season_id': 897, + 'series': '鬼灭之刃', + 'series_id': '4358', + 'season': '鬼灭之刃', + 'season_id': '26801', 'season_number': 1, - 'episode': '你与旅行包', - 'episode_number': 2, - 'title': '神的记事本:第2话 你与旅行包', - 'duration': 1428.487, - 'timestamp': 1310809380, - 'upload_date': '20110716', - 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'episode': '残酷', + 'episode_id': '267851', + 'episode_number': 1, + 'title': '1 残酷', + 'duration': 1425.256, + 'timestamp': 1554566400, + 'upload_date': '20190406', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$' }, - }, { - 'url': 'https://www.bilibili.com/bangumi/play/ep508406', - 'only_matching': True, + 'skip': 'According to the copyright owner\'s request, you may only watch the video after you are premium member.' }] def _real_extract(self, url): video_id = self._match_id(url) + episode_id = video_id[2:] webpage = self._download_webpage(url, video_id) if '您所在的地区无法观看本片' in webpage: raise GeoRestrictedError('This video is restricted') - elif ('开通大会员观看' in webpage and '__playinfo__' not in webpage - or '正在观看预览,大会员免费看全片' in webpage): + elif '正在观看预览,大会员免费看全片' in webpage: self.raise_login_required('This video is for premium members only') - play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + headers = {'Referer': url, **self.geo_verification_headers()} + play_info = self._download_json( + 'https://api.bilibili.com/pgc/player/web/v2/playurl', video_id, + 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, + headers=headers) + premium_only = play_info.get('code') == -10403 + play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {} + formats = self.extract_formats(play_info) - if (not formats and '成为大会员抢先看' in webpage - and play_info.get('durl') and not play_info.get('dash')): + if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage): self.raise_login_required('This video is for premium members only') - initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) + bangumi_info = self._download_json( + 'https://api.bilibili.com/pgc/view/web/season', video_id, 'Get episode details', + query={'ep_id': episode_id}, headers=headers)['result'] - season_id = traverse_obj(initial_state, ('mediaInfo', 'season_id')) + episode_number, episode_info = next(( + (idx, ep) for idx, ep in enumerate(traverse_obj( + bangumi_info, ('episodes', ..., {dict})), 1) + if str_or_none(ep.get('id')) == episode_id), (1, {})) + + season_id = bangumi_info.get('season_id') season_number = season_id and next(( idx + 1 for idx, e in enumerate( - traverse_obj(initial_state, ('mediaInfo', 'seasons', ...))) + traverse_obj(bangumi_info, ('seasons', ...))) if e.get('season_id') == season_id ), None) + aid = episode_info.get('aid') + return { 'id': video_id, 'formats': formats, - 'title': traverse_obj(initial_state, 'h1Title'), - 'episode': traverse_obj(initial_state, ('epInfo', 'long_title')), - 'episode_number': int_or_none(traverse_obj(initial_state, ('epInfo', 'title'))), - 'series': traverse_obj(initial_state, ('mediaInfo', 'series')), - 'season': traverse_obj(initial_state, ('mediaInfo', 'season_title')), - 'season_id': season_id, + **traverse_obj(bangumi_info, { + 'series': ('series', 'series_title', {str}), + 'series_id': ('series', 'series_id', {str_or_none}), + 'thumbnail': ('square_cover', {url_or_none}), + }), + 'title': join_nonempty('title', 'long_title', delim=' ', from_dict=episode_info), + 'episode': episode_info.get('long_title'), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode_info.get('title')) or episode_number, + 'season_id': str_or_none(season_id), 'season_number': season_number, - 'thumbnail': traverse_obj(initial_state, ('epInfo', 'cover')), - 'timestamp': traverse_obj(initial_state, ('epInfo', 'pub_time')), + 'timestamp': int_or_none(episode_info.get('pub_time')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), - 'subtitles': self.extract_subtitles( - video_id, initial_state, traverse_obj(initial_state, ('epInfo', 'cid'))), - '__post_extractor': self.extract_comments(traverse_obj(initial_state, ('epInfo', 'aid'))), - 'http_headers': {'Referer': url, **self.geo_verification_headers()}, + 'subtitles': self.extract_subtitles(video_id, aid, episode_info.get('cid')), + '__post_extractor': self.extract_comments(aid), + 'http_headers': headers, } -class BiliBiliBangumiMediaIE(InfoExtractor): +class BiliBiliBangumiMediaIE(BilibiliBaseIE): _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/media/md24097891', @@ -415,16 +514,26 @@ class BiliBiliBangumiMediaIE(InfoExtractor): def _real_extract(self, url): media_id = self._match_id(url) webpage = self._download_webpage(url, media_id) + ss_id = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)['mediaInfo']['season_id'] + + return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id) + + +class BiliBiliBangumiSeasonIE(BilibiliBaseIE): + _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/play/ss26801', + 'info_dict': { + 'id': '26801' + }, + 'playlist_mincount': 26 + }] - initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) - episode_list = self._download_json( - 'https://api.bilibili.com/pgc/web/season/section', media_id, - query={'season_id': initial_state['mediaInfo']['season_id']}, - note='Downloading season info')['result']['main_section']['episodes'] + def _real_extract(self, url): + ss_id = self._match_id(url) - return self.playlist_result(( - self.url_result(entry['share_url'], BiliBiliBangumiIE, entry['aid']) - for entry in episode_list), media_id) + return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id) class BilibiliSpaceBaseIE(InfoExtractor): @@ -447,21 +556,65 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): 'id': '3985676', }, 'playlist_mincount': 178, + }, { + 'url': 'https://space.bilibili.com/313580179/video', + 'info_dict': { + 'id': '313580179', + }, + 'playlist_mincount': 92, }] + def _extract_signature(self, playlist_id): + session_data = self._download_json('https://api.bilibili.com/x/web-interface/nav', playlist_id, fatal=False) + + key_from_url = lambda x: x[x.rfind('/') + 1:].split('.')[0] + img_key = traverse_obj( + session_data, ('data', 'wbi_img', 'img_url', {key_from_url})) or '34478ba821254d9d93542680e3b86100' + sub_key = traverse_obj( + session_data, ('data', 'wbi_img', 'sub_url', {key_from_url})) or '7e16a90d190a4355a78fd00b32a38de6' + + session_key = img_key + sub_key + + signature_values = [] + for position in ( + 46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39, + 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, + 57, 62, 11, 36, 20, 34, 44, 52 + ): + char_at_position = try_call(lambda: session_key[position]) + if char_at_position: + signature_values.append(char_at_position) + + return ''.join(signature_values)[:32] + def _real_extract(self, url): playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video') if not is_video_url: self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. ' 'To download audios, add a "/audio" to the URL') + signature = self._extract_signature(playlist_id) + def fetch_page(page_idx): + query = { + 'keyword': '', + 'mid': playlist_id, + 'order': 'pubdate', + 'order_avoided': 'true', + 'platform': 'web', + 'pn': page_idx + 1, + 'ps': 30, + 'tid': 0, + 'web_location': 1550101, + 'wts': int(time.time()), + } + query['w_rid'] = hashlib.md5(f'{urllib.parse.urlencode(query)}{signature}'.encode()).hexdigest() + try: - response = self._download_json('https://api.bilibili.com/x/space/arc/search', - playlist_id, note=f'Downloading page {page_idx}', - query={'mid': playlist_id, 'pn': page_idx + 1, 'jsonp': 'jsonp'}) + response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search', + playlist_id, note=f'Downloading page {page_idx}', query=query) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 412: + if isinstance(e.cause, HTTPError) and e.cause.status == 412: raise ExtractorError( 'Request is blocked by server (412), please add cookies, wait and try later.', expected=True) raise @@ -489,9 +642,9 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE): _VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)/audio' _TESTS = [{ - 'url': 'https://space.bilibili.com/3985676/audio', + 'url': 'https://space.bilibili.com/313580179/audio', 'info_dict': { - 'id': '3985676', + 'id': '313580179', }, 'playlist_mincount': 1, }] @@ -880,35 +1033,24 @@ class BiliIntlBaseIE(InfoExtractor): return formats - def _extract_video_info(self, video_data, *, ep_id=None, aid=None): + def _parse_video_metadata(self, video_data): return { - 'id': ep_id or aid, 'title': video_data.get('title_display') or video_data.get('title'), 'thumbnail': video_data.get('cover'), 'episode_number': int_or_none(self._search_regex( r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)), - 'formats': self._get_formats(ep_id=ep_id, aid=aid), - 'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid), - 'extractor_key': BiliIntlIE.ie_key(), } def _perform_login(self, username, password): - try: - from Cryptodome.PublicKey import RSA - from Cryptodome.Cipher import PKCS1_v1_5 - except ImportError: - try: - from Crypto.PublicKey import RSA - from Crypto.Cipher import PKCS1_v1_5 - except ImportError: - raise ExtractorError('pycryptodomex not found. Please install', expected=True) + if not Cryptodome.RSA: + raise ExtractorError('pycryptodomex not found. Please install', expected=True) key_data = self._download_json( 'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None, note='Downloading login key', errnote='Unable to download login key')['data'] - public_key = RSA.importKey(key_data['key']) - password_hash = PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8')) + public_key = Cryptodome.RSA.importKey(key_data['key']) + password_hash = Cryptodome.PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8')) login_post = self._download_json( 'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({ 'username': username, @@ -935,6 +1077,23 @@ class BiliIntlIE(BiliIntlBaseIE): 'title': 'E2 - The First Night', 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', 'episode_number': 2, + 'upload_date': '20201009', + 'episode': 'Episode 2', + 'timestamp': 1602259500, + 'description': 'md5:297b5a17155eb645e14a14b385ab547e', + 'chapters': [{ + 'start_time': 0, + 'end_time': 76.242, + 'title': '<Untitled Chapter 1>' + }, { + 'start_time': 76.242, + 'end_time': 161.161, + 'title': 'Intro' + }, { + 'start_time': 1325.742, + 'end_time': 1403.903, + 'title': 'Outro' + }], } }, { # Non-Bstation page @@ -945,6 +1104,23 @@ class BiliIntlIE(BiliIntlBaseIE): 'title': 'E3 - Who?', 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', 'episode_number': 3, + 'description': 'md5:e1a775e71a35c43f141484715470ad09', + 'episode': 'Episode 3', + 'upload_date': '20211219', + 'timestamp': 1639928700, + 'chapters': [{ + 'start_time': 0, + 'end_time': 88.0, + 'title': '<Untitled Chapter 1>' + }, { + 'start_time': 88.0, + 'end_time': 156.0, + 'title': 'Intro' + }, { + 'start_time': 1173.0, + 'end_time': 1259.535, + 'title': 'Outro' + }], } }, { # Subtitle with empty content @@ -958,6 +1134,78 @@ class BiliIntlIE(BiliIntlBaseIE): }, 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.' }, { + 'url': 'https://www.bilibili.tv/en/video/2041863208', + 'info_dict': { + 'id': '2041863208', + 'ext': 'mp4', + 'timestamp': 1670874843, + 'description': 'Scheduled for April 2023.\nStudio: ufotable', + 'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$', + 'upload_date': '20221212', + 'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation', + }, + }, { + # episode comment extraction + 'url': 'https://www.bilibili.tv/en/play/34580/340317', + 'info_dict': { + 'id': '340317', + 'ext': 'mp4', + 'timestamp': 1604057820, + 'upload_date': '20201030', + 'episode_number': 5, + 'title': 'E5 - My Own Steel', + 'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2', + 'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode': 'Episode 5', + 'comment_count': int, + 'chapters': [{ + 'start_time': 0, + 'end_time': 61.0, + 'title': '<Untitled Chapter 1>' + }, { + 'start_time': 61.0, + 'end_time': 134.0, + 'title': 'Intro' + }, { + 'start_time': 1290.0, + 'end_time': 1379.0, + 'title': 'Outro' + }], + }, + 'params': { + 'getcomments': True + } + }, { + # user generated content comment extraction + 'url': 'https://www.bilibili.tv/en/video/2045730385', + 'info_dict': { + 'id': '2045730385', + 'ext': 'mp4', + 'description': 'md5:693b6f3967fb4e7e7764ea817857c33a', + 'timestamp': 1667891924, + 'upload_date': '20221108', + 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation', + 'comment_count': int, + 'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg', + }, + 'params': { + 'getcomments': True + } + }, { + # episode id without intro and outro + 'url': 'https://www.bilibili.tv/en/play/1048837/11246489', + 'info_dict': { + 'id': '11246489', + 'ext': 'mp4', + 'title': 'E1 - Operation \'Strix\' <Owl>', + 'description': 'md5:b4434eb1a9a97ad2bccb779514b89f17', + 'timestamp': 1649516400, + 'thumbnail': 'https://pic.bstarstatic.com/ogv/62cb1de23ada17fb70fbe7bdd6ff29c29da02a64.png', + 'episode': 'Episode 1', + 'episode_number': 1, + 'upload_date': '20220409', + }, + }, { 'url': 'https://www.biliintl.com/en/play/34613/341736', 'only_matching': True, }, { @@ -974,36 +1222,139 @@ class BiliIntlIE(BiliIntlBaseIE): 'only_matching': True, }] - def _real_extract(self, url): - season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') - video_id = ep_id or aid + def _make_url(video_id, series_id=None): + if series_id: + return f'https://www.bilibili.tv/en/play/{series_id}/{video_id}' + return f'https://www.bilibili.tv/en/video/{video_id}' + + def _extract_video_metadata(self, url, video_id, season_id): + url, smuggled_data = unsmuggle_url(url, {}) + if smuggled_data.get('title'): + return smuggled_data + webpage = self._download_webpage(url, video_id) # Bstation layout initial_data = ( self._search_json(r'window\.__INITIAL_(?:DATA|STATE)__\s*=', webpage, 'preload state', video_id, default={}) or self._search_nuxt_data(webpage, video_id, '__initialState', fatal=False, traverse=None)) video_data = traverse_obj( - initial_data, ('OgvVideo', 'epDetail'), ('UgcVideo', 'videoData'), ('ugc', 'archive'), expected_type=dict) + initial_data, ('OgvVideo', 'epDetail'), ('UgcVideo', 'videoData'), ('ugc', 'archive'), expected_type=dict) or {} if season_id and not video_data: # Non-Bstation layout, read through episode list season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) - video_data = traverse_obj(season_json, - ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id), - expected_type=dict, get_all=False) - return self._extract_video_info(video_data or {}, ep_id=ep_id, aid=aid) + video_data = traverse_obj(season_json, ( + 'sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == video_id + ), expected_type=dict, get_all=False) + + # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found + return merge_dicts( + self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), { + 'title': self._html_search_meta('og:title', webpage), + 'description': self._html_search_meta('og:description', webpage) + }) + + def _get_comments_reply(self, root_id, next_id=0, display_id=None): + comment_api_raw_data = self._download_json( + 'https://api.bilibili.tv/reply/web/detail', display_id, + note=f'Downloading reply comment of {root_id} - {next_id}', + query={ + 'platform': 'web', + 'ps': 20, # comment's reply per page (default: 3) + 'root': root_id, + 'next': next_id, + }) + + for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): + yield { + 'author': traverse_obj(replies, ('member', 'name')), + 'author_id': traverse_obj(replies, ('member', 'mid')), + 'author_thumbnail': traverse_obj(replies, ('member', 'face')), + 'text': traverse_obj(replies, ('content', 'message')), + 'id': replies.get('rpid'), + 'like_count': int_or_none(replies.get('like_count')), + 'parent': replies.get('parent'), + 'timestamp': unified_timestamp(replies.get('ctime_text')) + } + + if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): + yield from self._get_comments_reply( + root_id, comment_api_raw_data['data']['cursor']['next'], display_id) + + def _get_comments(self, video_id, ep_id): + for i in itertools.count(0): + comment_api_raw_data = self._download_json( + 'https://api.bilibili.tv/reply/web/root', video_id, + note=f'Downloading comment page {i + 1}', + query={ + 'platform': 'web', + 'pn': i, # page number + 'ps': 20, # comment per page (default: 20) + 'oid': video_id, + 'type': 3 if ep_id else 1, # 1: user generated content, 3: series content + 'sort_type': 1, # 1: best, 2: recent + }) + + for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): + yield { + 'author': traverse_obj(replies, ('member', 'name')), + 'author_id': traverse_obj(replies, ('member', 'mid')), + 'author_thumbnail': traverse_obj(replies, ('member', 'face')), + 'text': traverse_obj(replies, ('content', 'message')), + 'id': replies.get('rpid'), + 'like_count': int_or_none(replies.get('like_count')), + 'timestamp': unified_timestamp(replies.get('ctime_text')), + 'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))), + } + if replies.get('count'): + yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id) + + if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): + break + + def _real_extract(self, url): + season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') + video_id = ep_id or aid + chapters = None + + if ep_id: + intro_ending_json = self._call_api( + f'/web/v2/ogv/play/episode?episode_id={ep_id}&platform=web', + video_id, fatal=False) or {} + if intro_ending_json.get('skip'): + # FIXME: start time and end time seems a bit off a few second even it corrext based on ogv.*.js + # ref: https://p.bstarstatic.com/fe-static/bstar-web-new/assets/ogv.2b147442.js + chapters = [{ + 'start_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'opening_start_time')), 1000), + 'end_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'opening_end_time')), 1000), + 'title': 'Intro' + }, { + 'start_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'ending_start_time')), 1000), + 'end_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'ending_end_time')), 1000), + 'title': 'Outro' + }] + + return { + 'id': video_id, + **self._extract_video_metadata(url, video_id, season_id), + 'formats': self._get_formats(ep_id=ep_id, aid=aid), + 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid), + 'chapters': chapters, + '__post_extractor': self.extract_comments(video_id, ep_id) + } class BiliIntlSeriesIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?play/(?P<id>\d+)/?(?:[?#]|$)' + IE_NAME = 'biliIntl:series' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?(?:play|media)/(?P<id>\d+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613', 'playlist_mincount': 15, 'info_dict': { 'id': '34613', - 'title': 'Fly Me to the Moon', - 'description': 'md5:a861ee1c4dc0acfad85f557cc42ac627', - 'categories': ['Romance', 'Comedy', 'Slice of life'], + 'title': 'TONIKAWA: Over the Moon For You', + 'description': 'md5:297b5a17155eb645e14a14b385ab547e', + 'categories': ['Slice of life', 'Comedy', 'Romance'], 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', 'view_count': int, }, @@ -1011,6 +1362,17 @@ class BiliIntlSeriesIE(BiliIntlBaseIE): 'skip_download': True, }, }, { + 'url': 'https://www.bilibili.tv/en/media/1048837', + 'info_dict': { + 'id': '1048837', + 'title': 'SPY×FAMILY', + 'description': 'md5:b4434eb1a9a97ad2bccb779514b89f17', + 'categories': ['Adventure', 'Action', 'Comedy'], + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.jpg$', + 'view_count': int, + }, + 'playlist_mincount': 25, + }, { 'url': 'https://www.biliintl.com/en/play/34613', 'only_matching': True, }, { @@ -1020,9 +1382,12 @@ class BiliIntlSeriesIE(BiliIntlBaseIE): def _entries(self, series_id): series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id) - for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]): - episode_id = str(episode.get('episode_id')) - yield self._extract_video_info(episode, ep_id=episode_id) + for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict): + episode_id = str(episode['episode_id']) + yield self.url_result(smuggle_url( + BiliIntlIE._make_url(episode_id, series_id), + self._parse_video_metadata(episode) + ), BiliIntlIE, episode_id) def _real_extract(self, url): series_id = self._match_id(url) @@ -1034,7 +1399,7 @@ class BiliIntlSeriesIE(BiliIntlBaseIE): class BiliLiveIE(InfoExtractor): - _VALID_URL = r'https?://live.bilibili.com/(?P<id>\d+)' + _VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P<id>\d+)' _TESTS = [{ 'url': 'https://live.bilibili.com/196', @@ -1050,6 +1415,9 @@ class BiliLiveIE(InfoExtractor): }, { 'url': 'https://live.bilibili.com/196?broadcast_type=0&is_room_feed=1?spm_id_from=333.999.space_home.strengthen_live_card.click', 'only_matching': True + }, { + 'url': 'https://live.bilibili.com/blanc/196', + 'only_matching': True }] _FORMATS = { @@ -1111,6 +1479,7 @@ class BiliLiveIE(InfoExtractor): 'thumbnail': room_data.get('user_cover'), 'timestamp': stream_data.get('live_time'), 'formats': formats, + 'is_live': True, 'http_headers': { 'Referer': url, }, diff --git a/hypervideo_dl/extractor/bitchute.py b/hypervideo_dl/extractor/bitchute.py index 10e7b0b..0805b8b 100644 --- a/hypervideo_dl/extractor/bitchute.py +++ b/hypervideo_dl/extractor/bitchute.py @@ -2,9 +2,9 @@ import functools import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, OnDemandPagedList, clean_html, get_element_by_class, @@ -77,7 +77,10 @@ class BitChuteIE(InfoExtractor): def _check_format(self, video_url, video_id): urls = orderedSet( re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url) - for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153')) + for host in (r'\g<2>', 'seed122', 'seed125', 'seed126', 'seed128', + 'seed132', 'seed150', 'seed151', 'seed152', 'seed153', + 'seed167', 'seed171', 'seed177', 'seed305', 'seed307', + 'seedp29xb', 'zb10-7gsop1v78')) for url in urls: try: response = self._request_webpage( diff --git a/hypervideo_dl/extractor/blerp.py b/hypervideo_dl/extractor/blerp.py new file mode 100644 index 0000000..4631ad2 --- /dev/null +++ b/hypervideo_dl/extractor/blerp.py @@ -0,0 +1,167 @@ +import json + +from .common import InfoExtractor +from ..utils import strip_or_none, traverse_obj + + +class BlerpIE(InfoExtractor): + IE_NAME = 'blerp' + _VALID_URL = r'https?://(?:www\.)?blerp\.com/soundbites/(?P<id>[0-9a-zA-Z]+)' + _TESTS = [{ + 'url': 'https://blerp.com/soundbites/6320fe8745636cb4dd677a5a', + 'info_dict': { + 'id': '6320fe8745636cb4dd677a5a', + 'title': 'Samsung Galaxy S8 Over the Horizon Ringtone 2016', + 'uploader': 'luminousaj', + 'uploader_id': '5fb81e51aa66ae000c395478', + 'ext': 'mp3', + 'tags': ['samsung', 'galaxy', 's8', 'over the horizon', '2016', 'ringtone'], + } + }, { + 'url': 'https://blerp.com/soundbites/5bc94ef4796001000498429f', + 'info_dict': { + 'id': '5bc94ef4796001000498429f', + 'title': 'Yee', + 'uploader': '179617322678353920', + 'uploader_id': '5ba99cf71386730004552c42', + 'ext': 'mp3', + 'tags': ['YEE', 'YEET', 'wo ha haah catchy tune yee', 'yee'] + } + }] + + _GRAPHQL_OPERATIONNAME = "webBitePageGetBite" + _GRAPHQL_QUERY = ( + '''query webBitePageGetBite($_id: MongoID!) { + web { + biteById(_id: $_id) { + ...bitePageFrag + __typename + } + __typename + } + } + + fragment bitePageFrag on Bite { + _id + title + userKeywords + keywords + color + visibility + isPremium + owned + price + extraReview + isAudioExists + image { + filename + original { + url + __typename + } + __typename + } + userReactions { + _id + reactions + createdAt + __typename + } + topReactions + totalSaveCount + saved + blerpLibraryType + license + licenseMetaData + playCount + totalShareCount + totalFavoriteCount + totalAddedToBoardCount + userCategory + userAudioQuality + audioCreationState + transcription + userTranscription + description + createdAt + updatedAt + author + listingType + ownerObject { + _id + username + profileImage { + filename + original { + url + __typename + } + __typename + } + __typename + } + transcription + favorited + visibility + isCurated + sourceUrl + audienceRating + strictAudienceRating + ownerId + reportObject { + reportedContentStatus + __typename + } + giphy { + mp4 + gif + __typename + } + audio { + filename + original { + url + __typename + } + mp3 { + url + __typename + } + __typename + } + __typename + } + + ''') + + def _real_extract(self, url): + audio_id = self._match_id(url) + + data = { + 'operationName': self._GRAPHQL_OPERATIONNAME, + 'query': self._GRAPHQL_QUERY, + 'variables': { + '_id': audio_id + } + } + + headers = { + 'Content-Type': 'application/json' + } + + json_result = self._download_json('https://api.blerp.com/graphql', + audio_id, data=json.dumps(data).encode('utf-8'), headers=headers) + + bite_json = json_result['data']['web']['biteById'] + + info_dict = { + 'id': bite_json['_id'], + 'url': bite_json['audio']['mp3']['url'], + 'title': bite_json['title'], + 'uploader': traverse_obj(bite_json, ('ownerObject', 'username'), expected_type=strip_or_none), + 'uploader_id': traverse_obj(bite_json, ('ownerObject', '_id'), expected_type=strip_or_none), + 'ext': 'mp3', + 'tags': list(filter(None, map(strip_or_none, (traverse_obj(bite_json, 'userKeywords', expected_type=list) or []))) or None) + } + + return info_dict diff --git a/hypervideo_dl/extractor/boxcast.py b/hypervideo_dl/extractor/boxcast.py new file mode 100644 index 0000000..51f9eb7 --- /dev/null +++ b/hypervideo_dl/extractor/boxcast.py @@ -0,0 +1,102 @@ +from .common import InfoExtractor +from ..utils import ( + js_to_json, + traverse_obj, + unified_timestamp +) + + +class BoxCastVideoIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://boxcast\.tv/(?: + view-embed/| + channel/\w+\?(?:[^#]+&)?b=| + video-portal/(?:\w+/){2} + )(?P<id>[\w-]+)''' + _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>https?://boxcast\.tv/view-embed/[\w-]+)'] + _TESTS = [{ + 'url': 'https://boxcast.tv/view-embed/in-the-midst-of-darkness-light-prevails-an-interdisciplinary-symposium-ozmq5eclj50ujl4bmpwx', + 'info_dict': { + 'id': 'da1eqqgkacngd5djlqld', + 'ext': 'mp4', + 'thumbnail': r're:https?://uploads\.boxcast\.com/(?:[\w+-]+/){3}.+\.png$', + 'title': 'In the Midst of Darkness Light Prevails: An Interdisciplinary Symposium', + 'release_timestamp': 1670686812, + 'release_date': '20221210', + 'uploader_id': 're8w0v8hohhvpqtbskpe', + 'uploader': 'Children\'s Health Defense', + } + }, { + 'url': 'https://boxcast.tv/video-portal/vctwevwntun3o0ikq7af/rvyblnn0fxbfjx5nwxhl/otbpltj2kzkveo2qz3ad', + 'info_dict': { + 'id': 'otbpltj2kzkveo2qz3ad', + 'ext': 'mp4', + 'uploader_id': 'vctwevwntun3o0ikq7af', + 'uploader': 'Legacy Christian Church', + 'title': 'The Quest | 1: Beginner\'s Bay | Jamie Schools', + 'thumbnail': r're:https?://uploads.boxcast.com/(?:[\w-]+/){3}.+\.jpg' + } + }, { + 'url': 'https://boxcast.tv/channel/z03fqwaeaby5lnaawox2?b=ssihlw5gvfij2by8tkev', + 'info_dict': { + 'id': 'ssihlw5gvfij2by8tkev', + 'ext': 'mp4', + 'thumbnail': r're:https?://uploads.boxcast.com/(?:[\w-]+/){3}.+\.jpg$', + 'release_date': '20230101', + 'uploader_id': 'ds25vaazhlu4ygcvffid', + 'release_timestamp': 1672543201, + 'uploader': 'Lighthouse Ministries International - Beltsville, Maryland', + 'description': 'md5:ac23e3d01b0b0be592e8f7fe0ec3a340', + 'title': 'New Year\'s Eve CROSSOVER Service at LHMI | December 31, 2022', + } + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://childrenshealthdefense.eu/live-stream/', + 'info_dict': { + 'id': 'da1eqqgkacngd5djlqld', + 'ext': 'mp4', + 'thumbnail': r're:https?://uploads\.boxcast\.com/(?:[\w+-]+/){3}.+\.png$', + 'title': 'In the Midst of Darkness Light Prevails: An Interdisciplinary Symposium', + 'release_timestamp': 1670686812, + 'release_date': '20221210', + 'uploader_id': 're8w0v8hohhvpqtbskpe', + 'uploader': 'Children\'s Health Defense', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + webpage_json_data = self._search_json( + r'var\s*BOXCAST_PRELOAD\s*=', webpage, 'broadcast data', display_id, + transform_source=js_to_json, default={}) + + # Ref: https://support.boxcast.com/en/articles/4235158-build-a-custom-viewer-experience-with-boxcast-api + broadcast_json_data = ( + traverse_obj(webpage_json_data, ('broadcast', 'data')) + or self._download_json(f'https://api.boxcast.com/broadcasts/{display_id}', display_id)) + view_json_data = ( + traverse_obj(webpage_json_data, ('view', 'data')) + or self._download_json(f'https://api.boxcast.com/broadcasts/{display_id}/view', + display_id, fatal=False) or {}) + + formats, subtitles = [], {} + if view_json_data.get('status') == 'recorded': + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + view_json_data['playlist'], display_id) + + return { + 'id': str(broadcast_json_data['id']), + 'title': (broadcast_json_data.get('name') + or self._html_search_meta(['og:title', 'twitter:title'], webpage)), + 'description': (broadcast_json_data.get('description') + or self._html_search_meta(['og:description', 'twitter:description'], webpage) + or None), + 'thumbnail': (broadcast_json_data.get('preview') + or self._html_search_meta(['og:image', 'twitter:image'], webpage)), + 'formats': formats, + 'subtitles': subtitles, + 'release_timestamp': unified_timestamp(broadcast_json_data.get('streamed_at')), + 'uploader': broadcast_json_data.get('account_name'), + 'uploader_id': broadcast_json_data.get('account_id'), + } diff --git a/hypervideo_dl/extractor/brainpop.py b/hypervideo_dl/extractor/brainpop.py new file mode 100644 index 0000000..1200437 --- /dev/null +++ b/hypervideo_dl/extractor/brainpop.py @@ -0,0 +1,318 @@ +import json +import re + +from .common import InfoExtractor +from ..utils import ( + classproperty, + int_or_none, + traverse_obj, + urljoin +) + + +class BrainPOPBaseIE(InfoExtractor): + _NETRC_MACHINE = 'brainpop' + _ORIGIN = '' # So that _VALID_URL doesn't crash + _LOGIN_ERRORS = { + 1502: 'The username and password you entered did not match.', # LOGIN_FAILED + 1503: 'Payment method is expired.', # LOGIN_FAILED_ACCOUNT_NOT_ACTIVE + 1506: 'Your BrainPOP plan has expired.', # LOGIN_FAILED_ACCOUNT_EXPIRED + 1507: 'Terms not accepted.', # LOGIN_FAILED_TERMS_NOT_ACCEPTED + 1508: 'Account not activated.', # LOGIN_FAILED_SUBSCRIPTION_NOT_ACTIVE + 1512: 'The maximum number of devices permitted are logged in with your account right now.', # LOGIN_FAILED_LOGIN_LIMIT_REACHED + 1513: 'You are trying to access your account from outside of its allowed IP range.', # LOGIN_FAILED_INVALID_IP + 1514: 'Individual accounts are not included in your plan. Try again with your shared username and password.', # LOGIN_FAILED_MBP_DISABLED + 1515: 'Account not activated.', # LOGIN_FAILED_TEACHER_NOT_ACTIVE + 1523: 'That username and password won\'t work on this BrainPOP site.', # LOGIN_FAILED_NO_ACCESS + 1524: 'You\'ll need to join a class before you can login.', # LOGIN_FAILED_STUDENT_NO_PERIOD + 1526: 'Your account is locked. Reset your password, or ask a teacher or administrator for help.', # LOGIN_FAILED_ACCOUNT_LOCKED + } + + @classproperty + def _VALID_URL(cls): + root = re.escape(cls._ORIGIN).replace(r'https:', r'https?:').replace(r'www\.', r'(?:www\.)?') + return rf'{root}/(?P<slug>[^/]+/[^/]+/(?P<id>[^/?#&]+))' + + def _assemble_formats(self, slug, format_id, display_id, token='', extra_fields={}): + formats = [] + formats = self._extract_m3u8_formats( + f'{urljoin(self._HLS_URL, slug)}.m3u8?{token}', + display_id, 'mp4', m3u8_id=f'{format_id}-hls', fatal=False) + formats.append({ + 'format_id': format_id, + 'url': f'{urljoin(self._VIDEO_URL, slug)}?{token}', + }) + for f in formats: + f.update(extra_fields) + return formats + + def _extract_adaptive_formats(self, data, token, display_id, key_format='%s', extra_fields={}): + formats = [] + additional_key_formats = { + '%s': {}, + 'ad_%s': { + 'format_note': 'Audio description', + 'source_preference': -2 + } + } + for additional_key_format, additional_key_fields in additional_key_formats.items(): + for key_quality, key_index in enumerate(('high', 'low')): + full_key_index = additional_key_format % (key_format % key_index) + if data.get(full_key_index): + formats.extend(self._assemble_formats(data[full_key_index], full_key_index, display_id, token, { + 'quality': -1 - key_quality, + **additional_key_fields, + **extra_fields + })) + return formats + + def _perform_login(self, username, password): + login_res = self._download_json( + 'https://api.brainpop.com/api/login', None, + data=json.dumps({'username': username, 'password': password}).encode(), + headers={ + 'Content-Type': 'application/json', + 'Referer': self._ORIGIN + }, note='Logging in', errnote='Unable to log in', expected_status=400) + status_code = int_or_none(login_res['status_code']) + if status_code != 1505: + self.report_warning( + f'Unable to login: {self._LOGIN_ERRORS.get(status_code) or login_res.get("message")}' + or f'Got status code {status_code}') + + +class BrainPOPIE(BrainPOPBaseIE): + _ORIGIN = 'https://www.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com' + _TESTS = [{ + 'url': 'https://www.brainpop.com/health/conflictresolution/martinlutherkingjr/movie?ref=null', + 'md5': '3ead374233ae74c7f1b0029a01c972f0', + 'info_dict': { + 'id': '1f3259fa457292b4', + 'ext': 'mp4', + 'title': 'Martin Luther King, Jr.', + 'display_id': 'martinlutherkingjr', + 'description': 'md5:f403dbb2bf3ccc7cf4c59d9e43e3c349', + }, + }, { + 'url': 'https://www.brainpop.com/science/space/bigbang/', + 'md5': '9a1ff0e77444dd9e437354eb669c87ec', + 'info_dict': { + 'id': 'acae52cd48c99acf', + 'ext': 'mp4', + 'title': 'Big Bang', + 'display_id': 'bigbang', + 'description': 'md5:3e53b766b0f116f631b13f4cae185d38', + }, + 'skip': 'Requires login', + }] + + def _real_extract(self, url): + slug, display_id = self._match_valid_url(url).group('slug', 'id') + movie_data = self._download_json( + f'https://api.brainpop.com/api/content/published/bp/en/{slug}/movie?full=1', display_id, + 'Downloading movie data JSON', 'Unable to download movie data')['data'] + topic_data = traverse_obj(self._download_json( + f'https://api.brainpop.com/api/content/published/bp/en/{slug}?full=1', display_id, + 'Downloading topic data JSON', 'Unable to download topic data', fatal=False), + ('data', 'topic'), expected_type=dict) or movie_data['topic'] + + if not traverse_obj(movie_data, ('access', 'allow')): + reason = traverse_obj(movie_data, ('access', 'reason')) + if 'logged' in reason: + self.raise_login_required(reason, metadata_available=True) + else: + self.raise_no_formats(reason, video_id=display_id) + movie_feature = movie_data['feature'] + movie_feature_data = movie_feature['data'] + + formats, subtitles = [], {} + formats.extend(self._extract_adaptive_formats(movie_feature_data, movie_feature_data.get('token', ''), display_id, '%s_v2', { + 'language': movie_feature.get('language') or 'en', + 'language_preference': 10 + })) + for lang, localized_feature in traverse_obj(movie_feature, 'localization', default={}, expected_type=dict).items(): + formats.extend(self._extract_adaptive_formats(localized_feature, localized_feature.get('token', ''), display_id, '%s_v2', { + 'language': lang, + 'language_preference': -10 + })) + + # TODO: Do localization fields also have subtitles? + for name, url in movie_feature_data.items(): + lang = self._search_regex( + r'^subtitles_(?P<lang>\w+)$', name, 'subtitle metadata', default=None) + if lang and url: + subtitles.setdefault(lang, []).append({ + 'url': urljoin(self._CDN_URL, url) + }) + + return { + 'id': topic_data['topic_id'], + 'display_id': display_id, + 'title': topic_data.get('name'), + 'description': topic_data.get('synopsis'), + 'formats': formats, + 'subtitles': subtitles, + } + + +class BrainPOPLegacyBaseIE(BrainPOPBaseIE): + def _parse_js_topic_data(self, topic_data, display_id, token): + movie_data = topic_data['movies'] + # TODO: Are there non-burned subtitles? + formats = self._extract_adaptive_formats(movie_data, token, display_id) + + return { + 'id': topic_data['EntryID'], + 'display_id': display_id, + 'title': topic_data.get('name'), + 'alt_title': topic_data.get('title'), + 'description': topic_data.get('synopsis'), + 'formats': formats, + } + + def _real_extract(self, url): + slug, display_id = self._match_valid_url(url).group('slug', 'id') + webpage = self._download_webpage(url, display_id) + topic_data = self._search_json( + r'var\s+content\s*=\s*', webpage, 'content data', + display_id, end_pattern=';')['category']['unit']['topic'] + token = self._search_regex(r'ec_token\s*:\s*[\'"]([^\'"]+)', webpage, 'video token') + return self._parse_js_topic_data(topic_data, display_id, token) + + +class BrainPOPJrIE(BrainPOPLegacyBaseIE): + _ORIGIN = 'https://jr.brainpop.com' + _VIDEO_URL = 'https://svideos-jr.brainpop.com' + _HLS_URL = 'https://hls-jr.brainpop.com' + _CDN_URL = 'https://cdn-jr.brainpop.com' + _TESTS = [{ + 'url': 'https://jr.brainpop.com/health/feelingsandsel/emotions/', + 'md5': '04e0561bb21770f305a0ce6cf0d869ab', + 'info_dict': { + 'id': '347', + 'ext': 'mp4', + 'title': 'Emotions', + 'display_id': 'emotions', + }, + }, { + 'url': 'https://jr.brainpop.com/science/habitats/arctichabitats/', + 'md5': 'b0ed063bbd1910df00220ee29340f5d6', + 'info_dict': { + 'id': '29', + 'ext': 'mp4', + 'title': 'Arctic Habitats', + 'display_id': 'arctichabitats', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPELLIE(BrainPOPLegacyBaseIE): + _ORIGIN = 'https://ell.brainpop.com' + _VIDEO_URL = 'https://svideos-esl.brainpop.com' + _HLS_URL = 'https://hls-esl.brainpop.com' + _CDN_URL = 'https://cdn-esl.brainpop.com' + _TESTS = [{ + 'url': 'https://ell.brainpop.com/level1/unit1/lesson1/', + 'md5': 'a2012700cfb774acb7ad2e8834eed0d0', + 'info_dict': { + 'id': '1', + 'ext': 'mp4', + 'title': 'Lesson 1', + 'display_id': 'lesson1', + 'alt_title': 'Personal Pronouns', + }, + }, { + 'url': 'https://ell.brainpop.com/level3/unit6/lesson5/', + 'md5': 'be19c8292c87b24aacfb5fda2f3f8363', + 'info_dict': { + 'id': '101', + 'ext': 'mp4', + 'title': 'Lesson 5', + 'display_id': 'lesson5', + 'alt_title': 'Review: Unit 6', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPEspIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Español' + _ORIGIN = 'https://esp.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/mx' + _TESTS = [{ + 'url': 'https://esp.brainpop.com/ciencia/la_diversidad_de_la_vida/ecosistemas/', + 'md5': 'cb3f062db2b3c5240ddfcfde7108f8c9', + 'info_dict': { + 'id': '3893', + 'ext': 'mp4', + 'title': 'Ecosistemas', + 'display_id': 'ecosistemas', + 'description': 'md5:80fc55b07e241f8c8f2aa8d74deaf3c3', + }, + }, { + 'url': 'https://esp.brainpop.com/espanol/la_escritura/emily_dickinson/', + 'md5': '98c1b9559e0e33777209c425cda7dac4', + 'info_dict': { + 'id': '7146', + 'ext': 'mp4', + 'title': 'Emily Dickinson', + 'display_id': 'emily_dickinson', + 'description': 'md5:2795ad87b1d239c9711c1e92ab5a978b', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPFrIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Français' + _ORIGIN = 'https://fr.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/fr' + _TESTS = [{ + 'url': 'https://fr.brainpop.com/sciencesdelaterre/energie/sourcesdenergie/', + 'md5': '97e7f48af8af93f8a2be11709f239371', + 'info_dict': { + 'id': '1651', + 'ext': 'mp4', + 'title': 'Sources d\'énergie', + 'display_id': 'sourcesdenergie', + 'description': 'md5:7eece350f019a21ef9f64d4088b2d857', + }, + }, { + 'url': 'https://fr.brainpop.com/francais/ecrire/plagiat/', + 'md5': '0cf2b4f89804d0dd4a360a51310d445a', + 'info_dict': { + 'id': '5803', + 'ext': 'mp4', + 'title': 'Plagiat', + 'display_id': 'plagiat', + 'description': 'md5:4496d87127ace28e8b1eda116e77cd2b', + }, + 'skip': 'Requires login', + }] + + +class BrainPOPIlIE(BrainPOPLegacyBaseIE): + IE_DESC = 'BrainPOP Hebrew' + _ORIGIN = 'https://il.brainpop.com' + _VIDEO_URL = 'https://svideos.brainpop.com' + _HLS_URL = 'https://hls.brainpop.com' + _CDN_URL = 'https://cdn.brainpop.com/he' + _TESTS = [{ + 'url': 'https://il.brainpop.com/category_9/subcategory_150/subjects_3782/', + 'md5': '9e4ea9dc60ecd385a6e5ca12ccf31641', + 'info_dict': { + 'id': '3782', + 'ext': 'mp4', + 'title': 'md5:e993632fcda0545d9205602ec314ad67', + 'display_id': 'subjects_3782', + 'description': 'md5:4cc084a8012beb01f037724423a4d4ed', + }, + }] diff --git a/hypervideo_dl/extractor/bravotv.py b/hypervideo_dl/extractor/bravotv.py index d489584..419fe8c 100644 --- a/hypervideo_dl/extractor/bravotv.py +++ b/hypervideo_dl/extractor/bravotv.py @@ -1,117 +1,189 @@ -import re - from .adobepass import AdobePassIE +from ..networking import HEADRequest from ..utils import ( - smuggle_url, - update_url_query, - int_or_none, + extract_attributes, float_or_none, - try_get, - dict_get, + get_element_html_by_class, + int_or_none, + merge_dicts, + parse_age_limit, + remove_end, + str_or_none, + traverse_obj, + unescapeHTML, + unified_timestamp, + update_url_query, + url_or_none, ) class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?P<req_id>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<site>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', - 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', 'info_dict': { - 'id': 'epL0pmK1kQlT', + 'id': '3923059', 'ext': 'mp4', 'title': 'The Top Chef Season 16 Winner Is...', 'description': 'Find out who takes the title of Top Chef!', - 'uploader': 'NBCU-BRAV', 'upload_date': '20190314', 'timestamp': 1552591860, 'season_number': 16, 'episode_number': 15, 'series': 'Top Chef', 'episode': 'The Top Chef Season 16 Winner Is...', - 'duration': 190.0, - } + 'duration': 190.357, + 'season': 'Season 16', + 'thumbnail': r're:^https://.+\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', - 'only_matching': True, + 'url': 'https://www.bravotv.com/top-chef/season-20/episode-1/london-calling', + 'info_dict': { + 'id': '9000234570', + 'ext': 'mp4', + 'title': 'London Calling', + 'description': 'md5:5af95a8cbac1856bd10e7562f86bb759', + 'upload_date': '20230310', + 'timestamp': 1678410000, + 'season_number': 20, + 'episode_number': 1, + 'series': 'Top Chef', + 'episode': 'London Calling', + 'duration': 3266.03, + 'season': 'Season 20', + 'chapters': 'count:7', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'This video requires AdobePass MSO credentials', + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-1/closing-night', + 'info_dict': { + 'id': '3692045', + 'ext': 'mp4', + 'title': 'Closing Night', + 'description': 'md5:3170065c5c2f19548d72a4cbc254af63', + 'upload_date': '20180401', + 'timestamp': 1522623600, + 'season_number': 1, + 'episode_number': 1, + 'series': 'In Ice Cold Blood', + 'episode': 'Closing Night', + 'duration': 2629.051, + 'season': 'Season 1', + 'chapters': 'count:6', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'This video requires AdobePass MSO credentials', }, { 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'info_dict': { + 'id': '3974019', + 'ext': 'mp4', + 'title': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', + 'description': 'md5:f9d638dd6946a1c1c0533a9c6100eae5', + 'upload_date': '20190617', + 'timestamp': 1560790800, + 'season_number': 2, + 'episode_number': 16, + 'series': 'In Ice Cold Blood', + 'episode': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', + 'duration': 68.235, + 'season': 'Season 2', + 'thumbnail': r're:^https://.+\.jpg', + 'age_limit': 14, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, }] def _real_extract(self, url): - site, display_id = self._match_valid_url(url).groups() + site, display_id = self._match_valid_url(url).group('site', 'id') webpage = self._download_webpage(url, display_id) - settings = self._parse_json(self._search_regex( - r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'), - display_id) - info = {} + settings = self._search_json( + r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>', webpage, 'settings', display_id) + tve = extract_attributes(get_element_html_by_class('tve-video-deck-app', webpage) or '') query = { - 'mbr': 'true', + 'manifest': 'm3u', + 'formats': 'm3u,mpeg4', } - account_pid, release_pid = [None] * 2 - tve = settings.get('ls_tve') + if tve: - query['manifest'] = 'm3u' - mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage) - if mobj: - account_pid, tp_path = mobj.groups() - release_pid = tp_path.strip('/').split('/')[-1] - else: - account_pid = 'HNK2IC' - tp_path = release_pid = tve['release_pid'] - if tve.get('entitlement') == 'auth': - adobe_pass = settings.get('tve_adobe_auth', {}) - if site == 'bravotv': - site = 'bravo' + account_pid = tve.get('data-mpx-media-account-pid') or 'HNK2IC' + account_id = tve['data-mpx-media-account-id'] + metadata = self._parse_json( + tve.get('data-normalized-video', ''), display_id, fatal=False, transform_source=unescapeHTML) + video_id = tve.get('data-guid') or metadata['guid'] + if tve.get('data-entitlement') == 'auth': + auth = traverse_obj(settings, ('tve_adobe_auth', {dict})) or {} + site = remove_end(site, 'tv') + release_pid = tve['data-release-pid'] resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId') or site, - tve['title'], release_pid, tve.get('rating')) - query['auth'] = self._extract_mvpd_auth( - url, release_pid, - adobe_pass.get('adobePassRequestorId') or site, resource) - else: - shared_playlist = settings['ls_playlist'] - account_pid = shared_playlist['account_pid'] - metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] - tp_path = release_pid = metadata.get('release_pid') - if not release_pid: - release_pid = metadata['guid'] - tp_path = 'media/guid/2140479951/' + release_pid - info.update({ - 'title': metadata['title'], - 'description': metadata.get('description'), - 'season_number': int_or_none(metadata.get('season_num')), - 'episode_number': int_or_none(metadata.get('episode_num')), - }) - query['switch'] = 'progressive' + tve.get('data-adobe-pass-resource-id') or auth.get('adobePassResourceId') or site, + tve['data-title'], release_pid, tve.get('data-rating')) + query.update({ + 'switch': 'HLSServiceSecure', + 'auth': self._extract_mvpd_auth( + url, release_pid, auth.get('adobePassRequestorId') or site, resource), + }) - tp_url = 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path) + else: + ls_playlist = traverse_obj(settings, ('ls_playlist', ..., {dict}), get_all=False) or {} + account_pid = ls_playlist.get('mpxMediaAccountPid') or 'PHSl-B' + account_id = ls_playlist['mpxMediaAccountId'] + video_id = ls_playlist['defaultGuid'] + metadata = traverse_obj( + ls_playlist, ('videos', lambda _, v: v['guid'] == video_id, {dict}), get_all=False) + tp_url = f'https://link.theplatform.com/s/{account_pid}/media/guid/{account_id}/{video_id}' tp_metadata = self._download_json( - update_url_query(tp_url, {'format': 'preview'}), - display_id, fatal=False) - if tp_metadata: - info.update({ - 'title': tp_metadata.get('title'), - 'description': tp_metadata.get('description'), - 'duration': float_or_none(tp_metadata.get('duration'), 1000), - 'season_number': int_or_none( - dict_get(tp_metadata, ('pl1$seasonNumber', 'nbcu$seasonNumber'))), - 'episode_number': int_or_none( - dict_get(tp_metadata, ('pl1$episodeNumber', 'nbcu$episodeNumber'))), - # For some reason the series is sometimes wrapped into a single element array. - 'series': try_get( - dict_get(tp_metadata, ('pl1$show', 'nbcu$show')), - lambda x: x[0] if isinstance(x, list) else x, - expected_type=str), - 'episode': dict_get( - tp_metadata, ('pl1$episodeName', 'nbcu$episodeName', 'title')), - }) + update_url_query(tp_url, {'format': 'preview'}), video_id, fatal=False) + + seconds_or_none = lambda x: float_or_none(x, 1000) + chapters = traverse_obj(tp_metadata, ('chapters', ..., { + 'start_time': ('startTime', {seconds_or_none}), + 'end_time': ('endTime', {seconds_or_none}), + })) + # prune pointless single chapters that span the entire duration from short videos + if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')): + chapters = None - info.update({ - '_type': 'url_transparent', - 'id': release_pid, - 'url': smuggle_url(update_url_query(tp_url, query), {'force_smil_url': True}), - 'ie_key': 'ThePlatform', - }) - return info + m3u8_url = self._request_webpage(HEADRequest( + update_url_query(f'{tp_url}/stream.m3u8', query)), video_id, 'Checking m3u8 URL').url + if 'mpeg_cenc' in m3u8_url: + self.report_drm(video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'chapters': chapters, + **merge_dicts(traverse_obj(tp_metadata, { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {seconds_or_none}), + 'timestamp': ('pubDate', {seconds_or_none}), + 'season_number': (('pl1$seasonNumber', 'nbcu$seasonNumber'), {int_or_none}), + 'episode_number': (('pl1$episodeNumber', 'nbcu$episodeNumber'), {int_or_none}), + 'series': (('pl1$show', 'nbcu$show'), (None, ...), {str}), + 'episode': (('title', 'pl1$episodeNumber', 'nbcu$episodeNumber'), {str_or_none}), + 'age_limit': ('ratings', ..., 'rating', {parse_age_limit}), + }, get_all=False), traverse_obj(metadata, { + 'title': 'title', + 'description': 'description', + 'duration': ('durationInSeconds', {int_or_none}), + 'timestamp': ('airDate', {unified_timestamp}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'episode': 'episodeTitle', + 'series': 'show', + })) + } diff --git a/hypervideo_dl/extractor/brightcove.py b/hypervideo_dl/extractor/brightcove.py index 2b7ddca..61b1841 100644 --- a/hypervideo_dl/extractor/brightcove.py +++ b/hypervideo_dl/extractor/brightcove.py @@ -7,10 +7,10 @@ from .adobepass import AdobePassIE from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, - compat_HTTPError, compat_parse_qs, compat_urlparse, ) +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, dict_get, @@ -575,6 +575,7 @@ class BrightcoveNewBaseIE(AdobePassIE): self.raise_no_formats( error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + headers.pop('Authorization', None) # or else http formats will give error 400 for f in formats: f.setdefault('http_headers', {}).update(headers) @@ -895,8 +896,9 @@ class BrightcoveNewIE(BrightcoveNewBaseIE): store_pk(policy_key) return policy_key - api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) - headers = {} + token = smuggled_data.get('token') + api_url = f'https://{"edge-auth" if token else "edge"}.api.brightcove.com/playback/v1/accounts/{account_id}/{content_type}s/{video_id}' + headers = {'Authorization': f'Bearer {token}'} if token else {} referrer = smuggled_data.get('referrer') # XXX: notice the spelling/case of the key if referrer: headers.update({ @@ -913,8 +915,8 @@ class BrightcoveNewIE(BrightcoveNewBaseIE): json_data = self._download_json(api_url, video_id, headers=headers) break except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): - json_data = self._parse_json(e.cause.read().decode(), video_id)[0] + if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403): + json_data = self._parse_json(e.cause.response.read().decode(), video_id)[0] message = json_data.get('message') or json_data['error_code'] if json_data.get('error_subcode') == 'CLIENT_GEO': self.raise_geo_restricted(msg=message) diff --git a/hypervideo_dl/extractor/callin.py b/hypervideo_dl/extractor/callin.py index e966876..c77179c 100644 --- a/hypervideo_dl/extractor/callin.py +++ b/hypervideo_dl/extractor/callin.py @@ -1,9 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - traverse_obj, - float_or_none, - int_or_none -) +from ..utils import float_or_none, int_or_none, make_archive_id, traverse_obj class CallinIE(InfoExtractor): @@ -35,6 +31,54 @@ class CallinIE(InfoExtractor): 'episode_number': 1, 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd' } + }, { + 'url': 'https://www.callin.com/episode/fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW', + 'md5': '14ede27ee2c957b7e4db93140fc0745c', + 'info_dict': { + 'id': 'c3dab47f237bf953d180d3f243477a84302798be0e0b29bc9ade6d60a69f04f5', + 'ext': 'ts', + 'title': 'FCC Commissioner Brendan Carr on Elon’s Starlink', + 'description': 'Or, why the government doesn’t like SpaceX', + 'channel': 'The Pull Request', + 'channel_url': 'https://callin.com/show/the-pull-request-ucnDJmEKAa', + 'duration': 3182.472, + 'series_id': '7e9c23156e4aecfdcaef46bfb2ed7ca268509622ec006c0f0f25d90e34496638', + 'uploader_url': 'http://thepullrequest.com', + 'upload_date': '20220902', + 'episode': 'FCC Commissioner Brendan Carr on Elon’s Starlink', + 'display_id': 'fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW', + 'series': 'The Pull Request', + 'channel_id': '7e9c23156e4aecfdcaef46bfb2ed7ca268509622ec006c0f0f25d90e34496638', + 'view_count': int, + 'uploader': 'Antonio García Martínez', + 'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/1ade9142625344045dc17cf523469ced1d93610762f4c886d06aa190a2f979e8.png', + 'episode_id': 'c3dab47f237bf953d180d3f243477a84302798be0e0b29bc9ade6d60a69f04f5', + 'timestamp': 1662100688.005, + } + }, { + 'url': 'https://www.callin.com/episode/episode-81-elites-melt-down-over-student-debt-lzxMidUnjA', + 'md5': '16f704ddbf82a27e3930533b12062f07', + 'info_dict': { + 'id': '8d06f869798f93a7814e380bceabea72d501417e620180416ff6bd510596e83c', + 'ext': 'ts', + 'title': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?', + 'description': 'Let’s talk todays episode about the primary election shake up in NYC and the elites melting down over student debt cancelation.', + 'channel': 'The DEBRIEF With Briahna Joy Gray', + 'channel_url': 'https://callin.com/show/the-debrief-with-briahna-joy-gray-siiFDzGegm', + 'duration': 10043.16, + 'series_id': '61cea58444465fd26674069703bd8322993bc9e5b4f1a6d0872690554a046ff7', + 'uploader_url': 'http://patreon.com/badfaithpodcast', + 'upload_date': '20220826', + 'episode': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?', + 'display_id': 'episode-', + 'series': 'The DEBRIEF With Briahna Joy Gray', + 'channel_id': '61cea58444465fd26674069703bd8322993bc9e5b4f1a6d0872690554a046ff7', + 'view_count': int, + 'uploader': 'Briahna Gray', + 'thumbnail': 'https://d1z76fhpoqkd01.cloudfront.net/shows/legacy/461ea0d86172cb6aff7d6c80fd49259cf5e64bdf737a4650f8bc24cf392ca218.png', + 'episode_id': '8d06f869798f93a7814e380bceabea72d501417e620180416ff6bd510596e83c', + 'timestamp': 1661476708.282, + } }] def try_get_user_name(self, d): @@ -86,6 +130,7 @@ class CallinIE(InfoExtractor): return { 'id': id, + '_old_archive_ids': [make_archive_id(self, display_id.rsplit('-', 1)[-1])], 'display_id': display_id, 'title': title, 'formats': formats, diff --git a/hypervideo_dl/extractor/camfm.py b/hypervideo_dl/extractor/camfm.py new file mode 100644 index 0000000..a9850f4 --- /dev/null +++ b/hypervideo_dl/extractor/camfm.py @@ -0,0 +1,85 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_class, + get_elements_by_class, + join_nonempty, + traverse_obj, + unified_timestamp, + urljoin, +) + + +class CamFMShowIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?camfm\.co\.uk/shows/(?P<id>[^/]+)' + _TESTS = [{ + 'playlist_mincount': 5, + 'url': 'https://camfm.co.uk/shows/soul-mining/', + 'info_dict': { + 'id': 'soul-mining', + 'thumbnail': 'md5:6a873091f92c936f23bdcce80f75e66a', + 'title': 'Soul Mining', + 'description': 'Telling the stories of jazz, funk and soul from all corners of the world.', + }, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + page = self._download_webpage(url, show_id) + + return { + '_type': 'playlist', + 'id': show_id, + 'entries': [self.url_result(urljoin('https://camfm.co.uk', i), CamFMEpisodeIE) + for i in re.findall(r"javascript:popup\('(/player/[^']+)', 'listen'", page)], + 'thumbnail': urljoin('https://camfm.co.uk', self._search_regex( + r'<img[^>]+class="thumb-expand"[^>]+src="([^"]+)"', page, 'thumbnail', fatal=False)), + 'title': self._html_search_regex('<h1>([^<]+)</h1>', page, 'title', fatal=False), + 'description': clean_html(get_element_by_class('small-12 medium-8 cell', page)) + } + + +class CamFMEpisodeIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?camfm\.co\.uk/player/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://camfm.co.uk/player/43336', + 'skip': 'Episode will expire - don\'t actually know when, but it will go eventually', + 'info_dict': { + 'id': '43336', + 'title': 'AITAA: Am I the Agony Aunt? - 19:00 Tue 16/05/2023', + 'ext': 'mp3', + 'upload_date': '20230516', + 'description': 'md5:f165144f94927c0f1bfa2ee6e6ab7bbf', + 'timestamp': 1684263600, + 'series': 'AITAA: Am I the Agony Aunt?', + 'thumbnail': 'md5:5980a831360d0744c3764551be3d09c1', + 'categories': ['Entertainment'], + } + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + page = self._download_webpage(url, episode_id) + audios = self._parse_html5_media_entries('https://audio.camfm.co.uk', page, episode_id) + + caption = get_element_by_class('caption', page) + series = clean_html(re.sub(r'<span[^<]+<[^<]+>', '', caption)) + + card_section = get_element_by_class('card-section', page) + date = self._html_search_regex('>Aired at ([^<]+)<', card_section, 'air date', fatal=False) + + return { + 'id': episode_id, + 'title': join_nonempty(series, date, delim=' - '), + 'formats': traverse_obj(audios, (..., 'formats', ...)), + 'timestamp': unified_timestamp(date), # XXX: Does not account for UK's daylight savings + 'series': series, + 'description': clean_html(re.sub(r'<b>[^<]+</b><br[^>]+/>', '', card_section)), + 'thumbnail': urljoin('https://camfm.co.uk', self._search_regex( + r'<div[^>]+class="cover-art"[^>]+style="[^"]+url\(\'([^\']+)', + page, 'thumbnail', fatal=False)), + 'categories': get_elements_by_class('label', caption), + 'was_live': True, + } diff --git a/hypervideo_dl/extractor/cammodels.py b/hypervideo_dl/extractor/cammodels.py index 0509057..135b315 100644 --- a/hypervideo_dl/extractor/cammodels.py +++ b/hypervideo_dl/extractor/cammodels.py @@ -1,9 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - url_or_none, -) +from ..utils import int_or_none, url_or_none class CamModelsIE(InfoExtractor): @@ -17,32 +13,11 @@ class CamModelsIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage( - url, user_id, headers=self.geo_verification_headers()) - - manifest_root = self._html_search_regex( - r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) - - if not manifest_root: - ERRORS = ( - ("I'm offline, but let's stay connected", 'This user is currently offline'), - ('in a private show', 'This user is in a private show'), - ('is currently performing LIVE', 'This model is currently performing live'), - ) - for pattern, message in ERRORS: - if pattern in webpage: - error = message - expected = True - break - else: - error = 'Unable to find manifest URL root' - expected = False - raise ExtractorError(error, expected=expected) - manifest = self._download_json( - '%s%s.json' % (manifest_root, user_id), user_id) + 'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id) formats = [] + thumbnails = [] for format_id, format_dict in manifest['formats'].items(): if not isinstance(format_dict, dict): continue @@ -82,12 +57,20 @@ class CamModelsIE(InfoExtractor): 'quality': -10, }) else: + if format_id == 'jpeg': + thumbnails.append({ + 'url': f['url'], + 'width': f['width'], + 'height': f['height'], + 'format_id': f['format_id'], + }) continue formats.append(f) return { 'id': user_id, 'title': user_id, + 'thumbnails': thumbnails, 'is_live': True, 'formats': formats, 'age_limit': 18 diff --git a/hypervideo_dl/extractor/canalplus.py b/hypervideo_dl/extractor/canalplus.py index b7e2f9d..3ff5c3f 100644 --- a/hypervideo_dl/extractor/canalplus.py +++ b/hypervideo_dl/extractor/canalplus.py @@ -64,7 +64,7 @@ class CanalplusIE(InfoExtractor): # response = self._request_webpage( # HEADRequest(fmt_url), video_id, # 'Checking if the video is georestricted') - # if '/blocage' in response.geturl(): + # if '/blocage' in response.url: # raise ExtractorError( # 'The video is not available in your country', # expected=True) diff --git a/hypervideo_dl/extractor/cbc.py b/hypervideo_dl/extractor/cbc.py index a9f6cd2..e66071f 100644 --- a/hypervideo_dl/extractor/cbc.py +++ b/hypervideo_dl/extractor/cbc.py @@ -2,20 +2,23 @@ import re import json import base64 import time +import urllib.parse from .common import InfoExtractor from ..compat import ( compat_str, ) from ..utils import ( + ExtractorError, int_or_none, join_nonempty, js_to_json, orderedSet, + parse_iso8601, smuggle_url, strip_or_none, + traverse_obj, try_get, - ExtractorError, ) @@ -159,7 +162,7 @@ class CBCPlayerIE(InfoExtractor): 'upload_date': '20160210', 'uploader': 'CBCC-NEW', }, - 'skip': 'Geo-restricted to Canada', + 'skip': 'Geo-restricted to Canada and no longer available', }, { # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ 'url': 'http://www.cbc.ca/player/play/2657631896', @@ -172,6 +175,9 @@ class CBCPlayerIE(InfoExtractor): 'timestamp': 1425704400, 'upload_date': '20150307', 'uploader': 'CBCC-NEW', + 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', + 'chapters': [], + 'duration': 494.811, }, }, { 'url': 'http://www.cbc.ca/player/play/2164402062', @@ -184,6 +190,28 @@ class CBCPlayerIE(InfoExtractor): 'timestamp': 1320410746, 'upload_date': '20111104', 'uploader': 'CBCC-NEW', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', + 'chapters': [], + 'duration': 186.867, + }, + }, { + # Has subtitles + # These broadcasts expire after ~1 month, can find new test URL here: + # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast + 'url': 'http://www.cbc.ca/player/play/2249992771553', + 'md5': '2f2fb675dd4f0f8a5bb7588d1b13bacd', + 'info_dict': { + 'id': '2249992771553', + 'ext': 'mp4', + 'title': 'The National | Women’s soccer pay, Florida seawater, Swift quake', + 'description': 'md5:adba28011a56cfa47a080ff198dad27a', + 'timestamp': 1690596000, + 'duration': 2716.333, + 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/481/326/thumbnail.jpeg', + 'uploader': 'CBCC-NEW', + 'chapters': 'count:5', + 'upload_date': '20230729', }, }] @@ -197,12 +225,45 @@ class CBCPlayerIE(InfoExtractor): 'force_smil_url': True }), 'id': video_id, + '_format_sort_fields': ('res', 'proto') # Prioritize direct http formats over HLS + } + + +class CBCPlayerPlaylistIE(InfoExtractor): + IE_NAME = 'cbc.ca:player:playlist' + _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:player/)(?!play/)(?P<id>[^?#]+)' + _TESTS = [{ + 'url': 'https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 'news/tv shows/the national/latest broadcast', + } + }, { + 'url': 'https://www.cbc.ca/player/news/Canada/North', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 'news/canada/north', } + }] + + def _real_extract(self, url): + playlist_id = urllib.parse.unquote(self._match_id(url)).lower() + webpage = self._download_webpage(url, playlist_id) + json_content = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', playlist_id) + + def entries(): + for video_id in traverse_obj(json_content, ( + 'video', 'clipsByCategory', lambda k, _: k.lower() == playlist_id, 'items', ..., 'id' + )): + yield self.url_result(f'https://www.cbc.ca/player/play/{video_id}', CBCPlayerIE) + + return self.playlist_result(entries(), playlist_id) class CBCGemIE(InfoExtractor): IE_NAME = 'gem.cbc.ca' - _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>[0-9a-z-]+/s[0-9]+[a-z][0-9]+)' + _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>[0-9a-z-]+/s[0-9]+[a-z][0-9]+)' _TESTS = [{ # This is a normal, public, TV show video 'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01', @@ -245,6 +306,9 @@ class CBCGemIE(InfoExtractor): }, 'params': {'format': 'bv'}, 'skip': 'Geo-restricted to Canada', + }, { + 'url': 'https://gem.cbc.ca/nadiyas-family-favourites/s01e01', + 'only_matching': True, }] _GEO_COUNTRIES = ['CA'] @@ -346,7 +410,9 @@ class CBCGemIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id) + video_info = self._download_json( + f'https://services.radio-canada.ca/ott/cbc-api/v2/assets/{video_id}', + video_id, expected_status=426) email, password = self._get_login_info() if email and password: @@ -401,7 +467,7 @@ class CBCGemIE(InfoExtractor): class CBCGemPlaylistIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:playlist' - _VALID_URL = r'https?://gem\.cbc\.ca/media/(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)' + _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P<id>(?P<show>[0-9a-z-]+)/s(?P<season>[0-9]+))/?(?:[?#]|$)' _TESTS = [{ # TV show playlist, all public videos 'url': 'https://gem.cbc.ca/media/schitts-creek/s06', @@ -411,6 +477,9 @@ class CBCGemPlaylistIE(InfoExtractor): 'title': 'Season 6', 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', }, + }, { + 'url': 'https://gem.cbc.ca/schitts-creek/s06', + 'only_matching': True, }] _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/shows/' @@ -418,7 +487,7 @@ class CBCGemPlaylistIE(InfoExtractor): match = self._match_valid_url(url) season_id = match.group('id') show = match.group('show') - show_info = self._download_json(self._API_BASE + show, season_id) + show_info = self._download_json(self._API_BASE + show, season_id, expected_status=426) season = int(match.group('season')) season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) @@ -470,49 +539,90 @@ class CBCGemPlaylistIE(InfoExtractor): class CBCGemLiveIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:live' - _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>\d+)' - _TEST = { - 'url': 'https://gem.cbc.ca/live/920604739687', - 'info_dict': { - 'title': 'Ottawa', - 'description': 'The live TV channel and local programming from Ottawa', - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', - 'is_live': True, - 'id': 'AyqZwxRqh8EH', - 'ext': 'mp4', - 'timestamp': 1492106160, - 'upload_date': '20170413', - 'uploader': 'CBCC-NEW', + _VALID_URL = r'https?://gem\.cbc\.ca/live(?:-event)?/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://gem.cbc.ca/live/920604739687', + 'info_dict': { + 'title': 'Ottawa', + 'description': 'The live TV channel and local programming from Ottawa', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', + 'is_live': True, + 'id': 'AyqZwxRqh8EH', + 'ext': 'mp4', + 'timestamp': 1492106160, + 'upload_date': '20170413', + 'uploader': 'CBCC-NEW', + }, + 'skip': 'Live might have ended', }, - 'skip': 'Live might have ended', - } - - # It's unclear where the chars at the end come from, but they appear to be - # constant. Might need updating in the future. - # There are two URLs, some livestreams are in one, and some - # in the other. The JSON schema is the same for both. - _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT'] + { + 'url': 'https://gem.cbc.ca/live/44', + 'info_dict': { + 'id': '44', + 'ext': 'mp4', + 'is_live': True, + 'title': r're:^Ottawa [0-9\-: ]+', + 'description': 'The live TV channel and local programming from Ottawa', + 'live_status': 'is_live', + 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*' + }, + 'params': {'skip_download': True}, + 'skip': 'Live might have ended', + }, + { + 'url': 'https://gem.cbc.ca/live-event/10835', + 'info_dict': { + 'id': '10835', + 'ext': 'mp4', + 'is_live': True, + 'title': r're:^The National \| Biden’s trip wraps up, Paltrow testifies, Bird flu [0-9\-: ]+', + 'description': 'March 24, 2023 | President Biden’s Ottawa visit ends with big pledges from both countries. Plus, Gwyneth Paltrow testifies in her ski collision trial.', + 'live_status': 'is_live', + 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*', + 'timestamp': 1679706000, + 'upload_date': '20230325', + }, + 'params': {'skip_download': True}, + 'skip': 'Live might have ended', + } + ] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_info = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['data'] - for api_url in self._API_URLS: - video_info = next(( - stream for stream in self._download_json(api_url, video_id)['entries'] - if stream.get('guid') == video_id), None) - if video_info: - break - else: + # Two types of metadata JSON + if not video_info.get('formattedIdMedia'): + video_info = traverse_obj( + video_info, (('freeTv', ('streams', ...)), 'items', lambda _, v: v['key'] == video_id, {dict}), + get_all=False, default={}) + + video_stream_id = video_info.get('formattedIdMedia') + if not video_stream_id: raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) + stream_data = self._download_json( + 'https://services.radio-canada.ca/media/validation/v2/', video_id, query={ + 'appCode': 'mpx', + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'idMedia': video_stream_id, + 'multibitrate': 'true', + 'output': 'json', + 'tech': 'hls', + 'manifestType': 'desktop', + }) + return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': video_info['content'][0]['url'], 'id': video_id, - 'title': video_info.get('title'), - 'description': video_info.get('description'), - 'tags': try_get(video_info, lambda x: x['keywords'].split(', ')), - 'thumbnail': video_info.get('cbc$staticImage'), + 'formats': self._extract_m3u8_formats(stream_data['url'], video_id, 'mp4', live=True), 'is_live': True, + **traverse_obj(video_info, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ('images', 'card', 'url'), + 'timestamp': ('airDate', {parse_iso8601}), + }) } diff --git a/hypervideo_dl/extractor/cbs.py b/hypervideo_dl/extractor/cbs.py index 9aacd50..1c0dbde 100644 --- a/hypervideo_dl/extractor/cbs.py +++ b/hypervideo_dl/extractor/cbs.py @@ -1,8 +1,14 @@ +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor from .theplatform import ThePlatformFeedIE +from .youtube import YoutubeIE from ..utils import ( ExtractorError, + extract_attributes, + get_element_html_by_id, int_or_none, find_xpath_attr, + smuggle_url, xpath_element, xpath_text, update_url_query, @@ -162,3 +168,110 @@ class CBSIE(CBSBaseIE): 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), 'thumbnail': url_or_none(xpath_text(video_data, 'previewImageURL')), }) + + +class ParamountPressExpressIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?paramountpressexpress\.com(?:/[\w-]+)+/(?P<yt>yt-)?video/?\?watch=(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.paramountpressexpress.com/cbs-entertainment/shows/survivor/video/?watch=pnzew7e2hx', + 'md5': '56631dbcadaab980d1fc47cb7b76cba4', + 'info_dict': { + 'id': '6322981580112', + 'ext': 'mp4', + 'title': 'I’m Felicia', + 'description': 'md5:88fad93f8eede1c9c8f390239e4c6290', + 'uploader_id': '6055873637001', + 'upload_date': '20230320', + 'timestamp': 1679334960, + 'duration': 49.557, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': [], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/cbs-entertainment/video/?watch=2s5eh8kppc', + 'md5': 'edcb03e3210b88a3e56c05aa863e0e5b', + 'info_dict': { + 'id': '6323036027112', + 'ext': 'mp4', + 'title': '‘Y&R’ Set Visit: Jerry O’Connell Quizzes Cast on Pre-Love Scene Rituals and More', + 'description': 'md5:b929867a357aac5544b783d834c78383', + 'uploader_id': '6055873637001', + 'upload_date': '20230321', + 'timestamp': 1679430180, + 'duration': 132.032, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': [], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/paramount-plus/yt-video/?watch=OX9wJWOcqck', + 'info_dict': { + 'id': 'OX9wJWOcqck', + 'ext': 'mp4', + 'title': 'Rugrats | Season 2 Official Trailer | Paramount+', + 'description': 'md5:1f7e26f5625a9f0d6564d9ad97a9f7de', + 'uploader': 'Paramount Plus', + 'uploader_id': '@paramountplus', + 'uploader_url': 'http://www.youtube.com/@paramountplus', + 'channel': 'Paramount Plus', + 'channel_id': 'UCrRttZIypNTA1Mrfwo745Sg', + 'channel_url': 'https://www.youtube.com/channel/UCrRttZIypNTA1Mrfwo745Sg', + 'upload_date': '20230316', + 'duration': 88, + 'age_limit': 0, + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'view_count': int, + 'like_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/OX9wJWOcqck/maxresdefault.jpg', + 'categories': ['Entertainment'], + 'tags': ['Rugrats'], + }, + }, { + 'url': 'https://www.paramountpressexpress.com/showtime/yt-video/?watch=_ljssSoDLkw', + 'info_dict': { + 'id': '_ljssSoDLkw', + 'ext': 'mp4', + 'title': 'Lavell Crawford: THEE Lavell Crawford Comedy Special Official Trailer | SHOWTIME', + 'description': 'md5:39581bcc3fd810209b642609f448af70', + 'uploader': 'SHOWTIME', + 'uploader_id': '@Showtime', + 'uploader_url': 'http://www.youtube.com/@Showtime', + 'channel': 'SHOWTIME', + 'channel_id': 'UCtwMWJr2BFPkuJTnSvCESSQ', + 'channel_url': 'https://www.youtube.com/channel/UCtwMWJr2BFPkuJTnSvCESSQ', + 'upload_date': '20230209', + 'duration': 49, + 'age_limit': 0, + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi_webp/_ljssSoDLkw/maxresdefault.webp', + 'categories': ['People & Blogs'], + 'tags': 'count:27', + }, + }] + + def _real_extract(self, url): + display_id, is_youtube = self._match_valid_url(url).group('id', 'yt') + if is_youtube: + return self.url_result(display_id, YoutubeIE) + + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'\bvideo_id\s*=\s*["\'](\d+)["\']\s*,', webpage, 'Brightcove ID') + token = self._search_regex(r'\btoken\s*=\s*["\']([\w.-]+)["\']', webpage, 'token') + + player = extract_attributes(get_element_html_by_id('vcbrightcoveplayer', webpage) or '') + account_id = player.get('data-account') or '6055873637001' + player_id = player.get('data-player') or 'OtLKgXlO9F' + embed = player.get('data-embed') or 'default' + + return self.url_result(smuggle_url( + f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}', + {'token': token}), BrightcoveNewIE) diff --git a/hypervideo_dl/extractor/cbsnews.py b/hypervideo_dl/extractor/cbsnews.py index 16edf3a..5a8ebb8 100644 --- a/hypervideo_dl/extractor/cbsnews.py +++ b/hypervideo_dl/extractor/cbsnews.py @@ -1,36 +1,153 @@ +import base64 import re +import urllib.error +import urllib.parse import zlib +from .anvato import AnvatoIE from .common import InfoExtractor -from .cbs import CBSIE -from ..compat import ( - compat_b64decode, - compat_urllib_parse_unquote, -) +from .paramountplus import ParamountPlusIE +from ..networking import HEADRequest from ..utils import ( + ExtractorError, + UserNotLive, + determine_ext, + float_or_none, + format_field, + int_or_none, + make_archive_id, + mimetype2ext, parse_duration, + smuggle_url, + traverse_obj, + url_or_none, ) -class CBSNewsEmbedIE(CBSIE): # XXX: Do not subclass from concrete IE +class CBSNewsBaseIE(InfoExtractor): + _LOCALES = { + 'atlanta': None, + 'baltimore': 'BAL', + 'boston': 'BOS', + 'chicago': 'CHI', + 'colorado': 'DEN', + 'detroit': 'DET', + 'losangeles': 'LA', + 'miami': 'MIA', + 'minnesota': 'MIN', + 'newyork': 'NY', + 'philadelphia': 'PHI', + 'pittsburgh': 'PIT', + 'sacramento': 'SAC', + 'sanfrancisco': 'SF', + 'texas': 'DAL', + } + _LOCALE_RE = '|'.join(map(re.escape, _LOCALES)) + _ANVACK = '5VD6Eyd6djewbCmNwBFnsJj17YAvGRwl' + + def _get_item(self, webpage, display_id): + return traverse_obj(self._search_json( + r'CBSNEWS\.defaultPayload\s*=', webpage, 'payload', display_id, + default={}), ('items', 0, {dict})) or {} + + def _get_video_url(self, item): + return traverse_obj(item, 'video', 'video2', expected_type=url_or_none) + + def _extract_playlist(self, webpage, playlist_id): + entries = [self.url_result(embed_url, CBSNewsEmbedIE) for embed_url in re.findall( + r'<iframe[^>]+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage)] + if entries: + return self.playlist_result( + entries, playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage), + self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) + + def _extract_video(self, item, video_url, video_id): + if mimetype2ext(item.get('format'), default=determine_ext(video_url)) == 'mp4': + formats = [{'url': video_url, 'ext': 'mp4'}] + + else: + manifest = self._download_webpage(video_url, video_id, note='Downloading m3u8 information') + + anvato_id = self._search_regex(r'anvato-(\d+)', manifest, 'Anvato ID', default=None) + # Prefer Anvato if available; cbsnews.com m3u8 formats are re-encoded from Anvato source + if anvato_id: + return self.url_result( + smuggle_url(f'anvato:{self._ANVACK}:{anvato_id}', {'token': 'default'}), + AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)]) + + formats, _ = self._parse_m3u8_formats_and_subtitles( + manifest, video_url, 'mp4', m3u8_id='hls', video_id=video_id) + + def get_subtitles(subs_url): + return { + 'en': [{ + 'url': subs_url, + 'ext': 'dfxp', # TTAF1 + }], + } if url_or_none(subs_url) else None + + episode_meta = traverse_obj(item, { + 'season_number': ('season', {int_or_none}), + 'episode_number': ('episode', {int_or_none}), + }) if item.get('isFullEpisode') else {} + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(item, { + 'title': (None, ('fulltitle', 'title')), + 'description': 'dek', + 'timestamp': ('timestamp', {lambda x: float_or_none(x, 1000)}), + 'duration': ('duration', {float_or_none}), + 'subtitles': ('captions', {get_subtitles}), + 'thumbnail': ('images', ('hd', 'sd'), {url_or_none}), + 'is_live': ('type', {lambda x: x == 'live'}), + }, get_all=False), + **episode_meta, + } + + +class CBSNewsEmbedIE(CBSNewsBaseIE): IE_NAME = 'cbsnews:embed' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P<id>.+)' _TESTS = [{ 'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A', - 'only_matching': True, + 'info_dict': { + 'id': '6ZP4cXvo9FaX3VLH7MF4CgY30JFpY_GA', + 'ext': 'mp4', + 'title': 'Cops investigate gorilla incident at Cincinnati Zoo', + 'description': 'md5:fee7441ab8aaeb3c693482394738102b', + 'duration': 350, + 'timestamp': 1464719713, + 'upload_date': '20160531', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): - item = self._parse_json(zlib.decompress(compat_b64decode( - compat_urllib_parse_unquote(self._match_id(url))), - -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0] - return self._extract_video_info(item['mpxRefId'], 'cbsnews') + item = traverse_obj(self._parse_json(zlib.decompress(base64.b64decode( + urllib.parse.unquote(self._match_id(url))), + -zlib.MAX_WBITS).decode(), None), ('video', 'items', 0, {dict})) or {} + video_id = item['mpxRefId'] + video_url = self._get_video_url(item) + if not video_url: + # Old embeds redirect user to ParamountPlus but most links are 404 + pplus_url = f'https://www.paramountplus.com/shows/video/{video_id}' + try: + self._request_webpage(HEADRequest(pplus_url), video_id) + return self.url_result(pplus_url, ParamountPlusIE) + except ExtractorError: + self.raise_no_formats('This video is no longer available', True, video_id) -class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE + return self._extract_video(item, video_url, video_id) + + +class CBSNewsIE(CBSNewsBaseIE): IE_NAME = 'cbsnews' IE_DESC = 'CBS News' - _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\w-]+)' _TESTS = [ { @@ -47,10 +164,7 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE 'timestamp': 1476046464, 'upload_date': '20161009', }, - 'params': { - # rtmp download - 'skip_download': True, - }, + 'skip': 'This video is no longer available', }, { 'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', @@ -61,48 +175,234 @@ class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', 'upload_date': '20140404', 'timestamp': 1396650660, - 'uploader': 'CBSI-NEW', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 205, 'subtitles': { 'en': [{ - 'ext': 'ttml', + 'ext': 'dfxp', }], }, }, 'params': { - # m3u8 download - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { # 48 hours 'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', 'info_dict': { + 'id': 'maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved', 'title': 'Cold as Ice', 'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?', }, 'playlist_mincount': 7, }, + { + 'url': 'https://www.cbsnews.com/video/032823-cbs-evening-news/', + 'info_dict': { + 'id': '_2wuO7hD9LwtyM_TwSnVwnKp6kxlcXgE', + 'ext': 'mp4', + 'title': 'CBS Evening News, March 28, 2023', + 'description': 'md5:db20615aae54adc1d55a1fd69dc75d13', + 'duration': 1189, + 'timestamp': 1680042600, + 'upload_date': '20230328', + 'season': 'Season 2023', + 'season_number': 2023, + 'episode': 'Episode 83', + 'episode_number': 83, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, ] def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + playlist = self._extract_playlist(webpage, display_id) + if playlist: + return playlist + item = self._get_item(webpage, display_id) + video_id = item.get('mpxRefId') or display_id + video_url = self._get_video_url(item) + if not video_url: + self.raise_no_formats('No video content was found', expected=True, video_id=video_id) + + return self._extract_video(item, video_url, video_id) + + +class CBSLocalBaseIE(CBSNewsBaseIE): + def _real_extract(self, url): + display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - entries = [] - for embed_url in re.findall(r'<iframe[^>]+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage): - entries.append(self.url_result(embed_url, CBSNewsEmbedIE.ie_key())) - if entries: - return self.playlist_result( - entries, playlist_title=self._html_search_meta(['og:title', 'twitter:title'], webpage), - playlist_description=self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) + item = self._get_item(webpage, display_id) + video_id = item.get('mpxRefId') or display_id + anvato_id = None + video_url = self._get_video_url(item) + + if not video_url: + anv_params = self._search_regex( + r'<iframe[^>]+\bdata-src="https?://w3\.mp\.lura\.live/player/prod/v3/anvload\.html\?key=([^"]+)"', + webpage, 'Anvato URL', default=None) + + if not anv_params: + playlist = self._extract_playlist(webpage, display_id) + if playlist: + return playlist + self.raise_no_formats('No video content was found', expected=True, video_id=video_id) + + anv_data = self._parse_json(base64.urlsafe_b64decode(f'{anv_params}===').decode(), video_id) + anvato_id = anv_data['v'] + return self.url_result( + smuggle_url(f'anvato:{anv_data.get("anvack") or self._ANVACK}:{anvato_id}', { + 'token': anv_data.get('token') or 'default', + }), AnvatoIE, url_transparent=True, _old_archive_ids=[make_archive_id(self, anvato_id)]) + + return self._extract_video(item, video_url, video_id) + - item = self._parse_json(self._html_search_regex( - r'CBSNEWS\.defaultPayload\s*=\s*({.+})', - webpage, 'video JSON info'), display_id)['items'][0] - return self._extract_video_info(item['mpxRefId'], 'cbsnews') +class CBSLocalIE(CBSLocalBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/(?:live/)?video/(?P<id>[\w-]+)' + _TESTS = [{ + # Anvato video via defaultPayload JSON + 'url': 'https://www.cbsnews.com/newyork/video/1st-cannabis-dispensary-opens-in-queens/', + 'info_dict': { + 'id': '6376747', + 'ext': 'mp4', + 'title': '1st cannabis dispensary opens in Queens', + 'description': 'The dispensary is women-owned and located in Jamaica.', + 'uploader': 'CBS', + 'duration': 20, + 'timestamp': 1680193657, + 'upload_date': '20230330', + 'categories': ['Stations\\Spoken Word\\WCBSTV', 'Content\\Google', 'Content\\News', 'Content\\News\\Local News'], + 'tags': 'count:11', + 'thumbnail': 're:^https?://.*', + '_old_archive_ids': ['cbslocal 6376747'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # cbsnews.com video via defaultPayload JSON + 'url': 'https://www.cbsnews.com/newyork/live/video/20230330171655-the-city-is-sounding-the-alarm-on-dangerous-social-media-challenges/', + 'info_dict': { + 'id': 'sJqfw7YvgSC6ant2zVmzt3y1jYKoL5J3', + 'ext': 'mp4', + 'title': 'the city is sounding the alarm on dangerous social media challenges', + 'description': 'md5:8eccc9b1b73be5138a52e9c4350d2cd6', + 'thumbnail': 'https://images-cbsn.cbsnews.com/prod/2023/03/30/story_22509622_1680196925.jpg', + 'duration': 41.0, + 'timestamp': 1680196615, + 'upload_date': '20230330', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + +class CBSLocalArticleIE(CBSLocalBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?:{CBSNewsBaseIE._LOCALE_RE})/news/(?P<id>[\w-]+)' + _TESTS = [{ + # Anvato video via iframe embed + 'url': 'https://www.cbsnews.com/newyork/news/mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service/', + 'playlist_count': 2, + 'info_dict': { + 'id': 'mta-station-agents-leaving-their-booths-to-provide-more-direct-customer-service', + 'title': 'MTA station agents begin leaving their booths to provide more direct customer service', + 'description': 'The more than 2,200 agents will provide face-to-face customer service to passengers.', + }, + }, { + 'url': 'https://www.cbsnews.com/losangeles/news/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis/', + 'md5': 'f0ee3081e3843f575fccef901199b212', + 'info_dict': { + 'id': '3401037', + 'ext': 'mp4', + 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'', + 'thumbnail': 're:^https?://.*', + 'timestamp': 1463440500, + 'upload_date': '20160516', + }, + 'skip': 'Video has been removed', + }] + + +class CBSNewsLiveBaseIE(CBSNewsBaseIE): + def _get_id(self, url): + raise NotImplementedError('This method must be implemented by subclasses') + + def _real_extract(self, url): + video_id = self._get_id(url) + if not video_id: + raise ExtractorError('Livestream is not available', expected=True) + + data = traverse_obj(self._download_json( + 'https://feeds-cbsn.cbsnews.com/2.0/rundown/', video_id, query={ + 'partner': 'cbsnsite', + 'edition': video_id, + 'type': 'live', + }), ('navigation', 'data', 0, {dict})) + + video_url = traverse_obj(data, (('videoUrlDAI', ('videoUrl', 'base')), {url_or_none}), get_all=False) + if not video_url: + raise UserNotLive(video_id=video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + **traverse_obj(data, { + 'title': 'headline', + 'description': 'rundown_slug', + 'thumbnail': ('images', 'thumbnail_url_hd', {url_or_none}), + }), + } + + +class CBSLocalLiveIE(CBSNewsLiveBaseIE): + _VALID_URL = rf'https?://(?:www\.)?cbsnews\.com/(?P<id>{CBSNewsBaseIE._LOCALE_RE})/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/losangeles/live/', + 'info_dict': { + 'id': 'CBSN-LA', + 'ext': 'mp4', + 'title': str, + 'description': r're:KCBS/CBSN_LA.CRISPIN.\w+.RUNDOWN \w+ \w+', + 'thumbnail': r're:^https?://.*\.jpg$', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _get_id(self, url): + return format_field(self._LOCALES, self._match_id(url), 'CBSN-%s') + + +class CBSNewsLiveIE(CBSNewsLiveBaseIE): + IE_NAME = 'cbsnews:live' + IE_DESC = 'CBS News Livestream' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/live/', + 'info_dict': { + 'id': 'CBSN-US', + 'ext': 'mp4', + 'title': str, + 'description': r're:\w+ \w+ CRISPIN RUNDOWN', + 'thumbnail': r're:^https?://.*\.jpg$', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _get_id(self, url): + return 'CBSN-US' class CBSNewsLiveVideoIE(InfoExtractor): @@ -111,7 +411,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[^/?#]+)' # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples - _TEST = { + _TESTS = [{ 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', 'info_dict': { 'id': 'clinton-sanders-prepare-to-face-off-in-nh', @@ -120,7 +420,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): 'duration': 334, }, 'skip': 'Video gone', - } + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -131,13 +431,13 @@ class CBSNewsLiveVideoIE(InfoExtractor): 'dvr_slug': display_id, }) - formats = self._extract_akamai_formats(video_info['url'], display_id) - return { 'id': display_id, 'display_id': display_id, - 'title': video_info['headline'], - 'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'), - 'duration': parse_duration(video_info.get('segmentDur')), - 'formats': formats, + 'formats': self._extract_akamai_formats(video_info['url'], display_id), + **traverse_obj(video_info, { + 'title': 'headline', + 'thumbnail': ('thumbnail_url_hd', {url_or_none}), + 'duration': ('segmentDur', {parse_duration}), + }), } diff --git a/hypervideo_dl/extractor/cda.py b/hypervideo_dl/extractor/cda.py index d1212e6..1157114 100644 --- a/hypervideo_dl/extractor/cda.py +++ b/hypervideo_dl/extractor/cda.py @@ -4,6 +4,7 @@ import datetime import hashlib import hmac import json +import random import re from .common import InfoExtractor @@ -27,11 +28,10 @@ class CDAIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)' _NETRC_MACHINE = 'cdapl' - _BASE_URL = 'http://www.cda.pl/' + _BASE_URL = 'https://www.cda.pl' _BASE_API_URL = 'https://api.cda.pl' _API_HEADERS = { 'Accept': 'application/vnd.cda.public+json', - 'User-Agent': 'pl.cda 1.0 (version 1.2.88 build 15306; Android 9; Xiaomi Redmi 3S)', } # hardcoded in the app _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q' @@ -101,6 +101,38 @@ class CDAIE(InfoExtractor): }, **kwargs) def _perform_login(self, username, password): + app_version = random.choice(( + '1.2.88 build 15306', + '1.2.174 build 18469', + )) + android_version = random.randrange(8, 14) + phone_model = random.choice(( + # x-kom.pl top selling Android smartphones, as of 2022-12-26 + # https://www.x-kom.pl/g-4/c/1590-smartfony-i-telefony.html?f201-system-operacyjny=61322-android + 'ASUS ZenFone 8', + 'Motorola edge 20 5G', + 'Motorola edge 30 neo 5G', + 'Motorola moto g22', + 'OnePlus Nord 2T 5G', + 'Samsung Galaxy A32 SM‑A325F', + 'Samsung Galaxy M13', + 'Samsung Galaxy S20 FE 5G', + 'Xiaomi 11T', + 'Xiaomi POCO M4 Pro', + 'Xiaomi Redmi 10', + 'Xiaomi Redmi 10C', + 'Xiaomi Redmi 9C NFC', + 'Xiaomi Redmi Note 10 Pro', + 'Xiaomi Redmi Note 11 Pro', + 'Xiaomi Redmi Note 11', + 'Xiaomi Redmi Note 11S 5G', + 'Xiaomi Redmi Note 11S', + 'realme 10', + 'realme 9 Pro+', + 'vivo Y33s', + )) + self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})' + cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {} if cached_bearer.get('valid_until', 0) > datetime.datetime.now().timestamp() + 5: self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}' @@ -138,9 +170,6 @@ class CDAIE(InfoExtractor): meta = self._download_json( f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video'] - if meta.get('premium') and not meta.get('premium_free'): - self.report_drm(video_id) - uploader = traverse_obj(meta, 'author', 'login') formats = [{ @@ -151,6 +180,10 @@ class CDAIE(InfoExtractor): 'filesize': quality.get('length'), } for quality in meta['qualities'] if quality.get('file')] + if meta.get('premium') and not meta.get('premium_free') and not formats: + raise ExtractorError( + 'Video requires CDA Premium - subscription needed', expected=True) + return { 'id': video_id, 'title': meta.get('title'), @@ -167,10 +200,10 @@ class CDAIE(InfoExtractor): def _web_extract(self, video_id, url): self._set_cookie('cda.pl', 'cda.player', 'html5') webpage = self._download_webpage( - self._BASE_URL + '/video/' + video_id, video_id) + f'{self._BASE_URL}/video/{video_id}/vfilm', video_id) if 'Ten film jest dostępny dla użytkowników premium' in webpage: - raise ExtractorError('This video is only available for premium users.', expected=True) + self.raise_login_required('This video is only available for premium users') if re.search(r'niedostępn[ey] w(?: |\s+)Twoim kraju\s*<', webpage): self.raise_geo_restricted() diff --git a/hypervideo_dl/extractor/ceskatelevize.py b/hypervideo_dl/extractor/ceskatelevize.py index be2b0bb..8390160 100644 --- a/hypervideo_dl/extractor/ceskatelevize.py +++ b/hypervideo_dl/extractor/ceskatelevize.py @@ -1,20 +1,20 @@ import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, -) +from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse +from ..networking import Request from ..utils import ( ExtractorError, float_or_none, - sanitized_Request, str_or_none, traverse_obj, urlencode_postdata, - USER_AGENTS, ) +USER_AGENTS = { + 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', +} + class CeskaTelevizeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' @@ -97,7 +97,7 @@ class CeskaTelevizeIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) webpage, urlh = self._download_webpage_handle(url, playlist_id) - parsed_url = compat_urllib_parse_urlparse(urlh.geturl()) + parsed_url = compat_urllib_parse_urlparse(urlh.url) site_name = self._og_search_property('site_name', webpage, fatal=False, default='Česká televize') playlist_title = self._og_search_title(webpage, default=None) if site_name and playlist_title: @@ -163,16 +163,16 @@ class CeskaTelevizeIE(InfoExtractor): entries = [] for user_agent in (None, USER_AGENTS['Safari']): - req = sanitized_Request( + req = Request( 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - req.add_header('x-addr', '127.0.0.1') - req.add_header('X-Requested-With', 'XMLHttpRequest') + req.headers['Content-type'] = 'application/x-www-form-urlencoded' + req.headers['x-addr'] = '127.0.0.1' + req.headers['X-Requested-With'] = 'XMLHttpRequest' if user_agent: - req.add_header('User-Agent', user_agent) - req.add_header('Referer', url) + req.headers['User-Agent'] = user_agent + req.headers['Referer'] = url playlistpage = self._download_json(req, playlist_id, fatal=False) @@ -183,8 +183,8 @@ class CeskaTelevizeIE(InfoExtractor): if playlist_url == 'error_region': raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) - req.add_header('Referer', url) + req = Request(compat_urllib_parse_unquote(playlist_url)) + req.headers['Referer'] = url playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: diff --git a/hypervideo_dl/extractor/chilloutzone.py b/hypervideo_dl/extractor/chilloutzone.py index 1a2f77c..ac4252f 100644 --- a/hypervideo_dl/extractor/chilloutzone.py +++ b/hypervideo_dl/extractor/chilloutzone.py @@ -1,93 +1,123 @@ -import json +import base64 from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import compat_b64decode from ..utils import ( clean_html, - ExtractorError + int_or_none, + traverse_obj, ) class ChilloutzoneIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+)\.html' + _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w-]+)\.html' _TESTS = [{ - 'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html', + 'url': 'https://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html', 'md5': 'a76f3457e813ea0037e5244f509e66d1', 'info_dict': { 'id': 'enemene-meck-alle-katzen-weg', 'ext': 'mp4', 'title': 'Enemene Meck - Alle Katzen weg', 'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?', + 'duration': 24, }, }, { 'note': 'Video hosted at YouTube', - 'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html', + 'url': 'https://www.chilloutzone.net/video/eine-sekunde-bevor.html', 'info_dict': { 'id': '1YVQaAgHyRU', 'ext': 'mp4', 'title': '16 Photos Taken 1 Second Before Disaster', 'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814', 'uploader': 'BuzzFeedVideo', - 'uploader_id': 'BuzzFeedVideo', + 'uploader_id': '@BuzzFeedVideo', 'upload_date': '20131105', + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi/1YVQaAgHyRU/maxresdefault.jpg', + 'tags': 'count:41', + 'like_count': int, + 'playable_in_embed': True, + 'channel_url': 'https://www.youtube.com/channel/UCpko_-a4wgz2u_DgDgd9fqA', + 'chapters': 'count:6', + 'live_status': 'not_live', + 'view_count': int, + 'categories': ['Entertainment'], + 'age_limit': 0, + 'channel_id': 'UCpko_-a4wgz2u_DgDgd9fqA', + 'duration': 100, + 'uploader_url': 'http://www.youtube.com/@BuzzFeedVideo', + 'channel_follower_count': int, + 'channel': 'BuzzFeedVideo', }, }, { - 'note': 'Video hosted at Vimeo', - 'url': 'http://www.chilloutzone.net/video/icon-blending.html', - 'md5': '2645c678b8dc4fefcc0e1b60db18dac1', + 'url': 'https://www.chilloutzone.net/video/icon-blending.html', + 'md5': '2f9d6850ec567b24f0f4fa143b9aa2f9', 'info_dict': { - 'id': '85523671', + 'id': 'LLNkHpSjBfc', 'ext': 'mp4', - 'title': 'The Sunday Times - Icons', - 'description': 're:(?s)^Watch the making of - makingoficons.com.{300,}', - 'uploader': 'Us', - 'uploader_id': 'usfilms', - 'upload_date': '20140131' + 'title': 'The Sunday Times Making of Icons', + 'description': 'md5:b9259fcf63a1669e42001e5db677f02a', + 'uploader': 'MadFoxUA', + 'uploader_id': '@MadFoxUA', + 'upload_date': '20140204', + 'channel_id': 'UCSZa9Y6-Vl7c11kWMcbAfCw', + 'channel_url': 'https://www.youtube.com/channel/UCSZa9Y6-Vl7c11kWMcbAfCw', + 'comment_count': int, + 'uploader_url': 'http://www.youtube.com/@MadFoxUA', + 'duration': 66, + 'live_status': 'not_live', + 'channel_follower_count': int, + 'playable_in_embed': True, + 'view_count': int, + 'like_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/LLNkHpSjBfc/maxresdefault.jpg', + 'categories': ['Comedy'], + 'availability': 'public', + 'tags': [], + 'channel': 'MadFoxUA', + 'age_limit': 0, + }, + }, { + 'url': 'https://www.chilloutzone.net/video/ordentlich-abgeschuettelt.html', + 'info_dict': { + 'id': 'ordentlich-abgeschuettelt', + 'ext': 'mp4', + 'title': 'Ordentlich abgeschüttelt', + 'description': 'md5:d41541966b75d3d1e8ea77a94ea0d329', + 'duration': 18, }, }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + b64_data = self._html_search_regex( + r'var cozVidData\s*=\s*"([^"]+)"', webpage, 'video data') + info = self._parse_json(base64.b64decode(b64_data).decode(), video_id) - base64_video_info = self._html_search_regex( - r'var cozVidData = "(.+?)";', webpage, 'video data') - decoded_video_info = compat_b64decode(base64_video_info).decode('utf-8') - video_info_dict = json.loads(decoded_video_info) - - # get video information from dict - video_url = video_info_dict['mediaUrl'] - description = clean_html(video_info_dict.get('description')) - title = video_info_dict['title'] - native_platform = video_info_dict['nativePlatform'] - native_video_id = video_info_dict['nativeVideoId'] - source_priority = video_info_dict['sourcePriority'] - - # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) - if native_platform is None: - youtube_url = YoutubeIE._extract_url(webpage) - if youtube_url: - return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) + video_url = info.get('mediaUrl') + native_platform = info.get('nativePlatform') - # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or - # the own CDN - if source_priority == 'native': + if native_platform and info.get('sourcePriority') == 'native': + native_video_id = info['nativeVideoId'] if native_platform == 'youtube': - return self.url_result(native_video_id, ie='Youtube') - if native_platform == 'vimeo': - return self.url_result( - 'http://vimeo.com/' + native_video_id, ie='Vimeo') + return self.url_result(native_video_id, 'Youtube') + elif native_platform == 'vimeo': + return self.url_result(f'https://vimeo.com/{native_video_id}', 'Vimeo') - if not video_url: - raise ExtractorError('No video found') + elif not video_url: + # Possibly a standard youtube embed? + # TODO: Investigate if site still does this (there are no tests for it) + return self.url_result(url, 'Generic') return { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, - 'description': description, + **traverse_obj(info, { + 'title': 'title', + 'description': ('description', {clean_html}), + 'duration': ('videoLength', {int_or_none}), + 'width': ('videoWidth', {int_or_none}), + 'height': ('videoHeight', {int_or_none}), + }), } diff --git a/hypervideo_dl/extractor/cinetecamilano.py b/hypervideo_dl/extractor/cinetecamilano.py index 5e770eb..9cffa11 100644 --- a/hypervideo_dl/extractor/cinetecamilano.py +++ b/hypervideo_dl/extractor/cinetecamilano.py @@ -1,6 +1,6 @@ import json -import urllib.error from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, @@ -40,7 +40,7 @@ class CinetecaMilanoIE(InfoExtractor): 'Authorization': try_get(self._get_cookies('https://www.cinetecamilano.it'), lambda x: f'Bearer {x["cnt-token"].value}') or '' }) except ExtractorError as e: - if ((isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 500) + if ((isinstance(e.cause, HTTPError) and e.cause.status == 500) or isinstance(e.cause, json.JSONDecodeError)): self.raise_login_required(method='cookies') raise diff --git a/hypervideo_dl/extractor/ciscowebex.py b/hypervideo_dl/extractor/ciscowebex.py index 44595d8..85585df 100644 --- a/hypervideo_dl/extractor/ciscowebex.py +++ b/hypervideo_dl/extractor/ciscowebex.py @@ -1,5 +1,6 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, try_get, unified_timestamp, @@ -32,17 +33,36 @@ class CiscoWebexIE(InfoExtractor): if rcid: webpage = self._download_webpage(url, None, note='Getting video ID') url = self._search_regex(self._VALID_URL, webpage, 'redirection url', group='url') - url = self._request_webpage(url, None, note='Resolving final URL').geturl() + url = self._request_webpage(url, None, note='Resolving final URL').url mobj = self._match_valid_url(url) subdomain = mobj.group('subdomain') siteurl = mobj.group('siteurl_1') or mobj.group('siteurl_2') video_id = mobj.group('id') - stream = self._download_json( + password = self.get_param('videopassword') + + headers = {'Accept': 'application/json'} + if password: + headers['accessPwd'] = password + + stream, urlh = self._download_json_handle( 'https://%s.webex.com/webappng/api/v1/recordings/%s/stream' % (subdomain, video_id), - video_id, fatal=False, query={'siteurl': siteurl}) - if not stream: - self.raise_login_required(method='cookies') + video_id, headers=headers, query={'siteurl': siteurl}, expected_status=(403, 429)) + + if urlh.status == 403: + if stream['code'] == 53004: + self.raise_login_required() + if stream['code'] == 53005: + if password: + raise ExtractorError('Wrong password', expected=True) + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', expected=True) + raise ExtractorError(f'{self.IE_NAME} said: {stream["code"]} - {stream["message"]}', expected=True) + + if urlh.status == 429: + self.raise_login_required( + f'{self.IE_NAME} asks you to solve a CAPTCHA. Solve CAPTCHA in browser and', + method='cookies') video_id = stream.get('recordUUID') or video_id @@ -78,7 +98,7 @@ class CiscoWebexIE(InfoExtractor): 'title': stream['recordName'], 'description': stream.get('description'), 'uploader': stream.get('ownerDisplayName'), - 'uploader_id': stream.get('ownerUserName') or stream.get('ownerId'), # mail or id + 'uploader_id': stream.get('ownerUserName') or stream.get('ownerId'), 'timestamp': unified_timestamp(stream.get('createTime')), 'duration': int_or_none(stream.get('duration'), 1000), 'webpage_url': 'https://%s.webex.com/recordingservice/sites/%s/recording/playback/%s' % (subdomain, siteurl, video_id), diff --git a/hypervideo_dl/extractor/clipchamp.py b/hypervideo_dl/extractor/clipchamp.py new file mode 100644 index 0000000..a8bdf7e --- /dev/null +++ b/hypervideo_dl/extractor/clipchamp.py @@ -0,0 +1,61 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class ClipchampIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU', + 'info_dict': { + 'id': 'gRXZ4ZhdDaU', + 'ext': 'mp4', + 'title': 'Untitled video', + 'uploader': 'Alexander Schwartz', + 'timestamp': 1680805580, + 'upload_date': '20230406', + 'thumbnail': r're:^https?://.+\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + _STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s' + _STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'} + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video'] + + storage_location = data.get('storage_location') + if storage_location != 'cf_stream': + raise ExtractorError(f'Unsupported clip storage location "{storage_location}"') + + path = data['download_url'] + iframe = self._download_webpage( + f'https://iframe.cloudflarestream.com/{path}', video_id, 'Downloading player iframe') + subdomain = self._search_regex( + r'\bcustomer-domain-prefix=["\']([\w-]+)["\']', iframe, + 'subdomain', fatal=False) or 'customer-2ut9yn3y6fta1yxe' + + formats = self._extract_mpd_formats( + self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id, + query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash') + formats.extend(self._extract_m3u8_formats( + self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4', + query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls')) + + return { + 'id': video_id, + 'formats': formats, + 'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), {str}))) or None, + **traverse_obj(data, { + 'title': ('project', 'project_name', {str}), + 'timestamp': ('created_at', {unified_timestamp}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + }), + } diff --git a/hypervideo_dl/extractor/clyp.py b/hypervideo_dl/extractor/clyp.py index 0aaf73d..273d002 100644 --- a/hypervideo_dl/extractor/clyp.py +++ b/hypervideo_dl/extractor/clyp.py @@ -9,22 +9,22 @@ from ..utils import ( class ClypIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)' _TESTS = [{ - 'url': 'https://clyp.it/ojz2wfah', - 'md5': '1d4961036c41247ecfdcc439c0cddcbb', + 'url': 'https://clyp.it/iynkjk4b', + 'md5': '4bc6371c65210e7b372097fce4d92441', 'info_dict': { - 'id': 'ojz2wfah', - 'ext': 'mp3', - 'title': 'Krisson80 - bits wip wip', - 'description': '#Krisson80BitsWipWip #chiptune\n#wip', - 'duration': 263.21, - 'timestamp': 1443515251, - 'upload_date': '20150929', + 'id': 'iynkjk4b', + 'ext': 'ogg', + 'title': 'research', + 'description': '#Research', + 'duration': 51.278, + 'timestamp': 1435524981, + 'upload_date': '20150628', }, }, { 'url': 'https://clyp.it/b04p1odi?token=b0078e077e15835845c528a44417719d', 'info_dict': { 'id': 'b04p1odi', - 'ext': 'mp3', + 'ext': 'ogg', 'title': 'GJ! (Reward Edit)', 'description': 'Metal Resistance (THE ONE edition)', 'duration': 177.789, @@ -34,6 +34,17 @@ class ClypIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://clyp.it/v42214lc', + 'md5': '4aca4dfc3236fb6d6ddc4ea08314f33f', + 'info_dict': { + 'id': 'v42214lc', + 'ext': 'wav', + 'title': 'i dont wanna go (old version)', + 'duration': 113.528, + 'timestamp': 1607348505, + 'upload_date': '20201207', + }, }] def _real_extract(self, url): @@ -59,8 +70,20 @@ class ClypIE(InfoExtractor): 'url': format_url, 'format_id': format_id, 'vcodec': 'none', + 'acodec': ext.lower(), }) + page = self._download_webpage(url, video_id=audio_id) + wav_url = self._html_search_regex( + r'var\s*wavStreamUrl\s*=\s*["\'](?P<url>https?://[^\'"]+)', page, 'url', default=None) + if wav_url: + formats.append({ + 'url': wav_url, + 'format_id': 'wavStreamUrl', + 'vcodec': 'none', + 'acodec': 'wav', + }) + title = metadata['Title'] description = metadata.get('Description') duration = float_or_none(metadata.get('Duration')) diff --git a/hypervideo_dl/extractor/comedycentral.py b/hypervideo_dl/extractor/comedycentral.py index 05fc9f2..27d295b 100644 --- a/hypervideo_dl/extractor/comedycentral.py +++ b/hypervideo_dl/extractor/comedycentral.py @@ -2,7 +2,7 @@ from .mtv import MTVServicesInfoExtractor class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist)/(?P<id>[0-9a-z]{6})' + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?|collection-playlist|movies)/(?P<id>[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ @@ -25,6 +25,9 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }, { 'url': 'https://www.cc.com/collection-playlist/cosnej/stand-up-specials/t6vtjb', 'only_matching': True, + }, { + 'url': 'https://www.cc.com/movies/tkp406/a-cluesterfuenke-christmas', + 'only_matching': True, }] diff --git a/hypervideo_dl/extractor/common.py b/hypervideo_dl/extractor/common.py index 4b56307..5a561a2 100644 --- a/hypervideo_dl/extractor/common.py +++ b/hypervideo_dl/extractor/common.py @@ -13,6 +13,7 @@ import netrc import os import random import re +import subprocess import sys import time import types @@ -21,9 +22,21 @@ import urllib.request import xml.etree.ElementTree from ..compat import functools # isort: split -from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name +from ..compat import ( + compat_etree_fromstring, + compat_expanduser, + compat_os_name, + urllib_req_to_req, +) from ..cookies import LenientSimpleCookie from ..downloader.f4m import get_base_url, remove_encrypted_media +from ..downloader.hls import HlsFD +from ..networking import HEADRequest, Request +from ..networking.exceptions import ( + HTTPError, + IncompleteRead, + network_exceptions, +) from ..utils import ( IDENTITY, JSON_LD_RE, @@ -33,6 +46,7 @@ from ..utils import ( GeoRestrictedError, GeoUtils, LenientJSONDecoder, + Popen, RegexNotFoundError, RetryManager, UnsupportedError, @@ -55,7 +69,7 @@ from ..utils import ( join_nonempty, js_to_json, mimetype2ext, - network_exceptions, + netrc_from_content, orderedSet, parse_bitrate, parse_codecs, @@ -65,21 +79,20 @@ from ..utils import ( parse_resolution, sanitize_filename, sanitize_url, - sanitized_Request, smuggle_url, str_or_none, str_to_int, strip_or_none, traverse_obj, + truncate_string, try_call, try_get, unescapeHTML, unified_strdate, unified_timestamp, - update_Request, - update_url_query, url_basename, url_or_none, + urlhandle_detect_ext, urljoin, variadic, xpath_element, @@ -129,6 +142,7 @@ class InfoExtractor: is parsed from a string (in case of fragmented media) for MSS - URL of the ISM manifest. + * request_data Data to send in POST request to the URL * manifest_url The URL of the manifest file in case of fragmented media: @@ -216,7 +230,19 @@ class InfoExtractor: width : height ratio as float. * no_resume The server does not support resuming the (HTTP or RTMP) download. Boolean. - * has_drm The format has DRM and cannot be downloaded. Boolean + * has_drm True if the format has DRM and cannot be downloaded. + 'maybe' if the format may have DRM and has to be tested before download. + * extra_param_to_segment_url A query string to append to each + fragment's URL, or to update each existing query string + with. Only applied by the native HLS/DASH downloaders. + * hls_aes A dictionary of HLS AES-128 decryption information + used by the native HLS downloader to override the + values in the media playlist when an '#EXT-X-KEY' tag + is present in the playlist: + * uri The URI from which the key will be downloaded + * key The key (as hex) used to decrypt fragments. + If `key` is given, any key URI will be ignored + * iv The IV (as hex) used to decrypt fragments * downloader_options A dictionary of downloader options (For internal use only) * http_chunk_size Chunk size for HTTP downloads @@ -271,6 +297,7 @@ class InfoExtractor: channel_id: Id of the channel. channel_url: Full URL to a channel webpage. channel_follower_count: Number of followers of the channel. + channel_is_verified: Whether the channel is verified on the platform. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and @@ -299,6 +326,11 @@ class InfoExtractor: * "author" - human-readable name of the comment author * "author_id" - user ID of the comment author * "author_thumbnail" - The thumbnail of the comment author + * "author_url" - The url to the comment author's page + * "author_is_verified" - Whether the author is verified + on the platform + * "author_is_uploader" - Whether the comment is made by + the video uploader * "id" - Comment ID * "html" - Comment as HTML * "text" - Plain text of the comment @@ -310,8 +342,8 @@ class InfoExtractor: * "dislike_count" - Number of negative ratings of the comment * "is_favorited" - Whether the comment is marked as favorite by the video uploader - * "author_is_uploader" - Whether the comment is made by - the video uploader + * "is_pinned" - Whether the comment is pinned to + the top of the comments age_limit: Age restriction for the video, as an integer (years) webpage_url: The URL to the video webpage, if given to hypervideo it should allow to get the same result again. (It will be set @@ -335,6 +367,10 @@ class InfoExtractor: * "start_time" - The start time of the chapter in seconds * "end_time" - The end time of the chapter in seconds * "title" (optional, string) + heatmap: A list of dictionaries, with the following entries: + * "start_time" - The start time of the data point in seconds + * "end_time" - The end time of the data point in seconds + * "value" - The normalized value of the data point (float between 0 and 1) playable_in_embed: Whether this video is allowed to play in embedded players on other sites. Can be True (=always allowed), False (=never allowed), None (=unknown), or a string @@ -446,8 +482,8 @@ class InfoExtractor: Subclasses of this should also be added to the list of extractors and - should define a _VALID_URL regexp and, re-define the _real_extract() and - (optionally) _real_initialize() methods. + should define _VALID_URL as a regexp or a Sequence of regexps, and + re-define the _real_extract() and (optionally) _real_initialize() methods. Subclasses may also override suitable() if necessary, but ensure the function signature is preserved and that this function imports everything it needs @@ -510,7 +546,7 @@ class InfoExtractor: _EMBED_REGEX = [] def _login_hint(self, method=NO_DEFAULT, netrc=None): - password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' + password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' return { None: '', 'any': f'Use --cookies, --cookies-from-browser, {password_hint}', @@ -537,8 +573,8 @@ class InfoExtractor: # we have cached the regexp for *this* class, whereas getattr would also # match the superclass if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - return cls._VALID_URL_RE.match(url) + cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL))) + return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None) @classmethod def suitable(cls, url): @@ -674,7 +710,8 @@ class InfoExtractor: for _ in range(2): try: self.initialize() - self.write_debug('Extracting URL: %s' % url) + self.to_screen('Extracting URL: %s' % ( + url if self.get_param('verbose') else truncate_string(url, 100, 20))) ie_result = self._real_extract(url) if ie_result is None: return None @@ -692,11 +729,11 @@ class InfoExtractor: except UnsupportedError: raise except ExtractorError as e: - e.video_id = e.video_id or self.get_temp_id(url), + e.video_id = e.video_id or self.get_temp_id(url) e.ie = e.ie or self.IE_NAME, e.traceback = e.traceback or sys.exc_info()[2] raise - except http.client.IncompleteRead as e: + except IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url)) @@ -755,20 +792,25 @@ class InfoExtractor: @staticmethod def __can_accept_status_code(err, expected_status): - assert isinstance(err, urllib.error.HTTPError) + assert isinstance(err, HTTPError) if expected_status is None: return False elif callable(expected_status): - return expected_status(err.code) is True + return expected_status(err.status) is True else: - return err.code in variadic(expected_status) + return err.status in variadic(expected_status) def _create_request(self, url_or_request, data=None, headers=None, query=None): if isinstance(url_or_request, urllib.request.Request): - return update_Request(url_or_request, data=data, headers=headers, query=query) - if query: - url_or_request = update_url_query(url_or_request, query) - return sanitized_Request(url_or_request, data, headers or {}) + self._downloader.deprecation_warning( + 'Passing a urllib.request.Request to _create_request() is deprecated. ' + 'Use hypervideo_dl.networking.common.Request instead.') + url_or_request = urllib_req_to_req(url_or_request) + elif not isinstance(url_or_request, Request): + url_or_request = Request(url_or_request) + + url_or_request.update(data=data, headers=headers, query=query) + return url_or_request def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None): """ @@ -804,14 +846,9 @@ class InfoExtractor: try: return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query)) except network_exceptions as err: - if isinstance(err, urllib.error.HTTPError): + if isinstance(err, HTTPError): if self.__can_accept_status_code(err, expected_status): - # Retain reference to error to prevent file object from - # being closed before it can be read. Works around the - # effects of <https://bugs.python.org/issue15002> - # introduced in Python 3.4.1. - err.fp._error = err - return err.fp + return err.response if errnote is False: return False @@ -943,11 +980,11 @@ class InfoExtractor: if prefix is not None: webpage_bytes = prefix + webpage_bytes if self.get_param('dump_intermediate_pages', False): - self.to_screen('Dumping request to ' + urlh.geturl()) + self.to_screen('Dumping request to ' + urlh.url) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self.get_param('write_pages'): - filename = self._request_dump_filename(urlh.geturl(), video_id) + filename = self._request_dump_filename(urlh.url, video_id) self.to_screen(f'Saving request to {filename}') with open(filename, 'wb') as outf: outf.write(webpage_bytes) @@ -1005,7 +1042,7 @@ class InfoExtractor: fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): if self.get_param('load_pages'): url_or_request = self._create_request(url_or_request, data, headers, query) - filename = self._request_dump_filename(url_or_request.full_url, video_id) + filename = self._request_dump_filename(url_or_request.url, video_id) self.to_screen(f'Loading request from {filename}') try: with open(filename, 'rb') as dumpf: @@ -1079,7 +1116,7 @@ class InfoExtractor: while True: try: return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs) - except http.client.IncompleteRead as e: + except IncompleteRead as e: try_count += 1 if try_count >= tries: raise e @@ -1260,51 +1297,53 @@ class InfoExtractor: Like _search_regex, but strips HTML tags and unescapes entities. """ res = self._search_regex(pattern, string, name, default, fatal, flags, group) - if res: - return clean_html(res).strip() - else: - return res + if isinstance(res, tuple): + return tuple(map(clean_html, res)) + return clean_html(res) def _get_netrc_login_info(self, netrc_machine=None): - username = None - password = None netrc_machine = netrc_machine or self._NETRC_MACHINE - if self.get_param('usenetrc', False): - try: - netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') - if os.path.isdir(netrc_file): - netrc_file = os.path.join(netrc_file, '.netrc') - info = netrc.netrc(file=netrc_file).authenticators(netrc_machine) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError( - 'No authenticators for %s' % netrc_machine) - except (OSError, netrc.NetrcParseError) as err: - self.report_warning( - 'parsing .netrc: %s' % error_to_compat_str(err)) + cmd = self.get_param('netrc_cmd') + if cmd: + cmd = cmd.replace('{}', netrc_machine) + self.to_screen(f'Executing command: {cmd}') + stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE) + if ret != 0: + raise OSError(f'Command returned error code {ret}') + info = netrc_from_content(stdout).authenticators(netrc_machine) + + elif self.get_param('usenetrc', False): + netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') + if os.path.isdir(netrc_file): + netrc_file = os.path.join(netrc_file, '.netrc') + info = netrc.netrc(netrc_file).authenticators(netrc_machine) - return username, password + else: + return None, None + if not info: + raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}') + return info[0], info[2] def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): """ Get the login info as (username, password) First look for the manually specified credentials using username_option and password_option as keys in params dictionary. If no such credentials - available look in the netrc file using the netrc_machine or _NETRC_MACHINE - value. + are available try the netrc_cmd if it is defined or look in the + netrc file using the netrc_machine or _NETRC_MACHINE value. If there's no info available, return (None, None) """ - # Attempt to use provided username and password or .netrc data username = self.get_param(username_option) if username is not None: password = self.get_param(password_option) else: - username, password = self._get_netrc_login_info(netrc_machine) - + try: + username, password = self._get_netrc_login_info(netrc_machine) + except (OSError, netrc.NetrcParseError) as err: + self.report_warning(f'Failed to parse .netrc: {err}') + return None, None return username, password def _get_tfa_info(self, note='two-factor verification code'): @@ -1324,7 +1363,7 @@ class InfoExtractor: # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))' property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' % {'prop': re.escape(prop), 'sep': '(?::|[:-])'}) template = r'<meta[^>]+?%s[^>]+?%s' @@ -1394,10 +1433,16 @@ class InfoExtractor: # And then there are the jokers who advertise that they use RTA, but actually don't. AGE_LIMIT_MARKERS = [ r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', + r'>[^<]*you acknowledge you are at least (\d+) years old', + r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b', ] - if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS): - return 18 - return 0 + + age_limit = 0 + for marker in AGE_LIMIT_MARKERS: + mobj = re.search(marker, html) + if mobj: + age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18))) + return age_limit def _media_rating_search(self, html): # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ @@ -1650,11 +1695,8 @@ class InfoExtractor: if js is None: return {} - args = dict(zip(arg_keys.split(','), arg_vals.split(','))) - - for key, val in args.items(): - if val in ('undefined', 'void 0'): - args[key] = 'null' + args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json( + f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ()))) ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) return traverse_obj(ret, traverse) or {} @@ -1757,6 +1799,9 @@ class InfoExtractor: def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), fatal=True, m3u8_id=None, data=None, headers={}, query={}): + if self.get_param('ignore_no_formats_error'): + fatal = False + res = self._download_xml_handle( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', @@ -1768,7 +1813,7 @@ class InfoExtractor: return [] manifest, urlh = res - manifest_url = urlh.geturl() + manifest_url = urlh.url return self._parse_f4m_formats( manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id, @@ -1906,6 +1951,17 @@ class InfoExtractor: errnote=None, fatal=True, live=False, data=None, headers={}, query={}): + if self.get_param('ignore_no_formats_error'): + fatal = False + + if not m3u8_url: + if errnote is not False: + errnote = errnote or 'Failed to obtain m3u8 URL' + if fatal: + raise ExtractorError(errnote, video_id=video_id) + self.report_warning(f'{errnote}{bug_reports_message()}') + return [], {} + res = self._download_webpage_handle( m3u8_url, video_id, note='Downloading m3u8 information' if note is None else note, @@ -1916,7 +1972,7 @@ class InfoExtractor: return [], {} m3u8_doc, urlh = res - m3u8_url = urlh.geturl() + m3u8_url = urlh.url return self._parse_m3u8_formats_and_subtitles( m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, @@ -1930,11 +1986,7 @@ class InfoExtractor: errnote=None, fatal=True, data=None, headers={}, query={}, video_id=None): formats, subtitles = [], {} - - has_drm = re.search('|'.join([ - r'#EXT-X-FAXS-CM:', # Adobe Flash Access - r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay - ]), m3u8_doc) + has_drm = HlsFD._has_drm(m3u8_doc) def format_url(url): return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url) @@ -2032,6 +2084,7 @@ class InfoExtractor: 'protocol': entry_protocol, 'preference': preference, 'quality': quality, + 'has_drm': has_drm, 'vcodec': 'none' if media_type == 'AUDIO' else None, } for idx in _extract_m3u8_playlist_indices(manifest_url)) @@ -2091,6 +2144,7 @@ class InfoExtractor: 'protocol': entry_protocol, 'preference': preference, 'quality': quality, + 'has_drm': has_drm, } resolution = last_stream_inf.get('RESOLUTION') if resolution: @@ -2157,13 +2211,23 @@ class InfoExtractor: return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id) def _parse_m3u8_vod_duration(self, m3u8_vod, video_id): - if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod: + if '#EXT-X-ENDLIST' not in m3u8_vod: return None return int(sum( float(line[len('#EXTINF:'):].split(',')[0]) for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None + def _extract_mpd_vod_duration( + self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}): + + mpd_doc = self._download_xml( + mpd_url, video_id, + note='Downloading MPD VOD manifest' if note is None else note, + errnote='Failed to download VOD manifest' if errnote is None else errnote, + fatal=False, data=data, headers=headers, query=query) or {} + return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration'))) + @staticmethod def _xpath_ns(path, namespace=None): if not namespace: @@ -2177,22 +2241,17 @@ class InfoExtractor: return '/'.join(out) def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): + if self.get_param('ignore_no_formats_error'): + fatal = False + res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) if res is False: assert not fatal return [], {} - smil, urlh = res - smil_url = urlh.geturl() - namespace = self._parse_smil_namespace(smil) - - fmts = self._parse_smil_formats( - smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subs = self._parse_smil_subtitles( - smil, namespace=namespace) - - return fmts, subs + return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params, + namespace=self._parse_smil_namespace(smil)) def _extract_smil_formats(self, *args, **kwargs): fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs) @@ -2206,7 +2265,7 @@ class InfoExtractor: return {} smil, urlh = res - smil_url = urlh.geturl() + smil_url = urlh.url return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) @@ -2218,9 +2277,8 @@ class InfoExtractor: def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): namespace = self._parse_smil_namespace(smil) - formats = self._parse_smil_formats( + formats, subtitles = self._parse_smil_formats_and_subtitles( smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subtitles = self._parse_smil_subtitles(smil, namespace=namespace) video_id = os.path.splitext(url_basename(smil_url))[0] title = None @@ -2259,7 +2317,14 @@ class InfoExtractor: return self._search_regex( r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats(self, *args, **kwargs): + fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('SMIL') + return fmts + + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base = smil_url for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): b = meta.get('base') or meta.get('httpBase') @@ -2267,7 +2332,7 @@ class InfoExtractor: base = b break - formats = [] + formats, subtitles = [], {} rtmp_count = 0 http_count = 0 m3u8_count = 0 @@ -2287,7 +2352,8 @@ class InfoExtractor: height = int_or_none(medium.get('height')) proto = medium.get('proto') ext = medium.get('ext') - src_ext = determine_ext(src) + src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext( + self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False)) streamer = medium.get('streamer') or base if proto == 'rtmp' or streamer.startswith('rtmp'): @@ -2314,8 +2380,9 @@ class InfoExtractor: src_url = src_url.strip() if proto == 'm3u8' or src_ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + self._merge_subtitles(m3u8_subs, target=subtitles) if len(m3u8_formats) == 1: m3u8_count += 1 m3u8_formats[0].update({ @@ -2336,11 +2403,15 @@ class InfoExtractor: f4m_url += urllib.parse.urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) elif src_ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src_url, video_id, mpd_id='dash', fatal=False)) + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles( + src_url, video_id, mpd_id='dash', fatal=False) + formats.extend(mpd_formats) + self._merge_subtitles(mpd_subs, target=subtitles) elif re.search(r'\.ism/[Mm]anifest', src_url): - formats.extend(self._extract_ism_formats( - src_url, video_id, ism_id='mss', fatal=False)) + ism_formats, ism_subs = self._extract_ism_formats_and_subtitles( + src_url, video_id, ism_id='mss', fatal=False) + formats.extend(ism_formats) + self._merge_subtitles(ism_subs, target=subtitles) elif src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ @@ -2371,7 +2442,10 @@ class InfoExtractor: 'format_note': 'SMIL storyboards', }) - return formats + smil_subs = self._parse_smil_subtitles(smil, namespace=namespace) + self._merge_subtitles(smil_subs, target=subtitles) + + return formats, subtitles def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): urls = [] @@ -2397,7 +2471,7 @@ class InfoExtractor: return [] xspf, urlh = res - xspf_url = urlh.geturl() + xspf_url = urlh.url return self._parse_xspf( xspf, playlist_id, xspf_url=xspf_url, @@ -2452,6 +2526,10 @@ class InfoExtractor: def _extract_mpd_formats_and_subtitles( self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): + + if self.get_param('ignore_no_formats_error'): + fatal = False + res = self._download_xml_handle( mpd_url, video_id, note='Downloading MPD manifest' if note is None else note, @@ -2464,7 +2542,7 @@ class InfoExtractor: return [], {} # We could have been redirected to a new url when we retrieved our mpd file. - mpd_url = urlh.geturl() + mpd_url = urlh.url mpd_base_url = base_url(mpd_url) return self._parse_mpd_formats_and_subtitles( @@ -2821,6 +2899,9 @@ class InfoExtractor: return fmts def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): + if self.get_param('ignore_no_formats_error'): + fatal = False + res = self._download_xml_handle( ism_url, video_id, note='Downloading ISM manifest' if note is None else note, @@ -2832,7 +2913,7 @@ class InfoExtractor: if ism_doc is None: return [], {} - return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id) + return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id) def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): """ @@ -2928,6 +3009,8 @@ class InfoExtractor: 'protocol': 'ism', 'fragments': fragments, 'has_drm': ism_doc.find('Protection') is not None, + 'language': stream_language, + 'audio_channels': int_or_none(track.get('Channels')), '_download_params': { 'stream_type': stream_type, 'duration': duration, @@ -3190,7 +3273,7 @@ class InfoExtractor: def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): mobj = re.search( - r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', + r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''', webpage) if mobj: try: @@ -3211,19 +3294,20 @@ class InfoExtractor: def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - # JWPlayer backward compatibility: flattened playlists - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: - jwplayer_data = {'playlist': [jwplayer_data]} - entries = [] + if not isinstance(jwplayer_data, dict): + return entries - # JWPlayer backward compatibility: single playlist item + playlist_items = jwplayer_data.get('playlist') + # JWPlayer backward compatibility: single playlist item/flattened playlists # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 - if not isinstance(jwplayer_data['playlist'], list): - jwplayer_data['playlist'] = [jwplayer_data['playlist']] + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 + if not isinstance(playlist_items, list): + playlist_items = (playlist_items or jwplayer_data, ) - for video_data in jwplayer_data['playlist']: + for video_data in playlist_items: + if not isinstance(video_data, dict): + continue # JWPlayer backward compatibility: flattened sources # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 if 'sources' not in video_data: @@ -3261,6 +3345,13 @@ class InfoExtractor: 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, + 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ... + 'genre': clean_html(video_data.get('genre')), + 'channel': clean_html(dict_get(video_data, ('category', 'channel'))), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'release_year': int_or_none(video_data.get('releasedate')), + 'age_limit': int_or_none(video_data.get('age_restriction')), } # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): @@ -3278,7 +3369,7 @@ class InfoExtractor: def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - urls = [] + urls = set() formats = [] for source in jwplayer_sources_data: if not isinstance(source, dict): @@ -3287,14 +3378,14 @@ class InfoExtractor: base_url, self._proto_relative_url(source.get('file'))) if not source_url or source_url in urls: continue - urls.append(source_url) + urls.add(source_url) source_type = source.get('type') or '' ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': + if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url: formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif source_type == 'dash' or ext == 'mpd': + elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url: formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) elif ext == 'smil': @@ -3309,13 +3400,12 @@ class InfoExtractor: 'ext': ext, }) else: + format_id = str_or_none(source.get('label')) height = int_or_none(source.get('height')) - if height is None: + if height is None and format_id: # Often no height is provided but there is a label in # format like "1080p", "720p SD", or 1080. - height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''), - 'height', default=None)) + height = parse_resolution(format_id).get('height') a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), @@ -3323,6 +3413,7 @@ class InfoExtractor: 'tbr': int_or_none(source.get('bitrate'), scale=1000), 'filesize': int_or_none(source.get('filesize')), 'ext': ext, + 'format_id': format_id } if source_url.startswith('rtmp'): a_format['ext'] = 'flv' @@ -3375,7 +3466,7 @@ class InfoExtractor: def _get_cookies(self, url): """ Return a http.cookies.SimpleCookie with the cookies for the url """ - return LenientSimpleCookie(self._downloader._calc_cookies(url)) + return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url)) def _apply_first_set_cookie_header(self, url_handle, cookie): """ @@ -3416,13 +3507,17 @@ class InfoExtractor: continue t['name'] = cls.ie_key() yield t + if getattr(cls, '__wrapped__', None): + yield from cls.__wrapped__.get_testcases(include_onlymatching) @classmethod def get_webpage_testcases(cls): tests = vars(cls).get('_WEBPAGE_TESTS', []) for t in tests: t['name'] = cls.ie_key() - return tests + yield t + if getattr(cls, '__wrapped__', None): + yield from cls.__wrapped__.get_webpage_testcases() @classproperty(cache=True) def age_limit(cls): @@ -3446,8 +3541,8 @@ class InfoExtractor: @classmethod def is_single_video(cls, url): """Returns whether the URL is of a single video, None if unknown""" - assert cls.suitable(url), 'The URL must be suitable for the extractor' - return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) + if cls.suitable(url): + return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) @classmethod def is_suitable(cls, age_limit): @@ -3460,7 +3555,7 @@ class InfoExtractor: desc = '' if cls._NETRC_MACHINE: if markdown: - desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]' + desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")' else: desc += f' [{cls._NETRC_MACHINE}]' if cls.IE_DESC is False: @@ -3468,7 +3563,7 @@ class InfoExtractor: elif cls.IE_DESC: desc += f' {cls.IE_DESC}' if cls.SEARCH_KEY: - desc += f'; "{cls.SEARCH_KEY}:" prefix' + desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix' if search_examples: _COUNTS = ('', '5', '10', 'all') desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")' @@ -3582,6 +3677,42 @@ class InfoExtractor: or urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) or default) + def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True): + if not duration: + return + chapter_list = [{ + 'start_time': start_function(chapter), + 'title': title_function(chapter), + } for chapter in chapter_list or []] + if strict: + warn = self.report_warning + else: + warn = self.write_debug + chapter_list.sort(key=lambda c: c['start_time'] or 0) + + chapters = [{'start_time': 0}] + for idx, chapter in enumerate(chapter_list): + if chapter['start_time'] is None: + warn(f'Incomplete chapter {idx}') + elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: + chapters.append(chapter) + elif chapter not in chapters: + issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration + else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}') + warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"') + return chapters[1:] + + def _extract_chapters_from_description(self, description, duration): + duration_re = r'(?:\d+:)?\d{1,2}:\d{2}' + sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$' + return self._extract_chapters_helper( + re.findall(sep_re % (duration_re, r'.+?'), description or ''), + start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1], + duration=duration, strict=False) or self._extract_chapters_helper( + re.findall(sep_re % (r'.+?', duration_re), description or ''), + start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0], + duration=duration, strict=False) + @staticmethod def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): all_known = all(map( @@ -3684,10 +3815,12 @@ class InfoExtractor: if plugin_name: mro = inspect.getmro(cls) super_class = cls.__wrapped__ = mro[mro.index(cls) + 1] - cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key + cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key + cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}' while getattr(super_class, '__wrapped__', None): super_class = super_class.__wrapped__ setattr(sys.modules[super_class.__module__], super_class.__name__, cls) + _PLUGIN_OVERRIDES[super_class].append(cls) return super().__init_subclass__(**kwargs) @@ -3744,3 +3877,6 @@ class UnsupportedURLIE(InfoExtractor): def _real_extract(self, url): raise UnsupportedError(url) + + +_PLUGIN_OVERRIDES = collections.defaultdict(list) diff --git a/hypervideo_dl/extractor/crackle.py b/hypervideo_dl/extractor/crackle.py index 4610015..1ef90b5 100644 --- a/hypervideo_dl/extractor/crackle.py +++ b/hypervideo_dl/extractor/crackle.py @@ -4,7 +4,7 @@ import re import time from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, float_or_none, @@ -113,7 +113,7 @@ class CrackleIE(InfoExtractor): errnote='Unable to download media JSON') except ExtractorError as e: # 401 means geo restriction, trying next country - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: continue raise diff --git a/hypervideo_dl/extractor/crtvg.py b/hypervideo_dl/extractor/crtvg.py new file mode 100644 index 0000000..1aa8d77 --- /dev/null +++ b/hypervideo_dl/extractor/crtvg.py @@ -0,0 +1,34 @@ +from .common import InfoExtractor +from ..utils import remove_end + + +class CrtvgIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?crtvg\.es/tvg/a-carta/[^/#?]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.crtvg.es/tvg/a-carta/os-caimans-do-tea-5839623', + 'md5': 'c0958d9ff90e4503a75544358758921d', + 'info_dict': { + 'id': '5839623', + 'title': 'Os caimáns do Tea', + 'ext': 'mp4', + 'description': 'md5:f71cfba21ae564f0a6f415b31de1f842', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'params': {'skip_download': 'm3u8'} + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_url = self._search_regex(r'var\s+url\s*=\s*["\']([^"\']+)', webpage, 'video url') + formats = self._extract_m3u8_formats(video_url + '/playlist.m3u8', video_id, fatal=False) + formats.extend(self._extract_mpd_formats(video_url + '/manifest.mpd', video_id, fatal=False)) + + return { + 'id': video_id, + 'formats': formats, + 'title': remove_end(self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title', default=None), ' | CRTVG'), + 'description': self._html_search_meta('description', webpage, 'description', default=None), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage, 'thumbnail', default=None), + } diff --git a/hypervideo_dl/extractor/crunchyroll.py b/hypervideo_dl/extractor/crunchyroll.py index d226050..241da11 100644 --- a/hypervideo_dl/extractor/crunchyroll.py +++ b/hypervideo_dl/extractor/crunchyroll.py @@ -1,27 +1,53 @@ import base64 -import urllib.parse from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, format_field, + int_or_none, join_nonempty, + parse_age_limit, + parse_count, parse_iso8601, qualities, + remove_start, + time_seconds, traverse_obj, - try_get, + url_or_none, + urlencode_postdata, ) class CrunchyrollBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login' + _BASE_URL = 'https://www.crunchyroll.com' _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' - params = None + _AUTH_HEADERS = None + _API_ENDPOINT = None + _BASIC_AUTH = None + _CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q') + _LOCALE_LOOKUP = { + 'ar': 'ar-SA', + 'de': 'de-DE', + '': 'en-US', + 'es': 'es-419', + 'es-es': 'es-ES', + 'fr': 'fr-FR', + 'it': 'it-IT', + 'pt-br': 'pt-BR', + 'pt-pt': 'pt-PT', + 'ru': 'ru-RU', + 'hi': 'hi-IN', + } + + @property + def is_logged_in(self): + return bool(self._get_cookies(self._BASE_URL).get('etp_rt')) def _perform_login(self, username, password): - if self._get_cookies(self._LOGIN_URL).get('etp_rt'): + if self.is_logged_in: return upsell_response = self._download_json( @@ -31,7 +57,7 @@ class CrunchyrollBaseIE(InfoExtractor): 'device_id': 'whatvalueshouldbeforweb', 'device_type': 'com.crunchyroll.static', 'access_token': 'giKq5eY27ny3cqz', - 'referer': self._LOGIN_URL + 'referer': f'{self._BASE_URL}/welcome/login' }) if upsell_response['code'] != 'ok': raise ExtractorError('Could not get session id') @@ -39,66 +65,164 @@ class CrunchyrollBaseIE(InfoExtractor): login_response = self._download_json( f'{self._API_BASE}/login.1.json', None, 'Logging in', - data=urllib.parse.urlencode({ + data=urlencode_postdata({ 'account': username, 'password': password, 'session_id': session_id - }).encode('ascii')) + })) if login_response['code'] != 'ok': raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True) - if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): + if not self.is_logged_in: raise ExtractorError('Login succeeded but did not set etp_rt cookie') - def _get_embedded_json(self, webpage, display_id): - initial_state = self._parse_json(self._search_regex( - r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) - app_config = self._parse_json(self._search_regex( - r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) - return initial_state, app_config - - def _get_params(self, lang): - if not CrunchyrollBaseIE.params: - if self._get_cookies(f'https://www.crunchyroll.com/{lang}').get('etp_rt'): - grant_type, key = 'etp_rt_cookie', 'accountAuthClientId' - else: - grant_type, key = 'client_id', 'anonClientId' + def _update_auth(self): + if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_REFRESH > time_seconds(): + return - initial_state, app_config = self._get_embedded_json(self._download_webpage( - f'https://www.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) - api_domain = app_config['cxApiParams']['apiDomain'].replace('beta.crunchyroll.com', 'www.crunchyroll.com') + if not CrunchyrollBaseIE._BASIC_AUTH: + cx_api_param = self._CLIENT_ID[self.is_logged_in] + self.write_debug(f'Using cxApiParam={cx_api_param}') + CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode() + grant_type = 'etp_rt_cookie' if self.is_logged_in else 'client_id' + try: auth_response = self._download_json( - f'{api_domain}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', - headers={ - 'Authorization': 'Basic ' + str(base64.b64encode(('%s:' % app_config['cxApiParams'][key]).encode('ascii')), 'ascii') - }, data=f'grant_type={grant_type}'.encode('ascii')) - policy_response = self._download_json( - f'{api_domain}/index/v2', None, note='Retrieving signed policy', - headers={ - 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] - }) - cms = policy_response.get('cms_web') - bucket = cms['bucket'] - params = { - 'Policy': cms['policy'], - 'Signature': cms['signature'], - 'Key-Pair-Id': cms['key_pair_id'] + f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', + headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode()) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 403: + raise ExtractorError( + 'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, ' + 'then pass the fresh cookies (with --cookies-from-browser or --cookies) ' + 'and your browser\'s User-Agent (with --user-agent)', expected=True) + raise + + CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']} + CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10) + + def _locale_from_language(self, language): + config_locale = self._configuration_arg('metadata', ie_key=CrunchyrollBetaIE, casesense=True) + return config_locale[0] if config_locale else self._LOCALE_LOOKUP.get(language) + + def _call_base_api(self, endpoint, internal_id, lang, note=None, query={}): + self._update_auth() + + if not endpoint.startswith('/'): + endpoint = f'/{endpoint}' + + query = query.copy() + locale = self._locale_from_language(lang) + if locale: + query['locale'] = locale + + return self._download_json( + f'{self._BASE_URL}{endpoint}', internal_id, note or f'Calling API: {endpoint}', + headers=CrunchyrollBaseIE._AUTH_HEADERS, query=query) + + def _call_api(self, path, internal_id, lang, note='api', query={}): + if not path.startswith(f'/content/v2/{self._API_ENDPOINT}/'): + path = f'/content/v2/{self._API_ENDPOINT}/{path}' + + try: + result = self._call_base_api( + path, internal_id, lang, f'Downloading {note} JSON ({self._API_ENDPOINT})', query=query) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 404: + return None + raise + + if not result: + raise ExtractorError(f'Unexpected response when downloading {note} JSON') + return result + + def _extract_formats(self, stream_response, display_id=None): + requested_formats = self._configuration_arg('format') or ['adaptive_hls'] + available_formats = {} + for stream_type, streams in traverse_obj( + stream_response, (('streams', ('data', 0)), {dict.items}, ...)): + if stream_type not in requested_formats: + continue + for stream in traverse_obj(streams, lambda _, v: v['url']): + hardsub_lang = stream.get('hardsub_locale') or '' + format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) + available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + + requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] + if '' in available_formats and 'all' not in requested_hardsubs: + full_format_langs = set(requested_hardsubs) + self.to_screen( + 'To get all formats of a hardsub language, use ' + '"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". ' + 'See https://github.com/hypervideo/hypervideo#crunchyrollbeta-crunchyroll for more info', + only_once=True) + else: + full_format_langs = set(map(str.lower, available_formats)) + + audio_locale = traverse_obj(stream_response, ((None, 'meta'), 'audio_locale'), get_all=False) + hardsub_preference = qualities(requested_hardsubs[::-1]) + formats = [] + for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): + if stream_type.endswith('hls'): + if hardsub_lang.lower() in full_format_langs: + adaptive_formats = self._extract_m3u8_formats( + stream_url, display_id, 'mp4', m3u8_id=format_id, + fatal=False, note=f'Downloading {format_id} HLS manifest') + else: + adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),) + elif stream_type.endswith('dash'): + adaptive_formats = self._extract_mpd_formats( + stream_url, display_id, mpd_id=format_id, + fatal=False, note=f'Downloading {format_id} MPD manifest') + else: + self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True) + continue + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_locale + f['quality'] = hardsub_preference(hardsub_lang.lower()) + formats.extend(adaptive_formats) + + return formats + + def _extract_subtitles(self, data): + subtitles = {} + + for locale, subtitle in traverse_obj(data, ((None, 'meta'), 'subtitles', {dict.items}, ...)): + subtitles[locale] = [traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})] + + return subtitles + + +class CrunchyrollCmsBaseIE(CrunchyrollBaseIE): + _API_ENDPOINT = 'cms' + _CMS_EXPIRY = None + + def _call_cms_api_signed(self, path, internal_id, lang, note='api'): + if not CrunchyrollCmsBaseIE._CMS_EXPIRY or CrunchyrollCmsBaseIE._CMS_EXPIRY <= time_seconds(): + response = self._call_base_api('index/v2', None, lang, 'Retrieving signed policy')['cms_web'] + CrunchyrollCmsBaseIE._CMS_QUERY = { + 'Policy': response['policy'], + 'Signature': response['signature'], + 'Key-Pair-Id': response['key_pair_id'], } - locale = traverse_obj(initial_state, ('localization', 'locale')) - if locale: - params['locale'] = locale - CrunchyrollBaseIE.params = (api_domain, bucket, params) - return CrunchyrollBaseIE.params + CrunchyrollCmsBaseIE._CMS_BUCKET = response['bucket'] + CrunchyrollCmsBaseIE._CMS_EXPIRY = parse_iso8601(response['expires']) - 10 + + if not path.startswith('/cms/v2'): + path = f'/cms/v2{CrunchyrollCmsBaseIE._CMS_BUCKET}/{path}' + + return self._call_base_api( + path, internal_id, lang, f'Downloading {note} JSON (signed cms)', query=CrunchyrollCmsBaseIE._CMS_QUERY) -class CrunchyrollBetaIE(CrunchyrollBaseIE): +class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): IE_NAME = 'crunchyroll' _VALID_URL = r'''(?x) - https?://(?:beta|www)\.crunchyroll\.com/ - (?P<lang>(?:\w{2}(?:-\w{2})?/)?) - watch/(?P<id>\w+) - (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)''' + https?://(?:beta\.|www\.)?crunchyroll\.com/ + (?:(?P<lang>\w{2}(?:-\w{2})?)/)? + watch/(?!concert|musicvideo)(?P<id>\w+)''' _TESTS = [{ + # Premium only 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', 'info_dict': { 'id': 'GY2P1Q98Y', @@ -115,10 +239,15 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'season_number': 1, 'episode': 'To the Future', 'episode_number': 73, - 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'chapters': 'count:2', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, }, 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, }, { + # Premium only 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', 'info_dict': { 'id': 'GYE5WKQGR', @@ -126,7 +255,7 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'duration': 366.459, 'timestamp': 1476788400, 'description': 'md5:74b67283ffddd75f6e224ca7dc031e76', - 'title': 'SHELTER Episode – Porter Robinson presents Shelter the Animation', + 'title': 'SHELTER – Porter Robinson presents Shelter the Animation', 'upload_date': '20161018', 'series': 'SHELTER', 'series_id': 'GYGG09WWY', @@ -135,121 +264,206 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'season_number': 1, 'episode': 'Porter Robinson presents Shelter the Animation', 'episode_number': 0, - 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, }, 'params': {'skip_download': True}, - 'skip': 'Video is Premium only', }, { - 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y', + 'url': 'https://www.crunchyroll.com/watch/GJWU2VKK3/cherry-blossom-meeting-and-a-coming-blizzard', + 'info_dict': { + 'id': 'GJWU2VKK3', + 'ext': 'mp4', + 'duration': 1420.054, + 'description': 'md5:2d1c67c0ec6ae514d9c30b0b99a625cd', + 'title': 'The Ice Guy and His Cool Female Colleague Episode 1 – Cherry Blossom Meeting and a Coming Blizzard', + 'series': 'The Ice Guy and His Cool Female Colleague', + 'series_id': 'GW4HM75NP', + 'season': 'The Ice Guy and His Cool Female Colleague', + 'season_id': 'GY9PC21VE', + 'season_number': 1, + 'episode': 'Cherry Blossom Meeting and a Coming Blizzard', + 'episode_number': 1, + 'chapters': 'count:2', + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'timestamp': 1672839000, + 'upload_date': '20230104', + 'age_limit': 14, + 'like_count': int, + 'dislike_count': int, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/GM8F313NQ', + 'info_dict': { + 'id': 'GM8F313NQ', + 'ext': 'mp4', + 'title': 'Garakowa -Restore the World-', + 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608', + 'duration': 3996.104, + 'age_limit': 13, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/G62PEZ2E6', + 'info_dict': { + 'id': 'G62PEZ2E6', + 'description': 'md5:8d2f8b6b9dd77d87810882e7d2ee5608', + 'age_limit': 13, + 'duration': 65.138, + 'title': 'Garakowa -Restore the World-', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://www.crunchyroll.com/de/watch/GY2P1Q98Y', 'only_matching': True, }, { 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', 'only_matching': True, }] + # We want to support lazy playlist filtering and movie listings cannot be inside a playlist + _RETURN_TYPE = 'video' def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - api_domain, bucket, params = self._get_params(lang) + lang, internal_id = self._match_valid_url(url).group('lang', 'id') - episode_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, - note='Retrieving episode metadata', query=params) - if episode_response.get('is_premium_only') and not episode_response.get('playback'): - raise ExtractorError('This video is for premium members only.', expected=True) + # We need to use unsigned API call to allow ratings query string + response = traverse_obj(self._call_api( + f'objects/{internal_id}', internal_id, lang, 'object info', {'ratings': 'true'}), ('data', 0, {dict})) + if not response: + raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) - stream_response = self._download_json( - f'{api_domain}{episode_response["__links__"]["streams"]["href"]}', display_id, - note='Retrieving stream info', query=params) - get_streams = lambda name: (traverse_obj(stream_response, name) or {}).items() + object_type = response.get('type') + if object_type == 'episode': + result = self._transform_episode_response(response) - requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] - hardsub_preference = qualities(requested_hardsubs[::-1]) - requested_formats = self._configuration_arg('format') or ['adaptive_hls'] + elif object_type == 'movie': + result = self._transform_movie_response(response) - available_formats = {} - for stream_type, streams in get_streams('streams'): - if stream_type not in requested_formats: - continue - for stream in streams.values(): - if not stream.get('url'): - continue - hardsub_lang = stream.get('hardsub_locale') or '' - format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) - available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + elif object_type == 'movie_listing': + first_movie_id = traverse_obj(response, ('movie_listing_metadata', 'first_movie_id')) + if not self._yes_playlist(internal_id, first_movie_id): + return self.url_result(f'{self._BASE_URL}/{lang}watch/{first_movie_id}', CrunchyrollBetaIE, first_movie_id) + + def entries(): + movies = self._call_api(f'movie_listings/{internal_id}/movies', internal_id, lang, 'movie list') + for movie_response in traverse_obj(movies, ('data', ...)): + yield self.url_result( + f'{self._BASE_URL}/{lang}watch/{movie_response["id"]}', + CrunchyrollBetaIE, **self._transform_movie_response(movie_response)) + + return self.playlist_result(entries(), **self._transform_movie_response(response)) - if '' in available_formats and 'all' not in requested_hardsubs: - full_format_langs = set(requested_hardsubs) - self.to_screen( - 'To get all formats of a hardsub language, use ' - '"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". ' - 'See https://github.com/hypervideo/hypervideo#crunchyrollbeta for more info', - only_once=True) else: - full_format_langs = set(map(str.lower, available_formats)) + raise ExtractorError(f'Unknown object type {object_type}') - formats = [] - for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): - if stream_type.endswith('hls'): - if hardsub_lang.lower() in full_format_langs: - adaptive_formats = self._extract_m3u8_formats( - stream_url, display_id, 'mp4', m3u8_id=format_id, - fatal=False, note=f'Downloading {format_id} HLS manifest') - else: - adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),) - elif stream_type.endswith('dash'): - adaptive_formats = self._extract_mpd_formats( - stream_url, display_id, mpd_id=format_id, - fatal=False, note=f'Downloading {format_id} MPD manifest') - else: - self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True) - continue - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = stream_response.get('audio_locale') - f['quality'] = hardsub_preference(hardsub_lang.lower()) - formats.extend(adaptive_formats) + # There might be multiple audio languages for one object (`<object>_metadata.versions`), + # so we need to get the id from `streams_link` instead or we dont know which language to choose + streams_link = response.get('streams_link') + if not streams_link and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): + message = f'This {object_type} is for premium members only' + if self.is_logged_in: + raise ExtractorError(message, expected=True) + self.raise_login_required(message) + + # We need go from unsigned to signed api to avoid getting soft banned + stream_response = self._call_cms_api_signed(remove_start( + streams_link, '/content/v2/cms/'), internal_id, lang, 'stream info') + result['formats'] = self._extract_formats(stream_response, internal_id) + result['subtitles'] = self._extract_subtitles(stream_response) + + # if no intro chapter is available, a 403 without usable data is returned + intro_chapter = self._download_json( + f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json', + internal_id, note='Downloading chapter info', fatal=False, errnote=False) + if isinstance(intro_chapter, dict): + result['chapters'] = [{ + 'title': 'Intro', + 'start_time': float_or_none(intro_chapter.get('startTime')), + 'end_time': float_or_none(intro_chapter.get('endTime')), + }] + + def calculate_count(item): + return parse_count(''.join((item['displayed'], item.get('unit') or ''))) + + result.update(traverse_obj(response, ('rating', { + 'like_count': ('up', {calculate_count}), + 'dislike_count': ('down', {calculate_count}), + }))) + + return result + @staticmethod + def _transform_episode_response(data): + metadata = traverse_obj(data, (('episode_metadata', None), {dict}), get_all=False) or {} return { - 'id': internal_id, - 'title': '%s Episode %s – %s' % ( - episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), - 'description': try_get(episode_response, lambda x: x['description'].replace(r'\r\n', '\n')), - 'duration': float_or_none(episode_response.get('duration_ms'), 1000), - 'timestamp': parse_iso8601(episode_response.get('upload_date')), - 'series': episode_response.get('series_title'), - 'series_id': episode_response.get('series_id'), - 'season': episode_response.get('season_title'), - 'season_id': episode_response.get('season_id'), - 'season_number': episode_response.get('season_number'), - 'episode': episode_response.get('title'), - 'episode_number': episode_response.get('sequence_number'), - 'formats': formats, - 'thumbnails': [{ - 'url': thumb.get('source'), - 'width': thumb.get('width'), - 'height': thumb.get('height'), - } for thumb in traverse_obj(episode_response, ('images', 'thumbnail', ..., ...)) or []], - 'subtitles': { - lang: [{ - 'url': subtitle_data.get('url'), - 'ext': subtitle_data.get('format') - }] for lang, subtitle_data in get_streams('subtitles') - }, + 'id': data['id'], + 'title': ' \u2013 '.join(( + ('%s%s' % ( + format_field(metadata, 'season_title'), + format_field(metadata, 'episode', ' Episode %s'))), + format_field(data, 'title'))), + **traverse_obj(data, { + 'episode': ('title', {str}), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', 'thumbnail', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + **traverse_obj(metadata, { + 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'timestamp': ('upload_date', {parse_iso8601}), + 'series': ('series_title', {str}), + 'series_id': ('series_id', {str}), + 'season': ('season_title', {str}), + 'season_id': ('season_id', {str}), + 'season_number': ('season_number', ({int}, {float_or_none})), + 'episode_number': ('sequence_number', ({int}, {float_or_none})), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + 'language': ('audio_locale', {str}), + }, get_all=False), + } + + @staticmethod + def _transform_movie_response(data): + metadata = traverse_obj(data, (('movie_metadata', 'movie_listing_metadata', None), {dict}), get_all=False) or {} + return { + 'id': data['id'], + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', 'thumbnail', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + **traverse_obj(metadata, { + 'duration': ('duration_ms', {lambda x: float_or_none(x, 1000)}), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + }), } -class CrunchyrollBetaShowIE(CrunchyrollBaseIE): +class CrunchyrollBetaShowIE(CrunchyrollCmsBaseIE): IE_NAME = 'crunchyroll:playlist' _VALID_URL = r'''(?x) - https?://(?:beta|www)\.crunchyroll\.com/ + https?://(?:beta\.|www\.)?crunchyroll\.com/ (?P<lang>(?:\w{2}(?:-\w{2})?/)?) - series/(?P<id>\w+) - (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)''' + series/(?P<id>\w+)''' _TESTS = [{ 'url': 'https://www.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { 'id': 'GY19NQ2QR', 'title': 'Girl Friend BETA', + 'description': 'md5:99c1b22ee30a74b536a8277ced8eb750', + # XXX: `thumbnail` does not get set from `thumbnails` in playlist + # 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'age_limit': 14, }, 'playlist_mincount': 10, }, { @@ -258,40 +472,179 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE): }] def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - api_domain, bucket, params = self._get_params(lang) + lang, internal_id = self._match_valid_url(url).group('lang', 'id') + + def entries(): + seasons_response = self._call_cms_api_signed(f'seasons?series_id={internal_id}', internal_id, lang, 'seasons') + for season in traverse_obj(seasons_response, ('items', ..., {dict})): + episodes_response = self._call_cms_api_signed( + f'episodes?season_id={season["id"]}', season["id"], lang, 'episode list') + for episode_response in traverse_obj(episodes_response, ('items', ..., {dict})): + yield self.url_result( + f'{self._BASE_URL}/{lang}watch/{episode_response["id"]}', + CrunchyrollBetaIE, **CrunchyrollBetaIE._transform_episode_response(episode_response)) - series_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id, - note='Retrieving series metadata', query=params) + return self.playlist_result( + entries(), internal_id, + **traverse_obj(self._call_api(f'series/{internal_id}', internal_id, lang, 'series'), ('data', 0, { + 'title': ('title', {str}), + 'description': ('description', {lambda x: x.replace(r'\r\n', '\n')}), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + 'thumbnails': ('images', ..., ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }) + }))) - seasons_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id, - note='Retrieving season list', query=params) + +class CrunchyrollMusicIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:music' + _VALID_URL = r'''(?x) + https?://(?:www\.)?crunchyroll\.com/ + (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + watch/(?P<type>concert|musicvideo)/(?P<id>\w+)''' + _TESTS = [{ + 'url': 'https://www.crunchyroll.com/de/watch/musicvideo/MV5B02C79', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MV5B02C79', + 'display_id': 'egaono-hana', + 'title': 'Egaono Hana', + 'track': 'Egaono Hana', + 'artist': 'Goose house', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'genre': ['J-Pop'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MV88BB7F2C', + 'display_id': 'crossing-field', + 'title': 'Crossing Field', + 'track': 'Crossing Field', + 'artist': 'LiSA', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'genre': ['Anime'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MC2E2AC135', + 'display_id': 'live-is-smile-always-364joker-at-yokohama-arena', + 'title': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', + 'track': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', + 'artist': 'LiSA', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'description': 'md5:747444e7e6300907b7a43f0a0503072e', + 'genre': ['J-Pop'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.crunchyroll.com/de/watch/musicvideo/MV5B02C79/egaono-hana', + 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135/live-is-smile-always-364joker-at-yokohama-arena', + 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C/crossing-field', + 'only_matching': True, + }] + _API_ENDPOINT = 'music' + + def _real_extract(self, url): + lang, internal_id, object_type = self._match_valid_url(url).group('lang', 'id', 'type') + path, name = { + 'concert': ('concerts', 'concert info'), + 'musicvideo': ('music_videos', 'music video info'), + }[object_type] + response = traverse_obj(self._call_api(f'{path}/{internal_id}', internal_id, lang, name), ('data', 0, {dict})) + if not response: + raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) + + streams_link = response.get('streams_link') + if not streams_link and response.get('isPremiumOnly'): + message = f'This {response.get("type") or "media"} is for premium members only' + if self.is_logged_in: + raise ExtractorError(message, expected=True) + self.raise_login_required(message) + + result = self._transform_music_response(response) + stream_response = self._call_api(streams_link, internal_id, lang, 'stream info') + result['formats'] = self._extract_formats(stream_response, internal_id) + + return result + + @staticmethod + def _transform_music_response(data): + return { + 'id': data['id'], + **traverse_obj(data, { + 'display_id': 'slug', + 'title': 'title', + 'track': 'title', + 'artist': ('artist', 'name'), + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n') or None}), + 'thumbnails': ('images', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + 'genre': ('genres', ..., 'displayValue'), + 'age_limit': ('maturity_ratings', -1, {parse_age_limit}), + }), + } + + +class CrunchyrollArtistIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:artist' + _VALID_URL = r'''(?x) + https?://(?:www\.)?crunchyroll\.com/ + (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + artist/(?P<id>\w{10})''' + _TESTS = [{ + 'url': 'https://www.crunchyroll.com/artist/MA179CB50D', + 'info_dict': { + 'id': 'MA179CB50D', + 'title': 'LiSA', + 'genre': ['J-Pop', 'Anime', 'Rock'], + 'description': 'md5:16d87de61a55c3f7d6c454b73285938e', + }, + 'playlist_mincount': 83, + }, { + 'url': 'https://www.crunchyroll.com/artist/MA179CB50D/lisa', + 'only_matching': True, + }] + _API_ENDPOINT = 'music' + + def _real_extract(self, url): + lang, internal_id = self._match_valid_url(url).group('lang', 'id') + response = traverse_obj(self._call_api( + f'artists/{internal_id}', internal_id, lang, 'artist info'), ('data', 0)) def entries(): - for season in seasons_response['items']: - episodes_response = self._download_json( - f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id, - note=f'Retrieving episode list for {season.get("slug_title")}', query=params) - for episode in episodes_response['items']: - episode_id = episode['id'] - episode_display_id = episode['slug_title'] - yield { - '_type': 'url', - 'url': f'https://www.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', - 'ie_key': CrunchyrollBetaIE.ie_key(), - 'id': episode_id, - 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), - 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')), - 'duration': float_or_none(episode.get('duration_ms'), 1000), - 'series': episode.get('series_title'), - 'series_id': episode.get('series_id'), - 'season': episode.get('season_title'), - 'season_id': episode.get('season_id'), - 'season_number': episode.get('season_number'), - 'episode': episode.get('title'), - 'episode_number': episode.get('sequence_number') - } - - return self.playlist_result(entries(), internal_id, series_response.get('title')) + for attribute, path in [('concerts', 'concert'), ('videos', 'musicvideo')]: + for internal_id in traverse_obj(response, (attribute, ...)): + yield self.url_result(f'{self._BASE_URL}/watch/{path}/{internal_id}', CrunchyrollMusicIE, internal_id) + + return self.playlist_result(entries(), **self._transform_artist_response(response)) + + @staticmethod + def _transform_artist_response(data): + return { + 'id': data['id'], + **traverse_obj(data, { + 'title': 'name', + 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n')}), + 'thumbnails': ('images', ..., ..., { + 'url': ('source', {url_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + 'genre': ('genres', ..., 'displayValue'), + }), + } diff --git a/hypervideo_dl/extractor/cultureunplugged.py b/hypervideo_dl/extractor/cultureunplugged.py index 2fb2280..9c8509f 100644 --- a/hypervideo_dl/extractor/cultureunplugged.py +++ b/hypervideo_dl/extractor/cultureunplugged.py @@ -1,10 +1,8 @@ import time from .common import InfoExtractor -from ..utils import ( - int_or_none, - HEADRequest, -) +from ..networking import HEADRequest +from ..utils import int_or_none class CultureUnpluggedIE(InfoExtractor): diff --git a/hypervideo_dl/extractor/curiositystream.py b/hypervideo_dl/extractor/curiositystream.py index 26cf24f..941cf4e 100644 --- a/hypervideo_dl/extractor/curiositystream.py +++ b/hypervideo_dl/extractor/curiositystream.py @@ -1,4 +1,5 @@ import re +import urllib.parse from .common import InfoExtractor from ..compat import compat_str @@ -23,7 +24,7 @@ class CuriosityStreamBaseIE(InfoExtractor): auth_cookie = self._get_cookies('https://curiositystream.com').get('auth_token') if auth_cookie: self.write_debug('Obtained auth_token cookie') - self._auth_token = auth_cookie.value + self._auth_token = urllib.parse.unquote(auth_cookie.value) if self._auth_token: headers['X-Auth-Token'] = self._auth_token result = self._download_json( @@ -54,8 +55,11 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', 'channel': 'Curiosity Stream', 'categories': ['Technology', 'Interview'], - 'average_rating': 96.79, + 'average_rating': float, 'series_id': '2', + 'thumbnail': r're:https://img.curiositystream.com/.+\.jpg', + 'tags': [], + 'duration': 158 }, 'params': { # m3u8 download diff --git a/hypervideo_dl/extractor/dacast.py b/hypervideo_dl/extractor/dacast.py new file mode 100644 index 0000000..4e81aa4 --- /dev/null +++ b/hypervideo_dl/extractor/dacast.py @@ -0,0 +1,158 @@ +import hashlib +import re +import time + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + classproperty, + float_or_none, + traverse_obj, + url_or_none, +) + + +class DacastBaseIE(InfoExtractor): + _URL_TYPE = None + + @classproperty + def _VALID_URL(cls): + return fr'https?://iframe\.dacast\.com/{cls._URL_TYPE}/(?P<user_id>[\w-]+)/(?P<id>[\w-]+)' + + @classproperty + def _EMBED_REGEX(cls): + return [rf'<iframe[^>]+\bsrc=["\'](?P<url>{cls._VALID_URL})'] + + _API_INFO_URL = 'https://playback.dacast.com/content/info' + + @classmethod + def _get_url_from_id(cls, content_id): + user_id, media_id = content_id.split(f'-{cls._URL_TYPE}-') + return f'https://iframe.dacast.com/{cls._URL_TYPE}/{user_id}/{media_id}' + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for content_id in re.findall( + rf'<script[^>]+\bsrc=["\']https://player\.dacast\.com/js/player\.js\?contentId=([\w-]+-{cls._URL_TYPE}-[\w-]+)["\']', webpage): + yield cls._get_url_from_id(content_id) + + +class DacastVODIE(DacastBaseIE): + _URL_TYPE = 'vod' + _TESTS = [{ + 'url': 'https://iframe.dacast.com/vod/acae82153ef4d7a7344ae4eaa86af534/1c6143e3-5a06-371d-8695-19b96ea49090', + 'info_dict': { + 'id': '1c6143e3-5a06-371d-8695-19b96ea49090', + 'ext': 'mp4', + 'uploader_id': 'acae82153ef4d7a7344ae4eaa86af534', + 'title': '2_4||Adnexal mass characterisation: O-RADS US and MRI||N. Bharwani, London/UK', + 'thumbnail': 'https://universe-files.dacast.com/26137208-5858-65c1-5e9a-9d6b6bd2b6c2', + }, + 'params': {'skip_download': 'm3u8'}, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.dacast.com/support/knowledgebase/how-can-i-embed-a-video-on-my-website/', + 'info_dict': { + 'id': 'b6674869-f08a-23c5-1d7b-81f5309e1a90', + 'ext': 'mp4', + 'title': '4-HowToEmbedVideo.mp4', + 'uploader_id': '3b67c4a9-3886-4eb1-d0eb-39b23b14bef3', + 'thumbnail': 'https://universe-files.dacast.com/d26ab48f-a52a-8783-c42e-a90290ba06b6.png', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://gist.githubusercontent.com/bashonly/4ad249ef2910346fbdf3809b220f11ee/raw/87349778d4af1a80b1fcc3beb9c88108de5858f5/dacast_embeds.html', + 'info_dict': { + 'id': 'e7df418e-a83b-7a7f-7b5e-1a667981e8fa', + 'ext': 'mp4', + 'title': 'Evening Service 2-5-23', + 'uploader_id': '943bb1ab3c03695ba85330d92d6d226e', + 'thumbnail': 'https://universe-files.dacast.com/337472b3-e92c-2ea4-7eb7-5700da477f67', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + user_id, video_id = self._match_valid_url(url).group('user_id', 'id') + query = {'contentId': f'{user_id}-vod-{video_id}', 'provider': 'universe'} + info = self._download_json(self._API_INFO_URL, video_id, query=query, fatal=False) + access = self._download_json( + 'https://playback.dacast.com/content/access', video_id, + note='Downloading access JSON', query=query, expected_status=403) + + error = access.get('error') + if error in ('Broadcaster has been blocked', 'Content is offline'): + raise ExtractorError(error, expected=True) + elif error: + raise ExtractorError(f'Dacast API says "{error}"') + + hls_url = access['hls'] + hls_aes = {} + + if 'DRM_EXT' in hls_url: + self.report_drm(video_id) + elif '/uspaes/' in hls_url: + # From https://player.dacast.com/js/player.js + ts = int(time.time()) + signature = hashlib.sha1( + f'{10413792000 - ts}{ts}YfaKtquEEpDeusCKbvYszIEZnWmBcSvw').digest().hex() + hls_aes['uri'] = f'https://keys.dacast.com/uspaes/{video_id}.key?s={signature}&ts={ts}' + + for retry in self.RetryManager(): + try: + formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls') + except ExtractorError as e: + # CDN will randomly respond with 403 + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + retry.error = e + continue + raise + + return { + 'id': video_id, + 'uploader_id': user_id, + 'formats': formats, + 'hls_aes': hls_aes or None, + **traverse_obj(info, ('contentInfo', { + 'title': 'title', + 'duration': ('duration', {float_or_none}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + })), + } + + +class DacastPlaylistIE(DacastBaseIE): + _URL_TYPE = 'playlist' + _TESTS = [{ + 'url': 'https://iframe.dacast.com/playlist/943bb1ab3c03695ba85330d92d6d226e/b632eb053cac17a9c9a02bcfc827f2d8', + 'playlist_mincount': 28, + 'info_dict': { + 'id': 'b632eb053cac17a9c9a02bcfc827f2d8', + 'title': 'Archive Sermons', + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://gist.githubusercontent.com/bashonly/7efb606f49f3c6e07ea0327de5a661d1/raw/05a16eac830245ea301fb0a585023bec71e6093c/dacast_playlist_embed.html', + 'playlist_mincount': 28, + 'info_dict': { + 'id': 'b632eb053cac17a9c9a02bcfc827f2d8', + 'title': 'Archive Sermons', + }, + }] + + def _real_extract(self, url): + user_id, playlist_id = self._match_valid_url(url).group('user_id', 'id') + info = self._download_json( + self._API_INFO_URL, playlist_id, note='Downloading playlist JSON', query={ + 'contentId': f'{user_id}-playlist-{playlist_id}', + 'provider': 'universe', + })['contentInfo'] + + def entries(info): + for video in traverse_obj(info, ('features', 'playlist', 'contents', lambda _, v: v['id'])): + yield self.url_result( + DacastVODIE._get_url_from_id(video['id']), DacastVODIE, video['id'], video.get('title')) + + return self.playlist_result(entries(info), playlist_id, info.get('title')) diff --git a/hypervideo_dl/extractor/daftsex.py b/hypervideo_dl/extractor/daftsex.py index 551d5e3..92510c7 100644 --- a/hypervideo_dl/extractor/daftsex.py +++ b/hypervideo_dl/extractor/daftsex.py @@ -1,6 +1,7 @@ from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( + ExtractorError, int_or_none, js_to_json, parse_count, @@ -12,21 +13,24 @@ from ..utils import ( class DaftsexIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?daftsex\.com/watch/(?P<id>-?\d+_\d+)' + _VALID_URL = r'https?://(?:www\.)?daft\.sex/watch/(?P<id>-?\d+_\d+)' _TESTS = [{ - 'url': 'https://daftsex.com/watch/-35370899_456246186', - 'md5': 'd95135e6cea2d905bea20dbe82cda64a', + 'url': 'https://daft.sex/watch/-35370899_456246186', + 'md5': '64c04ef7b4c7b04b308f3b0c78efe7cd', 'info_dict': { 'id': '-35370899_456246186', 'ext': 'mp4', 'title': 'just relaxing', - 'description': 'just relaxing - Watch video Watch video in high quality', + 'description': 'just relaxing – Watch video Watch video in high quality', 'upload_date': '20201113', 'timestamp': 1605261911, - 'thumbnail': r're:https://[^/]+/impf/-43BuMDIawmBGr3GLcZ93CYwWf2PBv_tVWoS1A/dnu41DnARU4\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=6af2c26ff4a45e55334189301c867384&type=video_thumb', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + 'duration': 15.0, + 'view_count': int }, }, { - 'url': 'https://daftsex.com/watch/-156601359_456242791', + 'url': 'https://daft.sex/watch/-156601359_456242791', 'info_dict': { 'id': '-156601359_456242791', 'ext': 'mp4', @@ -36,6 +40,7 @@ class DaftsexIE(InfoExtractor): 'timestamp': 1600250735, 'thumbnail': 'https://psv153-1.crazycloud.ru/videos/-156601359/456242791/thumb.jpg?extra=i3D32KaBbBFf9TqDRMAVmQ', }, + 'skip': 'deleted / private' }] def _real_extract(self, url): @@ -60,7 +65,7 @@ class DaftsexIE(InfoExtractor): webpage, 'player color', fatal=False) or '' embed_page = self._download_webpage( - 'https://daxab.com/player/%s?color=%s' % (player_hash, player_color), + 'https://dxb.to/player/%s?color=%s' % (player_hash, player_color), video_id, headers={'Referer': url}) video_params = self._parse_json( self._search_regex( @@ -94,15 +99,19 @@ class DaftsexIE(InfoExtractor): 'age_limit': 18, } - item = self._download_json( + items = self._download_json( f'{server_domain}/method/video.get/{video_id}', video_id, headers={'Referer': url}, query={ 'token': video_params['video']['access_token'], 'videos': video_id, 'ckey': video_params['c_key'], 'credentials': video_params['video']['credentials'], - })['response']['items'][0] + })['response']['items'] + + if not items: + raise ExtractorError('Video is not available', video_id=video_id, expected=True) + item = items[0] formats = [] for f_id, f_url in item.get('files', {}).items(): if f_id == 'external': diff --git a/hypervideo_dl/extractor/dailymotion.py b/hypervideo_dl/extractor/dailymotion.py index 2a44718..21263d4 100644 --- a/hypervideo_dl/extractor/dailymotion.py +++ b/hypervideo_dl/extractor/dailymotion.py @@ -3,7 +3,7 @@ import json import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, @@ -68,9 +68,9 @@ class DailymotionBaseInfoExtractor(InfoExtractor): None, 'Downloading Access Token', data=urlencode_postdata(data))['access_token'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: raise ExtractorError(self._parse_json( - e.cause.read().decode(), xid)['error_description'], expected=True) + e.cause.response.read().decode(), xid)['error_description'], expected=True) raise self._set_dailymotion_cookie('access_token' if username else 'client_token', token) self._HEADERS['Authorization'] = 'Bearer ' + token diff --git a/hypervideo_dl/extractor/digitalconcerthall.py b/hypervideo_dl/extractor/digitalconcerthall.py index 3461e36..c11cd79 100644 --- a/hypervideo_dl/extractor/digitalconcerthall.py +++ b/hypervideo_dl/extractor/digitalconcerthall.py @@ -11,7 +11,7 @@ from ..utils import ( class DigitalConcertHallIE(InfoExtractor): IE_DESC = 'DigitalConcertHall extractor' - _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P<language>[a-z]+)/concert/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P<language>[a-z]+)/(?P<type>film|concert)/(?P<id>[0-9]+)' _OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token' _ACCESS_TOKEN = None _NETRC_MACHINE = 'digitalconcerthall' @@ -40,6 +40,19 @@ class DigitalConcertHallIE(InfoExtractor): }, 'params': {'skip_download': 'm3u8'}, 'playlist_count': 3, + }, { + 'url': 'https://www.digitalconcerthall.com/en/film/388', + 'info_dict': { + 'id': '388', + 'ext': 'mp4', + 'title': 'The Berliner Philharmoniker and Frank Peter Zimmermann', + 'description': 'md5:cfe25a7044fa4be13743e5089b5b5eb2', + 'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$', + 'upload_date': '20220714', + 'timestamp': 1657785600, + 'album_artist': 'Frank Peter Zimmermann / Benedikt von Bernstorff / Jakob von Bernstorff', + }, + 'params': {'skip_download': 'm3u8'}, }] def _perform_login(self, username, password): @@ -75,7 +88,7 @@ class DigitalConcertHallIE(InfoExtractor): if not self._ACCESS_TOKEN: self.raise_login_required(method='password') - def _entries(self, items, language, **kwargs): + def _entries(self, items, language, type_, **kwargs): for item in items: video_id = item['id'] stream_info = self._download_json( @@ -103,11 +116,11 @@ class DigitalConcertHallIE(InfoExtractor): 'start_time': chapter.get('time'), 'end_time': try_get(chapter, lambda x: x['time'] + x['duration']), 'title': chapter.get('text'), - } for chapter in item['cuepoints']] if item.get('cuepoints') else None, + } for chapter in item['cuepoints']] if item.get('cuepoints') and type_ == 'concert' else None, } def _real_extract(self, url): - language, video_id = self._match_valid_url(url).group('language', 'id') + language, type_, video_id = self._match_valid_url(url).group('language', 'type', 'id') if not language: language = 'en' @@ -120,18 +133,18 @@ class DigitalConcertHallIE(InfoExtractor): }] vid_info = self._download_json( - f'https://api.digitalconcerthall.com/v2/concert/{video_id}', video_id, headers={ + f'https://api.digitalconcerthall.com/v2/{type_}/{video_id}', video_id, headers={ 'Accept': 'application/json', 'Accept-Language': language }) album_artist = ' / '.join(traverse_obj(vid_info, ('_links', 'artist', ..., 'name')) or '') + videos = [vid_info] if type_ == 'film' else traverse_obj(vid_info, ('_embedded', ..., ...)) return { '_type': 'playlist', 'id': video_id, 'title': vid_info.get('title'), - 'entries': self._entries(traverse_obj(vid_info, ('_embedded', ..., ...)), language, - thumbnails=thumbnails, album_artist=album_artist), + 'entries': self._entries(videos, language, thumbnails=thumbnails, album_artist=album_artist, type_=type_), 'thumbnails': thumbnails, 'album_artist': album_artist, } diff --git a/hypervideo_dl/extractor/discogs.py b/hypervideo_dl/extractor/discogs.py new file mode 100644 index 0000000..048c622 --- /dev/null +++ b/hypervideo_dl/extractor/discogs.py @@ -0,0 +1,35 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import traverse_obj + + +class DiscogsReleasePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?discogs\.com/(?P<type>release|master)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.discogs.com/release/1-The-Persuader-Stockholm', + 'info_dict': { + 'id': 'release1', + 'title': 'Stockholm', + }, + 'playlist_mincount': 7, + }, { + 'url': 'https://www.discogs.com/master/113-Vince-Watson-Moments-In-Time', + 'info_dict': { + 'id': 'master113', + 'title': 'Moments In Time', + }, + 'playlist_mincount': 53, + }] + + def _real_extract(self, url): + playlist_id, playlist_type = self._match_valid_url(url).group('id', 'type') + + display_id = f'{playlist_type}{playlist_id}' + response = self._download_json( + f'https://api.discogs.com/{playlist_type}s/{playlist_id}', display_id) + + entries = [ + self.url_result(video['uri'], YoutubeIE, video_title=video.get('title')) + for video in traverse_obj(response, ('videos', lambda _, v: YoutubeIE.suitable(v['uri'])))] + + return self.playlist_result(entries, display_id, response.get('title')) diff --git a/hypervideo_dl/extractor/discovery.py b/hypervideo_dl/extractor/discovery.py index fd3fc8f..75b4643 100644 --- a/hypervideo_dl/extractor/discovery.py +++ b/hypervideo_dl/extractor/discovery.py @@ -3,8 +3,8 @@ import string from .discoverygo import DiscoveryGoBaseIE from ..compat import compat_urllib_parse_unquote +from ..networking.exceptions import HTTPError from ..utils import ExtractorError -from ..compat import compat_HTTPError class DiscoveryIE(DiscoveryGoBaseIE): @@ -78,7 +78,7 @@ class DiscoveryIE(DiscoveryGoBaseIE): 'Downloading token JSON metadata', query={ 'authRel': 'authorization', 'client_id': '3020a40c2356a645b4b4', - 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), + 'nonce': ''.join(random.choices(string.ascii_letters, k=32)), 'redirectUri': 'https://www.discovery.com/', })['access_token'] @@ -100,9 +100,9 @@ class DiscoveryIE(DiscoveryGoBaseIE): self._API_BASE_URL + 'streaming/video/' + video_id, display_id, 'Downloading streaming JSON metadata', headers=headers) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): + if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403): e_description = self._parse_json( - e.cause.read().decode(), display_id)['description'] + e.cause.response.read().decode(), display_id)['description'] if 'resource not available for country' in e_description: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) if 'Authorized Networks' in e_description: diff --git a/hypervideo_dl/extractor/dlf.py b/hypervideo_dl/extractor/dlf.py new file mode 100644 index 0000000..88a4149 --- /dev/null +++ b/hypervideo_dl/extractor/dlf.py @@ -0,0 +1,192 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + extract_attributes, + int_or_none, + traverse_obj, + url_or_none, +) + + +class DLFBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/' + _BUTTON_REGEX = r'(<button[^>]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)' + + def _parse_button_attrs(self, button, audio_id=None): + attrs = extract_attributes(button) + audio_id = audio_id or attrs['data-audio-diraid'] + + url = traverse_obj( + attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference', + 'data-audio-src', expected_type=url_or_none) + ext = determine_ext(url) + + return { + 'id': audio_id, + 'extractor_key': DLFIE.ie_key(), + 'extractor': DLFIE.IE_NAME, + **traverse_obj(attrs, { + 'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), {str}), + 'duration': (('data-audioduration', 'data-audio-duration'), {int_or_none}), + 'thumbnail': ('data-audioimage', {url_or_none}), + 'uploader': 'data-audio-producer', + 'series': 'data-audio-series', + 'channel': 'data-audio-origin-site-name', + 'webpage_url': ('data-audio-download-tracking-path', {url_or_none}), + }, get_all=False), + 'formats': (self._extract_m3u8_formats(url, audio_id, fatal=False) + if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}]) + } + + +class DLFIE(DLFBaseIE): + IE_NAME = 'dlf' + _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P<id>[\da-f]{8})-100\.html' + _TESTS = [ + # Audio as an HLS stream + { + 'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html', + 'info_dict': { + 'id': '03a3eb19', + 'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien', + 'ext': 'm4a', + 'duration': 3298, + 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', + 'uploader': 'Deutschlandfunk', + 'series': 'On Stage', + 'channel': 'deutschlandfunk' + }, + 'params': { + 'skip_download': 'm3u8' + }, + 'skip': 'This webpage no longer exists' + }, { + 'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html', + 'info_dict': { + 'id': 'd9cc1856', + 'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner', + 'ext': 'mp3', + 'duration': 291, + 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', + 'uploader': 'Deutschlandfunk', + 'series': 'Kommentare und Themen der Woche', + 'channel': 'deutschlandfunk' + } + }, + ] + + def _real_extract(self, url): + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id) + + return self._parse_button_attrs( + self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id) + + +class DLFCorpusIE(DLFBaseIE): + IE_NAME = 'dlf:corpus' + IE_DESC = 'DLF Multi-feed Archives' + _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P<id>(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html' + _TESTS = [ + # Recorded news broadcast with referrals to related broadcasts + { + 'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html', + 'info_dict': { + 'id': 'fechten-russland-belarus-ukraine-protest-100', + 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', + 'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad' + }, + 'playlist_mincount': 5, + 'playlist': [{ + 'info_dict': { + 'id': '1fc5d64a', + 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', + 'ext': 'mp3', + 'duration': 252, + 'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '2ada145f', + 'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten', + 'ext': 'mp3', + 'duration': 336, + 'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005', + 'uploader': 'Deutschlandfunk', + 'series': 'Deutschlandfunk Nova', + 'channel': 'deutschlandfunk-nova' + } + }, { + 'info_dict': { + 'id': '5e55e8c9', + 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', + 'ext': 'mp3', + 'duration': 187, + 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '47e1a096', + 'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"', + 'ext': 'mp3', + 'duration': 602, + 'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '5e55e8c9', + 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', + 'ext': 'mp3', + 'duration': 187, + 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }] + }, + # Podcast feed with tag buttons, playlist count fluctuates + { + 'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html', + 'info_dict': { + 'id': 'kommentare-und-themen-der-woche-100', + 'title': 'Meinung - Kommentare und Themen der Woche', + 'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5', + }, + 'playlist_mincount': 10, + }, + # Podcast feed with no description + { + 'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html', + 'info_dict': { + 'id': 'podcast-tolle-idee-100', + 'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?', + }, + 'playlist_mincount': 11, + }, + ] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'description': self._html_search_meta( + ['description', 'og:description', 'twitter:description'], webpage, default=None), + 'title': self._html_search_meta( + ['og:title', 'twitter:title'], webpage, default=None), + 'entries': map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)), + } diff --git a/hypervideo_dl/extractor/douyutv.py b/hypervideo_dl/extractor/douyutv.py index 477f468..fa40844 100644 --- a/hypervideo_dl/extractor/douyutv.py +++ b/hypervideo_dl/extractor/douyutv.py @@ -1,6 +1,7 @@ import time import hashlib import re +import urllib from .common import InfoExtractor from ..utils import ( @@ -13,7 +14,7 @@ from ..utils import ( class DouyuTVIE(InfoExtractor): IE_DESC = '斗鱼' - _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?:[^/]+/)*(?P<id>[A-Za-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.douyutv.com/iseven', 'info_dict': { @@ -22,7 +23,7 @@ class DouyuTVIE(InfoExtractor): 'ext': 'flv', 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.png', 'uploader': '7师傅', 'is_live': True, }, @@ -37,7 +38,7 @@ class DouyuTVIE(InfoExtractor): 'ext': 'flv', 'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'md5:746a2f7a253966a06755a912f0acc0d2', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.png', 'uploader': 'douyu小漠', 'is_live': True, }, @@ -53,7 +54,7 @@ class DouyuTVIE(InfoExtractor): 'ext': 'flv', 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.png', 'uploader': '7师傅', 'is_live': True, }, @@ -61,6 +62,21 @@ class DouyuTVIE(InfoExtractor): 'skip_download': True, }, }, { + 'url': 'https://www.douyu.com/topic/ydxc?rid=6560603', + 'info_dict': { + 'id': '6560603', + 'display_id': '6560603', + 'ext': 'flv', + 'title': 're:^阿余:新年快乐恭喜发财! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 're:.*直播时间.*', + 'thumbnail': r're:^https?://.*\.png', + 'uploader': '阿涛皎月Carry', + 'live_status': 'is_live', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://www.douyu.com/xiaocang', 'only_matching': True, }, { @@ -79,28 +95,24 @@ class DouyuTVIE(InfoExtractor): room_id = self._html_search_regex( r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') - # Grab metadata from mobile API + # Grab metadata from API + params = { + 'aid': 'wp', + 'client_sys': 'wp', + 'time': int(time.time()), + } + params['auth'] = hashlib.md5( + f'room/{video_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest() room = self._download_json( - 'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id, - note='Downloading room info')['data'] + f'http://www.douyutv.com/api/v1/room/{room_id}', video_id, + note='Downloading room info', query=params)['data'] # 1 = live, 2 = offline if room.get('show_status') == '2': raise ExtractorError('Live stream is offline', expected=True) - # Grab the URL from PC client API - # The m3u8 url from mobile API requires re-authentication every 5 minutes - tt = int(time.time()) - signContent = 'lapi/live/thirdPart/getPlay/%s?aid=pcclient&rate=0&time=%d9TUk5fjjUjg9qIMH3sdnh' % (room_id, tt) - sign = hashlib.md5(signContent.encode('ascii')).hexdigest() - video_url = self._download_json( - 'http://coapi.douyucdn.cn/lapi/live/thirdPart/getPlay/' + room_id, - video_id, note='Downloading video URL info', - query={'rate': 0}, headers={ - 'auth': sign, - 'time': str(tt), - 'aid': 'pcclient' - })['data']['live_url'] + video_url = urljoin('https://hls3-akm.douyucdn.cn/', self._search_regex(r'(live/.*)', room['hls_url'], 'URL')) + formats, subs = self._extract_m3u8_formats_and_subtitles(video_url, room_id) title = unescapeHTML(room['room_name']) description = room.get('show_details') @@ -110,12 +122,13 @@ class DouyuTVIE(InfoExtractor): return { 'id': room_id, 'display_id': video_id, - 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, 'is_live': True, + 'subtitles': subs, + 'formats': formats, } diff --git a/hypervideo_dl/extractor/dplay.py b/hypervideo_dl/extractor/dplay.py index 8eb4d8f..363b4be 100644 --- a/hypervideo_dl/extractor/dplay.py +++ b/hypervideo_dl/extractor/dplay.py @@ -2,7 +2,7 @@ import json import uuid from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -39,7 +39,7 @@ class DPlayBaseIE(InfoExtractor): return f'Bearer {token}' def _process_errors(self, e, geo_countries): - info = self._parse_json(e.cause.read().decode('utf-8'), None) + info = self._parse_json(e.cause.response.read().decode('utf-8'), None) error = info['errors'][0] error_code = error.get('code') if error_code == 'access.denied.geoblocked': @@ -65,6 +65,7 @@ class DPlayBaseIE(InfoExtractor): return streaming_list def _get_disco_api_info(self, url, display_id, disco_host, realm, country, domain=''): + country = self.get_param('geo_bypass_country') or country geo_countries = [country.upper()] self._initialize_geo_bypass({ 'countries': geo_countries, @@ -86,7 +87,7 @@ class DPlayBaseIE(InfoExtractor): 'include': 'images,primaryChannel,show,tags' }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: self._process_errors(e, geo_countries) raise video_id = video['data']['id'] @@ -98,7 +99,7 @@ class DPlayBaseIE(InfoExtractor): streaming = self._download_video_playback_info( disco_base, video_id, headers) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: self._process_errors(e, geo_countries) raise for format_dict in streaming: @@ -745,7 +746,7 @@ class MotorTrendIE(DiscoveryPlusBaseIE): class MotorTrendOnDemandIE(DiscoveryPlusBaseIE): - _VALID_URL = r'https?://(?:www\.)?motortrendondemand\.com/detail' + DPlayBaseIE._PATH_REGEX + _VALID_URL = r'https?://(?:www\.)?motortrend(?:ondemand\.com|\.com/plus)/detail' + DPlayBaseIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.motortrendondemand.com/detail/wheelstanding-dump-truck-stubby-bobs-comeback/37699/784', 'info_dict': { @@ -766,6 +767,25 @@ class MotorTrendOnDemandIE(DiscoveryPlusBaseIE): 'upload_date': '20140101', 'tags': [], }, + }, { + 'url': 'https://www.motortrend.com/plus/detail/roadworthy-rescues-teaser-trailer/4922860/', + 'info_dict': { + 'id': '4922860', + 'ext': 'mp4', + 'title': 'Roadworthy Rescues | Teaser Trailer', + 'description': 'Derek Bieri helps Freiburger and Finnegan with their \'68 big-block Dart.', + 'display_id': 'roadworthy-rescues-teaser-trailer/4922860', + 'creator': 'Originals', + 'series': 'Roadworthy Rescues', + 'thumbnail': r're:^https?://.+\.jpe?g$', + 'upload_date': '20220907', + 'timestamp': 1662523200, + 'duration': 1066.356, + 'tags': [], + }, + }, { + 'url': 'https://www.motortrend.com/plus/detail/ugly-duckling/2450033/12439', + 'only_matching': True, }] _PRODUCT = 'MTOD' @@ -1001,3 +1021,39 @@ class DiscoveryPlusIndiaShowIE(DiscoveryPlusShowBaseIE): _SHOW_STR = 'show' _INDEX = 4 _VIDEO_IE = DiscoveryPlusIndiaIE + + +class GlobalCyclingNetworkPlusIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://plus\.globalcyclingnetwork\.com/watch/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://plus.globalcyclingnetwork.com/watch/1397691', + 'info_dict': { + 'id': '1397691', + 'ext': 'mp4', + 'title': 'The Athertons: Mountain Biking\'s Fastest Family', + 'description': 'md5:75a81937fcd8b989eec6083a709cd837', + 'thumbnail': 'https://us1-prod-images.disco-api.com/2021/03/04/eb9e3026-4849-3001-8281-9356466f0557.png', + 'series': 'gcn', + 'creator': 'Gcn', + 'upload_date': '20210309', + 'timestamp': 1615248000, + 'duration': 2531.0, + 'tags': [], + }, + 'skip': 'Subscription required', + 'params': {'skip_download': 'm3u8'}, + }] + + _PRODUCT = 'web' + _DISCO_API_PARAMS = { + 'disco_host': 'disco-api-prod.globalcyclingnetwork.com', + 'realm': 'gcn', + 'country': 'us', + } + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': f'realm={realm}', + 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:27.3.2', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) diff --git a/hypervideo_dl/extractor/dropbox.py b/hypervideo_dl/extractor/dropbox.py index 214b309..bc2efce 100644 --- a/hypervideo_dl/extractor/dropbox.py +++ b/hypervideo_dl/extractor/dropbox.py @@ -1,3 +1,4 @@ +import base64 import os.path import re @@ -5,14 +6,13 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, - traverse_obj, - try_get, + update_url_query, url_basename, ) class DropboxIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/sh?/(?P<id>[a-zA-Z0-9]{15})/.*' + _VALID_URL = r'https?://(?:www\.)?dropbox\.com/(?:(?:e/)?scl/fi|sh?)/(?P<id>\w+)' _TESTS = [ { 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4?dl=0', @@ -22,7 +22,16 @@ class DropboxIE(InfoExtractor): 'title': 'youtube-dl test video \'ä"BaW_jenozKc' } }, { - 'url': 'https://www.dropbox.com/sh/662glsejgzoj9sr/AAByil3FGH9KFNZ13e08eSa1a/Pregame%20Ceremony%20Program%20PA%2020140518.m4v', + 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh', + 'only_matching': True, + }, { + 'url': 'https://www.dropbox.com/sh/2mgpiuq7kv8nqdf/AABy-fW4dkydT4GmWi2mdOUDa?dl=0&preview=Drone+Shot.mp4', + 'only_matching': True, + }, { + 'url': 'https://www.dropbox.com/scl/fi/r2kd2skcy5ylbbta5y1pz/DJI_0003.MP4?dl=0&rlkey=wcdgqangn7t3lnmmv6li9mu9h', + 'only_matching': True, + }, { + 'url': 'https://www.dropbox.com/e/scl/fi/r2kd2skcy5ylbbta5y1pz/DJI_0003.MP4?dl=0&rlkey=wcdgqangn7t3lnmmv6li9mu9h', 'only_matching': True, }, ] @@ -53,16 +62,25 @@ class DropboxIE(InfoExtractor): else: raise ExtractorError('Password protected video, use --video-password <password>', expected=True) - info_json = self._search_json(r'InitReact\.mountComponent\(.*?,', webpage, 'mountComponent', video_id, - contains_pattern=r'{.+?"preview".+?}', end_pattern=r'\)')['props'] - transcode_url = traverse_obj(info_json, ((None, 'preview'), 'file', 'preview', 'content', 'transcode_url'), get_all=False) - formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id) + formats, subtitles, has_anonymous_download = [], {}, False + for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)): + decoded = base64.b64decode(encoded).decode('utf-8', 'ignore') + transcode_url = self._search_regex( + r'\n.(https://[^\x03\x08\x12\n]+\.m3u8)', decoded, 'transcode url', default=None) + if not transcode_url: + continue + formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id, 'mp4') + has_anonymous_download = self._search_regex(r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False) + break # downloads enabled we can get the original file - if 'anonymous' in (try_get(info_json, lambda x: x['sharePermission']['canDownloadRoles']) or []): - video_url = re.sub(r'[?&]dl=0', '', url) - video_url += ('?' if '?' not in video_url else '&') + 'dl=1' - formats.append({'url': video_url, 'format_id': 'original', 'format_note': 'Original', 'quality': 1}) + if has_anonymous_download: + formats.append({ + 'url': update_url_query(url, {'dl': '1'}), + 'format_id': 'original', + 'format_note': 'Original', + 'quality': 1 + }) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dropout.py b/hypervideo_dl/extractor/dropout.py index e280b1c..80ae6c1 100644 --- a/hypervideo_dl/extractor/dropout.py +++ b/hypervideo_dl/extractor/dropout.py @@ -1,13 +1,17 @@ +import functools + from .common import InfoExtractor from .vimeo import VHXEmbedIE from ..utils import ( ExtractorError, + OnDemandPagedList, clean_html, + extract_attributes, get_element_by_class, get_element_by_id, - get_elements_by_class, + get_elements_html_by_class, int_or_none, - join_nonempty, + traverse_obj, unified_strdate, urlencode_postdata, ) @@ -162,12 +166,13 @@ class DropoutIE(InfoExtractor): class DropoutSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P<id>[^\/$&?#]+)(?:/?$|/season:[0-9]+/?$)' + _PAGE_SIZE = 24 + _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?P<id>[^\/$&?#]+)(?:/?$|/season:(?P<season>[0-9]+)/?$)' _TESTS = [ { 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1', 'note': 'Multi-season series with the season in the url', - 'playlist_count': 17, + 'playlist_count': 24, 'info_dict': { 'id': 'dimension-20-fantasy-high-season-1', 'title': 'Dimension 20 Fantasy High - Season 1' @@ -176,7 +181,7 @@ class DropoutSeasonIE(InfoExtractor): { 'url': 'https://www.dropout.tv/dimension-20-fantasy-high', 'note': 'Multi-season series with the season not in the url', - 'playlist_count': 17, + 'playlist_count': 24, 'info_dict': { 'id': 'dimension-20-fantasy-high-season-1', 'title': 'Dimension 20 Fantasy High - Season 1' @@ -190,29 +195,30 @@ class DropoutSeasonIE(InfoExtractor): 'id': 'dimension-20-shriek-week-season-1', 'title': 'Dimension 20 Shriek Week - Season 1' } + }, + { + 'url': 'https://www.dropout.tv/breaking-news-no-laugh-newsroom/season:3', + 'note': 'Multi-season series with season in the url that requires pagination', + 'playlist_count': 25, + 'info_dict': { + 'id': 'breaking-news-no-laugh-newsroom-season-3', + 'title': 'Breaking News No Laugh Newsroom - Season 3' + } } ] + def _fetch_page(self, url, season_id, page): + page += 1 + webpage = self._download_webpage( + f'{url}?page={page}', season_id, note=f'Downloading page {page}', expected_status={400}) + yield from [self.url_result(item_url, DropoutIE) for item_url in traverse_obj( + get_elements_html_by_class('browse-item-link', webpage), (..., {extract_attributes}, 'href'))] + def _real_extract(self, url): season_id = self._match_id(url) + season_num = self._match_valid_url(url).group('season') or 1 season_title = season_id.replace('-', ' ').title() - webpage = self._download_webpage(url, season_id) - - entries = [ - self.url_result( - url=self._search_regex(r'<a href=["\'](.+?)["\'] class=["\']browse-item-link["\']', - item, 'item_url'), - ie=DropoutIE.ie_key() - ) for item in get_elements_by_class('js-collection-item', webpage) - ] - - seasons = (get_element_by_class('select-dropdown-wrapper', webpage) or '').strip().replace('\n', '') - current_season = self._search_regex(r'<option[^>]+selected>([^<]+)</option>', - seasons, 'current_season', default='').strip() - return { - '_type': 'playlist', - 'id': join_nonempty(season_id, current_season.lower().replace(' ', '-')), - 'title': join_nonempty(season_title, current_season, delim=' - '), - 'entries': entries - } + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, url, season_id), self._PAGE_SIZE), + f'{season_id}-season-{season_num}', f'{season_title} - Season {season_num}') diff --git a/hypervideo_dl/extractor/drtv.py b/hypervideo_dl/extractor/drtv.py index 128f439..6c381aa 100644 --- a/hypervideo_dl/extractor/drtv.py +++ b/hypervideo_dl/extractor/drtv.py @@ -2,28 +2,29 @@ import binascii import hashlib import re - from .common import InfoExtractor from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, - int_or_none, float_or_none, + int_or_none, mimetype2ext, str_or_none, - try_get, + traverse_obj, unified_timestamp, update_url_query, url_or_none, ) +SERIES_API = 'https://production-cdn.dr-massive.com/api/page?device=web_browser&item_detail_expand=all&lang=da&max_list_prefetch=3&path=%s' + class DRTVIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?:radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*| + (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?P<radio>radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*| (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ ) (?P<id>[\da-z_-]+) @@ -78,7 +79,7 @@ class DRTVIE(InfoExtractor): 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', 'timestamp': 1546628400, 'upload_date': '20190104', - 'duration': 3504.618, + 'duration': 3504.619, 'formats': 'mincount:20', 'release_year': 2017, 'season_id': 'urn:dr:mu:bundle:5afc03ad6187a4065ca5fd35', @@ -99,14 +100,16 @@ class DRTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Bonderøven 2019 (1:8)', 'description': 'md5:b6dcfe9b6f0bea6703e9a0092739a5bd', - 'timestamp': 1603188600, - 'upload_date': '20201020', + 'timestamp': 1654856100, + 'upload_date': '20220610', 'duration': 2576.6, 'season': 'Bonderøven 2019', 'season_id': 'urn:dr:mu:bundle:5c201667a11fa01ca4528ce5', 'release_year': 2019, 'season_number': 2019, - 'series': 'Frank & Kastaniegaarden' + 'series': 'Frank & Kastaniegaarden', + 'episode_number': 1, + 'episode': 'Episode 1', }, 'params': { 'skip_download': True, @@ -138,16 +141,32 @@ class DRTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'this video has been removed', + }, { + 'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/regionale-nyheder-2023-03-14-10-30-9', + 'info_dict': { + 'ext': 'mp4', + 'id': '14802310112', + 'timestamp': 1678786200, + 'duration': 120.043, + 'season_id': 'urn:dr:mu:bundle:63a4f7c87140143504b6710f', + 'series': 'P4 København regionale nyheder', + 'upload_date': '20230314', + 'release_year': 0, + 'description': 'Hør seneste regionale nyheder fra P4 København.', + 'season': 'Regionale nyheder', + 'title': 'Regionale nyheder', + }, }] def _real_extract(self, url): - video_id = self._match_id(url) + raw_video_id, is_radio_url = self._match_valid_url(url).group('id', 'radio') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, raw_video_id) if '>Programmet er ikke længere tilgængeligt' in webpage: raise ExtractorError( - 'Video %s is not available' % video_id, expected=True) + 'Video %s is not available' % raw_video_id, expected=True) video_id = self._search_regex( (r'data-(?:material-identifier|episode-slug)="([^"]+)"', @@ -168,20 +187,27 @@ class DRTVIE(InfoExtractor): programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id) else: programcard_url = _PROGRAMCARD_BASE - page = self._parse_json( - self._search_regex( - r'data\s*=\s*({.+?})\s*(?:;|</script)', webpage, - 'data'), '1')['cache']['page'] - page = page[list(page.keys())[0]] - item = try_get( - page, (lambda x: x['item'], lambda x: x['entries'][0]['item']), - dict) - video_id = item['customId'].split(':')[-1] + if is_radio_url: + video_id = self._search_nextjs_data( + webpage, raw_video_id)['props']['pageProps']['episode']['productionNumber'] + else: + json_data = self._search_json( + r'window\.__data\s*=', webpage, 'data', raw_video_id) + video_id = traverse_obj(json_data, ( + 'cache', 'page', ..., (None, ('entries', 0)), 'item', 'customId', + {lambda x: x.split(':')[-1]}), get_all=False) + if not video_id: + raise ExtractorError('Unable to extract video id') query['productionnumber'] = video_id data = self._download_json( programcard_url, video_id, 'Downloading video JSON', query=query) + supplementary_data = {} + if re.search(r'_\d+$', raw_video_id): + supplementary_data = self._download_json( + SERIES_API % f'/episode/{raw_video_id}', raw_video_id, fatal=False) or {} + title = str_or_none(data.get('Title')) or re.sub( r'\s*\|\s*(?:TV\s*\|\s*DR|DRTV)$', '', self._og_search_title(webpage)) @@ -262,10 +288,11 @@ class DRTVIE(InfoExtractor): f['vcodec'] = 'none' formats.extend(f4m_formats) elif target == 'HLS': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( uri, video_id, 'mp4', entry_protocol='m3u8_native', - quality=preference, m3u8_id=format_id, - fatal=False)) + quality=preference, m3u8_id=format_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: bitrate = link.get('Bitrate') if bitrate: @@ -313,8 +340,8 @@ class DRTVIE(InfoExtractor): 'season': str_or_none(data.get('SeasonTitle')), 'season_number': int_or_none(data.get('SeasonNumber')), 'season_id': str_or_none(data.get('SeasonUrn')), - 'episode': str_or_none(data.get('EpisodeTitle')), - 'episode_number': int_or_none(data.get('EpisodeNumber')), + 'episode': traverse_obj(supplementary_data, ('entries', 0, 'item', 'contextualTitle')) or str_or_none(data.get('EpisodeTitle')), + 'episode_number': traverse_obj(supplementary_data, ('entries', 0, 'item', 'episodeNumber')) or int_or_none(data.get('EpisodeNumber')), 'release_year': int_or_none(data.get('ProductionYear')), } @@ -372,3 +399,92 @@ class DRTVLiveIE(InfoExtractor): 'formats': formats, 'is_live': True, } + + +class DRTVSeasonIE(InfoExtractor): + IE_NAME = 'drtv:season' + _VALID_URL = r'https?://(?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/saeson/(?P<display_id>[\w-]+)_(?P<id>\d+)' + _GEO_COUNTRIES = ['DK'] + _TESTS = [{ + 'url': 'https://www.dr.dk/drtv/saeson/frank-and-kastaniegaarden_9008', + 'info_dict': { + 'id': '9008', + 'display_id': 'frank-and-kastaniegaarden', + 'title': 'Frank & Kastaniegaarden', + 'series': 'Frank & Kastaniegaarden', + }, + 'playlist_mincount': 8 + }, { + 'url': 'https://www.dr.dk/drtv/saeson/frank-and-kastaniegaarden_8761', + 'info_dict': { + 'id': '8761', + 'display_id': 'frank-and-kastaniegaarden', + 'title': 'Frank & Kastaniegaarden', + 'series': 'Frank & Kastaniegaarden', + }, + 'playlist_mincount': 19 + }] + + def _real_extract(self, url): + display_id, season_id = self._match_valid_url(url).group('display_id', 'id') + data = self._download_json(SERIES_API % f'/saeson/{display_id}_{season_id}', display_id) + + entries = [{ + '_type': 'url', + 'url': f'https://www.dr.dk/drtv{episode["path"]}', + 'ie_key': DRTVIE.ie_key(), + 'title': episode.get('title'), + 'episode': episode.get('episodeName'), + 'description': episode.get('shortDescription'), + 'series': traverse_obj(data, ('entries', 0, 'item', 'title')), + 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber')), + 'episode_number': episode.get('episodeNumber'), + } for episode in traverse_obj(data, ('entries', 0, 'item', 'episodes', 'items'))] + + return { + '_type': 'playlist', + 'id': season_id, + 'display_id': display_id, + 'title': traverse_obj(data, ('entries', 0, 'item', 'title')), + 'series': traverse_obj(data, ('entries', 0, 'item', 'title')), + 'entries': entries, + 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber')) + } + + +class DRTVSeriesIE(InfoExtractor): + IE_NAME = 'drtv:series' + _VALID_URL = r'https?://(?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/serie/(?P<display_id>[\w-]+)_(?P<id>\d+)' + _GEO_COUNTRIES = ['DK'] + _TESTS = [{ + 'url': 'https://www.dr.dk/drtv/serie/frank-and-kastaniegaarden_6954', + 'info_dict': { + 'id': '6954', + 'display_id': 'frank-and-kastaniegaarden', + 'title': 'Frank & Kastaniegaarden', + 'series': 'Frank & Kastaniegaarden', + }, + 'playlist_mincount': 15 + }] + + def _real_extract(self, url): + display_id, series_id = self._match_valid_url(url).group('display_id', 'id') + data = self._download_json(SERIES_API % f'/serie/{display_id}_{series_id}', display_id) + + entries = [{ + '_type': 'url', + 'url': f'https://www.dr.dk/drtv{season.get("path")}', + 'ie_key': DRTVSeasonIE.ie_key(), + 'title': season.get('title'), + 'series': traverse_obj(data, ('entries', 0, 'item', 'title')), + 'season_number': traverse_obj(data, ('entries', 0, 'item', 'seasonNumber')) + } for season in traverse_obj(data, ('entries', 0, 'item', 'show', 'seasons', 'items'))] + + return { + '_type': 'playlist', + 'id': series_id, + 'display_id': display_id, + 'title': traverse_obj(data, ('entries', 0, 'item', 'title')), + 'series': traverse_obj(data, ('entries', 0, 'item', 'title')), + 'entries': entries + } diff --git a/hypervideo_dl/extractor/dumpert.py b/hypervideo_dl/extractor/dumpert.py index 010c2d0..0cf8426 100644 --- a/hypervideo_dl/extractor/dumpert.py +++ b/hypervideo_dl/extractor/dumpert.py @@ -1,12 +1,17 @@ from .common import InfoExtractor from ..utils import ( + determine_ext, int_or_none, qualities, ) class DumpertIE(InfoExtractor): - _VALID_URL = r'(?P<protocol>https?)://(?:(?:www|legacy)\.)?dumpert\.nl/(?:mediabase|embed|item)/(?P<id>[0-9]+[/_][0-9a-zA-Z]+)' + _VALID_URL = r'''(?x) + (?P<protocol>https?)://(?:(?:www|legacy)\.)?dumpert\.nl(?: + /(?:mediabase|embed|item)/| + (?:/toppers|/latest|/?)\?selectedId= + )(?P<id>[0-9]+[/_][0-9a-zA-Z]+)''' _TESTS = [{ 'url': 'https://www.dumpert.nl/item/6646981_951bc60f', 'md5': '1b9318d7d5054e7dcb9dc7654f21d643', @@ -16,6 +21,9 @@ class DumpertIE(InfoExtractor): 'title': 'Ik heb nieuws voor je', 'description': 'Niet schrikken hoor', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 9, + 'view_count': int, + 'like_count': int, } }, { 'url': 'https://www.dumpert.nl/embed/6675421_dc440fe7', @@ -26,6 +34,28 @@ class DumpertIE(InfoExtractor): }, { 'url': 'http://legacy.dumpert.nl/embed/6675421/dc440fe7', 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/item/100031688_b317a185', + 'info_dict': { + 'id': '100031688/b317a185', + 'ext': 'mp4', + 'title': 'Epic schijnbeweging', + 'description': '<p>Die zag je niet eh</p>', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'duration': 12, + 'view_count': int, + 'like_count': int, + }, + 'params': {'skip_download': 'm3u8'} + }, { + 'url': 'https://www.dumpert.nl/toppers?selectedId=100031688_b317a185', + 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/latest?selectedId=100031688_b317a185', + 'only_matching': True, + }, { + 'url': 'https://www.dumpert.nl/?selectedId=100031688_b317a185', + 'only_matching': True, }] def _real_extract(self, url): @@ -36,18 +66,23 @@ class DumpertIE(InfoExtractor): title = item['title'] media = next(m for m in item['media'] if m.get('mediatype') == 'VIDEO') - quality = qualities(['flv', 'mobile', 'tablet', '720p']) + quality = qualities(['flv', 'mobile', 'tablet', '720p', '1080p']) formats = [] for variant in media.get('variants', []): uri = variant.get('uri') if not uri: continue version = variant.get('version') - formats.append({ - 'url': uri, - 'format_id': version, - 'quality': quality(version), - }) + preference = quality(version) + if determine_ext(uri) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + uri, video_id, 'mp4', m3u8_id=version, quality=preference)) + else: + formats.append({ + 'url': uri, + 'format_id': version, + 'quality': preference, + }) thumbnails = [] stills = item.get('stills') or {} diff --git a/hypervideo_dl/extractor/eagleplatform.py b/hypervideo_dl/extractor/eagleplatform.py index 9ebd24d..739d179 100644 --- a/hypervideo_dl/extractor/eagleplatform.py +++ b/hypervideo_dl/extractor/eagleplatform.py @@ -2,7 +2,7 @@ import functools import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -111,8 +111,8 @@ class EaglePlatformIE(InfoExtractor): response = super(EaglePlatformIE, self)._download_json( url_or_request, video_id, *args, **kwargs) except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError): - response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) + if isinstance(ee.cause, HTTPError): + response = self._parse_json(ee.cause.response.read().decode('utf-8'), video_id) self._handle_error(response) raise return response diff --git a/hypervideo_dl/extractor/ebay.py b/hypervideo_dl/extractor/ebay.py new file mode 100644 index 0000000..d0eb9fc --- /dev/null +++ b/hypervideo_dl/extractor/ebay.py @@ -0,0 +1,36 @@ +from .common import InfoExtractor +from ..utils import remove_end + + +class EbayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ebay\.com/itm/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.ebay.com/itm/194509326719', + 'info_dict': { + 'id': '194509326719', + 'ext': 'mp4', + 'title': 'WiFi internal antenna adhesive for wifi 2.4GHz wifi 5 wifi 6 wifi 6E full bands', + }, + 'params': {'skip_download': 'm3u8'} + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_json = self._search_json(r'"video":', webpage, 'video json', video_id) + + formats = [] + for key, url in video_json['playlistMap'].items(): + if key == 'HLS': + formats.extend(self._extract_m3u8_formats(url, video_id, fatal=False)) + elif key == 'DASH': + formats.extend(self._extract_mpd_formats(url, video_id, fatal=False)) + else: + self.report_warning(f'Unsupported format {key}', video_id) + + return { + 'id': video_id, + 'title': remove_end(self._html_extract_title(webpage), ' | eBay'), + 'formats': formats + } diff --git a/hypervideo_dl/extractor/eitb.py b/hypervideo_dl/extractor/eitb.py index bd027da..66afbb6 100644 --- a/hypervideo_dl/extractor/eitb.py +++ b/hypervideo_dl/extractor/eitb.py @@ -1,10 +1,6 @@ from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, - parse_iso8601, - sanitized_Request, -) +from ..networking import Request +from ..utils import float_or_none, int_or_none, parse_iso8601 class EitbIE(InfoExtractor): @@ -54,7 +50,7 @@ class EitbIE(InfoExtractor): hls_url = media.get('HLS_SURL') if hls_url: - request = sanitized_Request( + request = Request( 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', headers={'Referer': url}) token_data = self._download_json( diff --git a/hypervideo_dl/extractor/elevensports.py b/hypervideo_dl/extractor/elevensports.py new file mode 100644 index 0000000..99c52b3 --- /dev/null +++ b/hypervideo_dl/extractor/elevensports.py @@ -0,0 +1,59 @@ +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class ElevenSportsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?elevensports\.com/view/event/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://elevensports.com/view/event/clf46yr3kenn80jgrqsjmwefk', + 'md5': 'c0958d9ff90e4503a75544358758921d', + 'info_dict': { + 'id': 'clf46yr3kenn80jgrqsjmwefk', + 'title': 'Cleveland SC vs Lionsbridge FC', + 'ext': 'mp4', + 'description': 'md5:03b5238d6549f4ea1fddadf69b5e0b58', + 'upload_date': '20230323', + 'timestamp': 1679612400, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'params': {'skip_download': 'm3u8'} + }, { + 'url': 'https://elevensports.com/view/event/clhpyd53b06160jez74qhgkmf', + 'md5': 'c0958d9ff90e4503a75544358758921d', + 'info_dict': { + 'id': 'clhpyd53b06160jez74qhgkmf', + 'title': 'AJNLF vs ARRAF', + 'ext': 'mp4', + 'description': 'md5:c8c5e75c78f37c6d15cd6c475e43a8c1', + 'upload_date': '20230521', + 'timestamp': 1684684800, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'params': {'skip_download': 'm3u8'} + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + event_id = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['event']['mclsEventId'] + event_data = self._download_json( + f'https://mcls-api.mycujoo.tv/bff/events/v1beta1/{event_id}', video_id, + headers={'Authorization': 'Bearer FBVKACGN37JQC5SFA0OVK8KKSIOP153G'}) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + event_data['streams'][0]['full_url'], video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(event_data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': ('start_time', {parse_iso8601}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + }), + } diff --git a/hypervideo_dl/extractor/embedly.py b/hypervideo_dl/extractor/embedly.py index 483d018..458aaa0 100644 --- a/hypervideo_dl/extractor/embedly.py +++ b/hypervideo_dl/extractor/embedly.py @@ -1,24 +1,109 @@ import re import urllib.parse + from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from .youtube import YoutubeTabIE +from ..utils import parse_qs, smuggle_url, traverse_obj class EmbedlyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?url=(?P<id>[^#&]+)' + _VALID_URL = r'https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?(?:src|url)=(?:[^#&]+)' _TESTS = [{ 'url': 'https://cdn.embedly.com/widgets/media.html?src=http%3A%2F%2Fwww.youtube.com%2Fembed%2Fvideoseries%3Flist%3DUUGLim4T2loE5rwCMdpCIPVg&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSU4fj_aEMVw%26list%3DUUGLim4T2loE5rwCMdpCIPVg&image=http%3A%2F%2Fi.ytimg.com%2Fvi%2FSU4fj_aEMVw%2Fhqdefault.jpg&key=8ee8a2e6a8cc47aab1a5ee67f9a178e0&type=text%2Fhtml&schema=youtube&autoplay=1', + 'info_dict': { + 'id': 'UUGLim4T2loE5rwCMdpCIPVg', + 'modified_date': '20221225', + 'view_count': int, + 'uploader_url': 'https://www.youtube.com/@TraciHinesMusic', + 'channel_id': 'UCGLim4T2loE5rwCMdpCIPVg', + 'uploader': 'TraciJHines', + 'channel_url': 'https://www.youtube.com/@TraciHinesMusic', + 'channel': 'TraciJHines', + 'availability': 'public', + 'uploader_id': 'UCGLim4T2loE5rwCMdpCIPVg', + 'description': '', + 'tags': [], + 'title': 'Uploads from TraciJHines', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://cdn.embedly.com/widgets/media.html?src=http%3A%2F%2Fwww.youtube.com%2Fembed%2Fvideoseries%3Flist%3DUUGLim4T2loE5rwCMdpCIPVg&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSU4fj_aEMVw%26list%3DUUGLim4T2loE5rwCMdpCIPVg&image=http%3A%2F%2Fi.ytimg.com%2Fvi%2FSU4fj_aEMVw%2Fhqdefault.jpg&key=8ee8a2e6a8cc47aab1a5ee67f9a178e0&type=text%2Fhtml&schema=youtube&autoplay=1', + 'params': {'noplaylist': True}, + 'info_dict': { + 'id': 'SU4fj_aEMVw', + 'ext': 'mp4', + 'title': 'I\'m on Patreon!', + 'age_limit': 0, + 'categories': ['Entertainment'], + 'thumbnail': 'https://i.ytimg.com/vi_webp/SU4fj_aEMVw/maxresdefault.webp', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel': 'TraciJHines', + 'uploader_id': 'TraciJHines', + 'channel_url': 'https://www.youtube.com/channel/UCGLim4T2loE5rwCMdpCIPVg', + 'uploader_url': 'http://www.youtube.com/user/TraciJHines', + 'upload_date': '20150211', + 'duration': 282, + 'availability': 'public', + 'channel_follower_count': int, + 'tags': 'count:39', + 'view_count': int, + 'comment_count': int, + 'channel_id': 'UCGLim4T2loE5rwCMdpCIPVg', + 'like_count': int, + 'uploader': 'TraciJHines', + 'description': 'md5:8af6425f50bd46fbf29f3db0fc3a8364', + 'chapters': list, + + }, + }, { + 'url': 'https://cdn.embedly.com/widgets/media.html?src=https://player.vimeo.com/video/1234567?h=abcdefgh', 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + 'url': 'http://www.permacultureetc.com/2022/12/comment-greffer-facilement-les-arbres-fruitiers.html', + 'info_dict': { + 'id': 'pfUK_ADTvgY', + 'ext': 'mp4', + 'title': 'Comment greffer facilement les arbres fruitiers ? (mois par mois)', + 'description': 'md5:d3a876995e522f138aabb48e040bfb4c', + 'view_count': int, + 'upload_date': '20221210', + 'comment_count': int, + 'live_status': 'not_live', + 'channel_id': 'UCsM4_jihNFYe4CtSkXvDR-Q', + 'channel_follower_count': int, + 'tags': ['permaculture', 'jardinage', 'dekarz', 'autonomie', 'greffe', 'fruitiers', 'arbres', 'jardin forêt', 'forêt comestible', 'damien'], + 'playable_in_embed': True, + 'uploader': 'permaculture agroécologie etc...', + 'channel': 'permaculture agroécologie etc...', + 'thumbnail': 'https://i.ytimg.com/vi/pfUK_ADTvgY/sddefault.jpg', + 'duration': 1526, + 'channel_url': 'https://www.youtube.com/channel/UCsM4_jihNFYe4CtSkXvDR-Q', + 'age_limit': 0, + 'uploader_id': 'permacultureetc', + 'like_count': int, + 'uploader_url': 'http://www.youtube.com/user/permacultureetc', + 'categories': ['Education'], + 'availability': 'public', + }, + }] + @classmethod - def _extract_embed_urls(cls, url, webpage): - # Bypass suitable check + def _extract_from_webpage(cls, url, webpage): + # Bypass "ie=cls" and suitable check for mobj in re.finditer(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage): - yield mobj.group('url') + yield cls.url_result(mobj.group('url')) for mobj in re.finditer(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage): - yield urllib.parse.unquote(mobj.group('url')) + yield cls.url_result(urllib.parse.unquote(mobj.group('url'))) def _real_extract(self, url): - return self.url_result(compat_urllib_parse_unquote(self._match_id(url))) + qs = parse_qs(url) + src = urllib.parse.unquote(traverse_obj(qs, ('url', 0)) or '') + if src and YoutubeTabIE.suitable(src): + return self.url_result(src, YoutubeTabIE) + return self.url_result(smuggle_url( + urllib.parse.unquote(traverse_obj(qs, ('src', 0), ('url', 0))), + {'http_headers': {'Referer': url}})) diff --git a/hypervideo_dl/extractor/eporner.py b/hypervideo_dl/extractor/eporner.py index a233797..aee2dee 100644 --- a/hypervideo_dl/extractor/eporner.py +++ b/hypervideo_dl/extractor/eporner.py @@ -52,7 +52,7 @@ class EpornerIE(InfoExtractor): webpage, urlh = self._download_webpage_handle(url, display_id) - video_id = self._match_id(urlh.geturl()) + video_id = self._match_id(urlh.url) hash = self._search_regex( r'hash\s*[:=]\s*["\']([\da-f]{32})', webpage, 'hash') diff --git a/hypervideo_dl/extractor/espn.py b/hypervideo_dl/extractor/espn.py index f4b0134..7ed824c 100644 --- a/hypervideo_dl/extractor/espn.py +++ b/hypervideo_dl/extractor/espn.py @@ -240,7 +240,7 @@ class FiveThirtyEightIE(InfoExtractor): class ESPNCricInfoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/video/[^#$&?/]+-(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/(?:cricket-)?videos?/[^#$&?/]+-(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.espncricinfo.com/video/finch-chasing-comes-with-risks-despite-world-cup-trend-1289135', 'info_dict': { @@ -252,6 +252,17 @@ class ESPNCricInfoIE(InfoExtractor): 'duration': 96, }, 'params': {'skip_download': True} + }, { + 'url': 'https://www.espncricinfo.com/cricket-videos/daryl-mitchell-mitchell-santner-is-one-of-the-best-white-ball-spinners-india-vs-new-zealand-1356225', + 'info_dict': { + 'id': '1356225', + 'ext': 'mp4', + 'description': '"Santner has done it for a long time for New Zealand - we\'re lucky to have him"', + 'upload_date': '20230128', + 'title': 'Mitchell: \'Santner is one of the best white-ball spinners at the moment\'', + 'duration': 87, + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/ettutv.py b/hypervideo_dl/extractor/ettutv.py new file mode 100644 index 0000000..133b525 --- /dev/null +++ b/hypervideo_dl/extractor/ettutv.py @@ -0,0 +1,60 @@ +from .common import InfoExtractor +from ..utils import bool_or_none, traverse_obj, unified_timestamp, url_or_none + + +class EttuTvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ettu\.tv/[^?#]+/playerpage/(?P<id>[0-9]+)' + + _TESTS = [{ + 'url': 'https://www.ettu.tv/en-int/playerpage/1573849', + 'md5': '5874b7639a2aa866d1f6c3a4037c7c09', + 'info_dict': { + 'id': '1573849', + 'title': 'Ni Xia Lian - Shao Jieni', + 'description': 'ITTF Europe Top 16 Cup', + 'timestamp': 1677348600, + 'upload_date': '20230225', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.ettu.tv/en-int/playerpage/1573753', + 'md5': '1fc094bf96cf2d5ec0f434d3a6dec9aa', + 'info_dict': { + 'id': '1573753', + 'title': 'Qiu Dang - Jorgic Darko', + 'description': 'ITTF Europe Top 16 Cup', + 'timestamp': 1677423600, + 'upload_date': '20230226', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + player_settings = self._download_json( + f'https://www.ettu.tv/api/v3/contents/{video_id}/player-settings', video_id, query={ + 'language': 'en', + 'showTitle': 'true', + 'device': 'desktop', + }) + + stream_response = self._download_json(player_settings['streamAccess'], video_id, data=b'') + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + stream_response['data']['stream'], video_id, 'mp4') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(player_settings, { + 'title': 'title', + 'description': ('metaInformation', 'competition'), + 'thumbnail': ('image', {url_or_none}), + 'timestamp': ('date', {unified_timestamp}), + 'is_live': ('isLivestream', {bool_or_none}), + }) + } diff --git a/hypervideo_dl/extractor/europa.py b/hypervideo_dl/extractor/europa.py index c2b4937..f3da95f 100644 --- a/hypervideo_dl/extractor/europa.py +++ b/hypervideo_dl/extractor/europa.py @@ -3,8 +3,10 @@ from ..utils import ( int_or_none, orderedSet, parse_duration, + parse_iso8601, parse_qs, qualities, + traverse_obj, unified_strdate, xpath_text ) @@ -87,3 +89,85 @@ class EuropaIE(InfoExtractor): 'view_count': view_count, 'formats': formats } + + +class EuroParlWebstreamIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://multimedia\.europarl\.europa\.eu/[^/#?]+/ + (?:(?!video)[^/#?]+/[\w-]+_)(?P<id>[\w-]+) + ''' + _TESTS = [{ + 'url': 'https://multimedia.europarl.europa.eu/pl/webstreaming/plenary-session_20220914-0900-PLENARY', + 'info_dict': { + 'id': '62388b15-d85b-4add-99aa-ba12ccf64f0d', + 'ext': 'mp4', + 'title': 'Plenary session', + 'release_timestamp': 1663139069, + 'release_date': '20220914', + }, + 'params': { + 'skip_download': True, + } + }, { + # live webstream + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/euroscola_20221115-1000-SPECIAL-EUROSCOLA', + 'info_dict': { + 'ext': 'mp4', + 'id': '510eda7f-ba72-161b-7ee7-0e836cd2e715', + 'release_timestamp': 1668502800, + 'title': 'Euroscola 2022-11-15 19:21', + 'release_date': '20221115', + 'live_status': 'is_live', + }, + 'skip': 'not live anymore' + }, { + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-culture-and-education_20230301-1130-COMMITTEE-CULT', + 'info_dict': { + 'id': '7355662c-8eac-445e-4bb9-08db14b0ddd7', + 'ext': 'mp4', + 'release_date': '20230301', + 'title': 'Committee on Culture and Education', + 'release_timestamp': 1677666641, + } + }, { + # live stream + 'url': 'https://multimedia.europarl.europa.eu/en/webstreaming/committee-on-environment-public-health-and-food-safety_20230524-0900-COMMITTEE-ENVI', + 'info_dict': { + 'id': 'e4255f56-10aa-4b3c-6530-08db56d5b0d9', + 'ext': 'mp4', + 'release_date': '20230524', + 'title': r're:Committee on Environment, Public Health and Food Safety \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}', + 'release_timestamp': 1684911541, + 'live_status': 'is_live', + }, + 'skip': 'Not live anymore' + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + webpage_nextjs = self._search_nextjs_data(webpage, display_id)['props']['pageProps'] + + json_info = self._download_json( + 'https://acs-api.europarl.connectedviews.eu/api/FullMeeting', display_id, + query={ + 'api-version': 1.0, + 'tenantId': 'bae646ca-1fc8-4363-80ba-2c04f06b4968', + 'externalReference': display_id + }) + + formats, subtitles = [], {} + for hls_url in traverse_obj(json_info, ((('meetingVideo'), ('meetingVideos', ...)), 'hlsUrl')): + fmt, subs = self._extract_m3u8_formats_and_subtitles(hls_url, display_id) + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': json_info['id'], + 'title': traverse_obj(webpage_nextjs, (('mediaItem', 'title'), ('title', )), get_all=False), + 'formats': formats, + 'subtitles': subtitles, + 'release_timestamp': parse_iso8601(json_info.get('startDateTime')), + 'is_live': traverse_obj(webpage_nextjs, ('mediaItem', 'mediaSubType')) == 'Live' + } diff --git a/hypervideo_dl/extractor/eurosport.py b/hypervideo_dl/extractor/eurosport.py index 654e112..6c426bb 100644 --- a/hypervideo_dl/extractor/eurosport.py +++ b/hypervideo_dl/extractor/eurosport.py @@ -3,7 +3,7 @@ from ..utils import traverse_obj class EurosportIE(InfoExtractor): - _VALID_URL = r'https?://www\.eurosport\.com/\w+/[\w-]+/\d+/[\w-]+_(?P<id>vid\d+)' + _VALID_URL = r'https?://www\.eurosport\.com/\w+/(?:[\w-]+/[\d-]+/)?[\w-]+_(?P<id>vid\d+)' _TESTS = [{ 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/highlights-rafael-nadal-brushes-aside-caper-ruud-to-win-record-extending-14th-french-open-title_vid1694147/video.shtml', 'info_dict': { @@ -44,6 +44,32 @@ class EurosportIE(InfoExtractor): 'description': 'md5:32bbe3a773ac132c57fb1e8cca4b7c71', 'upload_date': '20220727', } + }, { + 'url': 'https://www.eurosport.com/football/champions-league/2022-2023/pep-guardiola-emotionally-destroyed-after-manchester-city-win-over-bayern-munich-in-champions-league_vid1896254/video.shtml', + 'info_dict': { + 'id': '3096477', + 'ext': 'mp4', + 'title': 'md5:82edc17370124c7a19b3cf518517583b', + 'duration': 84.0, + 'description': 'md5:b3f44ef7f5b5b95b24a273b163083feb', + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2023/04/12/3682873-74947393-2560-1440.jpg', + 'timestamp': 1681292028, + 'upload_date': '20230412', + 'display_id': 'vid1896254', + } + }, { + 'url': 'https://www.eurosport.com/football/last-year-s-semi-final-pain-was-still-there-pep-guardiola-after-man-city-reach-cl-final_vid1914115/video.shtml', + 'info_dict': { + 'id': '3149108', + 'ext': 'mp4', + 'title': '\'Last year\'s semi-final pain was still there\' - Pep Guardiola after Man City reach CL final', + 'description': 'md5:89ef142fe0170a66abab77fac2955d8e', + 'display_id': 'vid1914115', + 'timestamp': 1684403618, + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2023/05/18/3707254-75435008-2560-1440.jpg', + 'duration': 105.0, + 'upload_date': '20230518', + } }] _TOKEN = None diff --git a/hypervideo_dl/extractor/extractors.py b/hypervideo_dl/extractor/extractors.py index 610e02f..baa69d2 100644 --- a/hypervideo_dl/extractor/extractors.py +++ b/hypervideo_dl/extractor/extractors.py @@ -1,10 +1,10 @@ import contextlib import os -from ..utils import load_plugins +from ..plugins import load_plugins # NB: Must be before other imports so that plugins can be correctly injected -_PLUGIN_CLASSES = load_plugins('extractor', 'IE', {}) +_PLUGIN_CLASSES = load_plugins('extractor', 'IE') _LAZY_LOADER = False if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): @@ -24,3 +24,5 @@ if not _LAZY_LOADER: globals().update(_PLUGIN_CLASSES) _ALL_CLASSES[:0] = _PLUGIN_CLASSES.values() + +from .common import _PLUGIN_OVERRIDES # noqa: F401 diff --git a/hypervideo_dl/extractor/facebook.py b/hypervideo_dl/extractor/facebook.py index a58d9c8..021c3cf 100644 --- a/hypervideo_dl/extractor/facebook.py +++ b/hypervideo_dl/extractor/facebook.py @@ -8,6 +8,8 @@ from ..compat import ( compat_str, compat_urllib_parse_unquote, ) +from ..networking import Request +from ..networking.exceptions import network_exceptions from ..utils import ( ExtractorError, clean_html, @@ -19,11 +21,10 @@ from ..utils import ( int_or_none, js_to_json, merge_dicts, - network_exceptions, parse_count, parse_qs, qualities, - sanitized_Request, + str_or_none, traverse_obj, try_get, url_or_none, @@ -90,16 +91,16 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Asif Nawab Butt', - 'description': 'Asif Nawab Butt', + 'title': 'Asif', + 'description': '', 'uploader': 'Asif Nawab Butt', 'upload_date': '20140506', 'timestamp': 1399398998, 'thumbnail': r're:^https?://.*', + 'uploader_id': 'pfbid04scW44U4P9iTyLZAGy8y8W3pR3i2VugvHCimiRudUAVbN3MPp9eXBaYFcgVworZwl', + 'duration': 131.03, + 'concurrent_view_count': int, }, - 'expected_warnings': [ - 'title' - ] }, { 'note': 'Video with DASH manifest', 'url': 'https://www.facebook.com/video.php?v=957955867617029', @@ -151,7 +152,7 @@ class FacebookIE(InfoExtractor): # have 1080P, but only up to 720p in swf params # data.video.story.attachments[].media 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', - 'md5': '3f3798adb2b73423263e59376f1f5eb7', + 'md5': 'ca63897a90c9452efee5f8c40d080e25', 'info_dict': { 'id': '10155529876156509', 'ext': 'mp4', @@ -162,6 +163,9 @@ class FacebookIE(InfoExtractor): 'uploader': 'CNN', 'thumbnail': r're:^https?://.*', 'view_count': int, + 'uploader_id': '100059479812265', + 'concurrent_view_count': int, + 'duration': 44.478, }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall @@ -170,12 +174,16 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '1417995061575415', 'ext': 'mp4', - 'title': 'Ukrainian Scientists Worldwide | Довгоочікуване відео', + 'title': 'Довгоочікуване відео | By Yaroslav - Facebook', 'description': 'Довгоочікуване відео', - 'timestamp': 1486648771, + 'timestamp': 1486648217, 'upload_date': '20170209', 'uploader': 'Yaroslav Korpan', - 'uploader_id': '100000948048708', + 'uploader_id': 'pfbid029y8j22EwH3ikeqgH3SEP9G3CAi9kmWKgXJJG9s5geV7mo3J2bvURqHCdgucRgAyhl', + 'concurrent_view_count': int, + 'thumbnail': r're:^https?://.*', + 'view_count': int, + 'duration': 11736.446, }, 'params': { 'skip_download': True, @@ -192,9 +200,7 @@ class FacebookIE(InfoExtractor): 'uploader': 'La Guía Del Varón', 'thumbnail': r're:^https?://.*', }, - 'params': { - 'skip_download': True, - }, + 'skip': 'Requires logging in', }, { # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', @@ -208,9 +214,7 @@ class FacebookIE(InfoExtractor): 'uploader': 'Elisabeth Ahtn', 'uploader_id': '100013949973717', }, - 'params': { - 'skip_download': True, - }, + 'skip': 'Requires logging in', }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, @@ -252,7 +256,11 @@ class FacebookIE(InfoExtractor): 'timestamp': 1527084179, 'upload_date': '20180523', 'uploader': 'ESL One Dota 2', - 'uploader_id': '234218833769558', + 'uploader_id': '100066514874195', + 'duration': 4524.212, + 'view_count': int, + 'thumbnail': r're:^https?://.*', + 'concurrent_view_count': int, }, 'params': { 'skip_download': True, @@ -262,8 +270,17 @@ class FacebookIE(InfoExtractor): 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', 'info_dict': { 'id': '106560053808006', + 'ext': 'mp4', + 'title': 'Josef', + 'thumbnail': r're:^https?://.*', + 'concurrent_view_count': int, + 'uploader_id': 'pfbid02gXHbDwxumkaKJQaTGUf3znYfYzTuidGEWawiramNx4YamSj2afwYSRkpcjtHtMRJl', + 'timestamp': 1549275572, + 'duration': 3.413, + 'uploader': 'Josef Novak', + 'description': '', + 'upload_date': '20190204', }, - 'playlist_count': 2, }, { # data.video.story.attachments[].media 'url': 'https://www.facebook.com/watch/?v=647537299265662', @@ -276,6 +293,7 @@ class FacebookIE(InfoExtractor): 'id': '10157667649866271', }, 'playlist_count': 3, + 'skip': 'Requires logging in', }, { # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', @@ -319,7 +337,7 @@ class FacebookIE(InfoExtractor): } def _perform_login(self, username, password): - login_page_req = sanitized_Request(self._LOGIN_URL) + login_page_req = Request(self._LOGIN_URL) self._set_cookie('facebook.com', 'locale', 'en_US') login_page = self._download_webpage(login_page_req, None, note='Downloading login page', @@ -340,8 +358,8 @@ class FacebookIE(InfoExtractor): 'timezone': '-60', 'trynum': '1', } - request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') + request = Request(self._LOGIN_URL, urlencode_postdata(login_form)) + request.headers['Content-Type'] = 'application/x-www-form-urlencoded' try: login_results = self._download_webpage(request, None, note='Logging in', errnote='unable to fetch login page') @@ -367,8 +385,8 @@ class FacebookIE(InfoExtractor): 'h': h, 'name_action_selected': 'dont_save', } - check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) - check_req.add_header('Content-Type', 'application/x-www-form-urlencoded') + check_req = Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) + check_req.headers['Content-Type'] = 'application/x-www-form-urlencoded' check_response = self._download_webpage(check_req, None, note='Confirming login') if re.search(r'id="checkpointSubmitButton"', check_response) is not None: @@ -390,7 +408,10 @@ class FacebookIE(InfoExtractor): k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) - uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {} + uploader_data = ( + get_first(media, ('owner', {dict})) + or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) + or get_first(post, ('node', 'actors', ..., {dict})) or {}) page_title = title or self._html_search_regex(( r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>', @@ -415,16 +436,17 @@ class FacebookIE(InfoExtractor): # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): thumbnail = None - view_count = parse_count(self._search_regex( - r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count', - default=None)) info_dict = { 'description': description, 'uploader': uploader, 'uploader_id': uploader_data.get('id'), 'timestamp': timestamp, 'thumbnail': thumbnail, - 'view_count': view_count, + 'view_count': parse_count(self._search_regex( + (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)',), + webpage, 'view count', default=None)), + 'concurrent_view_count': get_first(post, ( + ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), } info_json_ld = self._search_json_ld(webpage, video_id, default={}) @@ -459,7 +481,8 @@ class FacebookIE(InfoExtractor): dash_manifest = video.get('dash_manifest') if dash_manifest: formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)))) + compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), + mpd_url=video.get('dash_manifest_url'))) def process_formats(info): # Downloads with browser's User-Agent are rate limited. Working around @@ -493,6 +516,13 @@ class FacebookIE(InfoExtractor): entries = [] def parse_graphql_video(video): + v_id = video.get('videoId') or video.get('id') or video_id + reel_info = traverse_obj( + video, ('creation_story', 'short_form_video_context', 'playback_video', {dict})) + if reel_info: + video = video['creation_story'] + video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) + video.update(reel_info) formats = [] q = qualities(['sd', 'hd']) for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), @@ -509,15 +539,15 @@ class FacebookIE(InfoExtractor): 'url': playable_url, }) extract_dash_manifest(video, formats) - v_id = video.get('videoId') or video.get('id') or video_id info = { 'id': v_id, 'formats': formats, 'thumbnail': traverse_obj( video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')), - 'uploader_id': try_get(video, lambda x: x['owner']['id']), - 'timestamp': int_or_none(video.get('publish_time')), - 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), + 'uploader_id': traverse_obj(video, ('owner', 'id', {str_or_none})), + 'timestamp': traverse_obj(video, 'publish_time', 'creation_time', expected_type=int_or_none), + 'duration': (float_or_none(video.get('playable_duration_in_ms'), 1000) + or float_or_none(video.get('length_in_second'))), } process_formats(info) description = try_get(video, lambda x: x['savable_description']['text']) @@ -778,18 +808,18 @@ class FacebookReelIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.facebook.com/reel/1195289147628387', - 'md5': 'c4ff9a7182ff9ff7d6f7a83603bae831', + 'md5': 'f13dd37f2633595982db5ed8765474d3', 'info_dict': { 'id': '1195289147628387', 'ext': 'mp4', - 'title': 'md5:9f5b142921b2dc57004fa13f76005f87', - 'description': 'md5:24ea7ef062215d295bdde64e778f5474', - 'uploader': 'Beast Camp Training', - 'uploader_id': '1738535909799870', - 'duration': 9.536, - 'thumbnail': r're:^https?://.*', + 'title': 'md5:b05800b5b1ad56c0ca78bd3807b6a61e', + 'description': 'md5:22f03309b216ac84720183961441d8db', + 'uploader': 'md5:723e6cb3091241160f20b3c5dc282af1', + 'uploader_id': '100040874179269', + 'duration': 9.579, + 'timestamp': 1637502609, 'upload_date': '20211121', - 'timestamp': 1637502604, + 'thumbnail': r're:^https?://.*', } }] diff --git a/hypervideo_dl/extractor/fc2.py b/hypervideo_dl/extractor/fc2.py index dd5e088..ba19b6c 100644 --- a/hypervideo_dl/extractor/fc2.py +++ b/hypervideo_dl/extractor/fc2.py @@ -3,11 +3,11 @@ import re from .common import InfoExtractor from ..compat import compat_parse_qs from ..dependencies import websockets +from ..networking import Request from ..utils import ( ExtractorError, WebSocketsWrapper, js_to_json, - sanitized_Request, traverse_obj, update_url_query, urlencode_postdata, @@ -57,7 +57,7 @@ class FC2IE(InfoExtractor): } login_data = urlencode_postdata(login_form_strs) - request = sanitized_Request( + request = Request( 'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data) login_results = self._download_webpage(request, None, note='Logging in', errnote='Unable to log in') @@ -66,7 +66,7 @@ class FC2IE(InfoExtractor): return False # this is also needed - login_redir = sanitized_Request('http://id.fc2.com/?mode=redirect&login=done') + login_redir = Request('http://id.fc2.com/?mode=redirect&login=done') self._download_webpage( login_redir, None, note='Login redirect', errnote='Login redirect failed') diff --git a/hypervideo_dl/extractor/fifa.py b/hypervideo_dl/extractor/fifa.py index dc00edc..8b4db3a 100644 --- a/hypervideo_dl/extractor/fifa.py +++ b/hypervideo_dl/extractor/fifa.py @@ -17,8 +17,10 @@ class FifaIE(InfoExtractor): 'description': 'md5:f4520d0ee80529c8ba4134a7d692ff8b', 'ext': 'mp4', 'categories': ['FIFA Tournaments'], - 'thumbnail': 'https://digitalhub.fifa.com/transform/fa6f0b3e-a2e9-4cf7-9f32-53c57bcb7360/2006_Final_ITA_FRA', + 'thumbnail': 'https://digitalhub.fifa.com/transform/135e2656-3a51-407b-8810-6c34bec5b59b/FMR_2006_Italy_France_Final_Hero', 'duration': 8165, + 'release_timestamp': 1152403200, + 'release_date': '20060709', }, 'params': {'skip_download': 'm3u8'}, }, { @@ -54,7 +56,7 @@ class FifaIE(InfoExtractor): webpage = self._download_webpage(url, video_id) preconnect_link = self._search_regex( - r'<link[^>]+rel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link') + r'<link\b[^>]+\brel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link') video_details = self._download_json( f'{preconnect_link}/sections/videoDetails/{video_id}', video_id, 'Downloading Video Details', fatal=False) @@ -62,22 +64,9 @@ class FifaIE(InfoExtractor): preplay_parameters = self._download_json( f'{preconnect_link}/videoPlayerData/{video_id}', video_id, 'Downloading Preplay Parameters')['preplayParameters'] - cid = preplay_parameters['contentId'] content_data = self._download_json( - f'https://content.uplynk.com/preplay/{cid}/multiple.json', video_id, 'Downloading Content Data', query={ - 'v': preplay_parameters['preplayAPIVersion'], - 'tc': preplay_parameters['tokenCheckAlgorithmVersion'], - 'rn': preplay_parameters['randomNumber'], - 'exp': preplay_parameters['tokenExpirationDate'], - 'ct': preplay_parameters['contentType'], - 'cid': cid, - 'mbtracks': preplay_parameters['tracksAssetNumber'], - 'ad': preplay_parameters['adConfiguration'], - 'ad.preroll': int(preplay_parameters['adPreroll']), - 'ad.cmsid': preplay_parameters['adCMSSourceId'], - 'ad.vid': preplay_parameters['adSourceVideoID'], - 'sig': preplay_parameters['signature'], - }) + 'https://content.uplynk.com/preplay/{contentId}/multiple.json?{queryStr}&sig={signature}'.format(**preplay_parameters), + video_id, 'Downloading Content Data') formats, subtitles = self._extract_m3u8_formats_and_subtitles(content_data['playURL'], video_id) diff --git a/hypervideo_dl/extractor/filmon.py b/hypervideo_dl/extractor/filmon.py index 9a93cb9..0cd18f4 100644 --- a/hypervideo_dl/extractor/filmon.py +++ b/hypervideo_dl/extractor/filmon.py @@ -1,8 +1,6 @@ from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( qualities, strip_or_none, @@ -40,8 +38,8 @@ class FilmOnIE(InfoExtractor): 'https://www.filmon.com/api/vod/movie?id=%s' % video_id, video_id)['response'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - errmsg = self._parse_json(e.cause.read().decode(), video_id)['reason'] + if isinstance(e.cause, HTTPError): + errmsg = self._parse_json(e.cause.response.read().decode(), video_id)['reason'] raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) raise @@ -124,8 +122,8 @@ class FilmOnChannelIE(InfoExtractor): channel_data = self._download_json( 'http://www.filmon.com/api-v2/channel/' + channel_id, channel_id)['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - errmsg = self._parse_json(e.cause.read().decode(), channel_id)['message'] + if isinstance(e.cause, HTTPError): + errmsg = self._parse_json(e.cause.response.read().decode(), channel_id)['message'] raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) raise diff --git a/hypervideo_dl/extractor/fox.py b/hypervideo_dl/extractor/fox.py index 15c0c48..e00e977 100644 --- a/hypervideo_dl/extractor/fox.py +++ b/hypervideo_dl/extractor/fox.py @@ -3,10 +3,10 @@ import uuid from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_str, compat_urllib_parse_unquote, ) +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -20,7 +20,7 @@ from ..utils import ( class FOXIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)' + _VALID_URL = r'https?://(?:www\.)?fox(?:sports)?\.com/(?:watch|replay)/(?P<id>[\da-fA-F]+)' _TESTS = [{ # clip 'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/', @@ -50,6 +50,10 @@ class FOXIE(InfoExtractor): # sports event, geo-restricted 'url': 'https://www.fox.com/watch/b057484dade738d1f373b3e46216fa2c/', 'only_matching': True, + }, { + # fox sports replay, geo-restricted + 'url': 'https://www.foxsports.com/replay/561f3e071347a24e5e877abc56b22e89', + 'only_matching': True, }] _GEO_BYPASS = False _HOME_PAGE_URL = 'https://www.fox.com/' @@ -68,9 +72,9 @@ class FOXIE(InfoExtractor): 'https://api3.fox.com/v2.0/' + path, video_id, data=data, headers=headers) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: entitlement_issues = self._parse_json( - e.cause.read().decode(), video_id)['entitlementIssues'] + e.cause.response.read().decode(), video_id)['entitlementIssues'] for e in entitlement_issues: if e.get('errorCode') == 1005: raise ExtractorError( @@ -123,8 +127,8 @@ class FOXIE(InfoExtractor): try: m3u8_url = self._download_json(release_url, video_id)['playURL'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error = self._parse_json(e.cause.read().decode(), video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + error = self._parse_json(e.cause.response.read().decode(), video_id) if error.get('exception') == 'GeoLocationBlocked': self.raise_geo_restricted(countries=['US']) raise ExtractorError(error['description'], expected=True) diff --git a/hypervideo_dl/extractor/foxnews.py b/hypervideo_dl/extractor/foxnews.py index 52172aa..6aa6361 100644 --- a/hypervideo_dl/extractor/foxnews.py +++ b/hypervideo_dl/extractor/foxnews.py @@ -7,9 +7,38 @@ from .common import InfoExtractor class FoxNewsIE(AMPIE): IE_NAME = 'foxnews' IE_DESC = 'Fox News and Fox Business Video' - _VALID_URL = r'https?://(?P<host>video\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)' + _VALID_URL = r'https?://video\.(?:insider\.)?fox(?:news|business)\.com/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)' _TESTS = [ { + 'url': 'https://video.foxnews.com/v/6320653836112', + 'info_dict': { + 'id': '6320653836112', + 'ext': 'mp4', + 'title': 'Tucker Carlson joins \'Gutfeld!\' to discuss his new documentary', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 404, + 'upload_date': '20230217', + 'description': 'md5:858a8a36f59e9ca897d758855bcdfa02', + 'timestamp': 1676611344.0, + }, + 'params': {'skip_download': 'm3u8'}, + }, + { + # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words + 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', + 'info_dict': { + 'id': '5099377331001', + 'ext': 'mp4', + 'title': '82416_censoring', + 'description': '82416_censoring', + 'upload_date': '20160826', + 'timestamp': 1472169708.0, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 521, + }, + 'params': {'skip_download': 'm3u8'}, + }, + { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', 'md5': '32aaded6ba3ef0d1c04e238d01031e5e', 'info_dict': { @@ -22,6 +51,7 @@ class FoxNewsIE(AMPIE): 'upload_date': '20110503', 'thumbnail': r're:^https?://.*\.jpg$', }, + 'skip': '404 page', }, { 'url': 'http://video.foxnews.com/v/3922535568001/rep-luis-gutierrez-on-if-obamas-immigration-plan-is-legal/#sp=show-clips', @@ -36,10 +66,7 @@ class FoxNewsIE(AMPIE): 'upload_date': '20141204', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + 'skip': 'm3u8 HTTP error 400 in web browser', }, { 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', @@ -49,11 +76,6 @@ class FoxNewsIE(AMPIE): 'url': 'http://video.foxbusiness.com/v/4442309889001', 'only_matching': True, }, - { - # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words - 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', - 'only_matching': True, - }, ] @classmethod @@ -67,10 +89,10 @@ class FoxNewsIE(AMPIE): yield f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}' def _real_extract(self, url): - host, video_id = self._match_valid_url(url).groups() + video_id = self._match_id(url) info = self._extract_feed_info( - 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) + f'https://api.foxnews.com/v3/video-player/{video_id}?callback=uid_{video_id}') info['id'] = video_id return info @@ -78,6 +100,19 @@ class FoxNewsIE(AMPIE): class FoxNewsVideoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxnews\.com/video/(?P<id>\d+)' _TESTS = [{ + 'url': 'https://www.foxnews.com/video/6328632286112', + 'info_dict': { + 'id': '6328632286112', + 'ext': 'mp4', + 'title': 'Review: 2023 Toyota Prius Prime', + 'duration': 155, + 'thumbnail': r're:^https://.+\.jpg$', + 'timestamp': 1685720177.0, + 'upload_date': '20230602', + 'description': 'md5:b69aafb125b41c1402e9744f53d6edc4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'https://www.foxnews.com/video/6313058664112', 'info_dict': { 'id': '6313058664112', @@ -89,8 +124,7 @@ class FoxNewsVideoIE(InfoExtractor): 'title': 'Gutfeld! - Thursday, September 29', 'timestamp': 1664527538, }, - 'expected_warnings': ['Ignoring subtitle tracks'], - 'params': {'skip_download': 'm3u8'}, + 'skip': '404 page', }] def _real_extract(self, url): @@ -104,19 +138,22 @@ class FoxNewsArticleIE(InfoExtractor): _TESTS = [{ # data-video-id - 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', - 'md5': '83d44e1aff1433e7a29a7b537d1700b5', + 'url': 'https://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', + 'md5': 'd2dd6ce809cedeefa96460e964821437', 'info_dict': { 'id': '5116295019001', 'ext': 'mp4', 'title': 'Trump and Clinton asked to defend positions on Iraq War', - 'description': 'Veterans react on \'The Kelly File\'', + 'description': 'Veterans and Fox News host Dana Perino react on \'The Kelly File\' to NBC\'s presidential forum', 'timestamp': 1473301045, 'upload_date': '20160908', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 426, }, + 'params': {'skip_download': 'm3u8'}, }, { # iframe embed - 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', + 'url': 'https://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true', 'info_dict': { 'id': '5748266721001', 'ext': 'flv', @@ -127,9 +164,7 @@ class FoxNewsArticleIE(InfoExtractor): 'timestamp': 1520594670, 'upload_date': '20180309', }, - 'params': { - 'skip_download': True, - }, + 'skip': '404 page', }, { 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', 'only_matching': True, diff --git a/hypervideo_dl/extractor/foxsports.py b/hypervideo_dl/extractor/foxsports.py index f9d7fe5..8e89ccf 100644 --- a/hypervideo_dl/extractor/foxsports.py +++ b/hypervideo_dl/extractor/foxsports.py @@ -1,31 +1,52 @@ from .common import InfoExtractor +from .uplynk import UplynkPreplayIE +from ..networking import HEADRequest +from ..utils import float_or_none, make_archive_id, smuggle_url class FoxSportsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*video/(?P<id>\d+)' - - _TEST = { - 'url': 'http://www.foxsports.com/tennessee/video/432609859715', - 'md5': 'b49050e955bebe32c301972e4012ac17', + _VALID_URL = r'https?://(?:www\.)?foxsports\.com/watch/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.foxsports.com/watch/play-612168c6700004b', 'info_dict': { - 'id': '432609859715', + 'id': 'b72f5bd8658140baa5791bb676433733', 'ext': 'mp4', - 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers', - 'description': 'Courtney Lee talks about Memphis being focused.', - # TODO: fix timestamp - 'upload_date': '19700101', # '20150423', - # 'timestamp': 1429761109, - 'uploader': 'NEWA-FNG-FOXSPORTS', + 'display_id': 'play-612168c6700004b', + 'title': 'md5:e0c4ecac3a1f25295b4fae22fb5c126a', + 'description': 'md5:371bc43609708ae2b9e1a939229762af', + 'uploader_id': '06b4a36349624051a9ba52ac3a91d268', + 'upload_date': '20221205', + 'timestamp': 1670262586, + 'duration': 31.7317, + 'thumbnail': r're:^https?://.*\.jpg$', + 'extra_param_to_segment_url': str, }, 'params': { - # m3u8 download - 'skip_download': True, + 'skip_download': 'm3u8', }, - 'add_ie': ['ThePlatform'], - } + }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_ld = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={}) + data = self._download_json( + f'https://api3.fox.com/v2.0/vodplayer/sportsclip/{video_id}', + video_id, note='Downloading API JSON', headers={ + 'x-api-key': 'cf289e299efdfa39fb6316f259d1de93', + }) + preplay_url = self._request_webpage( + HEADRequest(data['url']), video_id, 'Fetching preplay URL').url - return self.url_result( - 'https://feed.theplatform.com/f/BKQ29B/foxsports-all?byId=' + video_id, 'ThePlatformFeed') + return { + '_type': 'url_transparent', + 'ie_key': UplynkPreplayIE.ie_key(), + 'url': smuggle_url(preplay_url, {'Origin': 'https://www.foxsports.com'}), + 'display_id': video_id, + 'title': data.get('name') or json_ld.get('title'), + 'description': data.get('description') or json_ld.get('description'), + 'duration': float_or_none(data.get('durationInSeconds')), + 'timestamp': json_ld.get('timestamp'), + 'thumbnails': json_ld.get('thumbnails'), + '_old_archive_ids': [make_archive_id(self, video_id)], + } diff --git a/hypervideo_dl/extractor/freesound.py b/hypervideo_dl/extractor/freesound.py index 8b5f227..fcde044 100644 --- a/hypervideo_dl/extractor/freesound.py +++ b/hypervideo_dl/extractor/freesound.py @@ -52,6 +52,7 @@ class FreesoundIE(InfoExtractor): tags_str = get_element_by_class('tags', webpage) tags = re.findall(r'<a[^>]+>([^<]+)', tags_str) if tags_str else None + audio_url = re.sub(r'^https?://freesound\.org(https?://)', r'\1', audio_url) audio_urls = [audio_url] LQ_FORMAT = '-lq.mp3' diff --git a/hypervideo_dl/extractor/fujitv.py b/hypervideo_dl/extractor/fujitv.py index 668bb27..77e826e 100644 --- a/hypervideo_dl/extractor/fujitv.py +++ b/hypervideo_dl/extractor/fujitv.py @@ -1,5 +1,5 @@ -from ..utils import HEADRequest from .common import InfoExtractor +from ..networking import HEADRequest class FujiTVFODPlus7IE(InfoExtractor): diff --git a/hypervideo_dl/extractor/funimation.py b/hypervideo_dl/extractor/funimation.py index 18363c1..41de85c 100644 --- a/hypervideo_dl/extractor/funimation.py +++ b/hypervideo_dl/extractor/funimation.py @@ -3,7 +3,7 @@ import re import string from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -46,8 +46,8 @@ class FunimationBaseIE(InfoExtractor): })) FunimationBaseIE._TOKEN = data['token'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), None)['error'] + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read().decode(), None)['error'] raise ExtractorError(error, expected=True) raise @@ -210,7 +210,7 @@ class FunimationIE(FunimationBaseIE): page = self._download_json( 'https://www.funimation.com/api/showexperience/%s/' % experience_id, display_id, headers=headers, expected_status=403, query={ - 'pinst_id': ''.join([random.choice(string.digits + string.ascii_letters) for _ in range(8)]), + 'pinst_id': ''.join(random.choices(string.digits + string.ascii_letters, k=8)), }, note=f'Downloading {format_name} JSON') sources = page.get('items') or [] if not sources: diff --git a/hypervideo_dl/extractor/funker530.py b/hypervideo_dl/extractor/funker530.py new file mode 100644 index 0000000..ba5ab7d --- /dev/null +++ b/hypervideo_dl/extractor/funker530.py @@ -0,0 +1,79 @@ +from .common import InfoExtractor +from .rumble import RumbleEmbedIE +from .youtube import YoutubeIE +from ..utils import ExtractorError, clean_html, get_element_by_class, strip_or_none + + +class Funker530IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?funker530\.com/video/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://funker530.com/video/azov-patrol-caught-in-open-under-automatic-grenade-launcher-fire/', + 'md5': '085f50fea27523a388bbc22e123e09c8', + 'info_dict': { + 'id': 'v2qbmu4', + 'ext': 'mp4', + 'title': 'Azov Patrol Caught In Open Under Automatic Grenade Launcher Fire', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Funker530', + 'channel': 'Funker530', + 'channel_url': 'https://rumble.com/c/c-1199543', + 'width': 1280, + 'height': 720, + 'fps': 25, + 'duration': 27, + 'upload_date': '20230608', + 'timestamp': 1686241321, + 'live_status': 'not_live', + 'description': 'md5:bea2e1f458095414e04b5ac189c2f980', + } + }, { + 'url': 'https://funker530.com/video/my-friends-joined-the-russians-civdiv/', + 'md5': 'a42c2933391210662e93e867d7124b70', + 'info_dict': { + 'id': 'k-pk4bOvoac', + 'ext': 'mp4', + 'view_count': int, + 'channel': 'Civ Div', + 'comment_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/k-pk4bOvoac/maxresdefault.jpg', + 'uploader_id': '@CivDiv', + 'duration': 357, + 'channel_url': 'https://www.youtube.com/channel/UCgsCiwJ88up-YyMHo7hL5-A', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/@CivDiv', + 'channel_id': 'UCgsCiwJ88up-YyMHo7hL5-A', + 'like_count': int, + 'description': 'md5:aef75ec3f59c07a0e39400f609b24429', + 'live_status': 'not_live', + 'age_limit': 0, + 'uploader': 'Civ Div', + 'categories': ['People & Blogs'], + 'title': 'My “Friends” joined the Russians.', + 'availability': 'public', + 'upload_date': '20230608', + 'playable_in_embed': True, + 'heatmap': 'count:100', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage)) + if rumble_url: + info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()} + else: + youtube_url = list(YoutubeIE._extract_embed_urls(url, webpage)) + if youtube_url: + info = {'url': youtube_url[0], 'ie_key': YoutubeIE.ie_key()} + if not info: + raise ExtractorError('No videos found on webpage', expected=True) + + return { + **info, + '_type': 'url_transparent', + 'description': strip_or_none(self._search_regex( + r'(?s)(.+)About the Author', clean_html(get_element_by_class('video-desc-paragraph', webpage)), + 'description', default=None)) + } diff --git a/hypervideo_dl/extractor/gamejolt.py b/hypervideo_dl/extractor/gamejolt.py index 440b832..8ec046b 100644 --- a/hypervideo_dl/extractor/gamejolt.py +++ b/hypervideo_dl/extractor/gamejolt.py @@ -48,7 +48,7 @@ class GameJoltBaseIE(InfoExtractor): post_hash_id, note='Downloading comments list page %d' % page) if not comments_data.get('comments'): break - for comment in traverse_obj(comments_data, (('comments', 'childComments'), ...), expected_type=dict, default=[]): + for comment in traverse_obj(comments_data, (('comments', 'childComments'), ...), expected_type=dict): yield { 'id': comment['id'], 'text': self._parse_content_as_text( diff --git a/hypervideo_dl/extractor/gdcvault.py b/hypervideo_dl/extractor/gdcvault.py index 2878bbd..4265feb 100644 --- a/hypervideo_dl/extractor/gdcvault.py +++ b/hypervideo_dl/extractor/gdcvault.py @@ -2,13 +2,8 @@ import re from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import ( - HEADRequest, - remove_start, - sanitized_Request, - smuggle_url, - urlencode_postdata, -) +from ..networking import HEADRequest, Request +from ..utils import remove_start, smuggle_url, urlencode_postdata class GDCVaultIE(InfoExtractor): @@ -138,8 +133,8 @@ class GDCVaultIE(InfoExtractor): 'password': password, } - request = sanitized_Request(login_url, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') + request = Request(login_url, urlencode_postdata(login_form)) + request.headers['Content-Type'] = 'application/x-www-form-urlencoded' self._download_webpage(request, display_id, 'Logging in') start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page') self._download_webpage(logout_url, display_id, 'Logging out') @@ -163,7 +158,7 @@ class GDCVaultIE(InfoExtractor): video_url = 'http://www.gdcvault.com' + direct_url # resolve the url so that we can detect the correct extension video_url = self._request_webpage( - HEADRequest(video_url), video_id).geturl() + HEADRequest(video_url), video_id).url return { 'id': video_id, diff --git a/hypervideo_dl/extractor/generic.py b/hypervideo_dl/extractor/generic.py index f28a77e..77b6fb3 100644 --- a/hypervideo_dl/extractor/generic.py +++ b/hypervideo_dl/extractor/generic.py @@ -14,7 +14,9 @@ from ..utils import ( ExtractorError, UnsupportedError, determine_ext, + determine_protocol, dict_get, + extract_basic_auth, format_field, int_or_none, is_html, @@ -31,7 +33,9 @@ from ..utils import ( unescapeHTML, unified_timestamp, unsmuggle_url, + update_url_query, url_or_none, + urljoin, variadic, xpath_attr, xpath_text, @@ -864,21 +868,7 @@ class GenericIE(InfoExtractor): }, }, { - # JWPlayer config passed as variable - 'url': 'http://www.txxx.com/videos/3326530/ariele/', - 'info_dict': { - 'id': '3326530_hq', - 'ext': 'mp4', - 'title': 'ARIELE | Tube Cup', - 'uploader': 'www.txxx.com', - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - } - }, - { - # Video.js embed, multiple formats + # Youtube embed, formerly: Video.js embed, multiple formats 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', 'info_dict': { 'id': 'yygqldloqIk', @@ -905,6 +895,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404 Not Found', }, # rtl.nl embed { @@ -1548,19 +1539,6 @@ class GenericIE(InfoExtractor): 'add_ie': ['WashingtonPost'], }, { - # Mediaset embed - 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml', - 'info_dict': { - 'id': '720642', - 'ext': 'mp4', - 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Mediaset'], - }, - { # JOJ.sk embeds 'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok', 'info_dict': { @@ -1864,11 +1842,6 @@ class GenericIE(InfoExtractor): 'title': 'I AM BIO Podcast | BIO', }, 'playlist_mincount': 52, - }, - { - # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed) - 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html', - 'only_matching': True, }, { # WimTv embed player 'url': 'http://www.msmotor.tv/wearefmi-pt-2-2021/', @@ -1885,11 +1858,13 @@ class GenericIE(InfoExtractor): 'display_id': 'kelis-4th-of-july', 'ext': 'mp4', 'title': 'Kelis - 4th Of July', - 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + 'description': 'Kelis - 4th Of July', + 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', }, 'params': { 'skip_download': True, }, + 'expected_warnings': ['Untested major version'], }, { # KVS Player 'url': 'https://www.kvs-demo.com/embed/105/', @@ -1898,35 +1873,12 @@ class GenericIE(InfoExtractor): 'display_id': 'kelis-4th-of-july', 'ext': 'mp4', 'title': 'Kelis - 4th Of July / Embed Player', - 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', }, 'params': { 'skip_download': True, }, }, { - # KVS Player - 'url': 'https://thisvid.com/videos/french-boy-pantsed/', - 'md5': '3397979512c682f6b85b3b04989df224', - 'info_dict': { - 'id': '2400174', - 'display_id': 'french-boy-pantsed', - 'ext': 'mp4', - 'title': 'French Boy Pantsed - ThisVid.com', - 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', - } - }, { - # KVS Player - 'url': 'https://thisvid.com/embed/2400174/', - 'md5': '3397979512c682f6b85b3b04989df224', - 'info_dict': { - 'id': '2400174', - 'display_id': 'french-boy-pantsed', - 'ext': 'mp4', - 'title': 'French Boy Pantsed - ThisVid.com', - 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', - } - }, { - # KVS Player 'url': 'https://youix.com/video/leningrad-zoj/', 'md5': '94f96ba95706dc3880812b27b7d8a2b8', 'info_dict': { @@ -1934,8 +1886,8 @@ class GenericIE(InfoExtractor): 'display_id': 'leningrad-zoj', 'ext': 'mp4', 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com', - 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', - } + 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg', + }, }, { # KVS Player 'url': 'https://youix.com/embed/18485', @@ -1945,19 +1897,20 @@ class GenericIE(InfoExtractor): 'display_id': 'leningrad-zoj', 'ext': 'mp4', 'title': 'Ленинград - ЗОЖ', - 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', - } + 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg', + }, }, { # KVS Player 'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/', 'md5': '94166bdb26b4cb1fb9214319a629fc51', 'info_dict': { 'id': '21217', - 'display_id': '40-nochey-40-nights-2016', + 'display_id': '40-nochey-2016', 'ext': 'mp4', 'title': '40 ночей (2016) - BogMedia.org', + 'description': 'md5:4e6d7d622636eb7948275432eb256dc3', 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', - } + }, }, { # KVS Player (for sites that serve kt_player.js via non-https urls) @@ -1967,9 +1920,9 @@ class GenericIE(InfoExtractor): 'id': '389508', 'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source', 'ext': 'mp4', - 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', - 'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg', - } + 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер', + 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg', + }, }, { # Reddit-hosted video that will redirect and be processed by RedditIE @@ -2172,7 +2125,79 @@ class GenericIE(InfoExtractor): 'age_limit': 0, 'direct': True, } - } + }, + { + 'note': 'server returns data in brotli compression by default if `accept-encoding: *` is specified.', + 'url': 'https://www.extra.cz/cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867', + 'info_dict': { + 'id': 'cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867', + 'ext': 'mp4', + 'title': 'čauky lidi 70 finall', + 'description': 'čauky lidi 70 finall', + 'thumbnail': 'h', + 'upload_date': '20220606', + 'timestamp': 1654513791, + 'duration': 318.0, + 'direct': True, + 'age_limit': 0, + }, + }, + { + 'note': 'JW Player embed with unicode-escape sequences in URL', + 'url': 'https://www.medici.tv/en/concerts/lahav-shani-mozart-mahler-israel-philharmonic-abu-dhabi-classics', + 'info_dict': { + 'id': 'm', + 'ext': 'mp4', + 'title': 'Lahav Shani conducts the Israel Philharmonic\'s first-ever concert in Abu Dhabi', + 'description': 'Mahler\'s ', + 'uploader': 'www.medici.tv', + 'age_limit': 0, + 'thumbnail': r're:^https?://.+\.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/', + 'md5': 'e2f0a4c329f7986280b7328e24036d60', + 'info_dict': { + 'id': '284002', + 'display_id': 'just-out-of-the-shower-joi', + 'ext': 'mp4', + 'title': 'Just Out Of The Shower JOI - Shooshtime', + 'thumbnail': 'https://i.shoosh.co/contents/videos_screenshots/284000/284002/preview.mp4.jpg', + 'height': 720, + 'age_limit': 18, + }, + }, + { + 'note': 'Live HLS direct link', + 'url': 'https://d18j67ugtrocuq.cloudfront.net/out/v1/2767aec339144787926bd0322f72c6e9/index.m3u8', + 'info_dict': { + 'id': 'index', + 'title': r're:index', + 'ext': 'mp4', + 'live_status': 'is_live', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, + { + 'note': 'Video.js VOD HLS', + 'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html', + 'info_dict': { + 'id': 'videojs_hls_test', + 'title': 'video', + 'ext': 'mp4', + 'age_limit': 0, + 'duration': 1800, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, ] def report_following_redirect(self, new_url): @@ -2189,12 +2214,41 @@ class GenericIE(InfoExtractor): self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}') - def _fragment_query(self, url): - if self._configuration_arg('fragment_query'): - query_string = urllib.parse.urlparse(url).query - if query_string: - return {'extra_param_to_segment_url': query_string} - return {} + def _extra_manifest_info(self, info, manifest_url): + fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0] + if fragment_query is not None: + info['extra_param_to_segment_url'] = ( + urllib.parse.urlparse(fragment_query).query or fragment_query + or urllib.parse.urlparse(manifest_url).query or None) + + hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None + info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), { + 'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}), + }) or None + + variant_query = self._configuration_arg('variant_query', [None], casesense=True)[0] + if variant_query is not None: + query = urllib.parse.parse_qs( + urllib.parse.urlparse(variant_query).query or variant_query + or urllib.parse.urlparse(manifest_url).query) + for fmt in self._downloader._get_formats(info): + fmt['url'] = update_url_query(fmt['url'], query) + + # Attempt to detect live HLS or set VOD duration + m3u8_format = next((f for f in self._downloader._get_formats(info) + if determine_protocol(f) == 'm3u8_native'), None) + if m3u8_format: + is_live = self._configuration_arg('is_live', [None])[0] + if is_live is not None: + info['live_status'] = 'not_live' if is_live == 'false' else 'is_live' + return + headers = m3u8_format.get('http_headers') or info.get('http_headers') + duration = self._extract_m3u8_vod_duration( + m3u8_format['url'], info.get('id'), note='Checking m3u8 live status', + errnote='Failed to download m3u8 media playlist', headers=headers) + if not duration: + info['live_status'] = 'is_live' + info['duration'] = info.get('duration') or duration def _extract_rss(self, url, video_id, doc): NS_MAP = { @@ -2238,43 +2292,87 @@ class GenericIE(InfoExtractor): 'entries': entries, } - def _kvs_getrealurl(self, video_url, license_code): + @classmethod + def _kvs_get_real_url(cls, video_url, license_code): if not video_url.startswith('function/0/'): return video_url # not obfuscated - url_path, _, url_query = video_url.partition('?') - urlparts = url_path.split('/')[2:] - license = self._kvs_getlicensetoken(license_code) - newmagic = urlparts[5][:32] + parsed = urllib.parse.urlparse(video_url[len('function/0/'):]) + license = cls._kvs_get_license_token(license_code) + urlparts = parsed.path.split('/') - for o in range(len(newmagic) - 1, -1, -1): - new = '' - l = (o + sum(int(n) for n in license[o:])) % 32 + HASH_LENGTH = 32 + hash = urlparts[3][:HASH_LENGTH] + indices = list(range(HASH_LENGTH)) - for i in range(0, len(newmagic)): - if i == o: - new += newmagic[l] - elif i == l: - new += newmagic[o] - else: - new += newmagic[i] - newmagic = new + # Swap indices of hash according to the destination calculated from the license token + accum = 0 + for src in reversed(range(HASH_LENGTH)): + accum += license[src] + dest = (src + accum) % HASH_LENGTH + indices[src], indices[dest] = indices[dest], indices[src] + + urlparts[3] = ''.join(hash[index] for index in indices) + urlparts[3][HASH_LENGTH:] + return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts))) - urlparts[5] = newmagic + urlparts[5][32:] - return '/'.join(urlparts) + '?' + url_query + @staticmethod + def _kvs_get_license_token(license): + license = license.replace('$', '') + license_values = [int(char) for char in license] - def _kvs_getlicensetoken(self, license): - modlicense = license.replace('$', '').replace('0', '1') - center = int(len(modlicense) / 2) + modlicense = license.replace('0', '1') + center = len(modlicense) // 2 fronthalf = int(modlicense[:center + 1]) backhalf = int(modlicense[center:]) + modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1] + + return [ + (license_values[index + offset] + current) % 10 + for index, current in enumerate(map(int, modlicense)) + for offset in range(4) + ] + + def _extract_kvs(self, url, webpage, video_id): + flashvars = self._search_json( + r'(?s:<script\b[^>]*>.*?var\s+flashvars\s*=)', + webpage, 'flashvars', video_id, transform_source=js_to_json) + + # extract the part after the last / as the display_id from the + # canonical URL. + display_id = self._search_regex( + r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>' + r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)', + webpage, 'display_id', fatal=False) + title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title') + + thumbnail = flashvars['preview_url'] + if thumbnail.startswith('//'): + protocol, _, _ = url.partition('/') + thumbnail = protocol + thumbnail + + url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys())) + formats = [] + for key in url_keys: + if '/get_file/' not in flashvars[key]: + continue + format_id = flashvars.get(f'{key}_text', key) + formats.append({ + 'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])), + 'format_id': format_id, + 'ext': 'mp4', + **(parse_resolution(format_id) or parse_resolution(flashvars[key])), + 'http_headers': {'Referer': url}, + }) + if not formats[-1].get('height'): + formats[-1]['quality'] = 1 - modlicense = str(4 * abs(fronthalf - backhalf)) - retval = '' - for o in range(0, center + 1): - for i in range(1, 5): - retval += str((int(license[o + i]) + int(modlicense[o])) % 10) - return retval + return { + 'id': flashvars['video_id'], + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } def _real_extract(self, url): if url.startswith('//'): @@ -2330,13 +2428,12 @@ class GenericIE(InfoExtractor): # It may probably better to solve this by checking Content-Type for application/octet-stream # after a HEAD request, but not sure if we can rely on this. full_response = self._request_webpage(url, video_id, headers={ - 'Accept-Encoding': '*', + 'Accept-Encoding': 'identity', **smuggled_data.get('http_headers', {}) }) - new_url = full_response.geturl() - if new_url == urllib.parse.urlparse(url)._replace(scheme='https').geturl(): - url = new_url - elif url != new_url: + new_url = full_response.url + url = urllib.parse.urlparse(url)._replace(scheme=urllib.parse.urlparse(new_url).scheme).geturl() + if new_url != extract_basic_auth(url)[0]: self.report_following_redirect(new_url) if force_videoid: new_url = smuggle_url(new_url, {'force_videoid': force_videoid}) @@ -2355,14 +2452,13 @@ class GenericIE(InfoExtractor): self.report_detected('direct video link') headers = smuggled_data.get('http_headers', {}) format_id = str(m.group('format_id')) + ext = determine_ext(url) subtitles = {} - if format_id.endswith('mpegurl'): + if format_id.endswith('mpegurl') or ext == 'm3u8': formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) - info_dict.update(self._fragment_query(url)) - elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): + elif format_id.endswith('mpd') or format_id.endswith('dash+xml') or ext == 'mpd': formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers) - info_dict.update(self._fragment_query(url)) - elif format_id == 'f4m': + elif format_id == 'f4m' or ext == 'f4m': formats = self._extract_f4m_formats(url, video_id, headers=headers) else: formats = [{ @@ -2374,8 +2470,9 @@ class GenericIE(InfoExtractor): info_dict.update({ 'formats': formats, 'subtitles': subtitles, - 'http_headers': headers, + 'http_headers': headers or None, }) + self._extra_manifest_info(info_dict, url) return info_dict if not self.get_param('test', False) and not is_intentional: @@ -2388,7 +2485,7 @@ class GenericIE(InfoExtractor): if first_bytes.startswith(b'#EXTM3U'): self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') - info_dict.update(self._fragment_query(url)) + self._extra_manifest_info(info_dict, url) return info_dict # Maybe it's a direct link to a video? @@ -2432,14 +2529,14 @@ class GenericIE(InfoExtractor): return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, - xspf_base_url=full_response.geturl()), + xspf_base_url=full_response.url), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( doc, - mpd_base_url=full_response.geturl().rpartition('/')[0], + mpd_base_url=full_response.url.rpartition('/')[0], mpd_url=url) - info_dict.update(self._fragment_query(url)) + self._extra_manifest_info(info_dict, url) self.report_detected('DASH manifest') return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): @@ -2465,7 +2562,7 @@ class GenericIE(InfoExtractor): self._downloader.write_debug('Looking for embeds') embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict)) if len(embeds) == 1: - return {**info_dict, **embeds[0]} + return merge_dicts(embeds[0], info_dict) elif embeds: return self.playlist_result(embeds, **info_dict) raise UnsupportedError(url) @@ -2475,7 +2572,7 @@ class GenericIE(InfoExtractor): info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url) url, smuggled_data = unsmuggle_url(url, {}) - actual_url = urlh.geturl() if urlh else url + actual_url = urlh.url if urlh else url # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) @@ -2528,8 +2625,7 @@ class GenericIE(InfoExtractor): varname = mobj.group(1) sources = variadic(self._parse_json( mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or []) - formats = [] - subtitles = {} + formats, subtitles, src = [], {}, None for source in sources: src = source.get('src') if not src or not isinstance(src, str): @@ -2552,8 +2648,6 @@ class GenericIE(InfoExtractor): m3u8_id='hls', fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) - for fmt in formats: - fmt.update(self._fragment_query(src)) if not formats: formats.append({ @@ -2569,11 +2663,11 @@ class GenericIE(InfoExtractor): for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage): sub = self._parse_json( sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {} - src = str_or_none(sub.get('src')) - if not src: + sub_src = str_or_none(sub.get('src')) + if not sub_src: continue subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({ - 'url': urllib.parse.urljoin(url, src), + 'url': urllib.parse.urljoin(url, sub_src), 'name': sub.get('label'), 'http_headers': { 'Referer': actual_url, @@ -2581,7 +2675,21 @@ class GenericIE(InfoExtractor): }) if formats or subtitles: self.report_detected('video.js embed') - return [{'formats': formats, 'subtitles': subtitles}] + info_dict = {'formats': formats, 'subtitles': subtitles} + if formats: + self._extra_manifest_info(info_dict, src) + return [info_dict] + + # Look for generic KVS player (before json-ld bc of some urls that break otherwise) + found = self._search_regex(( + r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:(?!\1)[^?#])+/kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>', + r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:(?!\2)[^?#])+/kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,', + ), webpage, 'KVS player', group='ver', default=False) + if found: + self.report_detected('KVS Player') + if found.split('.')[0] not in ('4', '5', '6'): + self.report_warning(f'Untested major version ({found}) in player engine - download may fail.') + return [self._extract_kvs(url, webpage, video_id)] # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld(webpage, video_id, default={}) @@ -2626,52 +2734,6 @@ class GenericIE(InfoExtractor): if found: self.report_detected('JW Player embed') if not found: - # Look for generic KVS player - found = re.search(r'<script [^>]*?src="https?://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage) - if found: - self.report_detected('KWS Player') - if found.group('maj_ver') not in ['4', '5']: - self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver')) - flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage) - flashvars = self._parse_json(flashvars.group(1), video_id, transform_source=js_to_json) - - # extract the part after the last / as the display_id from the - # canonical URL. - display_id = self._search_regex( - r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>' - r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)', - webpage, 'display_id', fatal=False - ) - title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title') - - thumbnail = flashvars['preview_url'] - if thumbnail.startswith('//'): - protocol, _, _ = url.partition('/') - thumbnail = protocol + thumbnail - - url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys())) - formats = [] - for key in url_keys: - if '/get_file/' not in flashvars[key]: - continue - format_id = flashvars.get(f'{key}_text', key) - formats.append({ - 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']), - 'format_id': format_id, - 'ext': 'mp4', - **(parse_resolution(format_id) or parse_resolution(flashvars[key])) - }) - if not formats[-1].get('height'): - formats[-1]['quality'] = 1 - - return [{ - 'id': flashvars['video_id'], - 'display_id': display_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - }] - if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) if found: @@ -2751,6 +2813,7 @@ class GenericIE(InfoExtractor): entries = [] for video_url in orderedSet(found): + video_url = video_url.encode().decode('unicode-escape') video_url = unescapeHTML(video_url) video_url = video_url.replace('\\/', '/') video_url = urllib.parse.urljoin(url, video_url) @@ -2790,10 +2853,10 @@ class GenericIE(InfoExtractor): return [self._extract_xspf_playlist(video_url, video_id)] elif ext == 'm3u8': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers) - entry_info_dict.update(self._fragment_query(video_url)) + self._extra_manifest_info(entry_info_dict, video_url) elif ext == 'mpd': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers) - entry_info_dict.update(self._fragment_query(video_url)) + self._extra_manifest_info(entry_info_dict, video_url) elif ext == 'f4m': entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers) elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: diff --git a/hypervideo_dl/extractor/genius.py b/hypervideo_dl/extractor/genius.py index 62f5a28..57c25e7 100644 --- a/hypervideo_dl/extractor/genius.py +++ b/hypervideo_dl/extractor/genius.py @@ -10,7 +10,7 @@ from ..utils import ( class GeniusIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?genius\.com/videos/(?P<id>[^?/#]+)' + _VALID_URL = r'https?://(?:www\.)?genius\.com/(?:videos|(?P<article>a))/(?P<id>[^?/#]+)' _TESTS = [{ 'url': 'https://genius.com/videos/Vince-staples-breaks-down-the-meaning-of-when-sparks-fly', 'md5': '64c2ad98cfafcfda23bfa0ad0c512f4c', @@ -41,19 +41,37 @@ class GeniusIE(InfoExtractor): 'timestamp': 1631209167, 'thumbnail': r're:^https?://.*\.jpg$', }, + }, { + 'url': 'https://genius.com/a/cordae-anderson-paak-break-down-the-meaning-of-two-tens', + 'md5': 'f98a4e03b16b0a2821bd6e52fb3cc9d7', + 'info_dict': { + 'id': '6321509903112', + 'ext': 'mp4', + 'title': 'Cordae & Anderson .Paak Breaks Down The Meaning Of “Two Tens”', + 'description': 'md5:1255f0e1161d07342ce56a8464ac339d', + 'tags': ['song id: 5457554'], + 'uploader_id': '4863540648001', + 'duration': 361.813, + 'upload_date': '20230301', + 'timestamp': 1677703908, + 'thumbnail': r're:^https?://.*\.jpg$', + }, }] def _real_extract(self, url): - display_id = self._match_id(url) + display_id, is_article = self._match_valid_url(url).group('id', 'article') webpage = self._download_webpage(url, display_id) metadata = self._search_json( - r'<meta content="', webpage, 'metadata', display_id, transform_source=unescapeHTML) - video_id = traverse_obj( - metadata, ('video', 'provider_id'), - ('dfp_kv', lambda _, x: x['name'] == 'brightcove_video_id', 'values', 0), get_all=False) + r'<meta content="', webpage, 'metadata', display_id, + end_pattern=r'"\s+itemprop="page_data"', transform_source=unescapeHTML) + video_id = traverse_obj(metadata, ( + (('article', 'media', ...), ('video', None)), + ('provider_id', ('dfp_kv', lambda _, v: v['name'] == 'brightcove_video_id', 'values', ...))), + get_all=False) if not video_id: - raise ExtractorError('Brightcove video id not found in webpage') + # Not all article pages have videos, expect the error + raise ExtractorError('Brightcove video ID not found in webpage', expected=bool(is_article)) config = self._search_json(r'var\s*APP_CONFIG\s*=', webpage, 'config', video_id, default={}) account_id = config.get('brightcove_account_id', '4863540648001') @@ -68,7 +86,7 @@ class GeniusIE(InfoExtractor): class GeniusLyricsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?genius\.com/(?P<id>[^?/#]+)-lyrics[?/#]?' + _VALID_URL = r'https?://(?:www\.)?genius\.com/(?P<id>[^?/#]+)-lyrics(?:[?/#]|$)' _TESTS = [{ 'url': 'https://genius.com/Lil-baby-heyy-lyrics', 'playlist_mincount': 2, diff --git a/hypervideo_dl/extractor/globalplayer.py b/hypervideo_dl/extractor/globalplayer.py new file mode 100644 index 0000000..e0c0d58 --- /dev/null +++ b/hypervideo_dl/extractor/globalplayer.py @@ -0,0 +1,254 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + join_nonempty, + parse_duration, + str_or_none, + traverse_obj, + unified_strdate, + unified_timestamp, + urlhandle_detect_ext, +) + + +class GlobalPlayerBaseIE(InfoExtractor): + def _get_page_props(self, url, video_id): + webpage = self._download_webpage(url, video_id) + return self._search_nextjs_data(webpage, video_id)['props']['pageProps'] + + def _request_ext(self, url, video_id): + return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests + url, video_id, note='Determining source extension')) + + def _extract_audio(self, episode, series): + return { + 'vcodec': 'none', + **traverse_obj(series, { + 'series': 'title', + 'series_id': 'id', + 'thumbnail': 'imageUrl', + 'uploader': 'itunesAuthor', # podcasts only + }), + **traverse_obj(episode, { + 'id': 'id', + 'description': ('description', {clean_html}), + 'duration': ('duration', {parse_duration}), + 'thumbnail': 'imageUrl', + 'url': 'streamUrl', + 'timestamp': (('pubDate', 'startDate'), {unified_timestamp}), + 'title': 'title', + }, get_all=False) + } + + +class GlobalPlayerLiveIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/live/(?P<id>\w+)/\w+' + _TESTS = [{ + 'url': 'https://www.globalplayer.com/live/smoothchill/uk/', + 'info_dict': { + 'id': '2mx1E', + 'ext': 'aac', + 'display_id': 'smoothchill-uk', + 'title': 're:^Smooth Chill.+$', + 'thumbnail': 'https://herald.musicradio.com/media/f296ade8-50c9-4f60-911f-924e96873620.png', + 'description': 'Music To Chill To', + 'live_status': 'is_live', + }, + }, { + # national station + 'url': 'https://www.globalplayer.com/live/heart/uk/', + 'info_dict': { + 'id': '2mwx4', + 'ext': 'aac', + 'description': 'turn up the feel good!', + 'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png', + 'live_status': 'is_live', + 'title': 're:^Heart UK.+$', + 'display_id': 'heart-uk', + }, + }, { + # regional variation + 'url': 'https://www.globalplayer.com/live/heart/london/', + 'info_dict': { + 'id': 'AMqg', + 'ext': 'aac', + 'thumbnail': 'https://herald.musicradio.com/media/49b9e8cb-15bf-4bf2-8c28-a4850cc6b0f3.png', + 'title': 're:^Heart London.+$', + 'live_status': 'is_live', + 'display_id': 'heart-london', + 'description': 'turn up the feel good!', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + station = self._get_page_props(url, video_id)['station'] + stream_url = station['streamUrl'] + + return { + 'id': station['id'], + 'display_id': join_nonempty('brandSlug', 'slug', from_dict=station) or station.get('legacyStationPrefix'), + 'url': stream_url, + 'ext': self._request_ext(stream_url, video_id), + 'vcodec': 'none', + 'is_live': True, + **traverse_obj(station, { + 'title': (('name', 'brandName'), {str_or_none}), + 'description': 'tagline', + 'thumbnail': 'brandLogo', + }, get_all=False), + } + + +class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/playlists/(?P<id>\w+)' + _TESTS = [{ + # "live playlist" + 'url': 'https://www.globalplayer.com/playlists/8bLk/', + 'info_dict': { + 'id': '8bLk', + 'ext': 'aac', + 'live_status': 'is_live', + 'description': 'md5:e10f5e10b01a7f2c14ba815509fbb38d', + 'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=', + 'title': 're:^Classic FM Hall of Fame.+$' + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + station = self._get_page_props(url, video_id)['playlistData'] + stream_url = station['streamUrl'] + + return { + 'id': video_id, + 'url': stream_url, + 'ext': self._request_ext(stream_url, video_id), + 'vcodec': 'none', + 'is_live': True, + **traverse_obj(station, { + 'title': 'title', + 'description': 'description', + 'thumbnail': 'image', + }), + } + + +class GlobalPlayerAudioIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)/|catchup/\w+/\w+/)(?P<id>\w+)/?(?:$|[?#])' + _TESTS = [{ + # podcast + 'url': 'https://www.globalplayer.com/podcasts/42KuaM/', + 'playlist_mincount': 5, + 'info_dict': { + 'id': '42KuaM', + 'title': 'Filthy Ritual', + 'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', + 'categories': ['Society & Culture', 'True Crime'], + 'uploader': 'Global', + 'description': 'md5:da5b918eac9ae319454a10a563afacf9', + }, + }, { + # radio catchup + 'url': 'https://www.globalplayer.com/catchup/lbc/uk/46vyD7z/', + 'playlist_mincount': 3, + 'info_dict': { + 'id': '46vyD7z', + 'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.', + 'title': 'Nick Ferrari', + 'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf', + }, + }] + + def _real_extract(self, url): + video_id, podcast = self._match_valid_url(url).group('id', 'podcast') + props = self._get_page_props(url, video_id) + series = props['podcastInfo'] if podcast else props['catchupInfo'] + + return { + '_type': 'playlist', + 'id': video_id, + 'entries': [self._extract_audio(ep, series) for ep in traverse_obj( + series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))], + 'categories': traverse_obj(series, ('categories', ..., 'name')) or None, + **traverse_obj(series, { + 'description': 'description', + 'thumbnail': 'imageUrl', + 'title': 'title', + 'uploader': 'itunesAuthor', # podcasts only + }), + } + + +class GlobalPlayerAudioEpisodeIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/(?:(?P<podcast>podcasts)|catchup/\w+/\w+)/episodes/(?P<id>\w+)/?(?:$|[?#])' + _TESTS = [{ + # podcast + 'url': 'https://www.globalplayer.com/podcasts/episodes/7DrfNnE/', + 'info_dict': { + 'id': '7DrfNnE', + 'ext': 'mp3', + 'title': 'Filthy Ritual - Trailer', + 'description': 'md5:1f1562fd0f01b4773b590984f94223e0', + 'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', + 'duration': 225.0, + 'timestamp': 1681254900, + 'series': 'Filthy Ritual', + 'series_id': '42KuaM', + 'upload_date': '20230411', + 'uploader': 'Global', + }, + }, { + # radio catchup + 'url': 'https://www.globalplayer.com/catchup/lbc/uk/episodes/2zGq26Vcv1fCWhddC4JAwETXWe/', + 'info_dict': { + 'id': '2zGq26Vcv1fCWhddC4JAwETXWe', + 'ext': 'm4a', + 'timestamp': 1682056800, + 'series': 'Nick Ferrari', + 'thumbnail': 'md5:4df24d8a226f5b2508efbcc6ae874ebf', + 'upload_date': '20230421', + 'series_id': '46vyD7z', + 'description': 'Nick Ferrari At Breakfast is Leading Britain\'s Conversation.', + 'title': 'Nick Ferrari', + 'duration': 10800.0, + }, + }] + + def _real_extract(self, url): + video_id, podcast = self._match_valid_url(url).group('id', 'podcast') + props = self._get_page_props(url, video_id) + episode = props['podcastEpisode'] if podcast else props['catchupEpisode'] + + return self._extract_audio( + episode, traverse_obj(episode, 'podcast', 'show', expected_type=dict) or {}) + + +class GlobalPlayerVideoIE(GlobalPlayerBaseIE): + _VALID_URL = r'https?://www\.globalplayer\.com/videos/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.globalplayer.com/videos/2JsSZ7Gm2uP/', + 'info_dict': { + 'id': '2JsSZ7Gm2uP', + 'ext': 'mp4', + 'description': 'md5:6a9f063c67c42f218e42eee7d0298bfd', + 'thumbnail': 'md5:d4498af48e15aae4839ce77b97d39550', + 'upload_date': '20230420', + 'title': 'Treble Malakai Bayoh sings a sublime Handel aria at Classic FM Live', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._get_page_props(url, video_id)['videoData'] + + return { + 'id': video_id, + **traverse_obj(meta, { + 'url': 'url', + 'thumbnail': ('image', 'url'), + 'title': 'title', + 'upload_date': ('publish_date', {unified_strdate}), + 'description': 'description', + }), + } diff --git a/hypervideo_dl/extractor/globo.py b/hypervideo_dl/extractor/globo.py index a7be2cb..df98f09 100644 --- a/hypervideo_dl/extractor/globo.py +++ b/hypervideo_dl/extractor/globo.py @@ -8,8 +8,8 @@ from .common import InfoExtractor from ..compat import ( compat_str, ) +from ..networking import HEADRequest from ..utils import ( - HEADRequest, ExtractorError, float_or_none, orderedSet, diff --git a/hypervideo_dl/extractor/gmanetwork.py b/hypervideo_dl/extractor/gmanetwork.py new file mode 100644 index 0000000..62fff4e --- /dev/null +++ b/hypervideo_dl/extractor/gmanetwork.py @@ -0,0 +1,83 @@ +from .common import InfoExtractor +from .dailymotion import DailymotionIE +from .youtube import YoutubeIE + + +class GMANetworkVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www)\.gmanetwork\.com/(?:\w+/){3}(?P<id>\d+)/(?P<display_id>[\w-]+)/video' + _TESTS = [{ + 'url': 'https://www.gmanetwork.com/fullepisodes/home/running_man_philippines/168677/running-man-philippines-catch-the-thief-full-chapter-2/video?section=home', + 'info_dict': { + 'id': '28BqW0AXPe0', + 'ext': 'mp4', + 'upload_date': '20220919', + 'uploader_url': 'http://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ', + 'like_count': int, + 'view_count': int, + 'uploader': 'YoüLOL', + 'channel_id': 'UChsoPNR5x-wdSO2GrOSIWqQ', + 'duration': 5313, + 'comment_count': int, + 'tags': 'count:22', + 'uploader_id': 'UChsoPNR5x-wdSO2GrOSIWqQ', + 'title': 'Running Man Philippines: Catch the Thief (FULL CHAPTER 2)', + 'channel_url': 'https://www.youtube.com/channel/UChsoPNR5x-wdSO2GrOSIWqQ', + 'thumbnail': 'https://i.ytimg.com/vi/28BqW0AXPe0/maxresdefault.jpg', + 'release_timestamp': 1663594212, + 'age_limit': 0, + 'channel_follower_count': int, + 'categories': ['Entertainment'], + 'description': 'md5:811bdcea74f9c48051824e494756e926', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel': 'YoüLOL', + 'availability': 'public', + 'release_date': '20220919', + } + }, { + 'url': 'https://www.gmanetwork.com/fullepisodes/home/more_than_words/87059/more-than-words-full-episode-80/video?section=home', + 'info_dict': { + 'id': 'yiDOExw2aSA', + 'ext': 'mp4', + 'live_status': 'not_live', + 'channel': 'GMANetwork', + 'like_count': int, + 'channel_follower_count': int, + 'description': 'md5:6d00cd658394fa1a5071200d3ed4be05', + 'duration': 1419, + 'age_limit': 0, + 'comment_count': int, + 'upload_date': '20181003', + 'thumbnail': 'https://i.ytimg.com/vi_webp/yiDOExw2aSA/maxresdefault.webp', + 'availability': 'public', + 'playable_in_embed': True, + 'channel_id': 'UCKL5hAuzgFQsyrsQKgU0Qng', + 'title': 'More Than Words: Full Episode 80 (Finale)', + 'uploader_id': 'GMANETWORK', + 'categories': ['Entertainment'], + 'uploader': 'GMANetwork', + 'channel_url': 'https://www.youtube.com/channel/UCKL5hAuzgFQsyrsQKgU0Qng', + 'tags': 'count:29', + 'view_count': int, + 'uploader_url': 'http://www.youtube.com/user/GMANETWORK', + } + }] + + def _real_extract(self, url): + content_id, display_id = self._match_valid_url(url).group('id', 'display_id') + webpage = self._download_webpage(url, display_id) + # webpage route + youtube_id = self._search_regex( + r'var\s*YOUTUBE_VIDEO\s*=\s*[\'"]+(?P<yt_id>[\w-]+)', webpage, 'youtube_id', fatal=False) + if youtube_id: + return self.url_result(youtube_id, YoutubeIE, youtube_id) + + # api call route + # more info at https://aphrodite.gmanetwork.com/fullepisodes/assets/fullepisodes/js/dist/fullepisodes_video.js?v=1.1.11 + network_url = self._search_regex( + r'NETWORK_URL\s*=\s*[\'"](?P<url>[^\'"]+)', webpage, 'network_url') + json_data = self._download_json(f'{network_url}api/data/content/video/{content_id}', display_id) + if json_data.get('video_file'): + return self.url_result(json_data['video_file'], YoutubeIE, json_data['video_file']) + else: + return self.url_result(json_data['dailymotion_file'], DailymotionIE, json_data['dailymotion_file']) diff --git a/hypervideo_dl/extractor/googledrive.py b/hypervideo_dl/extractor/googledrive.py index e027ea7..2fdec20 100644 --- a/hypervideo_dl/extractor/googledrive.py +++ b/hypervideo_dl/extractor/googledrive.py @@ -3,9 +3,11 @@ import re from .common import InfoExtractor from ..compat import compat_parse_qs from ..utils import ( - determine_ext, ExtractorError, + determine_ext, + extract_attributes, get_element_by_class, + get_element_html_by_id, int_or_none, lowercase_escape, try_get, @@ -34,6 +36,7 @@ class GoogleDriveIE(InfoExtractor): 'ext': 'mp4', 'title': 'Big Buck Bunny.mp4', 'duration': 45, + 'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ', } }, { # video can't be watched anonymously due to view count limit reached, @@ -163,15 +166,13 @@ class GoogleDriveIE(InfoExtractor): video_id = self._match_id(url) video_info = compat_parse_qs(self._download_webpage( 'https://drive.google.com/get_video_info', - video_id, query={'docid': video_id})) + video_id, 'Downloading video webpage', query={'docid': video_id})) def get_value(key): return try_get(video_info, lambda x: x[key][0]) reason = get_value('reason') title = get_value('title') - if not title and reason: - raise ExtractorError(reason, expected=True) formats = [] fmt_stream_map = (get_value('fmt_stream_map') or '').split(',') @@ -209,20 +210,25 @@ class GoogleDriveIE(InfoExtractor): 'export': 'download', }) - def request_source_file(source_url, kind): + def request_source_file(source_url, kind, data=None): return self._request_webpage( source_url, video_id, note='Requesting %s file' % kind, - errnote='Unable to request %s file' % kind, fatal=False) + errnote='Unable to request %s file' % kind, fatal=False, data=data) urlh = request_source_file(source_url, 'source') if urlh: def add_source_format(urlh): + nonlocal title + if not title: + title = self._search_regex( + r'\bfilename="([^"]+)"', urlh.headers.get('Content-Disposition'), + 'title', default=None) formats.append({ # Use redirect URLs as download URLs in order to calculate # correct cookies in _calc_cookies. # Using original URLs may result in redirect loop due to # google.com's cookies mistakenly used for googleusercontent.com # redirect URLs (see #23919). - 'url': urlh.geturl(), + 'url': urlh.url, 'ext': determine_ext(title, 'mp4').lower(), 'format_id': 'source', 'quality': 1, @@ -234,14 +240,10 @@ class GoogleDriveIE(InfoExtractor): urlh, url, video_id, note='Downloading confirmation page', errnote='Unable to confirm download', fatal=False) if confirmation_webpage: - confirm = self._search_regex( - r'confirm=([^&"\']+)', confirmation_webpage, - 'confirmation code', default=None) - if confirm: - confirmed_source_url = update_url_query(source_url, { - 'confirm': confirm, - }) - urlh = request_source_file(confirmed_source_url, 'confirmed source') + confirmed_source_url = extract_attributes( + get_element_html_by_id('download-form', confirmation_webpage) or '').get('action') + if confirmed_source_url: + urlh = request_source_file(confirmed_source_url, 'confirmed source', data=b'') if urlh and urlh.headers.get('Content-Disposition'): add_source_format(urlh) else: @@ -251,7 +253,10 @@ class GoogleDriveIE(InfoExtractor): or 'unable to extract confirmation code') if not formats and reason: - self.raise_no_formats(reason, expected=True) + if title: + self.raise_no_formats(reason, expected=True) + else: + raise ExtractorError(reason, expected=True) hl = get_value('hl') subtitles_id = None diff --git a/hypervideo_dl/extractor/goplay.py b/hypervideo_dl/extractor/goplay.py index 2882b49..960d7d7 100644 --- a/hypervideo_dl/extractor/goplay.py +++ b/hypervideo_dl/extractor/goplay.py @@ -76,11 +76,11 @@ class GoPlayIE(InfoExtractor): } api = self._download_json( - f'https://api.viervijfzes.be/content/{video_id}', - video_id, headers={'Authorization': self._id_token}) + f'https://api.goplay.be/web/v1/videos/long-form/{video_id}', + video_id, headers={'Authorization': 'Bearer %s' % self._id_token}) formats, subs = self._extract_m3u8_formats_and_subtitles( - api['video']['S'], video_id, ext='mp4', m3u8_id='HLS') + api['manifestUrls']['hls'], video_id, ext='mp4', m3u8_id='HLS') info_dict.update({ 'id': video_id, diff --git a/hypervideo_dl/extractor/gronkh.py b/hypervideo_dl/extractor/gronkh.py index b9370e3..1ae0a68 100644 --- a/hypervideo_dl/extractor/gronkh.py +++ b/hypervideo_dl/extractor/gronkh.py @@ -3,6 +3,7 @@ import functools from .common import InfoExtractor from ..utils import ( OnDemandPagedList, + float_or_none, traverse_obj, unified_strdate, ) @@ -19,7 +20,9 @@ class GronkhIE(InfoExtractor): 'title': 'H.O.R.D.E. - DAS ZWEiTE ZEiTALTER 🎲 Session 1', 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/9e2555d3a23bf4e5c5b7c6b3b70a9d84.jpg', - 'upload_date': '20221111' + 'upload_date': '20221111', + 'chapters': 'count:3', + 'duration': 31463, }, 'params': {'skip_download': True} }, { @@ -30,7 +33,8 @@ class GronkhIE(InfoExtractor): 'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv', 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg', - 'upload_date': '20211001' + 'upload_date': '20211001', + 'duration': 32058, }, 'params': {'skip_download': True} }, { @@ -56,6 +60,12 @@ class GronkhIE(InfoExtractor): 'upload_date': unified_strdate(data_json.get('created_at')), 'formats': formats, 'subtitles': subtitles, + 'duration': float_or_none(data_json.get('source_length')), + 'chapters': traverse_obj(data_json, ( + 'chapters', lambda _, v: float_or_none(v['offset']) is not None, { + 'title': 'title', + 'start_time': ('offset', {float_or_none}), + })) or None, } diff --git a/hypervideo_dl/extractor/hidive.py b/hypervideo_dl/extractor/hidive.py index 3a53f2c..df6868d 100644 --- a/hypervideo_dl/extractor/hidive.py +++ b/hypervideo_dl/extractor/hidive.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -39,15 +37,28 @@ class HiDiveIE(InfoExtractor): form = self._search_regex( r'(?s)<form[^>]+action="/account/login"[^>]*>(.+?)</form>', webpage, 'login form', default=None) - if not form: # logged in + if not form: return data = self._hidden_inputs(form) data.update({ 'Email': username, 'Password': password, }) - self._download_webpage( + login_webpage = self._download_webpage( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) + # If the user has multiple profiles on their account, select one. For now pick the first profile. + profile_id = self._search_regex( + r'<button [^>]+?data-profile-id="(\w+)"', login_webpage, 'profile id', default=None) + if profile_id is None: + return # If only one profile, Hidive auto-selects it + self._request_webpage( + 'https://www.hidive.com/ajax/chooseprofile', None, + data=urlencode_postdata({ + 'profileId': profile_id, + 'hash': self._search_regex( + r'\<button [^>]+?data-hash="(\w+)"', login_webpage, 'profile id hash'), + 'returnUrl': '/dashboard' + })) def _call_api(self, video_id, title, key, data={}, **kwargs): data = { @@ -60,26 +71,6 @@ class HiDiveIE(InfoExtractor): 'https://www.hidive.com/play/settings', video_id, data=urlencode_postdata(data), **kwargs) or {} - def _extract_subtitles_from_rendition(self, rendition, subtitles, parsed_urls): - for cc_file in rendition.get('ccFiles', []): - cc_url = url_or_none(try_get(cc_file, lambda x: x[2])) - # name is used since we cant distinguish subs with same language code - cc_lang = try_get(cc_file, (lambda x: x[1].replace(' ', '-').lower(), lambda x: x[0]), str) - if cc_url not in parsed_urls and cc_lang: - parsed_urls.add(cc_url) - subtitles.setdefault(cc_lang, []).append({'url': cc_url}) - - def _get_subtitles(self, url, video_id, title, key, parsed_urls): - webpage = self._download_webpage(url, video_id, fatal=False) or '' - subtitles = {} - for caption in set(re.findall(r'data-captions=\"([^\"]+)\"', webpage)): - renditions = self._call_api( - video_id, title, key, {'Captions': caption}, fatal=False, - note=f'Downloading {caption} subtitle information').get('renditions') or {} - for rendition_id, rendition in renditions.items(): - self._extract_subtitles_from_rendition(rendition, subtitles, parsed_urls) - return subtitles - def _real_extract(self, url): video_id, title, key = self._match_valid_url(url).group('id', 'title', 'key') settings = self._call_api(video_id, title, key) @@ -104,10 +95,20 @@ class HiDiveIE(InfoExtractor): f['format_note'] = f'{version}, {extra}' formats.extend(frmt) + subtitles = {} + for rendition_id, rendition in settings['renditions'].items(): + audio, version, extra = rendition_id.split('_') + for cc_file in rendition.get('ccFiles') or []: + cc_url = url_or_none(try_get(cc_file, lambda x: x[2])) + cc_lang = try_get(cc_file, (lambda x: x[1].replace(' ', '-').lower(), lambda x: x[0]), str) + if cc_url not in parsed_urls and cc_lang: + parsed_urls.add(cc_url) + subtitles.setdefault(cc_lang, []).append({'url': cc_url}) + return { 'id': video_id, 'title': video_id, - 'subtitles': self.extract_subtitles(url, video_id, title, key, parsed_urls), + 'subtitles': subtitles, 'formats': formats, 'series': title, 'season_number': int_or_none( diff --git a/hypervideo_dl/extractor/hketv.py b/hypervideo_dl/extractor/hketv.py index 1087956..e026996 100644 --- a/hypervideo_dl/extractor/hketv.py +++ b/hypervideo_dl/extractor/hketv.py @@ -126,7 +126,7 @@ class HKETVIE(InfoExtractor): # If we ever wanted to provide the final resolved URL that # does not require cookies, albeit with a shorter lifespan: # urlh = self._downloader.urlopen(file_url) - # resolved_url = urlh.geturl() + # resolved_url = urlh.url label = fmt.get('label') h = self._FORMAT_HEIGHTS.get(label) w = h * width // height if h and width and height else None diff --git a/hypervideo_dl/extractor/hollywoodreporter.py b/hypervideo_dl/extractor/hollywoodreporter.py new file mode 100644 index 0000000..1f7eb89 --- /dev/null +++ b/hypervideo_dl/extractor/hollywoodreporter.py @@ -0,0 +1,72 @@ +import functools +import re + +from .common import InfoExtractor +from .jwplatform import JWPlatformIE +from ..utils import ( + ExtractorError, + OnDemandPagedList, + extract_attributes, + get_element_by_class, + get_element_html_by_class, +) + + +class HollywoodReporterIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hollywoodreporter\.com/video/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.hollywoodreporter.com/video/chris-pine-michelle-rodriguez-dungeons-dragons-cast-directors-on-what-it-took-to-make-film-sxsw-2023/', + 'info_dict': { + 'id': 'zH4jZaR5', + 'ext': 'mp4', + 'title': 'md5:a9a1c073770a32f178955997712c4bd9', + 'description': 'The cast and directors of \'Dungeons & Dragons: Honor Among Thieves\' talk about their new film.', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/zH4jZaR5/poster.jpg?width=720', + 'upload_date': '20230312', + 'timestamp': 1678586423, + 'duration': 242.0, + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + data = extract_attributes(get_element_html_by_class('vlanding-video-card__link', webpage) or '') + video_id = data['data-video-showcase-trigger'] + showcase_type = data['data-video-showcase-type'] + + if showcase_type == 'jwplayer': + return self.url_result(f'jwplatform:{video_id}', JWPlatformIE) + elif showcase_type == 'youtube': + return self.url_result(video_id, 'Youtube') + else: + raise ExtractorError(f'Unsupported showcase type "{showcase_type}"') + + +class HollywoodReporterPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hollywoodreporter\.com/vcategory/(?P<slug>[\w-]+)-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.hollywoodreporter.com/vcategory/heat-vision-breakdown-57822/', + 'playlist_mincount': 109, + 'info_dict': { + 'id': '57822', + 'title': 'heat-vision-breakdown', + } + }] + + def _fetch_page(self, slug, pl_id, page): + page += 1 + webpage = self._download_webpage( + f'https://www.hollywoodreporter.com/vcategory/{slug}-{pl_id}/page/{page}/', + pl_id, note=f'Downloading playlist page {page}') + section = get_element_by_class('video-playlist-river', webpage) or '' + + for url in re.findall(r'<a[^>]+href="([^"]+)"[^>]+class="c-title__link', section): + yield self.url_result(url, HollywoodReporterIE) + + def _real_extract(self, url): + slug, pl_id = self._match_valid_url(url).group('slug', 'id') + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, slug, pl_id), 15), pl_id, slug) diff --git a/hypervideo_dl/extractor/hotnewhiphop.py b/hypervideo_dl/extractor/hotnewhiphop.py index f8570cb..3007fbb 100644 --- a/hypervideo_dl/extractor/hotnewhiphop.py +++ b/hypervideo_dl/extractor/hotnewhiphop.py @@ -1,11 +1,7 @@ from .common import InfoExtractor from ..compat import compat_b64decode -from ..utils import ( - ExtractorError, - HEADRequest, - sanitized_Request, - urlencode_postdata, -) +from ..networking import HEADRequest, Request +from ..utils import ExtractorError, urlencode_postdata class HotNewHipHopIE(InfoExtractor): @@ -36,9 +32,9 @@ class HotNewHipHopIE(InfoExtractor): ('mediaType', 's'), ('mediaId', video_id), ]) - r = sanitized_Request( + r = Request( 'http://www.hotnewhiphop.com/ajax/media/getActions/', data=reqdata) - r.add_header('Content-Type', 'application/x-www-form-urlencoded') + r.headers['Content-Type'] = 'application/x-www-form-urlencoded' mkd = self._download_json( r, video_id, note='Requesting media key', errnote='Could not download media key') @@ -50,7 +46,7 @@ class HotNewHipHopIE(InfoExtractor): req = self._request_webpage( redirect_req, video_id, note='Resolving final URL', errnote='Could not resolve final URL') - video_url = req.geturl() + video_url = req.url if video_url.endswith('.html'): raise ExtractorError('Redirect failed') diff --git a/hypervideo_dl/extractor/hotstar.py b/hypervideo_dl/extractor/hotstar.py index 61eec7b..02183ad 100644 --- a/hypervideo_dl/extractor/hotstar.py +++ b/hypervideo_dl/extractor/hotstar.py @@ -6,7 +6,8 @@ import time import uuid from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -83,7 +84,7 @@ class HotStarIE(HotStarBaseIE): _VALID_URL = r'''(?x) https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) (?: - (?P<type>movies|sports|episode|(?P<tv>tv))/ + (?P<type>movies|sports|clips|episode|(?P<tv>tv|shows))/ (?(tv)(?:[^/?#]+/){2}|[^?#]*) )? [^/?#]+/ @@ -123,6 +124,70 @@ class HotStarIE(HotStarBaseIE): 'episode_number': 8, } }, { + 'url': 'https://www.hotstar.com/in/shows/anupama/1260022017/anupama-anuj-share-a-moment/1000282843', + 'info_dict': { + 'id': '1000282843', + 'ext': 'mp4', + 'title': 'Anupama, Anuj Share a Moment', + 'season': 'Chapter 1', + 'description': 'md5:8d74ed2248423b8b06d5c8add4d7a0c0', + 'timestamp': 1678149000, + 'channel': 'StarPlus', + 'series': 'Anupama', + 'season_number': 1, + 'season_id': 7399, + 'upload_date': '20230307', + 'episode': 'Anupama, Anuj Share a Moment', + 'episode_number': 853, + 'duration': 1272, + 'channel_id': 3, + }, + 'skip': 'HTTP Error 504: Gateway Time-out', # XXX: Investigate 504 errors on some episodes + }, { + 'url': 'https://www.hotstar.com/in/shows/kana-kaanum-kaalangal/1260097087/back-to-school/1260097320', + 'info_dict': { + 'id': '1260097320', + 'ext': 'mp4', + 'title': 'Back To School', + 'season': 'Chapter 1', + 'description': 'md5:b0d6a4c8a650681491e7405496fc7e13', + 'timestamp': 1650564000, + 'channel': 'Hotstar Specials', + 'series': 'Kana Kaanum Kaalangal', + 'season_number': 1, + 'season_id': 9441, + 'upload_date': '20220421', + 'episode': 'Back To School', + 'episode_number': 1, + 'duration': 1810, + 'channel_id': 54, + }, + }, { + 'url': 'https://www.hotstar.com/in/clips/e3-sairat-kahani-pyaar-ki/1000262286', + 'info_dict': { + 'id': '1000262286', + 'ext': 'mp4', + 'title': 'E3 - SaiRat, Kahani Pyaar Ki', + 'description': 'md5:e3b4b3203bc0c5396fe7d0e4948a6385', + 'episode': 'E3 - SaiRat, Kahani Pyaar Ki', + 'upload_date': '20210606', + 'timestamp': 1622943900, + 'duration': 5395, + }, + }, { + 'url': 'https://www.hotstar.com/in/movies/premam/1000091195', + 'info_dict': { + 'id': '1000091195', + 'ext': 'mp4', + 'title': 'Premam', + 'release_year': 2015, + 'description': 'md5:d833c654e4187b5e34757eafb5b72d7f', + 'timestamp': 1462149000, + 'upload_date': '20160502', + 'episode': 'Premam', + 'duration': 8994, + }, + }, { 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', 'only_matching': True, }, { @@ -139,6 +204,8 @@ class HotStarIE(HotStarBaseIE): 'sports': 'match', 'episode': 'episode', 'tv': 'episode', + 'shows': 'episode', + 'clips': 'content', None: 'content', } @@ -148,6 +215,12 @@ class HotStarIE(HotStarBaseIE): 'dr': 'dynamic_range', } + _TAG_FIELDS = { + 'language': 'language', + 'acodec': 'audio_codec', + 'vcodec': 'video_codec', + } + @classmethod def _video_url(cls, video_id, video_type=None, *, slug='ignore_me', root=None): assert None in (video_type, root) @@ -160,8 +233,10 @@ class HotStarIE(HotStarBaseIE): video_type = self._TYPE.get(video_type, video_type) cookies = self._get_cookies(url) # Cookies before any request - video_data = self._call_api_v1(f'{video_type}/detail', video_id, - query={'tas': 10000, 'contentId': video_id})['body']['results']['item'] + video_data = traverse_obj( + self._call_api_v1( + f'{video_type}/detail', video_id, fatal=False, query={'tas': 10000, 'contentId': video_id}), + ('body', 'results', 'item', {dict})) or {} if not self.get_param('allow_unplayable_formats') and video_data.get('drmProtected'): self.report_drm(video_id) @@ -182,24 +257,22 @@ class HotStarIE(HotStarBaseIE): for key, prefix in self._IGNORE_MAP.items() for ignore in self._configuration_arg(key)): continue + tag_dict = dict((t.split(':', 1) + [None])[:2] for t in tags.split(';')) format_url = url_or_none(playback_set.get('playbackUrl')) if not format_url: continue format_url = re.sub(r'(?<=//staragvod)(\d)', r'web\1', format_url) - dr = re.search(r'dynamic_range:(?P<dr>[a-z]+)', playback_set.get('tagsCombination')).group('dr') ext = determine_ext(format_url) current_formats, current_subs = [], {} try: if 'package:hls' in tags or ext == 'm3u8': current_formats, current_subs = self._extract_m3u8_formats_and_subtitles( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id=f'{dr}-hls', headers=headers) + format_url, video_id, ext='mp4', headers=headers) elif 'package:dash' in tags or ext == 'mpd': current_formats, current_subs = self._extract_mpd_formats_and_subtitles( - format_url, video_id, mpd_id=f'{dr}-dash', headers=headers) + format_url, video_id, headers=headers) elif ext == 'f4m': pass # XXX: produce broken files else: @@ -209,24 +282,36 @@ class HotStarIE(HotStarBaseIE): 'height': int_or_none(playback_set.get('height')), }] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: geo_restricted = True continue - if tags and 'encryption:plain' not in tags: + if tag_dict.get('encryption') not in ('plain', None): for f in current_formats: f['has_drm'] = True - if tags and 'language' in tags: - lang = re.search(r'language:(?P<lang>[a-z]+)', tags).group('lang') - for f in current_formats: - if not f.get('langauge'): - f['language'] = lang + for f in current_formats: + for k, v in self._TAG_FIELDS.items(): + if not f.get(k): + f[k] = tag_dict.get(v) + if f.get('vcodec') != 'none' and not f.get('dynamic_range'): + f['dynamic_range'] = tag_dict.get('dynamic_range') + if f.get('acodec') != 'none' and not f.get('audio_channels'): + f['audio_channels'] = { + 'stereo': 2, + 'dolby51': 6, + }.get(tag_dict.get('audio_channel')) + f['format_note'] = join_nonempty( + tag_dict.get('ladder'), + tag_dict.get('audio_channel') if f.get('acodec') != 'none' else None, + f.get('format_note'), + delim=', ') formats.extend(current_formats) subs = self._merge_subtitles(subs, current_subs) if not formats and geo_restricted: self.raise_geo_restricted(countries=['IN'], metadata_available=True) + self._remove_duplicate_formats(formats) for f in formats: f.setdefault('http_headers', {}).update(headers) @@ -235,7 +320,8 @@ class HotStarIE(HotStarBaseIE): 'title': video_data.get('title'), 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), - 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')), + 'timestamp': int_or_none(traverse_obj(video_data, 'broadcastDate', 'startDate')), + 'release_year': int_or_none(video_data.get('year')), 'formats': formats, 'subtitles': subs, 'channel': video_data.get('channelName'), @@ -288,7 +374,7 @@ class HotStarPrefixIE(InfoExtractor): class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/tv(?:/[^/]+){2}/list/[^/]+/t-(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)(?:/[^/]+){2}/list/[^/]+/t-(?P<id>\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', 'info_dict': { @@ -296,6 +382,9 @@ class HotStarPlaylistIE(HotStarBaseIE): }, 'playlist_mincount': 20, }, { + 'url': 'https://www.hotstar.com/shows/savdhaan-india/s-26/list/popular-clips/t-3_2_26', + 'only_matching': True, + }, { 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', 'only_matching': True, }, { @@ -311,7 +400,7 @@ class HotStarPlaylistIE(HotStarBaseIE): class HotStarSeasonIE(HotStarBaseIE): IE_NAME = 'hotstar:season' - _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/\w+)/seasons/[^/]+/ss-(?P<id>\w+)' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)/[^/]+/\w+)/seasons/[^/]+/ss-(?P<id>\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/radhakrishn/1260000646/seasons/season-2/ss-8028', 'info_dict': { @@ -330,6 +419,9 @@ class HotStarSeasonIE(HotStarBaseIE): 'id': '8208', }, 'playlist_mincount': 19, + }, { + 'url': 'https://www.hotstar.com/in/shows/bigg-boss/14714/seasons/season-4/ss-8208/', + 'only_matching': True, }] def _real_extract(self, url): @@ -340,7 +432,7 @@ class HotStarSeasonIE(HotStarBaseIE): class HotStarSeriesIE(HotStarBaseIE): IE_NAME = 'hotstar:series' - _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))/?(?:[#?]|$)' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/(?:tv|shows)/[^/]+/(?P<id>\d+))/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646', 'info_dict': { @@ -359,6 +451,12 @@ class HotStarSeriesIE(HotStarBaseIE): 'id': '435', }, 'playlist_mincount': 267, + }, { + 'url': 'https://www.hotstar.com/in/shows/anupama/1260022017/', + 'info_dict': { + 'id': '1260022017', + }, + 'playlist_mincount': 940, }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/hrefli.py b/hypervideo_dl/extractor/hrefli.py new file mode 100644 index 0000000..77db2ea --- /dev/null +++ b/hypervideo_dl/extractor/hrefli.py @@ -0,0 +1,15 @@ +from .common import InfoExtractor + + +class HrefLiRedirectIE(InfoExtractor): + IE_NAME = 'href.li' + IE_DESC = False # Do not list + _VALID_URL = r'https?://href\.li/\?(?P<url>.+)' + + _TESTS = [{ + 'url': 'https://href.li/?https://www.reddit.com/r/cats/comments/12bluel/my_cat_helps_me_with_water/?utm_source=share&utm_medium=android_app&utm_name=androidcss&utm_term=1&utm_content=share_button', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result(self._match_valid_url(url).group('url')) diff --git a/hypervideo_dl/extractor/hrti.py b/hypervideo_dl/extractor/hrti.py index cfec80d..57b76e4 100644 --- a/hypervideo_dl/extractor/hrti.py +++ b/hypervideo_dl/extractor/hrti.py @@ -1,13 +1,13 @@ import json from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking import Request +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, ExtractorError, int_or_none, parse_age_limit, - sanitized_Request, try_get, ) @@ -42,7 +42,7 @@ class HRTiBaseIE(InfoExtractor): 'application_version': self._APP_VERSION } - req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8')) + req = Request(self._API_URL, data=json.dumps(app_data).encode('utf-8')) req.get_method = lambda: 'PUT' resources = self._download_json( @@ -73,8 +73,8 @@ class HRTiBaseIE(InfoExtractor): self._login_url, None, note='Logging in', errnote='Unable to log in', data=json.dumps(auth_data).encode('utf-8')) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406: - auth_info = self._parse_json(e.cause.read().encode('utf-8'), None) + if isinstance(e.cause, HTTPError) and e.cause.status == 406: + auth_info = self._parse_json(e.cause.response.read().encode('utf-8'), None) else: raise diff --git a/hypervideo_dl/extractor/hungama.py b/hypervideo_dl/extractor/hungama.py index 2e99396..cdec368 100644 --- a/hypervideo_dl/extractor/hungama.py +++ b/hypervideo_dl/extractor/hungama.py @@ -1,19 +1,32 @@ -import re - from .common import InfoExtractor from ..utils import ( int_or_none, + remove_end, + traverse_obj, try_get, + unified_timestamp, + url_or_none, urlencode_postdata, ) -class HungamaIE(InfoExtractor): +class HungamaBaseIE(InfoExtractor): + def _call_api(self, path, content_id, fatal=False): + return traverse_obj(self._download_json( + f'https://cpage.api.hungama.com/v2/page/content/{content_id}/{path}/detail', + content_id, fatal=fatal, query={ + 'device': 'web', + 'platform': 'a', + 'storeId': '1', + }), ('data', {dict})) or {} + + +class HungamaIE(HungamaBaseIE): _VALID_URL = r'''(?x) https?:// - (?:www\.)?hungama\.com/ + (?:www\.|un\.)?hungama\.com/ (?: - (?:video|movie)/[^/]+/| + (?:video|movie|short-film)/[^/]+/| tv-show/(?:[^/]+/){2}\d+/episode/[^/]+/ ) (?P<id>\d+) @@ -25,13 +38,28 @@ class HungamaIE(InfoExtractor): 'id': '39349649', 'ext': 'mp4', 'title': 'Krishna Chants', - 'description': 'Watch Krishna Chants video now. You can also watch other latest videos only at Hungama', + 'description': ' ', 'upload_date': '20180829', 'duration': 264, 'timestamp': 1535500800, 'view_count': int, - 'thumbnail': 'https://images.hungama.com/c/1/0dc/2ca/39349649/39349649_700x394.jpg', - } + 'thumbnail': 'https://images1.hungama.com/tr:n-a_169_m/c/1/0dc/2ca/39349649/39349649_350x197.jpg?v=8', + 'tags': 'count:6', + }, + }, { + 'url': 'https://un.hungama.com/short-film/adira/102524179/', + 'md5': '2278463f5dc9db9054d0c02602d44666', + 'info_dict': { + 'id': '102524179', + 'ext': 'mp4', + 'title': 'Adira', + 'description': 'md5:df20cd4d41eabb33634f06de1025a4b4', + 'upload_date': '20230417', + 'timestamp': 1681689600, + 'view_count': int, + 'thumbnail': 'https://images1.hungama.com/tr:n-a_23_m/c/1/197/ac9/102524179/102524179_350x525.jpg?v=1', + 'tags': 'count:7', + }, }, { 'url': 'https://www.hungama.com/movie/kahaani-2/44129919/', 'only_matching': True, @@ -51,14 +79,19 @@ class HungamaIE(InfoExtractor): 'c': 'common', 'm': 'get_video_mdn_url', }) - formats = self._extract_m3u8_formats(video_json['stream_url'], video_id, ext='mp4', m3u8_id='hls') - - json_ld = self._search_json_ld( - self._download_webpage(url, video_id, fatal=False) or '', video_id, fatal=False) + metadata = self._call_api('movie', video_id) return { - **json_ld, + **traverse_obj(metadata, ('head', 'data', { + 'title': ('title', {str}), + 'description': ('misc', 'description', {str}), + 'duration': ('duration', {int}), # duration in JSON is incorrect if string + 'timestamp': ('releasedate', {unified_timestamp}), + 'view_count': ('misc', 'playcount', {int_or_none}), + 'thumbnail': ('image', {url_or_none}), + 'tags': ('misc', 'keywords', ..., {str}), + })), 'id': video_id, 'formats': formats, 'subtitles': { @@ -71,10 +104,10 @@ class HungamaIE(InfoExtractor): class HungamaSongIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.|un\.)?hungama\.com/song/[^/]+/(?P<id>\d+)' + _TESTS = [{ 'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/', - 'md5': 'd4a6a05a394ad0453a9bea3ca00e6024', + 'md5': '964f46828e8b250aa35e5fdcfdcac367', 'info_dict': { 'id': '2931166', 'ext': 'mp3', @@ -83,8 +116,22 @@ class HungamaSongIE(InfoExtractor): 'artist': 'Lucky Ali', 'album': None, 'release_year': 2000, - } - } + 'thumbnail': 'https://stat2.hungama.ind.in/assets/images/default_images/da-200x200.png', + }, + }, { + 'url': 'https://un.hungama.com/song/tum-kya-mile-from-rocky-aur-rani-kii-prem-kahaani/103553672', + 'md5': '964f46828e8b250aa35e5fdcfdcac367', + 'info_dict': { + 'id': '103553672', + 'ext': 'mp3', + 'title': 'md5:5ebeb1e10771b634ce5f700ce68ae5f4', + 'track': 'Tum Kya Mile (From "Rocky Aur Rani Kii Prem Kahaani")', + 'artist': 'Pritam Chakraborty, Arijit Singh, Shreya Ghoshal, Amitabh Bhattacharya', + 'album': 'Tum Kya Mile (From "Rocky Aur Rani Kii Prem Kahaani")', + 'release_year': 2023, + 'thumbnail': 'https://images.hungama.com/c/1/7c2/c7b/103553671/103553671_200x200.jpg', + }, + }] def _real_extract(self, url): audio_id = self._match_id(url) @@ -122,8 +169,8 @@ class HungamaSongIE(InfoExtractor): } -class HungamaAlbumPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hungama\.com/(?:playlists|album)/[^/]+/(?P<id>\d+)' +class HungamaAlbumPlaylistIE(HungamaBaseIE): + _VALID_URL = r'https?://(?:www\.|un\.)?hungama\.com/(?P<path>playlists|album)/[^/]+/(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.hungama.com/album/bhuj-the-pride-of-india/69481490/', 'playlist_mincount': 7, @@ -132,16 +179,24 @@ class HungamaAlbumPlaylistIE(InfoExtractor): }, }, { 'url': 'https://www.hungama.com/playlists/hindi-jan-to-june-2021/123063/', - 'playlist_mincount': 50, + 'playlist_mincount': 33, 'info_dict': { 'id': '123063', }, + }, { + 'url': 'https://un.hungama.com/album/what-jhumka-%3F-from-rocky-aur-rani-kii-prem-kahaani/103891805/', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '103891805', + }, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - ptrn = r'<meta[^>]+?property=[\"\']?music:song:url[\"\']?[^>]+?content=[\"\']?([^\"\']+)' - items = re.findall(ptrn, webpage) - entries = [self.url_result(item, ie=HungamaSongIE.ie_key()) for item in items] - return self.playlist_result(entries, video_id) + playlist_id, path = self._match_valid_url(url).group('id', 'path') + data = self._call_api(remove_end(path, 's'), playlist_id, fatal=True) + + def entries(): + for song_url in traverse_obj(data, ('body', 'rows', ..., 'data', 'misc', 'share', {url_or_none})): + yield self.url_result(song_url, HungamaSongIE) + + return self.playlist_result(entries(), playlist_id) diff --git a/hypervideo_dl/extractor/huya.py b/hypervideo_dl/extractor/huya.py index b6e9eec..c4965f9 100644 --- a/hypervideo_dl/extractor/huya.py +++ b/hypervideo_dl/extractor/huya.py @@ -1,5 +1,6 @@ import hashlib import random +import re from ..compat import compat_urlparse, compat_b64decode @@ -37,7 +38,7 @@ class HuyaLiveIE(InfoExtractor): }] _RESOLUTION = { - '蓝光4M': { + '蓝光': { 'width': 1920, 'height': 1080, }, @@ -76,11 +77,15 @@ class HuyaLiveIE(InfoExtractor): if re_secret: fm, ss = self.encrypt(params, stream_info, stream_name) for si in stream_data.get('vMultiStreamInfo'): + display_name, bitrate = re.fullmatch( + r'(.+?)(?:(\d+)M)?', si.get('sDisplayName')).groups() rate = si.get('iBitRate') if rate: params['ratio'] = rate else: params.pop('ratio', None) + if bitrate: + rate = int(bitrate) * 1000 if re_secret: params['wsSecret'] = hashlib.md5( '_'.join([fm, params['u'], stream_name, ss, params['wsTime']])) @@ -90,7 +95,7 @@ class HuyaLiveIE(InfoExtractor): 'tbr': rate, 'url': update_url_query(f'{stream_url}/{stream_name}.{stream_info.get("sFlvUrlSuffix")}', query=params), - **self._RESOLUTION.get(si.get('sDisplayName'), {}), + **self._RESOLUTION.get(display_name, {}), }) return { diff --git a/hypervideo_dl/extractor/hypergryph.py b/hypervideo_dl/extractor/hypergryph.py new file mode 100644 index 0000000..9ca6cae --- /dev/null +++ b/hypervideo_dl/extractor/hypergryph.py @@ -0,0 +1,32 @@ +from .common import InfoExtractor +from ..utils import js_to_json, traverse_obj + + +class MonsterSirenHypergryphMusicIE(InfoExtractor): + _VALID_URL = r'https?://monster-siren\.hypergryph\.com/music/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://monster-siren.hypergryph.com/music/514562', + 'info_dict': { + 'id': '514562', + 'ext': 'wav', + 'artist': ['塞壬唱片-MSR'], + 'album': 'Flame Shadow', + 'title': 'Flame Shadow', + } + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id) + json_data = self._search_json( + r'window\.g_initialProps\s*=', webpage, 'data', audio_id, transform_source=js_to_json) + + return { + 'id': audio_id, + 'title': traverse_obj(json_data, ('player', 'songDetail', 'name')), + 'url': traverse_obj(json_data, ('player', 'songDetail', 'sourceUrl')), + 'ext': 'wav', + 'vcodec': 'none', + 'artist': traverse_obj(json_data, ('player', 'songDetail', 'artists')), + 'album': traverse_obj(json_data, ('musicPlay', 'albumDetail', 'name')) + } diff --git a/hypervideo_dl/extractor/idolplus.py b/hypervideo_dl/extractor/idolplus.py new file mode 100644 index 0000000..3c905b0 --- /dev/null +++ b/hypervideo_dl/extractor/idolplus.py @@ -0,0 +1,115 @@ +from .common import InfoExtractor +from ..utils import traverse_obj, try_call, url_or_none + + +class IdolPlusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?idolplus\.com/z[us]/(?:concert/|contents/?\?(?:[^#]+&)?albumId=)(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://idolplus.com/zs/contents?albumId=M012077298PPV00', + 'md5': '2ace3f4661c943a2f7e79f0b88cea1e7', + 'info_dict': { + 'id': 'M012077298PPV00', + 'ext': 'mp4', + 'title': '[MultiCam] Aegyo on Top of Aegyo (IZ*ONE EATING TRIP)', + 'release_date': '20200707', + 'formats': 'count:65', + }, + 'params': {'format': '532-KIM_MINJU'}, + }, { + 'url': 'https://idolplus.com/zs/contents?albumId=M01232H058PPV00&catId=E9TX5', + 'info_dict': { + 'id': 'M01232H058PPV00', + 'ext': 'mp4', + 'title': 'YENA (CIRCLE CHART MUSIC AWARDS 2022 RED CARPET)', + 'release_date': '20230218', + 'formats': 'count:5', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # live stream + 'url': 'https://idolplus.com/zu/contents?albumId=M012323174PPV00', + 'info_dict': { + 'id': 'M012323174PPV00', + 'ext': 'mp4', + 'title': 'Hanteo Music Awards 2022 DAY2', + 'release_date': '20230211', + 'formats': 'count:5', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://idolplus.com/zs/concert/M012323039PPV00', + 'info_dict': { + 'id': 'M012323039PPV00', + 'ext': 'mp4', + 'title': 'CIRCLE CHART MUSIC AWARDS 2022', + 'release_date': '20230218', + 'formats': 'count:5', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data_list = traverse_obj(self._download_json( + 'https://idolplus.com/api/zs/viewdata/ruleset/build', video_id, + headers={'App_type': 'web', 'Country_Code': 'KR'}, query={ + 'rulesetId': 'contents', + 'albumId': video_id, + 'distribute': 'PRD', + 'loggedIn': 'false', + 'region': 'zs', + 'countryGroup': '00010', + 'lang': 'en', + 'saId': '999999999998', + }), ('data', 'viewData', ...)) + + player_data = {} + while data_list: + player_data = data_list.pop() + if traverse_obj(player_data, 'type') == 'player': + break + elif traverse_obj(player_data, ('dataList', ...)): + data_list += player_data['dataList'] + + formats = self._extract_m3u8_formats(traverse_obj(player_data, ( + 'vodPlayerList', 'vodProfile', 0, 'vodServer', 0, 'video_url', {url_or_none})), video_id) + + subtitles = {} + for caption in traverse_obj(player_data, ('vodPlayerList', 'caption')) or []: + subtitles.setdefault(caption.get('lang') or 'und', []).append({ + 'url': caption.get('smi_url'), + 'ext': 'vtt', + }) + + # Add member multicams as alternative formats + if (traverse_obj(player_data, ('detail', 'has_cuesheet')) == 'Y' + and traverse_obj(player_data, ('detail', 'is_omni_member')) == 'Y'): + cuesheet = traverse_obj(self._download_json( + 'https://idolplus.com/gapi/contents/v1.0/content/cuesheet', video_id, + 'Downloading JSON metadata for member multicams', + headers={'App_type': 'web', 'Country_Code': 'KR'}, query={ + 'ALBUM_ID': video_id, + 'COUNTRY_GRP': '00010', + 'LANG': 'en', + 'SA_ID': '999999999998', + 'COUNTRY_CODE': 'KR', + }), ('data', 'cuesheet_item', 0)) + + for member in traverse_obj(cuesheet, ('members', ...)): + index = try_call(lambda: int(member['omni_view_index']) - 1) + member_video_url = traverse_obj(cuesheet, ('omni_view', index, 'cdn_url', 0, 'url', {url_or_none})) + if not member_video_url: + continue + member_formats = self._extract_m3u8_formats( + member_video_url, video_id, note=f'Downloading m3u8 for multicam {member["name"]}') + for mf in member_formats: + mf['format_id'] = f'{mf["format_id"]}-{member["name"].replace(" ", "_")}' + formats.extend(member_formats) + + return { + 'id': video_id, + 'title': traverse_obj(player_data, ('detail', 'albumName')), + 'formats': formats, + 'subtitles': subtitles, + 'release_date': traverse_obj(player_data, ('detail', 'broadcastDate')), + } diff --git a/hypervideo_dl/extractor/ign.py b/hypervideo_dl/extractor/ign.py index d4797d3..64875f8 100644 --- a/hypervideo_dl/extractor/ign.py +++ b/hypervideo_dl/extractor/ign.py @@ -1,17 +1,21 @@ import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, -) +from ..compat import compat_parse_qs +from ..networking.exceptions import HTTPError from ..utils import ( - HEADRequest, + ExtractorError, determine_ext, + error_to_compat_str, + extract_attributes, int_or_none, + merge_dicts, parse_iso8601, strip_or_none, - try_get, + traverse_obj, + url_or_none, + urljoin, ) @@ -20,69 +24,37 @@ class IGNBaseIE(InfoExtractor): return self._download_json( 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) + def _checked_call_api(self, slug): + try: + return self._call_api(slug) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: + e.cause.args = e.cause.args or [ + e.cause.response.url, e.cause.status, e.cause.reason] + raise ExtractorError( + 'Content not found: expired?', cause=e.cause, + expected=True) + raise -class IGNIE(IGNBaseIE): - """ - Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. - Some videos of it.ign.com are also supported - """ - - _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[^/?&#]+)' - IE_NAME = 'ign.com' - _PAGE_TYPE = 'video' - - _TESTS = [{ - 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - 'md5': 'd2e1586d9987d40fad7867bf96a018ea', - 'info_dict': { - 'id': '8f862beef863986b2785559b9e1aa599', - 'ext': 'mp4', - 'title': 'The Last of Us Review', - 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', - 'timestamp': 1370440800, - 'upload_date': '20130605', - 'tags': 'count:9', - } - }, { - 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', - 'md5': 'f1581a6fe8c5121be5b807684aeac3f6', - 'info_dict': { - 'id': 'ee10d774b508c9b8ec07e763b9125b91', - 'ext': 'mp4', - 'title': 'What\'s New Now: Is GoGo Snooping on Your Data?', - 'description': 'md5:817a20299de610bd56f13175386da6fa', - 'timestamp': 1420571160, - 'upload_date': '20150106', - 'tags': 'count:4', - } - }, { - 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - video = self._call_api(display_id) + def _extract_video_info(self, video, fatal=True): video_id = video['videoId'] - metadata = video['metadata'] - title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] formats = [] - refs = video.get('refs') or {} + refs = traverse_obj(video, 'refs', expected_type=dict) or {} - m3u8_url = refs.get('m3uUrl') + m3u8_url = url_or_none(refs.get('m3uUrl')) if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - f4m_url = refs.get('f4mUrl') + f4m_url = url_or_none(refs.get('f4mUrl')) if f4m_url: formats.extend(self._extract_f4m_formats( f4m_url, video_id, f4m_id='hds', fatal=False)) for asset in (video.get('assets') or []): - asset_url = asset.get('url') + asset_url = url_or_none(asset.get('url')) if not asset_url: continue formats.append({ @@ -93,7 +65,8 @@ class IGNIE(IGNBaseIE): 'width': int_or_none(asset.get('width')), }) - mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) + mezzanine_url = traverse_obj( + video, ('system', 'mezzanineUrl'), expected_type=url_or_none) if mezzanine_url: formats.append({ 'ext': determine_ext(mezzanine_url, 'mp4'), @@ -102,21 +75,16 @@ class IGNIE(IGNBaseIE): 'url': mezzanine_url, }) - thumbnails = [] - for thumbnail in (video.get('thumbnails') or []): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - }) + thumbnails = traverse_obj( + video, ('thumbnails', ..., {'url': 'url'}), expected_type=url_or_none) + tags = traverse_obj( + video, ('tags', ..., 'displayName'), + expected_type=lambda x: x.strip() or None) - tags = [] - for tag in (video.get('tags') or []): - display_name = tag.get('displayName') - if not display_name: - continue - tags.append(display_name) + metadata = traverse_obj(video, 'metadata', expected_type=dict) or {} + title = traverse_obj( + metadata, 'longTitle', 'title', 'name', + expected_type=lambda x: x.strip() or None) return { 'id': video_id, @@ -124,14 +92,96 @@ class IGNIE(IGNBaseIE): 'description': strip_or_none(metadata.get('description')), 'timestamp': parse_iso8601(metadata.get('publishDate')), 'duration': int_or_none(metadata.get('duration')), - 'display_id': display_id, 'thumbnails': thumbnails, 'formats': formats, 'tags': tags, } -class IGNVideoIE(InfoExtractor): +class IGNIE(IGNBaseIE): + """ + Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. + Some videos of it.ign.com are also supported + """ + _VIDEO_PATH_RE = r'/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>.+?)' + _PLAYLIST_PATH_RE = r'(?:/?\?(?P<filt>[^&#]+))?' + _VALID_URL = ( + r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos(?:%s)' + % '|'.join((_VIDEO_PATH_RE + r'(?:[/?&#]|$)', _PLAYLIST_PATH_RE))) + IE_NAME = 'ign.com' + _PAGE_TYPE = 'video' + + _TESTS = [{ + 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', + 'md5': 'd2e1586d9987d40fad7867bf96a018ea', + 'info_dict': { + 'id': '8f862beef863986b2785559b9e1aa599', + 'ext': 'mp4', + 'title': 'The Last of Us Review', + 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', + 'timestamp': 1370440800, + 'upload_date': '20130605', + 'tags': 'count:9', + 'display_id': 'the-last-of-us-review', + 'thumbnail': 'https://assets1.ignimgs.com/vid/thumbnails/user/2014/03/26/lastofusreviewmimig2.jpg', + 'duration': 440, + }, + 'params': { + 'nocheckcertificate': True, + }, + }, { + 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', + 'md5': 'f1581a6fe8c5121be5b807684aeac3f6', + 'info_dict': { + 'id': 'ee10d774b508c9b8ec07e763b9125b91', + 'ext': 'mp4', + 'title': 'What\'s New Now: Is GoGo Snooping on Your Data?', + 'description': 'md5:817a20299de610bd56f13175386da6fa', + 'timestamp': 1420571160, + 'upload_date': '20150106', + 'tags': 'count:4', + }, + 'skip': '404 Not Found', + }, { + 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix', + 'only_matching': True, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + grids = re.findall( + r'''(?s)<section\b[^>]+\bclass\s*=\s*['"](?:[\w-]+\s+)*?content-feed-grid(?!\B|-)[^>]+>(.+?)</section[^>]*>''', + webpage) + return filter(None, + (urljoin(url, m.group('path')) for m in re.finditer( + r'''<a\b[^>]+\bhref\s*=\s*('|")(?P<path>/videos%s)\1''' + % cls._VIDEO_PATH_RE, grids[0] if grids else ''))) + + def _real_extract(self, url): + display_id, filt = self._match_valid_url(url).group('id', 'filt') + if display_id: + return self._extract_video(url, display_id) + return self._extract_playlist(url, filt or 'all') + + def _extract_playlist(self, url, display_id): + webpage = self._download_webpage(url, display_id) + + return self.playlist_result( + (self.url_result(u, self.ie_key()) + for u in self._extract_embed_urls(url, webpage)), + playlist_id=display_id) + + def _extract_video(self, url, display_id): + video = self._checked_call_api(display_id) + + info = self._extract_video_info(video) + + return merge_dicts({ + 'display_id': display_id, + }, info) + + +class IGNVideoIE(IGNBaseIE): _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/' _TESTS = [{ 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', @@ -143,7 +193,16 @@ class IGNVideoIE(InfoExtractor): 'description': 'Taking out assassination targets in Hitman has never been more stylish.', 'timestamp': 1444665600, 'upload_date': '20151012', - } + 'display_id': '112203', + 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg', + 'duration': 298, + 'tags': 'count:13', + 'display_id': '112203', + 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg', + 'duration': 298, + 'tags': 'count:13', + }, + 'expected_warnings': ['HTTP Error 400: Bad Request'], }, { 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', 'only_matching': True, @@ -163,22 +222,38 @@ class IGNVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') - url = self._request_webpage(req, video_id).geturl() + parsed_url = urllib.parse.urlparse(url) + embed_url = urllib.parse.urlunparse( + parsed_url._replace(path=parsed_url.path.rsplit('/', 1)[0] + '/embed')) + + webpage, urlh = self._download_webpage_handle(embed_url, video_id) + new_url = urlh.url ign_url = compat_parse_qs( - compat_urllib_parse_urlparse(url).query).get('url', [None])[0] + urllib.parse.urlparse(new_url).query).get('url', [None])[-1] if ign_url: return self.url_result(ign_url, IGNIE.ie_key()) - return self.url_result(url) + video = self._search_regex(r'(<div\b[^>]+\bdata-video-id\s*=\s*[^>]+>)', webpage, 'video element', fatal=False) + if not video: + if new_url == url: + raise ExtractorError('Redirect loop: ' + url) + return self.url_result(new_url) + video = extract_attributes(video) + video_data = video.get('data-settings') or '{}' + video_data = self._parse_json(video_data, video_id)['video'] + info = self._extract_video_info(video_data) + + return merge_dicts({ + 'display_id': video_id, + }, info) class IGNArticleIE(IGNBaseIE): - _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P<id>[^/?&#]+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?(?:[\w-]+/)*?feature/\d+)/(?P<id>[^/?&#]+)' _PAGE_TYPE = 'article' _TESTS = [{ 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'info_dict': { - 'id': '524497489e4e8ff5848ece34', + 'id': '72113', 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', }, 'playlist': [ @@ -186,34 +261,43 @@ class IGNArticleIE(IGNBaseIE): 'info_dict': { 'id': '5ebbd138523268b93c9141af17bec937', 'ext': 'mp4', - 'title': 'GTA 5 Video Review', + 'title': 'Grand Theft Auto V Video Review', 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', 'timestamp': 1379339880, 'upload_date': '20130916', + 'tags': 'count:12', + 'thumbnail': 'https://assets1.ignimgs.com/thumbs/userUploaded/2021/8/16/gta-v-heistsjpg-e94705-1629138553533.jpeg', + 'display_id': 'grand-theft-auto-v-video-review', + 'duration': 501, }, }, { 'info_dict': { 'id': '638672ee848ae4ff108df2a296418ee2', 'ext': 'mp4', - 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'title': 'GTA 5 In Slow Motion', 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', 'timestamp': 1386878820, 'upload_date': '20131212', + 'duration': 202, + 'tags': 'count:25', + 'display_id': 'gta-5-in-slow-motion', + 'thumbnail': 'https://assets1.ignimgs.com/vid/thumbnails/user/2013/11/03/GTA-SLO-MO-1.jpg', }, }, ], 'params': { - 'playlist_items': '2-3', 'skip_download': True, }, + 'expected_warnings': ['Backend fetch failed'], }, { 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'info_dict': { 'id': '53ee806780a81ec46e0790f8', 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', }, - 'playlist_count': 2, + 'playlist_count': 1, + 'expected_warnings': ['Backend fetch failed'], }, { # videoId pattern 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', @@ -236,18 +320,84 @@ class IGNArticleIE(IGNBaseIE): 'only_matching': True, }] + def _checked_call_api(self, slug): + try: + return self._call_api(slug) + except ExtractorError as e: + if isinstance(e.cause, HTTPError): + e.cause.args = e.cause.args or [ + e.cause.response.url, e.cause.status, e.cause.reason] + if e.cause.status == 404: + raise ExtractorError( + 'Content not found: expired?', cause=e.cause, + expected=True) + elif e.cause.status == 503: + self.report_warning(error_to_compat_str(e.cause)) + return + raise + def _real_extract(self, url): display_id = self._match_id(url) - article = self._call_api(display_id) + article = self._checked_call_api(display_id) + + if article: + # obsolete ? + def entries(): + media_url = traverse_obj( + article, ('mediaRelations', 0, 'media', 'metadata', 'url'), + expected_type=url_or_none) + if media_url: + yield self.url_result(media_url, IGNIE.ie_key()) + for content in (article.get('content') or []): + for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content): + if url_or_none(video_url): + yield self.url_result(video_url) + + return self.playlist_result( + entries(), article.get('articleId'), + traverse_obj( + article, ('metadata', 'headline'), + expected_type=lambda x: x.strip() or None)) + + webpage = self._download_webpage(url, display_id) + + playlist_id = self._html_search_meta('dable:item_id', webpage, default=None) + if playlist_id: + + def entries(): + for m in re.finditer( + r'''(?s)<object\b[^>]+\bclass\s*=\s*("|')ign-videoplayer\1[^>]*>(?P<params>.+?)</object''', + webpage): + flashvars = self._search_regex( + r'''(<param\b[^>]+\bname\s*=\s*("|')flashvars\2[^>]*>)''', + m.group('params'), 'flashvars', default='') + flashvars = compat_parse_qs(extract_attributes(flashvars).get('value') or '') + v_url = url_or_none((flashvars.get('url') or [None])[-1]) + if v_url: + yield self.url_result(v_url) + else: + playlist_id = self._search_regex( + r'''\bdata-post-id\s*=\s*("|')(?P<id>[\da-f]+)\1''', + webpage, 'id', group='id', default=None) + + nextjs_data = self._search_nextjs_data(webpage, display_id) - def entries(): - media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) - if media_url: - yield self.url_result(media_url, IGNIE.ie_key()) - for content in (article.get('content') or []): - for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content): - yield self.url_result(video_url) + def entries(): + for player in traverse_obj( + nextjs_data, + ('props', 'apolloState', 'ROOT_QUERY', lambda k, _: k.startswith('videoPlayerProps('), '__ref')): + # skip promo links (which may not always be served, eg GH CI servers) + if traverse_obj(nextjs_data, + ('props', 'apolloState', player.replace('PlayerProps', 'ModernContent')), + expected_type=dict): + continue + video = traverse_obj(nextjs_data, ('props', 'apolloState', player), expected_type=dict) or {} + info = self._extract_video_info(video, fatal=False) + if info: + yield merge_dicts({ + 'display_id': display_id, + }, info) return self.playlist_result( - entries(), article.get('articleId'), - strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) + entries(), playlist_id or display_id, + re.sub(r'\s+-\s+IGN\s*$', '', self._og_search_title(webpage, default='')) or None) diff --git a/hypervideo_dl/extractor/imggaming.py b/hypervideo_dl/extractor/imggaming.py index 8e220fd..a40aa21 100644 --- a/hypervideo_dl/extractor/imggaming.py +++ b/hypervideo_dl/extractor/imggaming.py @@ -1,7 +1,7 @@ import json from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -52,9 +52,9 @@ class ImgGamingBaseIE(InfoExtractor): return self._call_api( stream_path, media_id)['playerUrlCallback'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: raise ExtractorError( - self._parse_json(e.cause.read().decode(), media_id)['messages'][0], + self._parse_json(e.cause.response.read().decode(), media_id)['messages'][0], expected=True) raise diff --git a/hypervideo_dl/extractor/instagram.py b/hypervideo_dl/extractor/instagram.py index 0233513..bfc4b7b 100644 --- a/hypervideo_dl/extractor/instagram.py +++ b/hypervideo_dl/extractor/instagram.py @@ -3,9 +3,9 @@ import itertools import json import re import time -import urllib.error from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, decode_base_n, @@ -442,7 +442,7 @@ class InstagramIE(InstagramBaseIE): shared_data = self._search_json( r'window\._sharedData\s*=', webpage, 'shared data', video_id, fatal=False) or {} - if shared_data and self._LOGIN_URL not in urlh.geturl(): + if shared_data and self._LOGIN_URL not in urlh.url: media.update(traverse_obj( shared_data, ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), ('entry_data', 'PostPage', 0, 'media'), expected_type=dict) or {}) @@ -589,7 +589,7 @@ class InstagramPlaylistBaseIE(InstagramBaseIE): except ExtractorError as e: # if it's an error caused by a bad query, and there are # more GIS templates to try, ignore it and keep trying - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: if gis_tmpl != gis_tmpls[-1]: continue raise diff --git a/hypervideo_dl/extractor/iprima.py b/hypervideo_dl/extractor/iprima.py index 1818205..6dec151 100644 --- a/hypervideo_dl/extractor/iprima.py +++ b/hypervideo_dl/extractor/iprima.py @@ -7,7 +7,8 @@ from ..utils import ( js_to_json, urlencode_postdata, ExtractorError, - parse_qs + parse_qs, + traverse_obj ) @@ -15,8 +16,7 @@ class IPrimaIE(InfoExtractor): _VALID_URL = r'https?://(?!cnn)(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' _GEO_BYPASS = False _NETRC_MACHINE = 'iprima' - _LOGIN_URL = 'https://auth.iprima.cz/oauth2/login' - _TOKEN_URL = 'https://auth.iprima.cz/oauth2/token' + _AUTH_ROOT = 'https://auth.iprima.cz' access_token = None _TESTS = [{ @@ -67,7 +67,7 @@ class IPrimaIE(InfoExtractor): return login_page = self._download_webpage( - self._LOGIN_URL, None, note='Downloading login page', + f'{self._AUTH_ROOT}/oauth2/login', None, note='Downloading login page', errnote='Downloading login page failed') login_form = self._hidden_inputs(login_page) @@ -76,11 +76,20 @@ class IPrimaIE(InfoExtractor): '_email': username, '_password': password}) - _, login_handle = self._download_webpage_handle( - self._LOGIN_URL, None, data=urlencode_postdata(login_form), + profile_select_html, login_handle = self._download_webpage_handle( + f'{self._AUTH_ROOT}/oauth2/login', None, data=urlencode_postdata(login_form), note='Logging in') - code = parse_qs(login_handle.geturl()).get('code')[0] + # a profile may need to be selected first, even when there is only a single one + if '/profile-select' in login_handle.url: + profile_id = self._search_regex( + r'data-identifier\s*=\s*["\']?(\w+)', profile_select_html, 'profile id') + + login_handle = self._request_webpage( + f'{self._AUTH_ROOT}/user/profile-select-perform/{profile_id}', None, + query={'continueUrl': '/user/login?redirect_uri=/user/'}, note='Selecting profile') + + code = traverse_obj(login_handle.url, ({parse_qs}, 'code', 0)) if not code: raise ExtractorError('Login failed', expected=True) @@ -89,10 +98,10 @@ class IPrimaIE(InfoExtractor): 'client_id': 'prima_sso', 'grant_type': 'authorization_code', 'code': code, - 'redirect_uri': 'https://auth.iprima.cz/sso/auth-check'} + 'redirect_uri': f'{self._AUTH_ROOT}/sso/auth-check'} token_data = self._download_json( - self._TOKEN_URL, None, + f'{self._AUTH_ROOT}/oauth2/token', None, note='Downloading token', errnote='Downloading token failed', data=urlencode_postdata(token_request_data)) @@ -115,14 +124,22 @@ class IPrimaIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_search_meta( + title = self._html_extract_title(webpage) or self._html_search_meta( ['og:title', 'twitter:title'], webpage, 'title', default=None) video_id = self._search_regex(( r'productId\s*=\s*([\'"])(?P<id>p\d+)\1', - r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1'), - webpage, 'real id', group='id') + r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1', + ), webpage, 'real id', group='id', default=None) + + if not video_id: + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse='data') + video_id = traverse_obj( + nuxt_data, (..., 'content', 'additionals', 'videoPlayId', {str}), get_all=False) + + if not video_id: + self.raise_no_formats('Unable to extract video ID from webpage') metadata = self._download_json( f'https://api.play-backend.iprima.cz/api/v1//products/id-{video_id}/play', diff --git a/hypervideo_dl/extractor/iqiyi.py b/hypervideo_dl/extractor/iqiyi.py index c41f6db..94bcad4 100644 --- a/hypervideo_dl/extractor/iqiyi.py +++ b/hypervideo_dl/extractor/iqiyi.py @@ -270,12 +270,14 @@ class IqIE(InfoExtractor): '1': 'zh_CN', '2': 'zh_TW', '3': 'en', - '4': 'kor', + '4': 'ko', + '5': 'ja', '18': 'th', '21': 'my', '23': 'vi', '24': 'id', '26': 'es', + '27': 'pt', '28': 'ar', } @@ -355,13 +357,16 @@ class IqIE(InfoExtractor): if player_js_cache: return player_js_cache webpack_js_url = self._proto_relative_url(self._search_regex( - r'<script src="((?:https?)?//stc.iqiyipic.com/_next/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL')) + r'<script src="((?:https?:)?//stc\.iqiyipic\.com/_next/static/chunks/webpack-\w+\.js)"', webpage, 'webpack URL')) webpack_js = self._download_webpage(webpack_js_url, video_id, note='Downloading webpack JS', errnote='Unable to download webpack JS') - webpack_map1, webpack_map2 = [self._parse_json(js_map, video_id, transform_source=js_to_json) for js_map in self._search_regex( - r'\(({[^}]*})\[\w+\][^\)]*\)\s*\+\s*["\']\.["\']\s*\+\s*({[^}]*})\[\w+\]\+["\']\.js', webpack_js, 'JS locations', group=(1, 2))] - for module_index in reversed(list(webpack_map2.keys())): + webpack_map = self._search_json( + r'["\']\s*\+\s*', webpack_js, 'JS locations', video_id, + contains_pattern=r'{\s*(?:\d+\s*:\s*["\'][\da-f]+["\']\s*,?\s*)+}', + end_pattern=r'\[\w+\]\+["\']\.js', transform_source=js_to_json) + + for module_index in reversed(webpack_map): module_js = self._download_webpage( - f'https://stc.iqiyipic.com/_next/static/chunks/{webpack_map1.get(module_index, module_index)}.{webpack_map2[module_index]}.js', + f'https://stc.iqiyipic.com/_next/static/chunks/{module_index}.{webpack_map[module_index]}.js', video_id, note=f'Downloading #{module_index} module JS', errnote='Unable to download module JS', fatal=False) or '' if 'vms request' in module_js: self.cache.store('iq', 'player_js', module_js) @@ -373,11 +378,11 @@ class IqIE(InfoExtractor): self._extract_vms_player_js(webpage, video_id), 'signature function') def _update_bid_tags(self, webpage, video_id): - extracted_bid_tags = self._parse_json( - self._search_regex( - r'arguments\[1\][^,]*,\s*function\s*\([^\)]*\)\s*{\s*"use strict";?\s*var \w=({.+}})\s*,\s*\w\s*=\s*{\s*getNewVd', - self._extract_vms_player_js(webpage, video_id), 'video tags', default=''), - video_id, transform_source=js_to_json, fatal=False) + extracted_bid_tags = self._search_json( + r'function\s*\([^)]*\)\s*\{\s*"use strict";?\s*var \w\s*=\s*', + self._extract_vms_player_js(webpage, video_id), 'video tags', video_id, + contains_pattern=r'{\s*\d+\s*:\s*\{\s*nbid\s*:.+}\s*}', + end_pattern=r'\s*,\s*\w\s*=\s*\{\s*getNewVd', fatal=False, transform_source=js_to_json) if not extracted_bid_tags: return self._BID_TAGS = { @@ -412,7 +417,7 @@ class IqIE(InfoExtractor): 'langCode': self._get_cookie('lang', 'en_us'), 'deviceId': self._get_cookie('QC005', '') }, fatal=False) - ut_list = traverse_obj(vip_data, ('data', 'all_vip', ..., 'vipType'), expected_type=str_or_none, default=[]) + ut_list = traverse_obj(vip_data, ('data', 'all_vip', ..., 'vipType'), expected_type=str_or_none) else: ut_list = ['0'] @@ -444,7 +449,7 @@ class IqIE(InfoExtractor): self.report_warning('This preview video is limited%s' % format_field(preview_time, None, ' to %s seconds')) # TODO: Extract audio-only formats - for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none, default=[])): + for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none)): dash_path = dash_paths.get(bid) if not dash_path: self.report_warning(f'Unknown format id: {bid}. It is currently not being extracted') @@ -455,7 +460,7 @@ class IqIE(InfoExtractor): fatal=False), 'data', expected_type=dict) video_format = traverse_obj(format_data, ('program', 'video', lambda _, v: str(v['bid']) == bid), - expected_type=dict, default=[], get_all=False) or {} + expected_type=dict, get_all=False) or {} extracted_formats = [] if video_format.get('m3u8Url'): extracted_formats.extend(self._extract_m3u8_formats( @@ -496,7 +501,7 @@ class IqIE(InfoExtractor): }) formats.extend(extracted_formats) - for sub_format in traverse_obj(initial_format_data, ('program', 'stl', ...), expected_type=dict, default=[]): + for sub_format in traverse_obj(initial_format_data, ('program', 'stl', ...), expected_type=dict): lang = self._LID_TAGS.get(str_or_none(sub_format.get('lid')), sub_format.get('_name')) subtitles.setdefault(lang, []).extend([{ 'ext': format_ext, diff --git a/hypervideo_dl/extractor/ivi.py b/hypervideo_dl/extractor/ivi.py index 27a222a..e7ba5f3 100644 --- a/hypervideo_dl/extractor/ivi.py +++ b/hypervideo_dl/extractor/ivi.py @@ -2,11 +2,8 @@ import json import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - qualities, -) +from ..dependencies import Cryptodome +from ..utils import ExtractorError, int_or_none, qualities class IviIE(InfoExtractor): @@ -94,18 +91,8 @@ class IviIE(InfoExtractor): for site in (353, 183): content_data = (data % site).encode() if site == 353: - try: - from Cryptodome.Cipher import Blowfish - from Cryptodome.Hash import CMAC - pycryptodome_found = True - except ImportError: - try: - from Crypto.Cipher import Blowfish - from Crypto.Hash import CMAC - pycryptodome_found = True - except ImportError: - pycryptodome_found = False - continue + if not Cryptodome.CMAC: + continue timestamp = (self._download_json( self._LIGHT_URL, video_id, @@ -118,7 +105,8 @@ class IviIE(InfoExtractor): query = { 'ts': timestamp, - 'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + content_data, Blowfish).hexdigest(), + 'sign': Cryptodome.CMAC.new(self._LIGHT_KEY, timestamp.encode() + content_data, + Cryptodome.Blowfish).hexdigest(), } else: query = {} diff --git a/hypervideo_dl/extractor/iwara.py b/hypervideo_dl/extractor/iwara.py index ec3e59c..91b87e0 100644 --- a/hypervideo_dl/extractor/iwara.py +++ b/hypervideo_dl/extractor/iwara.py @@ -1,239 +1,298 @@ -import itertools -import re +import functools import urllib.parse +import urllib.error +import hashlib +import json +import time from .common import InfoExtractor from ..utils import ( + ExtractorError, + OnDemandPagedList, int_or_none, + jwt_decode_hs256, mimetype2ext, - remove_end, - strip_or_none, - unified_strdate, - url_or_none, - urljoin, + qualities, + traverse_obj, + try_call, + unified_timestamp, ) class IwaraBaseIE(InfoExtractor): - _BASE_REGEX = r'(?P<base_url>https?://(?:www\.|ecchi\.)?iwara\.tv)' + _NETRC_MACHINE = 'iwara' + _USERTOKEN = None + _MEDIATOKEN = None + + def _is_token_expired(self, token, token_type): + # User token TTL == ~3 weeks, Media token TTL == ~1 hour + if (try_call(lambda: jwt_decode_hs256(token)['exp']) or 0) <= int(time.time() - 120): + self.to_screen(f'{token_type} token has expired') + return True + + def _get_user_token(self): + username, password = self._get_login_info() + if not username or not password: + return - def _extract_playlist(self, base_url, webpage): - for path in re.findall(r'class="title">\s*<a[^<]+href="([^"]+)', webpage): - yield self.url_result(urljoin(base_url, path)) + user_token = IwaraBaseIE._USERTOKEN or self.cache.load(self._NETRC_MACHINE, username) + if not user_token or self._is_token_expired(user_token, 'User'): + response = self._download_json( + 'https://api.iwara.tv/user/login', None, note='Logging in', + headers={'Content-Type': 'application/json'}, data=json.dumps({ + 'email': username, + 'password': password + }).encode(), expected_status=lambda x: True) + user_token = traverse_obj(response, ('token', {str})) + if not user_token: + error = traverse_obj(response, ('message', {str})) + if 'invalidLogin' in error: + raise ExtractorError('Invalid login credentials', expected=True) + else: + raise ExtractorError(f'Iwara API said: {error or "nothing"}') + + self.cache.store(self._NETRC_MACHINE, username, user_token) + + IwaraBaseIE._USERTOKEN = user_token + + def _get_media_token(self): + self._get_user_token() + if not IwaraBaseIE._USERTOKEN: + return # user has not passed credentials + + if not IwaraBaseIE._MEDIATOKEN or self._is_token_expired(IwaraBaseIE._MEDIATOKEN, 'Media'): + IwaraBaseIE._MEDIATOKEN = self._download_json( + 'https://api.iwara.tv/user/token', None, note='Fetching media token', + data=b'', headers={ + 'Authorization': f'Bearer {IwaraBaseIE._USERTOKEN}', + 'Content-Type': 'application/json' + })['accessToken'] + + return {'Authorization': f'Bearer {IwaraBaseIE._MEDIATOKEN}'} + + def _perform_login(self, username, password): + self._get_media_token() class IwaraIE(IwaraBaseIE): - _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/videos/(?P<id>[a-zA-Z0-9]+)' + IE_NAME = 'iwara' + _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos?/(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD', - # md5 is unstable + 'url': 'https://www.iwara.tv/video/k2ayoueezfkx6gvq', 'info_dict': { - 'id': 'amVwUl1EHpAD9RD', + 'id': 'k2ayoueezfkx6gvq', 'ext': 'mp4', - 'title': '【MMD R-18】ガールフレンド carry_me_off', 'age_limit': 18, - 'thumbnail': 'https://i.iwara.tv/sites/default/files/videos/thumbnails/7951/thumbnail-7951_0001.png', - 'uploader': 'Reimu丨Action', - 'upload_date': '20150828', - 'description': 'md5:1d4905ce48c66c9299c617f08e106e0f', + 'title': 'Defeat of Irybelda - アイリベルダの敗北', + 'description': 'md5:70278abebe706647a8b4cb04cf23e0d3', + 'uploader': 'Inwerwm', + 'uploader_id': 'inwerwm', + 'tags': 'count:1', + 'like_count': 6133, + 'view_count': 1050343, + 'comment_count': 1, + 'timestamp': 1677843869, + 'modified_timestamp': 1679056362, }, + 'skip': 'this video cannot be played because of migration', }, { - 'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO', - 'md5': '7e5f1f359cd51a027ba4a7b7710a50f0', + 'url': 'https://iwara.tv/video/1ywe1sbkqwumpdxz5/', + 'md5': '7645f966f069b8ec9210efd9130c9aad', 'info_dict': { - 'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc', + 'id': '1ywe1sbkqwumpdxz5', 'ext': 'mp4', - 'title': '[3D Hentai] Kyonyu × Genkai × Emaki Shinobi Girls.mp4', 'age_limit': 18, + 'title': 'Aponia アポニア SEX Party Tonight 手の脱衣 巨乳 ', + 'description': 'md5:3f60016fff22060eef1ef26d430b1f67', + 'uploader': 'Lyu ya', + 'uploader_id': 'user792540', + 'tags': [ + 'uncategorized' + ], + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'timestamp': 1678732213, + 'modified_timestamp': int, + 'thumbnail': 'https://files.iwara.tv/image/thumbnail/581d12b5-46f4-4f15-beb2-cfe2cde5d13d/thumbnail-00.jpg', + 'modified_date': '20230614', + 'upload_date': '20230313', }, - 'add_ie': ['GoogleDrive'], }, { - 'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq', - # md5 is unstable + 'url': 'https://iwara.tv/video/blggmfno8ghl725bg', 'info_dict': { - 'id': '6liAP9s2Ojc', + 'id': 'blggmfno8ghl725bg', 'ext': 'mp4', 'age_limit': 18, - 'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)', - 'description': 'md5:590c12c0df1443d833fbebe05da8c47a', - 'upload_date': '20160910', - 'uploader': 'aMMDsork', - 'uploader_id': 'UCVOFyOSCyFkXTYYHITtqB7A', + 'title': 'お外でおしっこしちゃう猫耳ロリメイド', + 'description': 'md5:0342ba9bf6db09edbbb28729657c3611', + 'uploader': 'Fe_Kurosabi', + 'uploader_id': 'fekurosabi', + 'tags': [ + 'pee' + ], + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'timestamp': 1598880567, + 'modified_timestamp': int, + 'upload_date': '20200831', + 'modified_date': '20230605', + 'thumbnail': 'https://files.iwara.tv/image/thumbnail/7693e881-d302-42a4-a780-f16d66b5dadd/thumbnail-00.jpg', + # 'availability': 'needs_auth', }, - 'add_ie': ['Youtube'], }] + def _extract_formats(self, video_id, fileurl): + up = urllib.parse.urlparse(fileurl) + q = urllib.parse.parse_qs(up.query) + paths = up.path.rstrip('/').split('/') + # https://github.com/hypervideo/hypervideo/issues/6549#issuecomment-1473771047 + x_version = hashlib.sha1('_'.join((paths[-1], q['expires'][0], '5nFp9kmbNnHdAFhaqMvt')).encode()).hexdigest() + + preference = qualities(['preview', '360', '540', 'Source']) + + files = self._download_json(fileurl, video_id, headers={'X-Version': x_version}) + for fmt in files: + yield traverse_obj(fmt, { + 'format_id': 'name', + 'url': ('src', ('view', 'download'), {self._proto_relative_url}), + 'ext': ('type', {mimetype2ext}), + 'quality': ('name', {preference}), + 'height': ('name', {int_or_none}), + }, get_all=False) + def _real_extract(self, url): video_id = self._match_id(url) - - webpage, urlh = self._download_webpage_handle(url, video_id) - - hostname = urllib.parse.urlparse(urlh.geturl()).hostname - # ecchi is 'sexy' in Japanese - age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0 - - video_data = self._download_json('http://www.iwara.tv/api/video/%s' % video_id, video_id) - - if not video_data: - iframe_url = self._html_search_regex( - r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', - webpage, 'iframe URL', group='url') - return { - '_type': 'url_transparent', - 'url': iframe_url, - 'age_limit': age_limit, - } - - title = remove_end(self._html_extract_title(webpage), ' | Iwara') - - thumbnail = self._html_search_regex( - r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None) - - uploader = self._html_search_regex( - r'class="username">([^<]+)', webpage, 'uploader', fatal=False) - - upload_date = unified_strdate(self._html_search_regex( - r'作成日:([^\s]+)', webpage, 'upload_date', fatal=False)) - - description = strip_or_none(self._search_regex( - r'<p>(.+?(?=</div))', webpage, 'description', fatal=False, - flags=re.DOTALL)) - - formats = [] - for a_format in video_data: - format_uri = url_or_none(a_format.get('uri')) - if not format_uri: - continue - format_id = a_format.get('resolution') - height = int_or_none(self._search_regex( - r'(\d+)p', format_id, 'height', default=None)) - formats.append({ - 'url': self._proto_relative_url(format_uri, 'https:'), - 'format_id': format_id, - 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', - 'height': height, - 'width': int_or_none(height / 9.0 * 16.0 if height else None), - 'quality': 1 if format_id == 'Source' else 0, - }) + username, _ = self._get_login_info() + video_data = self._download_json( + f'https://api.iwara.tv/video/{video_id}', video_id, + expected_status=lambda x: True, headers=self._get_media_token()) + errmsg = video_data.get('message') + # at this point we can actually get uploaded user info, but do we need it? + if errmsg == 'errors.privateVideo': + self.raise_login_required('Private video. Login if you have permissions to watch', method='password') + elif errmsg == 'errors.notFound' and not username: + self.raise_login_required('Video may need login to view', method='password') + elif errmsg: # None if success + raise ExtractorError(f'Iwara says: {errmsg}') + + if not video_data.get('fileUrl'): + if video_data.get('embedUrl'): + return self.url_result(video_data.get('embedUrl')) + raise ExtractorError('This video is unplayable', expected=True) return { 'id': video_id, - 'title': title, - 'age_limit': age_limit, - 'formats': formats, - 'thumbnail': self._proto_relative_url(thumbnail, 'https:'), - 'uploader': uploader, - 'upload_date': upload_date, - 'description': description, + 'age_limit': 18 if video_data.get('rating') == 'ecchi' else 0, # ecchi is 'sexy' in Japanese + **traverse_obj(video_data, { + 'title': 'title', + 'description': 'body', + 'uploader': ('user', 'name'), + 'uploader_id': ('user', 'username'), + 'tags': ('tags', ..., 'id'), + 'like_count': 'numLikes', + 'view_count': 'numViews', + 'comment_count': 'numComments', + 'timestamp': ('createdAt', {unified_timestamp}), + 'modified_timestamp': ('updatedAt', {unified_timestamp}), + 'thumbnail': ('file', 'id', {str}, { + lambda x: f'https://files.iwara.tv/image/thumbnail/{x}/thumbnail-00.jpg'}), + }), + 'formats': list(self._extract_formats(video_id, video_data.get('fileUrl'))), } -class IwaraPlaylistIE(IwaraBaseIE): - _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/playlist/(?P<id>[^/?#&]+)' - IE_NAME = 'iwara:playlist' +class IwaraUserIE(IwaraBaseIE): + _VALID_URL = r'https?://(?:www\.)?iwara\.tv/profile/(?P<id>[^/?#&]+)' + IE_NAME = 'iwara:user' + _PER_PAGE = 32 _TESTS = [{ - 'url': 'https://ecchi.iwara.tv/playlist/best-enf', + 'url': 'https://iwara.tv/profile/user792540/videos', + 'info_dict': { + 'id': 'user792540', + 'title': 'Lyu ya', + }, + 'playlist_mincount': 70, + }, { + 'url': 'https://iwara.tv/profile/theblackbirdcalls/videos', 'info_dict': { - 'title': 'Best enf', - 'uploader': 'Jared98112', - 'id': 'best-enf', + 'id': 'theblackbirdcalls', + 'title': 'TheBlackbirdCalls', }, - 'playlist_mincount': 1097, + 'playlist_mincount': 723, + }, { + 'url': 'https://iwara.tv/profile/user792540', + 'only_matching': True, }, { - # urlencoded - 'url': 'https://ecchi.iwara.tv/playlist/%E3%83%97%E3%83%AC%E3%82%A4%E3%83%AA%E3%82%B9%E3%83%88-2', + 'url': 'https://iwara.tv/profile/theblackbirdcalls', + 'only_matching': True, + }, { + 'url': 'https://www.iwara.tv/profile/lumymmd', 'info_dict': { - 'id': 'プレイリスト-2', - 'title': 'プレイリスト', - 'uploader': 'mainyu', + 'id': 'lumymmd', + 'title': 'Lumy MMD', }, - 'playlist_mincount': 91, + 'playlist_mincount': 1, }] + def _entries(self, playlist_id, user_id, page): + videos = self._download_json( + 'https://api.iwara.tv/videos', playlist_id, + note=f'Downloading page {page}', + query={ + 'page': page, + 'sort': 'date', + 'user': user_id, + 'limit': self._PER_PAGE, + }, headers=self._get_media_token()) + for x in traverse_obj(videos, ('results', ..., 'id')): + yield self.url_result(f'https://iwara.tv/video/{x}') + def _real_extract(self, url): - playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url') - playlist_id = urllib.parse.unquote(playlist_id) - webpage = self._download_webpage(url, playlist_id) + playlist_id = self._match_id(url) + user_info = self._download_json( + f'https://api.iwara.tv/profile/{playlist_id}', playlist_id, + note='Requesting user info') + user_id = traverse_obj(user_info, ('user', 'id')) - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': self._html_search_regex(r'class="title"[^>]*>([^<]+)', webpage, 'title', fatal=False), - 'uploader': self._html_search_regex(r'<h2>([^<]+)', webpage, 'uploader', fatal=False), - 'entries': self._extract_playlist(base_url, webpage), - } + return self.playlist_result( + OnDemandPagedList( + functools.partial(self._entries, playlist_id, user_id), + self._PER_PAGE), + playlist_id, traverse_obj(user_info, ('user', 'name'))) -class IwaraUserIE(IwaraBaseIE): - _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/users/(?P<id>[^/?#&]+)' - IE_NAME = 'iwara:user' +class IwaraPlaylistIE(IwaraBaseIE): + _VALID_URL = r'https?://(?:www\.)?iwara\.tv/playlist/(?P<id>[0-9a-f-]+)' + IE_NAME = 'iwara:playlist' + _PER_PAGE = 32 _TESTS = [{ - 'note': 'number of all videos page is just 1 page. less than 40 videos', - 'url': 'https://ecchi.iwara.tv/users/infinityyukarip', + 'url': 'https://iwara.tv/playlist/458e5486-36a4-4ac0-b233-7e9eef01025f', 'info_dict': { - 'title': 'Uploaded videos from Infinity_YukariP', - 'id': 'infinityyukarip', - 'uploader': 'Infinity_YukariP', - 'uploader_id': 'infinityyukarip', + 'id': '458e5486-36a4-4ac0-b233-7e9eef01025f', }, - 'playlist_mincount': 39, - }, { - 'note': 'no even all videos page. probably less than 10 videos', - 'url': 'https://ecchi.iwara.tv/users/mmd-quintet', - 'info_dict': { - 'title': 'Uploaded videos from mmd quintet', - 'id': 'mmd-quintet', - 'uploader': 'mmd quintet', - 'uploader_id': 'mmd-quintet', - }, - 'playlist_mincount': 6, - }, { - 'note': 'has paging. more than 40 videos', - 'url': 'https://ecchi.iwara.tv/users/theblackbirdcalls', - 'info_dict': { - 'title': 'Uploaded videos from TheBlackbirdCalls', - 'id': 'theblackbirdcalls', - 'uploader': 'TheBlackbirdCalls', - 'uploader_id': 'theblackbirdcalls', - }, - 'playlist_mincount': 420, - }, { - 'note': 'foreign chars in URL. there must be foreign characters in URL', - 'url': 'https://ecchi.iwara.tv/users/ぶた丼', - 'info_dict': { - 'title': 'Uploaded videos from ぶた丼', - 'id': 'ぶた丼', - 'uploader': 'ぶた丼', - 'uploader_id': 'ぶた丼', - }, - 'playlist_mincount': 170, + 'playlist_mincount': 3, }] - def _entries(self, playlist_id, base_url): - webpage = self._download_webpage( - f'{base_url}/users/{playlist_id}', playlist_id) - videos_url = self._search_regex(r'<a href="(/users/[^/]+/videos)(?:\?[^"]+)?">', webpage, 'all videos url', default=None) - if not videos_url: - yield from self._extract_playlist(base_url, webpage) - return - - videos_url = urljoin(base_url, videos_url) - - for n in itertools.count(1): - page = self._download_webpage( - videos_url, playlist_id, note=f'Downloading playlist page {n}', - query={'page': str(n - 1)} if n > 1 else {}) - yield from self._extract_playlist( - base_url, page) - - if f'page={n}' not in page: - break + def _entries(self, playlist_id, first_page, page): + videos = self._download_json( + 'https://api.iwara.tv/videos', playlist_id, f'Downloading page {page}', + query={'page': page, 'limit': self._PER_PAGE}, + headers=self._get_media_token()) if page else first_page + for x in traverse_obj(videos, ('results', ..., 'id')): + yield self.url_result(f'https://iwara.tv/video/{x}') def _real_extract(self, url): - playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url') - playlist_id = urllib.parse.unquote(playlist_id) + playlist_id = self._match_id(url) + page_0 = self._download_json( + f'https://api.iwara.tv/playlist/{playlist_id}?page=0&limit={self._PER_PAGE}', playlist_id, + note='Requesting playlist info', headers=self._get_media_token()) return self.playlist_result( - self._entries(playlist_id, base_url), playlist_id) + OnDemandPagedList( + functools.partial(self._entries, playlist_id, page_0), + self._PER_PAGE), + playlist_id, traverse_obj(page_0, ('title', 'name'))) diff --git a/hypervideo_dl/extractor/joj.py b/hypervideo_dl/extractor/joj.py index 9b62284..ea46042 100644 --- a/hypervideo_dl/extractor/joj.py +++ b/hypervideo_dl/extractor/joj.py @@ -23,10 +23,20 @@ class JojIE(InfoExtractor): 'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932', 'ext': 'mp4', 'title': 'NOVÉ BÝVANIE', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*?$', 'duration': 3118, } }, { + 'url': 'https://media.joj.sk/embed/CSM0Na0l0p1', + 'info_dict': { + 'id': 'CSM0Na0l0p1', + 'ext': 'mp4', + 'height': 576, + 'title': 'Extrémne rodiny 2 - POKRAČOVANIE (2012/04/09 21:30:00)', + 'duration': 3937, + 'thumbnail': r're:^https?://.*?$', + } + }, { 'url': 'https://media.joj.sk/embed/9i1cxv', 'only_matching': True, }, { @@ -43,10 +53,10 @@ class JojIE(InfoExtractor): webpage = self._download_webpage( 'https://media.joj.sk/embed/%s' % video_id, video_id) - title = self._search_regex( - (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1', - r'<title>(?P<title>[^<]+)'), webpage, 'title', - default=None, group='title') or self._og_search_title(webpage) + title = (self._search_json(r'videoTitle\s*:', webpage, 'title', video_id, + contains_pattern=r'["\'].+["\']', default=None) + or self._html_extract_title(webpage, default=None) + or self._og_search_title(webpage)) bitrates = self._parse_json( self._search_regex( @@ -58,11 +68,13 @@ class JojIE(InfoExtractor): for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []: if isinstance(format_url, compat_str): height = self._search_regex( - r'(\d+)[pP]\.', format_url, 'height', default=None) + r'(\d+)[pP]|(pal)\.', format_url, 'height', default=None) + if height == 'pal': + height = 576 formats.append({ 'url': format_url, 'format_id': format_field(height, None, '%sp'), - 'height': int(height), + 'height': int_or_none(height), }) if not formats: playlist = self._download_xml( diff --git a/hypervideo_dl/extractor/jstream.py b/hypervideo_dl/extractor/jstream.py new file mode 100644 index 0000000..3e2e627 --- /dev/null +++ b/hypervideo_dl/extractor/jstream.py @@ -0,0 +1,73 @@ +import base64 +import re +import json + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + js_to_json, + remove_start, +) + + +class JStreamIE(InfoExtractor): + # group "id" only exists for compliance, not directly used in requests + # also all components are mandatory + _VALID_URL = r'jstream:(?P<host>www\d+):(?P<id>(?P<publisher>[a-z0-9]+):(?P<mid>\d+))' + + _TESTS = [{ + 'url': 'jstream:www50:eqd638pvwx:752', + 'info_dict': { + 'id': 'eqd638pvwx:752', + 'ext': 'mp4', + 'title': '阪神淡路大震災 激震の記録2020年版 解説動画', + 'duration': 672, + 'thumbnail': r're:https?://eqd638pvwx\.eq\.webcdn\.stream\.ne\.jp/.+\.jpg', + }, + }] + + def _parse_jsonp(self, callback, string, video_id): + return self._search_json(rf'\s*{re.escape(callback)}\s*\(', string, callback, video_id) + + def _find_formats(self, video_id, movie_list_hls, host, publisher, subtitles): + for value in movie_list_hls: + text = value.get('text') or '' + if not text.startswith('auto'): + continue + m3u8_id = remove_start(remove_start(text, 'auto'), '_') or None + fmts, subs = self._extract_m3u8_formats_and_subtitles( + f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/{value.get("url")}', video_id, 'mp4', m3u8_id=m3u8_id) + self._merge_subtitles(subs, target=subtitles) + yield from fmts + + def _real_extract(self, url): + host, publisher, mid, video_id = self._match_valid_url(url).group('host', 'publisher', 'mid', 'id') + video_info_jsonp = self._download_webpage( + f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/eq_meta/v1/{mid}.jsonp', + video_id, 'Requesting video info') + video_info = self._parse_jsonp('metaDataResult', video_info_jsonp, video_id)['movie'] + subtitles = {} + formats = list(self._find_formats(video_id, video_info.get('movie_list_hls'), host, publisher, subtitles)) + self._remove_duplicate_formats(formats) + return { + 'id': video_id, + 'title': video_info.get('title'), + 'duration': float_or_none(video_info.get('duration')), + 'thumbnail': video_info.get('thumbnail_url'), + 'formats': formats, + 'subtitles': subtitles, + } + + @classmethod + def _extract_embed_urls(cls, url, webpage): + # check for eligiblity of webpage + # https://support.eq.stream.co.jp/hc/ja/articles/115008388147-%E3%83%97%E3%83%AC%E3%82%A4%E3%83%A4%E3%83%BCAPI%E3%81%AE%E3%82%B5%E3%83%B3%E3%83%97%E3%83%AB%E3%82%B3%E3%83%BC%E3%83%89 + script_tag = re.search(r'<script\s*[^>]+?src="https://ssl-cache\.stream\.ne\.jp/(?P<host>www\d+)/(?P<publisher>[a-z0-9]+)/[^"]+?/if\.js"', webpage) + if not script_tag: + return + host, publisher = script_tag.groups() + for m in re.finditer(r'(?s)PlayerFactoryIF\.create\(\s*({[^\}]+?})\s*\)\s*;', webpage): + # TODO: using json.loads here as InfoExtractor._parse_json is not classmethod + info = json.loads(js_to_json(m.group(1))) + mid = base64.b64decode(info.get('m')).decode() + yield f'jstream:{host}:{publisher}:{mid}' diff --git a/hypervideo_dl/extractor/jwplatform.py b/hypervideo_dl/extractor/jwplatform.py index c949689..bc47aa6 100644 --- a/hypervideo_dl/extractor/jwplatform.py +++ b/hypervideo_dl/extractor/jwplatform.py @@ -8,14 +8,16 @@ class JWPlatformIE(InfoExtractor): _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|manifest)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' _TESTS = [{ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', - 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', + 'md5': '3aa16e4f6860e6e78b7df5829519aed3', 'info_dict': { 'id': 'nPripu9l', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'Big Buck Bunny Trailer', 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', 'upload_date': '20081127', 'timestamp': 1227796140, + 'duration': 32.0, + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/nPripu9l/poster.jpg?width=720', } }, { 'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js', @@ -37,18 +39,31 @@ class JWPlatformIE(InfoExtractor): }, }, { # Player url not surrounded by quotes - 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/darling-berlin', + 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/school-trip', 'info_dict': { - 'id': 'R10NQdhY', - 'title': 'Playgirl', + 'id': 'jUxh5uin', + 'title': 'Klassenfahrt', 'ext': 'mp4', - 'upload_date': '20220624', - 'thumbnail': 'https://cdn.jwplayer.com/v2/media/R10NQdhY/poster.jpg?width=720', - 'timestamp': 1656064800, - 'description': 'BRD 1966, Will Tremper', - 'duration': 5146.0, + 'upload_date': '20230109', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/jUxh5uin/poster.jpg?width=720', + 'timestamp': 1673270298, + 'description': '', + 'duration': 5193.0, }, 'params': {'allowed_extractors': ['generic', 'jwplatform']}, + }, { + # iframe src attribute includes backslash before URL string + 'url': 'https://www.elespectador.com/colombia/video-asi-se-evito-la-fuga-de-john-poulos-presunto-feminicida-de-valentina-trespalacios-explicacion', + 'info_dict': { + 'id': 'QD3gsexj', + 'title': 'Así se evitó la fuga de John Poulos, presunto feminicida de Valentina Trespalacios', + 'ext': 'mp4', + 'upload_date': '20230127', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/QD3gsexj/poster.jpg?width=720', + 'timestamp': 1674862986, + 'description': 'md5:128fd74591c4e1fc2da598c5cb6f5ce4', + 'duration': 263.0, + }, }] @classmethod @@ -57,7 +72,7 @@ class JWPlatformIE(InfoExtractor): # <input value=URL> is used by hyland.com # if we find <iframe>, dont look for <input> ret = re.findall( - r'<%s[^>]+?%s=["\']?((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})' % (tag, key), + r'<%s[^>]+?%s=\\?["\']?((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})' % (tag, key), webpage) if ret: return ret diff --git a/hypervideo_dl/extractor/kakao.py b/hypervideo_dl/extractor/kakao.py index 1f0f0a5..43055e8 100644 --- a/hypervideo_dl/extractor/kakao.py +++ b/hypervideo_dl/extractor/kakao.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -101,8 +101,8 @@ class KakaoIE(InfoExtractor): cdn_api_base, video_id, query=query, note='Downloading video URL for profile %s' % profile_name) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - resp = self._parse_json(e.cause.read().decode(), video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + resp = self._parse_json(e.cause.response.read().decode(), video_id) if resp.get('code') == 'GeoBlocked': self.raise_geo_restricted() raise diff --git a/hypervideo_dl/extractor/kankanews.py b/hypervideo_dl/extractor/kankanews.py new file mode 100644 index 0000000..46e239b --- /dev/null +++ b/hypervideo_dl/extractor/kankanews.py @@ -0,0 +1,48 @@ +import time +import random +import string +import hashlib +import urllib.parse + +from .common import InfoExtractor + + +class KankaNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kankanews\.com/a/\d+\-\d+\-\d+/(?P<id>\d+)\.shtml' + _TESTS = [{ + 'url': 'https://www.kankanews.com/a/2022-11-08/00310276054.shtml?appid=1088227', + 'md5': '05e126513c74b1258d657452a6f4eef9', + 'info_dict': { + 'id': '4485057', + 'url': 'http://mediaplay.kksmg.com/2022/11/08/h264_450k_mp4_1a388ad771e0e4cc28b0da44d245054e_ncm.mp4', + 'ext': 'mp4', + 'title': '视频|第23个中国记者节,我们在进博切蛋糕', + 'thumbnail': r're:^https?://.*\.jpg*', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'omsid\s*=\s*"(\d+)"', webpage, 'video id') + + params = { + 'nonce': ''.join(random.choices(string.ascii_lowercase + string.digits, k=8)), + 'omsid': video_id, + 'platform': 'pc', + 'timestamp': int(time.time()), + 'version': '1.0', + } + params['sign'] = hashlib.md5((hashlib.md5(( + urllib.parse.urlencode(params) + '&28c8edde3d61a0411511d3b1866f0636' + ).encode()).hexdigest()).encode()).hexdigest() + + meta = self._download_json('https://api-app.kankanews.com/kankan/pc/getvideo', + video_id, query=params)['result']['video'] + + return { + 'id': video_id, + 'url': meta['videourl'], + 'title': self._search_regex(r'g\.title\s*=\s*"([^"]+)"', webpage, 'title'), + 'thumbnail': meta.get('titlepic'), + } diff --git a/hypervideo_dl/extractor/kick.py b/hypervideo_dl/extractor/kick.py new file mode 100644 index 0000000..d124372 --- /dev/null +++ b/hypervideo_dl/extractor/kick.py @@ -0,0 +1,126 @@ +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import ( + UserNotLive, + float_or_none, + merge_dicts, + str_or_none, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class KickBaseIE(InfoExtractor): + def _real_initialize(self): + self._request_webpage(HEADRequest('https://kick.com/'), None, 'Setting up session', fatal=False) + xsrf_token = self._get_cookies('https://kick.com/').get('XSRF-TOKEN') + if not xsrf_token: + self.write_debug('kick.com did not set XSRF-TOKEN cookie') + KickBaseIE._API_HEADERS = { + 'Authorization': f'Bearer {xsrf_token.value}', + 'X-XSRF-TOKEN': xsrf_token.value, + } if xsrf_token else {} + + def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, **kwargs): + return self._download_json( + f'https://kick.com/api/v1/{path}', display_id, note=note, + headers=merge_dicts(headers, self._API_HEADERS), **kwargs) + + +class KickIE(KickBaseIE): + _VALID_URL = r'https?://(?:www\.)?kick\.com/(?!(?:video|categories|search|auth)(?:[/?#]|$))(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://kick.com/yuppy', + 'info_dict': { + 'id': '6cde1-kickrp-joe-flemmingskick-info-heremust-knowmust-see21', + 'ext': 'mp4', + 'title': str, + 'description': str, + 'channel': 'yuppy', + 'channel_id': '33538', + 'uploader': 'Yuppy', + 'uploader_id': '33793', + 'upload_date': str, + 'live_status': 'is_live', + 'timestamp': int, + 'thumbnail': r're:^https?://.*\.jpg', + 'categories': list, + }, + 'skip': 'livestream', + }, { + 'url': 'https://kick.com/kmack710', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel = self._match_id(url) + response = self._call_api(f'channels/{channel}', channel) + if not traverse_obj(response, 'livestream', expected_type=dict): + raise UserNotLive(video_id=channel) + + return { + 'id': str(traverse_obj( + response, ('livestream', ('slug', 'id')), get_all=False, default=channel)), + 'formats': self._extract_m3u8_formats( + response['playback_url'], channel, 'mp4', live=True), + 'title': traverse_obj( + response, ('livestream', ('session_title', 'slug')), get_all=False, default=''), + 'description': traverse_obj(response, ('user', 'bio')), + 'channel': channel, + 'channel_id': str_or_none(traverse_obj(response, 'id', ('livestream', 'channel_id'))), + 'uploader': traverse_obj(response, 'name', ('user', 'username')), + 'uploader_id': str_or_none(traverse_obj(response, 'user_id', ('user', 'id'))), + 'is_live': True, + 'timestamp': unified_timestamp(traverse_obj(response, ('livestream', 'created_at'))), + 'thumbnail': traverse_obj( + response, ('livestream', 'thumbnail', 'url'), expected_type=url_or_none), + 'categories': traverse_obj(response, ('recent_categories', ..., 'name')), + } + + +class KickVODIE(KickBaseIE): + _VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ + 'url': 'https://kick.com/video/54244b5e-050a-4df4-a013-b2433dafbe35', + 'md5': '73691206a6a49db25c5aa1588e6538fc', + 'info_dict': { + 'id': '54244b5e-050a-4df4-a013-b2433dafbe35', + 'ext': 'mp4', + 'title': 'Making 710-carBoosting. Kinda No Pixel inspired. !guilded - !links', + 'description': 'md5:a0d3546bf7955d0a8252ffe0fd6f518f', + 'channel': 'kmack710', + 'channel_id': '16278', + 'uploader': 'Kmack710', + 'uploader_id': '16412', + 'upload_date': '20221206', + 'timestamp': 1670318289, + 'duration': 40104.0, + 'thumbnail': r're:^https?://.*\.jpg', + 'categories': ['Grand Theft Auto V'], + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + response = self._call_api(f'video/{video_id}', video_id) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(response['source'], video_id, 'mp4'), + 'title': traverse_obj( + response, ('livestream', ('session_title', 'slug')), get_all=False, default=''), + 'description': traverse_obj(response, ('livestream', 'channel', 'user', 'bio')), + 'channel': traverse_obj(response, ('livestream', 'channel', 'slug')), + 'channel_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'id'))), + 'uploader': traverse_obj(response, ('livestream', 'channel', 'user', 'username')), + 'uploader_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'user_id'))), + 'timestamp': unified_timestamp(response.get('created_at')), + 'duration': float_or_none(traverse_obj(response, ('livestream', 'duration')), scale=1000), + 'thumbnail': traverse_obj( + response, ('livestream', 'thumbnail'), expected_type=url_or_none), + 'categories': traverse_obj(response, ('livestream', 'categories', ..., 'name')), + } diff --git a/hypervideo_dl/extractor/kommunetv.py b/hypervideo_dl/extractor/kommunetv.py new file mode 100644 index 0000000..e21e556 --- /dev/null +++ b/hypervideo_dl/extractor/kommunetv.py @@ -0,0 +1,31 @@ +from .common import InfoExtractor +from ..utils import update_url + + +class KommunetvIE(InfoExtractor): + _VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P<id>\w+)' + _TEST = { + 'url': 'https://oslo.kommunetv.no/archive/921', + 'md5': '5f102be308ee759be1e12b63d5da4bbc', + 'info_dict': { + 'id': '921', + 'title': 'Bystyremøte', + 'ext': 'mp4' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + headers = { + 'Accept': 'application/json' + } + data = self._download_json('https://oslo.kommunetv.no/api/streams?streamType=1&id=%s' % video_id, video_id, headers=headers) + title = data['stream']['title'] + file = data['playlist'][0]['playlist'][0]['file'] + url = update_url(file, query=None, fragment=None) + formats = self._extract_m3u8_formats(url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + return { + 'id': video_id, + 'formats': formats, + 'title': title + } diff --git a/hypervideo_dl/extractor/kuwo.py b/hypervideo_dl/extractor/kuwo.py index cfec1c5..e8a061a 100644 --- a/hypervideo_dl/extractor/kuwo.py +++ b/hypervideo_dl/extractor/kuwo.py @@ -91,7 +91,7 @@ class KuwoIE(KuwoBaseIE): webpage, urlh = self._download_webpage_handle( url, song_id, note='Download song detail info', errnote='Unable to get song detail info') - if song_id not in urlh.geturl() or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: + if song_id not in urlh.url or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: raise ExtractorError('this song has been offline because of copyright issues', expected=True) song_name = self._html_search_regex( diff --git a/hypervideo_dl/extractor/la7.py b/hypervideo_dl/extractor/la7.py index 68dc1d4..a3cd12b 100644 --- a/hypervideo_dl/extractor/la7.py +++ b/hypervideo_dl/extractor/la7.py @@ -1,25 +1,19 @@ import re from .common import InfoExtractor -from ..utils import ( - determine_ext, - float_or_none, - HEADRequest, - int_or_none, - parse_duration, - unified_strdate, -) +from ..networking import HEADRequest +from ..utils import float_or_none, int_or_none, parse_duration, unified_strdate class LA7IE(InfoExtractor): IE_NAME = 'la7.it' - _VALID_URL = r'''(?x)(https?://)?(?: - (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/| + _VALID_URL = r'''(?x)https?://(?: + (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video|news)/| tg\.la7\.it/repliche-tgla7\?id= )(?P<id>.+)''' _TESTS = [{ - # 'src' is a plain URL + # single quality video 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', 'md5': '8b613ffc0c4bf9b9e377169fc19c214c', 'info_dict': { @@ -29,6 +23,20 @@ class LA7IE(InfoExtractor): 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico', 'thumbnail': 're:^https?://.*', 'upload_date': '20151002', + 'formats': 'count:4', + }, + }, { + # multiple quality video + 'url': 'https://www.la7.it/calcio-femminile/news/il-gol-di-lindsey-thomas-fiorentina-vs-milan-serie-a-calcio-femminile-26-11-2022-461736', + 'md5': 'd2370e78f75e8d1238cb3a0db9a2eda3', + 'info_dict': { + 'id': 'il-gol-di-lindsey-thomas-fiorentina-vs-milan-serie-a-calcio-femminile-26-11-2022-461736', + 'ext': 'mp4', + 'title': 'Il gol di Lindsey Thomas | Fiorentina vs Milan | Serie A Calcio Femminile', + 'description': 'Il gol di Lindsey Thomas | Fiorentina vs Milan | Serie A Calcio Femminile', + 'thumbnail': 're:^https?://.*', + 'upload_date': '20221126', + 'formats': 'count:8', }, }, { 'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077', @@ -39,7 +47,7 @@ class LA7IE(InfoExtractor): def _generate_mp4_url(self, quality, m3u8_formats): for f in m3u8_formats: if f['vcodec'] != 'none' and quality in f['url']: - http_url = '%s%s.mp4' % (self._HOST, quality) + http_url = f'{self._HOST}{quality}.mp4' urlh = self._request_webpage( HEADRequest(http_url), quality, @@ -58,12 +66,13 @@ class LA7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - if not url.startswith('http'): - url = '%s//%s' % (self.http_scheme(), url) + if re.search(r'(?i)(drmsupport\s*:\s*true)\s*', webpage): + self.report_drm(video_id) - webpage = self._download_webpage(url, video_id) - video_path = self._search_regex(r'(/content/.*?).mp4', webpage, 'video_path') + video_path = self._search_regex( + r'(/content/[\w/,]+?)\.mp4(?:\.csmil)?/master\.m3u8', webpage, 'video_path') formats = self._extract_mpd_formats( f'{self._HOST}/local/dash/,{video_path}.mp4.urlset/manifest.mpd', @@ -90,8 +99,7 @@ class LA7IE(InfoExtractor): class LA7PodcastEpisodeIE(InfoExtractor): IE_NAME = 'la7.it:pod:episode' - _VALID_URL = r'''(?x)(https?://)? - (?:www\.)?la7\.it/[^/]+/podcast/([^/]+-)?(?P<id>\d+)''' + _VALID_URL = r'https?://(?:www\.)?la7\.it/[^/]+/podcast/([^/]+-)?(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.la7.it/voicetown/podcast/la-carezza-delle-memoria-di-carlo-verdone-23-03-2021-371497', @@ -125,14 +133,15 @@ class LA7PodcastEpisodeIE(InfoExtractor): webpage, 'video_id', group='vid') media_url = self._search_regex( - (r'src:\s*([\'"])(?P<url>.+?mp3.+?)\1', - r'data-podcast=([\'"])(?P<url>.+?mp3.+?)\1'), + (r'src\s*:\s*([\'"])(?P<url>\S+?mp3.+?)\1', + r'data-podcast\s*=\s*([\'"])(?P<url>\S+?mp3.+?)\1'), webpage, 'media_url', group='url') - ext = determine_ext(media_url) formats = [{ 'url': media_url, - 'format_id': ext, - 'ext': ext, + 'format_id': 'http-mp3', + 'ext': 'mp3', + 'acodec': 'mp3', + 'vcodec': 'none', }] title = self._html_search_regex( @@ -173,7 +182,7 @@ class LA7PodcastEpisodeIE(InfoExtractor): # and title is the same as the show_title # add the date to the title if date and not date_alt and ppn and ppn.lower() == title.lower(): - title += ' del %s' % date + title = f'{title} del {date}' return { 'id': video_id, 'title': title, @@ -193,7 +202,7 @@ class LA7PodcastEpisodeIE(InfoExtractor): class LA7PodcastIE(LA7PodcastEpisodeIE): # XXX: Do not subclass from concrete IE IE_NAME = 'la7.it:podcast' - _VALID_URL = r'(https?://)?(www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$|[#?])' + _VALID_URL = r'https?://(?:www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$|[#?])' _TESTS = [{ 'url': 'https://www.la7.it/propagandalive/podcast', @@ -201,7 +210,7 @@ class LA7PodcastIE(LA7PodcastEpisodeIE): # XXX: Do not subclass from concrete I 'id': 'propagandalive', 'title': "Propaganda Live", }, - 'playlist_count': 10, + 'playlist_count_min': 10, }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/lastfm.py b/hypervideo_dl/extractor/lastfm.py index f14198c..6710335 100644 --- a/hypervideo_dl/extractor/lastfm.py +++ b/hypervideo_dl/extractor/lastfm.py @@ -1,33 +1,24 @@ +import itertools import re from .common import InfoExtractor -from ..utils import int_or_none, format_field +from ..utils import int_or_none, parse_qs, traverse_obj class LastFMPlaylistBaseIE(InfoExtractor): def _entries(self, url, playlist_id): - webpage = self._download_webpage(url, playlist_id) - start_page_number = int_or_none(self._search_regex( - r'\bpage=(\d+)', url, 'page', default=None)) or 1 - last_page_number = int_or_none(self._search_regex( - r'>(\d+)</a>[^<]*</li>[^<]*<li[^>]+class="pagination-next', webpage, 'last_page', default=None)) - - for page_number in range(start_page_number, (last_page_number or start_page_number) + 1): + single_page = traverse_obj(parse_qs(url), ('page', -1, {int_or_none})) + for page in itertools.count(single_page or 1): webpage = self._download_webpage( - url, playlist_id, - note='Downloading page %d%s' % (page_number, format_field(last_page_number, None, ' of %d')), - query={'page': page_number}) - page_entries = [ - self.url_result(player_url, 'Youtube') - for player_url in set(re.findall(r'data-youtube-url="([^"]+)"', webpage)) - ] - - for e in page_entries: - yield e + url, playlist_id, f'Downloading page {page}', query={'page': page}) + videos = re.findall(r'data-youtube-url="([^"]+)"', webpage) + yield from videos + if single_page or not videos: + return def _real_extract(self, url): playlist_id = self._match_id(url) - return self.playlist_result(self._entries(url, playlist_id), playlist_id) + return self.playlist_from_matches(self._entries(url, playlist_id), playlist_id, ie='Youtube') class LastFMPlaylistIE(LastFMPlaylistBaseIE): @@ -37,7 +28,7 @@ class LastFMPlaylistIE(LastFMPlaylistBaseIE): 'info_dict': { 'id': 'Oasis', }, - 'playlist_count': 11, + 'playlist_mincount': 11, }, { 'url': 'https://www.last.fm/music/Oasis', 'only_matching': True, @@ -73,6 +64,18 @@ class LastFMUserIE(LastFMPlaylistBaseIE): 'id': '12319471', }, 'playlist_count': 30, + }, { + 'url': 'https://www.last.fm/user/naamloos1/playlists/12543760', + 'info_dict': { + 'id': '12543760', + }, + 'playlist_mincount': 80, + }, { + 'url': 'https://www.last.fm/user/naamloos1/playlists/12543760?page=3', + 'info_dict': { + 'id': '12543760', + }, + 'playlist_count': 32, }] diff --git a/hypervideo_dl/extractor/lbry.py b/hypervideo_dl/extractor/lbry.py index b5def1e..9a9f925 100644 --- a/hypervideo_dl/extractor/lbry.py +++ b/hypervideo_dl/extractor/lbry.py @@ -1,18 +1,22 @@ import functools import json +import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_str, compat_urllib_parse_unquote +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, OnDemandPagedList, UnsupportedError, determine_ext, int_or_none, mimetype2ext, parse_qs, + traverse_obj, try_get, + url_or_none, + urlhandle_detect_ext, urljoin, ) @@ -52,38 +56,25 @@ class LBRYBaseIE(InfoExtractor): '/%s:%s' % (claim_name, claim_id)) def _parse_stream(self, stream, url): - stream_value = stream.get('value') or {} - stream_type = stream_value.get('stream_type') - source = stream_value.get('source') or {} - media = stream_value.get(stream_type) or {} - signing_channel = stream.get('signing_channel') or {} - channel_name = signing_channel.get('name') - channel_claim_id = signing_channel.get('claim_id') - channel_url = None - if channel_name and channel_claim_id: - channel_url = self._permanent_url(url, channel_name, channel_claim_id) + stream_type = traverse_obj(stream, ('value', 'stream_type', {str})) + + info = traverse_obj(stream, { + 'title': ('value', 'title', {str}), + 'thumbnail': ('value', 'thumbnail', 'url', {url_or_none}), + 'description': ('value', 'description', {str}), + 'license': ('value', 'license', {str}), + 'timestamp': ('timestamp', {int_or_none}), + 'release_timestamp': ('value', 'release_time', {int_or_none}), + 'tags': ('value', 'tags', ..., {lambda x: x or None}), + 'duration': ('value', stream_type, 'duration', {int_or_none}), + 'channel': ('signing_channel', 'value', 'title', {str}), + 'channel_id': ('signing_channel', 'claim_id', {str}), + }) + + channel_name = traverse_obj(stream, ('signing_channel', 'name', {str})) + if channel_name and info.get('channel_id'): + info['channel_url'] = self._permanent_url(url, channel_name, info['channel_id']) - info = { - 'thumbnail': try_get(stream_value, lambda x: x['thumbnail']['url'], compat_str), - 'description': stream_value.get('description'), - 'license': stream_value.get('license'), - 'timestamp': int_or_none(stream.get('timestamp')), - 'release_timestamp': int_or_none(stream_value.get('release_time')), - 'tags': stream_value.get('tags'), - 'duration': int_or_none(media.get('duration')), - 'channel': try_get(signing_channel, lambda x: x['value']['title']), - 'channel_id': channel_claim_id, - 'channel_url': channel_url, - 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), - 'filesize': int_or_none(source.get('size')), - } - if stream_type == 'audio': - info['vcodec'] = 'none' - else: - info.update({ - 'width': int_or_none(media.get('width')), - 'height': int_or_none(media.get('height')), - }) return info @@ -93,7 +84,7 @@ class LBRYIE(LBRYBaseIE): _TESTS = [{ # Video 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', - 'md5': 'fffd15d76062e9a985c22c7c7f2f4805', + 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9', 'info_dict': { 'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d', 'ext': 'mp4', @@ -142,9 +133,8 @@ class LBRYIE(LBRYBaseIE): 'license': 'None', } }, { - # HLS 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', - 'md5': '25049011f3c8bc2f8b60ad88a031837e', + 'md5': 'c35fac796f62a14274b4dc2addb5d0ba', 'info_dict': { 'id': 'e51671357333fe22ae88aad320bde2f6f96b1410', 'ext': 'mp4', @@ -187,6 +177,28 @@ class LBRYIE(LBRYBaseIE): }, 'params': {'skip_download': True} }, { + # original quality format w/higher resolution than HLS formats + 'url': 'https://odysee.com/@wickedtruths:2/Biotechnological-Invasion-of-Skin-(April-2023):4', + 'md5': '305b0b3b369bde1b984961f005b67193', + 'info_dict': { + 'id': '41fbfe805eb73c8d3012c0c49faa0f563274f634', + 'ext': 'mp4', + 'title': 'Biotechnological Invasion of Skin (April 2023)', + 'description': 'md5:709a2f4c07bd8891cda3a7cc2d6fcf5c', + 'channel': 'Wicked Truths', + 'channel_id': '23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', + 'channel_url': 'https://odysee.com/@wickedtruths:23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', + 'timestamp': 1685790036, + 'upload_date': '20230603', + 'release_timestamp': 1685617473, + 'release_date': '20230601', + 'duration': 1063, + 'thumbnail': 'https://thumbs.odycdn.com/4e6d39da4df0cfdad45f64e253a15959.webp', + 'tags': ['smart skin surveillance', 'biotechnology invasion of skin', 'morgellons'], + 'license': 'None', + 'protocol': 'https', # test for direct mp4 download + }, + }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'only_matching': True, }, { @@ -221,41 +233,65 @@ class LBRYIE(LBRYBaseIE): display_id = display_id.split('/', 2)[-1].replace('/', ':') else: display_id = display_id.replace(':', '#') - display_id = compat_urllib_parse_unquote(display_id) + display_id = urllib.parse.unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') headers = {'Referer': 'https://odysee.com/'} - if result['value'].get('stream_type') in self._SUPPORTED_STREAM_TYPES: + + formats = [] + stream_type = traverse_obj(result, ('value', 'stream_type', {str})) + + if stream_type in self._SUPPORTED_STREAM_TYPES: claim_id, is_live = result['claim_id'], False streaming_url = self._call_api_proxy( 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] + + # GET request to v3 API returns original video/audio file if available + direct_url = re.sub(r'/api/v\d+/', '/api/v3/', streaming_url) + urlh = self._request_webpage( + direct_url, display_id, 'Checking for original quality', headers=headers, fatal=False) + if urlh and urlhandle_detect_ext(urlh) != 'm3u8': + formats.append({ + 'url': direct_url, + 'format_id': 'original', + 'quality': 1, + **traverse_obj(result, ('value', { + 'ext': ('source', (('name', {determine_ext}), ('media_type', {mimetype2ext}))), + 'filesize': ('source', 'size', {int_or_none}), + 'width': ('video', 'width', {int_or_none}), + 'height': ('video', 'height', {int_or_none}), + }), get_all=False), + 'vcodec': 'none' if stream_type == 'audio' else None, + }) + + # HEAD request returns redirect response to m3u8 URL if available final_url = self._request_webpage( HEADRequest(streaming_url), display_id, headers=headers, - note='Downloading streaming redirect url info').geturl() + note='Downloading streaming redirect url info').url + elif result.get('value_type') == 'stream': claim_id, is_live = result['signing_channel']['claim_id'], True live_data = self._download_json( 'https://api.odysee.live/livestream/is_live', claim_id, query={'channel_claim_id': claim_id}, note='Downloading livestream JSON metadata')['data'] - streaming_url = final_url = live_data.get('VideoURL') + final_url = live_data.get('VideoURL') # Upcoming videos may still give VideoURL if not live_data.get('Live'): - streaming_url = final_url = None + final_url = None self.raise_no_formats('This stream is not live', True, claim_id) + else: raise UnsupportedError(url) - info = self._parse_stream(result, url) if determine_ext(final_url) == 'm3u8': - info['formats'] = self._extract_m3u8_formats( - final_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live, headers=headers) - else: - info['url'] = streaming_url + formats.extend(self._extract_m3u8_formats( + final_url, display_id, 'mp4', m3u8_id='hls', live=is_live, headers=headers)) + return { - **info, + **self._parse_stream(result, url), 'id': claim_id, - 'title': result['value']['title'], + 'formats': formats, 'is_live': is_live, 'http_headers': headers, } @@ -299,14 +335,12 @@ class LBRYChannelIE(LBRYBaseIE): if not (stream_claim_name and stream_claim_id): continue - info = self._parse_stream(item, url) - info.update({ + yield { + **self._parse_stream(item, url), '_type': 'url', 'id': stream_claim_id, - 'title': try_get(item, lambda x: x['value']['title']), 'url': self._permanent_url(url, stream_claim_name, stream_claim_id), - }) - yield info + } def _real_extract(self, url): display_id = self._match_id(url).replace(':', '#') diff --git a/hypervideo_dl/extractor/lecturio.py b/hypervideo_dl/extractor/lecturio.py index 973764c..bb059d3 100644 --- a/hypervideo_dl/extractor/lecturio.py +++ b/hypervideo_dl/extractor/lecturio.py @@ -25,7 +25,7 @@ class LecturioBaseIE(InfoExtractor): self._LOGIN_URL, None, 'Downloading login popup') def is_logged(url_handle): - return self._LOGIN_URL not in url_handle.geturl() + return self._LOGIN_URL not in url_handle.url # Already logged in if is_logged(urlh): diff --git a/hypervideo_dl/extractor/lefigaro.py b/hypervideo_dl/extractor/lefigaro.py new file mode 100644 index 0000000..9465095 --- /dev/null +++ b/hypervideo_dl/extractor/lefigaro.py @@ -0,0 +1,135 @@ +import json +import math + +from .common import InfoExtractor +from ..utils import ( + InAdvancePagedList, + traverse_obj, +) + + +class LeFigaroVideoEmbedIE(InfoExtractor): + _VALID_URL = r'https?://video\.lefigaro\.fr/embed/[^?#]+/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://video.lefigaro.fr/embed/figaro/video/les-francais-ne-veulent-ils-plus-travailler-suivez-en-direct-le-club-le-figaro-idees/', + 'md5': 'e94de44cd80818084352fcf8de1ce82c', + 'info_dict': { + 'id': 'g9j7Eovo', + 'title': 'Les Français ne veulent-ils plus travailler ? Retrouvez Le Club Le Figaro Idées', + 'description': 'md5:862b8813148ba4bf10763a65a69dfe41', + 'upload_date': '20230216', + 'timestamp': 1676581615, + 'duration': 3076, + 'thumbnail': r're:^https?://[^?#]+\.(?:jpeg|jpg)', + 'ext': 'mp4', + }, + }, { + 'url': 'https://video.lefigaro.fr/embed/figaro/video/intelligence-artificielle-faut-il-sen-mefier/', + 'md5': '0b3f10332b812034b3a3eda1ef877c5f', + 'info_dict': { + 'id': 'LeAgybyc', + 'title': 'Intelligence artificielle : faut-il s’en méfier ?', + 'description': 'md5:249d136e3e5934a67c8cb704f8abf4d2', + 'upload_date': '20230124', + 'timestamp': 1674584477, + 'duration': 860, + 'thumbnail': r're:^https?://[^?#]+\.(?:jpeg|jpg)', + 'ext': 'mp4', + }, + }] + + _WEBPAGE_TESTS = [{ + 'url': 'https://video.lefigaro.fr/figaro/video/suivez-en-direct-le-club-le-figaro-international-avec-philippe-gelie-9/', + 'md5': '3972ddf2d5f8b98699f191687258e2f9', + 'info_dict': { + 'id': 'QChnbPYA', + 'title': 'Où en est le couple franco-allemand ? Retrouvez Le Club Le Figaro International', + 'description': 'md5:6f47235b7e7c93b366fd8ebfa10572ac', + 'upload_date': '20230123', + 'timestamp': 1674503575, + 'duration': 3153, + 'thumbnail': r're:^https?://[^?#]+\.(?:jpeg|jpg)', + 'age_limit': 0, + 'ext': 'mp4', + }, + }, { + 'url': 'https://video.lefigaro.fr/figaro/video/la-philosophe-nathalie-sarthou-lajus-est-linvitee-du-figaro-live/', + 'md5': '3ac0a0769546ee6be41ab52caea5d9a9', + 'info_dict': { + 'id': 'QJzqoNbf', + 'title': 'La philosophe Nathalie Sarthou-Lajus est l’invitée du Figaro Live', + 'description': 'md5:c586793bb72e726c83aa257f99a8c8c4', + 'upload_date': '20230217', + 'timestamp': 1676661986, + 'duration': 1558, + 'thumbnail': r're:^https?://[^?#]+\.(?:jpeg|jpg)', + 'age_limit': 0, + 'ext': 'mp4', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + player_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['pageData']['playerData'] + + return self.url_result( + f'jwplatform:{player_data["videoId"]}', title=player_data.get('title'), + description=player_data.get('description'), thumbnail=player_data.get('poster')) + + +class LeFigaroVideoSectionIE(InfoExtractor): + _VALID_URL = r'https?://video\.lefigaro\.fr/figaro/(?P<id>[\w-]+)/?(?:[#?]|$)' + + _TESTS = [{ + 'url': 'https://video.lefigaro.fr/figaro/le-club-le-figaro-idees/', + 'info_dict': { + 'id': 'le-club-le-figaro-idees', + 'title': 'Le Club Le Figaro Idées', + }, + 'playlist_mincount': 14, + }, { + 'url': 'https://video.lefigaro.fr/figaro/factu/', + 'info_dict': { + 'id': 'factu', + 'title': 'Factu', + }, + 'playlist_mincount': 519, + }] + + _PAGE_SIZE = 20 + + def _get_api_response(self, display_id, page_num, note=None): + return self._download_json( + 'https://api-graphql.lefigaro.fr/graphql', display_id, note=note, + query={ + 'id': 'flive-website_UpdateListPage_1fb260f996bca2d78960805ac382544186b3225f5bedb43ad08b9b8abef79af6', + 'variables': json.dumps({ + 'slug': display_id, + 'videosLimit': self._PAGE_SIZE, + 'sort': 'DESC', + 'order': 'PUBLISHED_AT', + 'page': page_num, + }).encode(), + }) + + def _real_extract(self, url): + display_id = self._match_id(url) + initial_response = self._get_api_response(display_id, page_num=1)['data']['playlist'] + + def page_func(page_num): + api_response = self._get_api_response(display_id, page_num + 1, note=f'Downloading page {page_num + 1}') + + return [self.url_result( + video['embedUrl'], LeFigaroVideoEmbedIE, **traverse_obj(video, { + 'title': 'name', + 'description': 'description', + 'thumbnail': 'thumbnailUrl', + })) for video in api_response['data']['playlist']['jsonLd'][0]['itemListElement']] + + entries = InAdvancePagedList( + page_func, math.ceil(initial_response['videoCount'] / self._PAGE_SIZE), self._PAGE_SIZE) + + return self.playlist_result(entries, playlist_id=display_id, playlist_title=initial_response.get('title')) diff --git a/hypervideo_dl/extractor/lego.py b/hypervideo_dl/extractor/lego.py index 811b447..46fc7a9 100644 --- a/hypervideo_dl/extractor/lego.py +++ b/hypervideo_dl/extractor/lego.py @@ -1,7 +1,7 @@ import uuid from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -75,7 +75,7 @@ class LEGOIE(InfoExtractor): 'videoId': '%s_%s' % (uuid.UUID(video_id), locale), }, headers=self.geo_verification_headers()) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 451: + if isinstance(e.cause, HTTPError) and e.cause.status == 451: self.raise_geo_restricted(countries=countries) raise diff --git a/hypervideo_dl/extractor/limelight.py b/hypervideo_dl/extractor/limelight.py index e11ec43..4e50f10 100644 --- a/hypervideo_dl/extractor/limelight.py +++ b/hypervideo_dl/extractor/limelight.py @@ -1,7 +1,7 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, float_or_none, @@ -69,8 +69,8 @@ class LimelightBaseIE(InfoExtractor): item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal, headers=headers) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error = self._parse_json(e.cause.read().decode(), item_id)['detail']['contentAccessPermission'] + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + error = self._parse_json(e.cause.response.read().decode(), item_id)['detail']['contentAccessPermission'] if error == 'CountryDisabled': self.raise_geo_restricted() raise ExtractorError(error, expected=True) diff --git a/hypervideo_dl/extractor/linuxacademy.py b/hypervideo_dl/extractor/linuxacademy.py index a570248..0b16442 100644 --- a/hypervideo_dl/extractor/linuxacademy.py +++ b/hypervideo_dl/extractor/linuxacademy.py @@ -2,11 +2,8 @@ import json import random from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_HTTPError, - compat_str, -) +from ..compat import compat_b64decode, compat_str +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, ExtractorError, @@ -75,9 +72,8 @@ class LinuxAcademyIE(InfoExtractor): def _perform_login(self, username, password): def random_string(): - return ''.join([ - random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') - for _ in range(32)]) + return ''.join(random.choices( + '0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~', k=32)) webpage, urlh = self._download_webpage_handle( self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ @@ -108,7 +104,7 @@ class LinuxAcademyIE(InfoExtractor): 'sso': 'true', }) - login_state_url = urlh.geturl() + login_state_url = urlh.url try: login_page = self._download_webpage( @@ -120,8 +116,8 @@ class LinuxAcademyIE(InfoExtractor): 'Referer': login_state_url, }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read(), None) + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read(), None) message = error.get('description') or error['code'] raise ExtractorError( '%s said: %s' % (self.IE_NAME, message), expected=True) @@ -138,7 +134,7 @@ class LinuxAcademyIE(InfoExtractor): }) access_token = self._search_regex( - r'access_token=([^=&]+)', urlh.geturl(), + r'access_token=([^=&]+)', urlh.url, 'access token', default=None) if not access_token: access_token = self._parse_json( diff --git a/hypervideo_dl/extractor/litv.py b/hypervideo_dl/extractor/litv.py index 31826ac..19b298e 100644 --- a/hypervideo_dl/extractor/litv.py +++ b/hypervideo_dl/extractor/litv.py @@ -4,8 +4,8 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - traverse_obj, smuggle_url, + traverse_obj, unsmuggle_url, ) @@ -113,7 +113,7 @@ class LiTVIE(InfoExtractor): entry_protocol='m3u8_native', m3u8_id='hls') for a_format in formats: # LiTV HLS segments doesn't like compressions - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True + a_format.setdefault('http_headers', {})['Accept-Encoding'] = 'identity' title = program_info['title'] + program_info.get('secondaryMark', '') description = program_info.get('description') diff --git a/hypervideo_dl/extractor/livestream.py b/hypervideo_dl/extractor/livestream.py index d883eaf..a05a0fa 100644 --- a/hypervideo_dl/extractor/livestream.py +++ b/hypervideo_dl/extractor/livestream.py @@ -1,33 +1,36 @@ -import re import itertools +import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str, compat_urlparse from ..utils import ( + determine_ext, find_xpath_attr, - xpath_attr, - xpath_with_ns, - xpath_text, - orderedSet, - update_url_query, - int_or_none, float_or_none, + int_or_none, + orderedSet, parse_iso8601, - determine_ext, + traverse_obj, + update_url_query, + xpath_attr, + xpath_text, + xpath_with_ns, ) class LivestreamIE(InfoExtractor): IE_NAME = 'livestream' - _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?' + _VALID_URL = r'''(?x) + https?://(?:new\.)?livestream\.com/ + (?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+)) + (?:/events/(?P<event_id>\d+)|/(?P<event_name>[^/]+))? + (?:/videos/(?P<id>\d+))? + ''' _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"'] _TESTS = [{ 'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', - 'md5': '53274c76ba7754fb0e8d072716f2292b', + 'md5': '7876c5f5dc3e711b6b73acce4aac1527', 'info_dict': { 'id': '4719370', 'ext': 'mp4', @@ -37,22 +40,37 @@ class LivestreamIE(InfoExtractor): 'duration': 5968.0, 'like_count': int, 'view_count': int, + 'comment_count': int, 'thumbnail': r're:^http://.*\.jpg$' } }, { - 'url': 'http://new.livestream.com/tedx/cityenglish', + 'url': 'https://livestream.com/coheedandcambria/websterhall', 'info_dict': { - 'title': 'TEDCity2.0 (English)', - 'id': '2245590', + 'id': '1585861', + 'title': 'Live From Webster Hall' }, - 'playlist_mincount': 4, + 'playlist_mincount': 1, }, { - 'url': 'http://new.livestream.com/chess24/tatasteelchess', + 'url': 'https://livestream.com/dayananda/events/7954027', 'info_dict': { - 'title': 'Tata Steel Chess', - 'id': '3705884', + 'title': 'Live from Mevo', + 'id': '7954027', }, - 'playlist_mincount': 60, + 'playlist_mincount': 4, + }, { + 'url': 'https://livestream.com/accounts/82', + 'info_dict': { + 'id': '253978', + 'view_count': int, + 'title': 'trsr', + 'comment_count': int, + 'like_count': int, + 'upload_date': '20120306', + 'timestamp': 1331042383, + 'thumbnail': 'http://img.new.livestream.com/videos/0000000000000372/cacbeed6-fb68-4b5e-ad9c-e148124e68a9_640x427.jpg', + 'duration': 15.332, + 'ext': 'mp4' + } }, { 'url': 'https://new.livestream.com/accounts/362/events/3557232/videos/67864563/player?autoPlay=false&height=360&mute=false&width=640', 'only_matching': True, @@ -62,7 +80,8 @@ class LivestreamIE(InfoExtractor): }] _API_URL_TEMPLATE = 'http://livestream.com/api/accounts/%s/events/%s' - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base_ele = find_xpath_attr( smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase') base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/' @@ -86,7 +105,7 @@ class LivestreamIE(InfoExtractor): 'tbr': tbr, 'preference': -1000, # Strictly inferior than all other formats? }) - return formats + return formats, {} def _extract_video_info(self, video_data): video_id = compat_str(video_data['id']) @@ -179,7 +198,7 @@ class LivestreamIE(InfoExtractor): 'is_live': is_live, } - def _extract_event(self, event_data): + def _generate_event_playlist(self, event_data): event_id = compat_str(event_data['id']) account_id = compat_str(event_data['owner_account_id']) feed_root_url = self._API_URL_TEMPLATE % (account_id, event_id) + '/feed.json' @@ -189,7 +208,6 @@ class LivestreamIE(InfoExtractor): return self._extract_stream_info(stream_info) last_video = None - entries = [] for i in itertools.count(1): if last_video is None: info_url = feed_root_url @@ -197,31 +215,38 @@ class LivestreamIE(InfoExtractor): info_url = '{root}?&id={id}&newer=-1&type=video'.format( root=feed_root_url, id=last_video) videos_info = self._download_json( - info_url, event_id, 'Downloading page {0}'.format(i))['data'] + info_url, event_id, f'Downloading page {i}')['data'] videos_info = [v['data'] for v in videos_info if v['type'] == 'video'] if not videos_info: break for v in videos_info: v_id = compat_str(v['id']) - entries.append(self.url_result( - 'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v_id), - 'Livestream', v_id, v.get('caption'))) + yield self.url_result( + f'http://livestream.com/accounts/{account_id}/events/{event_id}/videos/{v_id}', + LivestreamIE, v_id, v.get('caption')) last_video = videos_info[-1]['id'] - return self.playlist_result(entries, event_id, event_data['full_name']) def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') event = mobj.group('event_id') or mobj.group('event_name') account = mobj.group('account_id') or mobj.group('account_name') - api_url = self._API_URL_TEMPLATE % (account, event) + api_url = f'http://livestream.com/api/accounts/{account}' + if video_id: video_data = self._download_json( - api_url + '/videos/%s' % video_id, video_id) + f'{api_url}/events/{event}/videos/{video_id}', video_id) return self._extract_video_info(video_data) - else: - event_data = self._download_json(api_url, video_id) - return self._extract_event(event_data) + elif event: + event_data = self._download_json(f'{api_url}/events/{event}', None) + return self.playlist_result( + self._generate_event_playlist(event_data), str(event_data['id']), event_data['full_name']) + + account_data = self._download_json(api_url, None) + items = traverse_obj(account_data, (('upcoming_events', 'past_events'), 'data', ...)) + return self.playlist_result( + itertools.chain.from_iterable(map(self._generate_event_playlist, items)), + account_data.get('id'), account_data.get('full_name')) # The original version of Livestream uses a different system diff --git a/hypervideo_dl/extractor/lumni.py b/hypervideo_dl/extractor/lumni.py new file mode 100644 index 0000000..5810da0 --- /dev/null +++ b/hypervideo_dl/extractor/lumni.py @@ -0,0 +1,24 @@ +from .common import InfoExtractor +from .francetv import FranceTVIE + + +class LumniIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lumni\.fr/video/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.lumni.fr/video/l-homme-et-son-environnement-dans-la-revolution-industrielle', + 'md5': '960e8240c4f2c7a20854503a71e52f5e', + 'info_dict': { + 'id': 'd2b9a4e5-a526-495b-866c-ab72737e3645', + 'ext': 'mp4', + 'title': "L'homme et son environnement dans la révolution industrielle - L'ère de l'homme", + 'thumbnail': 'https://assets.webservices.francetelevisions.fr/v1/assets/images/a7/17/9f/a7179f5f-63a5-4e11-8d4d-012ab942d905.jpg', + 'duration': 230, + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex( + r'<div[^>]+data-factoryid\s*=\s*["\']([^"\']+)', webpage, 'video id') + return self.url_result(f'francetv:{video_id}', FranceTVIE, video_id) diff --git a/hypervideo_dl/extractor/magellantv.py b/hypervideo_dl/extractor/magellantv.py new file mode 100644 index 0000000..0947a45 --- /dev/null +++ b/hypervideo_dl/extractor/magellantv.py @@ -0,0 +1,50 @@ +from .common import InfoExtractor +from ..utils import parse_age_limit, parse_duration, traverse_obj + + +class MagellanTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?magellantv\.com/(?:watch|video)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.magellantv.com/watch/my-dads-on-death-row?type=v', + 'info_dict': { + 'id': 'my-dads-on-death-row', + 'ext': 'mp4', + 'title': 'My Dad\'s On Death Row', + 'description': 'md5:33ba23b9f0651fc4537ed19b1d5b0d7a', + 'duration': 3780.0, + 'age_limit': 14, + 'tags': ['Justice', 'Reality', 'United States', 'True Crime'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.magellantv.com/video/james-bulger-the-new-revelations', + 'info_dict': { + 'id': 'james-bulger-the-new-revelations', + 'ext': 'mp4', + 'title': 'James Bulger: The New Revelations', + 'description': 'md5:7b97922038bad1d0fe8d0470d8a189f2', + 'duration': 2640.0, + 'age_limit': 0, + 'tags': ['Investigation', 'True Crime', 'Justice', 'Europe'], + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['reactContext']['video']['detail'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(data['jwpVideoUrl'], video_id) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('metadata', 'description', {str}), + 'duration': ('duration', {parse_duration}), + 'age_limit': ('ratingCategory', {parse_age_limit}), + 'tags': ('tags', ..., {str}), + }), + } diff --git a/hypervideo_dl/extractor/mailru.py b/hypervideo_dl/extractor/mailru.py index 387d211..0f0550c 100644 --- a/hypervideo_dl/extractor/mailru.py +++ b/hypervideo_dl/extractor/mailru.py @@ -1,6 +1,7 @@ import itertools import json import re +import urllib.parse from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote @@ -140,17 +141,15 @@ class MailRuIE(InfoExtractor): 'http://api.video.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON') - headers = {} - video_key = self._get_cookies('https://my.mail.ru').get('video_key') - if video_key: - headers['Cookie'] = 'video_key=%s' % video_key.value formats = [] for f in video_data['videos']: video_url = f.get('url') if not video_url: continue + if video_key: + self._set_cookie(urllib.parse.urlparse(video_url).hostname, 'video_key', video_key.value) format_id = f.get('key') height = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) if format_id else None @@ -158,7 +157,6 @@ class MailRuIE(InfoExtractor): 'url': video_url, 'format_id': format_id, 'height': height, - 'http_headers': headers, }) meta_data = video_data['meta'] diff --git a/hypervideo_dl/extractor/medaltv.py b/hypervideo_dl/extractor/medaltv.py index 82be823..9e57ee2 100644 --- a/hypervideo_dl/extractor/medaltv.py +++ b/hypervideo_dl/extractor/medaltv.py @@ -8,12 +8,12 @@ from ..utils import ( float_or_none, int_or_none, str_or_none, - traverse_obj, + traverse_obj ) class MedalTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medal\.tv/(?P<path>games/[^/?#&]+/clips)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?medal\.tv/games/[^/?#&]+/clips/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://medal.tv/games/valorant/clips/jTBFnLKdLy15K', 'md5': '6930f8972914b6b9fdc2bb3918098ba0', @@ -80,25 +80,14 @@ class MedalTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - path = self._match_valid_url(url).group('path') webpage = self._download_webpage(url, video_id) - next_data = self._search_json( - '<script[^>]*__NEXT_DATA__[^>]*>', webpage, + hydration_data = self._search_json( + r'<script[^>]*>[^<]*\bhydrationData\s*=', webpage, 'next data', video_id, end_pattern='</script>', fatal=False) - build_id = next_data.get('buildId') - if not build_id: - raise ExtractorError( - 'Could not find build ID.', video_id=video_id) - - locale = next_data.get('locale', 'en') - - api_response = self._download_json( - f'https://medal.tv/_next/data/{build_id}/{locale}/{path}/{video_id}.json', video_id) - - clip = traverse_obj(api_response, ('pageProps', 'clip')) or {} + clip = traverse_obj(hydration_data, ('clips', ...), get_all=False) if not clip: raise ExtractorError( 'Could not find video information.', video_id=video_id) @@ -152,7 +141,7 @@ class MedalTVIE(InfoExtractor): # Necessary because the id of the author is not known in advance. # Won't raise an issue if no profile can be found as this is optional. - author = traverse_obj(api_response, ('pageProps', 'profile')) or {} + author = traverse_obj(hydration_data, ('profiles', ...), get_all=False) or {} author_id = str_or_none(author.get('userId')) author_url = format_field(author_id, None, 'https://medal.tv/users/%s') diff --git a/hypervideo_dl/extractor/mediaite.py b/hypervideo_dl/extractor/mediaite.py index 0f9079b..ab25392 100644 --- a/hypervideo_dl/extractor/mediaite.py +++ b/hypervideo_dl/extractor/mediaite.py @@ -81,10 +81,24 @@ class MediaiteIE(InfoExtractor): 'upload_date': '20210930', }, 'params': {'skip_download': True} + }, { + 'url': 'https://www.mediaite.com/politics/i-cant-read-it-fast-enough-while-defending-trump-larry-kudlow-overwhelmed-by-volume-of-ex-presidents-legal-troubles/', + 'info_dict': { + 'id': 'E6EhDX5z', + 'ext': 'mp4', + 'title': 'Fox Business Network - 4:00 PM - 5:00 PM - 1:39:42 pm - 1:42:20 pm', + 'description': '', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/E6EhDX5z/poster.jpg?width=720', + 'duration': 157, + 'timestamp': 1691015535, + 'upload_date': '20230802', + }, + 'params': {'skip_download': True} }] def _real_extract(self, url): webpage = self._download_webpage(url, None) - id = self._search_regex(r'data-video-id\s?=\s?\"([^\"]+)\"', webpage, 'id') - data_json = self._download_json(f'https://cdn.jwplayer.com/v2/media/{id}', id) + video_id = self._search_regex( + [r'"https://cdn\.jwplayer\.com/players/(\w+)', r'data-video-id\s*=\s*\"([^\"]+)\"'], webpage, 'id') + data_json = self._download_json(f'https://cdn.jwplayer.com/v2/media/{video_id}', video_id) return self._parse_jwplayer_data(data_json) diff --git a/hypervideo_dl/extractor/mediaset.py b/hypervideo_dl/extractor/mediaset.py index 61bdb2a..e3b728d 100644 --- a/hypervideo_dl/extractor/mediaset.py +++ b/hypervideo_dl/extractor/mediaset.py @@ -7,7 +7,6 @@ from ..utils import ( GeoRestrictedError, int_or_none, OnDemandPagedList, - parse_qs, try_get, urljoin, update_url_query, @@ -16,20 +15,25 @@ from ..utils import ( class MediasetIE(ThePlatformBaseIE): _TP_TLD = 'eu' - _VALID_URL = r'''(?x) + _GUID_RE = r'F[0-9A-Z]{15}' + _VALID_URL = rf'''(?x) (?: mediaset:| https?:// (?:\w+\.)+mediaset\.it/ (?: (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_| - player/(?:v\d+/)?index\.html\?.*?\bprogramGuid= + player/(?:v\d+/)?index\.html\?\S*?\bprogramGuid= ) - )(?P<id>[0-9A-Z]{16,}) + )(?P<id>{_GUID_RE}) ''' + + _EMBED_REGEX = [ + rf'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//(?:\w+\.)+mediaset\.it/player/(?:v\d+/)?index\.html\?\S*?programGuid={_GUID_RE})[\'"&]' + ] _TESTS = [{ # full episode - 'url': 'https://www.mediasetplay.mediaset.it/video/mrwronglezionidamore/episodio-1_F310575103000102', + 'url': 'https://mediasetinfinity.mediaset.it/video/mrwronglezionidamore/episodio-1_F310575103000102', 'md5': 'a7e75c6384871f322adb781d3bd72c26', 'info_dict': { 'id': 'F310575103000102', @@ -50,7 +54,7 @@ class MediasetIE(ThePlatformBaseIE): 'chapters': [{'start_time': 0.0, 'end_time': 439.88}, {'start_time': 439.88, 'end_time': 1685.84}, {'start_time': 1685.84, 'end_time': 2682.0}], }, }, { - 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501', + 'url': 'https://mediasetinfinity.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501', 'md5': '1276f966ac423d16ba255ce867de073e', 'info_dict': { 'id': 'F309013801000501', @@ -71,51 +75,8 @@ class MediasetIE(ThePlatformBaseIE): 'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}], }, }, { - 'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-69-pezzo-di-luna_F303843101017801', - 'md5': 'd1650ac9ff944f185556126a736df148', - 'info_dict': { - 'id': 'F303843101017801', - 'ext': 'mp4', - 'title': 'Episodio 69 - Pezzo di luna', - 'description': 'md5:7c32c8ec4118b72588b9412f11353f73', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 263.008, - 'upload_date': '20200902', - 'series': 'Camera Café 5', - 'timestamp': 1599064700, - 'uploader': 'Italia 1', - 'uploader_id': 'I1', - 'season': 'Season 5', - 'episode': 'Episode 178', - 'season_number': 5, - 'episode_number': 178, - 'chapters': [{'start_time': 0.0, 'end_time': 261.88}, {'start_time': 261.88, 'end_time': 263.008}], - }, - }, { - 'url': 'https://www.mediasetplay.mediaset.it/video/cameracafe5/episodio-51-tu-chi-sei_F303843107000601', - 'md5': '567e9ad375b7a27a0e370650f572a1e3', - 'info_dict': { - 'id': 'F303843107000601', - 'ext': 'mp4', - 'title': 'Episodio 51 - Tu chi sei?', - 'description': 'md5:42ef006e56824cc31787a547590923f4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 367.021, - 'upload_date': '20200902', - 'series': 'Camera Café 5', - 'timestamp': 1599069817, - 'uploader': 'Italia 1', - 'uploader_id': 'I1', - 'season': 'Season 5', - 'episode': 'Episode 6', - 'season_number': 5, - 'episode_number': 6, - 'chapters': [{'start_time': 0.0, 'end_time': 358.68}, {'start_time': 358.68, 'end_time': 367.021}], - }, - }, { - # movie - 'url': 'https://www.mediasetplay.mediaset.it/movie/selvaggi/selvaggi_F006474501000101', - 'md5': '720440187a2ae26af8148eb9e6b901ed', + # DRM + 'url': 'https://mediasetinfinity.mediaset.it/movie/selvaggi/selvaggi_F006474501000101', 'info_dict': { 'id': 'F006474501000101', 'ext': 'mp4', @@ -129,75 +90,76 @@ class MediasetIE(ThePlatformBaseIE): 'uploader_id': 'B6', 'chapters': [{'start_time': 0.0, 'end_time': 1938.56}, {'start_time': 1938.56, 'end_time': 5233.01}], }, + 'params': { + 'ignore_no_formats_error': True, + }, + 'expected_warnings': [ + 'None of the available releases match the specified AssetType, ProtectionScheme, and/or Format preferences', + 'Content behind paywall and DRM', + ], + 'skip': True, }, { - # clip - 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680', + # old domain + 'url': 'https://www.mediasetplay.mediaset.it/video/mrwronglezionidamore/episodio-1_F310575103000102', 'only_matching': True, }, { - # iframe simple + # iframe 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665924&id=665924', 'only_matching': True, }, { - # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) - 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104', - 'only_matching': True, - }, { - # embedUrl (from https://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/) - 'url': 'https://static3.mediasetplay.mediaset.it/player/v2/index.html?partnerId=wittytv&configId=&programGuid=FD00000000153323&autoplay=true&purl=http://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/', - 'only_matching': True, - }, { 'url': 'mediaset:FAFU000000665924', 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + # Mediaset embed + 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml', + 'info_dict': { + 'id': 'FD00000000004929', + 'ext': 'mp4', + 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"', + 'duration': 67.013, + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Mediaset Play', + 'uploader_id': 'QY', + 'upload_date': '20201005', + 'timestamp': 1601866168, + 'chapters': [], + }, + 'params': { + 'skip_download': True, + } }, { - 'url': 'https://www.mediasetplay.mediaset.it/video/mediasethaacuoreilfuturo/palmieri-alicudi-lisola-dei-tre-bambini-felici--un-decreto-per-alicudi-e-tutte-le-microscuole_FD00000000102295', - 'only_matching': True, - }, { - 'url': 'https://www.mediasetplay.mediaset.it/video/cherryseason/anticipazioni-degli-episodi-del-23-ottobre_F306837101005C02', - 'only_matching': True, - }, { - 'url': 'https://www.mediasetplay.mediaset.it/video/tg5/ambiente-onda-umana-per-salvare-il-pianeta_F309453601079D01', - 'only_matching': True, - }, { - 'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135', - 'only_matching': True, - }, { - 'url': 'https://www.mediasetplay.mediaset.it/movie/herculeslaleggendahainizio/hercules-la-leggenda-ha-inizio_F305927501000102', - 'only_matching': True, - }, { - 'url': 'https://mediasetinfinity.mediaset.it/video/braveandbeautiful/episodio-113_F310948005000402', - 'only_matching': True, - }, { - 'url': 'https://static3.mediasetplay.mediaset.it/player/v2/index.html?partnerId=wittytv&configId=&programGuid=FD00000000153323', - 'only_matching': True, + # WittyTV embed + 'url': 'https://www.wittytv.it/mauriziocostanzoshow/ultima-puntata-venerdi-25-novembre/', + 'info_dict': { + 'id': 'F312172801000801', + 'ext': 'mp4', + 'title': 'Ultima puntata - Venerdì 25 novembre', + 'description': 'Una serata all\'insegna della musica e del buonumore ma non priva di spunti di riflessione', + 'duration': 6203.01, + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Canale 5', + 'uploader_id': 'C5', + 'upload_date': '20221126', + 'timestamp': 1669428689, + 'chapters': list, + 'series': 'Maurizio Costanzo Show', + 'season': 'Season 12', + 'season_number': 12, + 'episode': 'Episode 8', + 'episode_number': 8, + }, + 'params': { + 'skip_download': True, + } }] - def _extract_from_webpage(self, url, webpage): - def _program_guid(qs): - return qs.get('programGuid', [None])[0] - - for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml.*?)\1', - webpage): - embed_url = mobj.group('url') - embed_qs = parse_qs(embed_url) - program_guid = _program_guid(embed_qs) - if program_guid: - yield self.url_result(embed_url) - continue - - video_id = embed_qs.get('id', [None])[0] - if not video_id: - continue - urlh = self._request_webpage(embed_url, video_id, note='Following embed URL redirect') - embed_url = urlh.geturl() - program_guid = _program_guid(parse_qs(embed_url)) - if program_guid: - yield self.url_result(embed_url) - - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): for video in smil.findall(self._xpath_ns('.//video', namespace)): video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src']) - return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) + return super(MediasetIE, self)._parse_smil_formats_and_subtitles( + smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url) def _check_drm_formats(self, tp_formats, video_id): has_nondrm, drm_manifest = False, '' @@ -217,7 +179,7 @@ class MediasetIE(ThePlatformBaseIE): def _real_extract(self, url): guid = self._match_id(url) - tp_path = 'PR1GhC/media/guid/2702976343/' + guid + tp_path = f'PR1GhC/media/guid/2702976343/{guid}' info = self._extract_theplatform_metadata(tp_path, guid) formats = [] @@ -225,15 +187,17 @@ class MediasetIE(ThePlatformBaseIE): first_e = geo_e = None asset_type = 'geoNo:HD,browser,geoIT|geoNo:HD,geoIT|geoNo:SD,browser,geoIT|geoNo:SD,geoIT|geoNo|HD|SD' # TODO: fixup ISM+none manifest URLs - for f in ('MPEG4', 'M3U'): + for f in ('MPEG4', 'MPEG-DASH', 'M3U'): try: tp_formats, tp_subtitles = self._extract_theplatform_smil( - update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), { + update_url_query(f'http://link.theplatform.{self._TP_TLD}/s/{tp_path}', { 'mbr': 'true', 'formats': f, 'assetTypes': asset_type, - }), guid, 'Downloading %s SMIL data' % (f.split('+')[0])) + }), guid, f'Downloading {f.split("+")[0]} SMIL data') except ExtractorError as e: + if e.orig_msg == 'None of the available releases match the specified AssetType, ProtectionScheme, and/or Format preferences': + e.orig_msg = 'This video is DRM protected' if not geo_e and isinstance(e, GeoRestrictedError): geo_e = e if not first_e: @@ -248,7 +212,7 @@ class MediasetIE(ThePlatformBaseIE): raise geo_e or first_e feed_data = self._download_json( - 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2/guid/-/' + guid, + f'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2/guid/-/{guid}', guid, fatal=False) if feed_data: publish_info = feed_data.get('mediasetprogram$publishInfo') or {} @@ -299,23 +263,23 @@ class MediasetShowIE(MediasetIE): # XXX: Do not subclass from concrete IE ''' _TESTS = [{ # TV Show webpage (general webpage) - 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061', + 'url': 'https://mediasetinfinity.mediaset.it/programmi-tv/leiene/leiene_SE000000000061', 'info_dict': { 'id': '000000000061', - 'title': 'Le Iene', + 'title': 'Le Iene 2022/2023', }, - 'playlist_mincount': 7, + 'playlist_mincount': 6, }, { # TV Show webpage (specific season) - 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061,ST000000002763', + 'url': 'https://mediasetinfinity.mediaset.it/programmi-tv/leiene/leiene_SE000000000061,ST000000002763', 'info_dict': { 'id': '000000002763', - 'title': 'Le Iene', + 'title': 'Le Iene 2021/2022', }, 'playlist_mincount': 7, }, { # TV Show specific playlist (with multiple pages) - 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/iservizi_SE000000000061,ST000000002763,sb100013375', + 'url': 'https://mediasetinfinity.mediaset.it/programmi-tv/leiene/iservizi_SE000000000061,ST000000002763,sb100013375', 'info_dict': { 'id': '100013375', 'title': 'I servizi', @@ -340,10 +304,9 @@ class MediasetShowIE(MediasetIE): # XXX: Do not subclass from concrete IE playlist_id, st, sb = self._match_valid_url(url).group('id', 'st', 'sb') if not sb: page = self._download_webpage(url, st or playlist_id) - entries = [self.url_result(urljoin('https://www.mediasetplay.mediaset.it', url)) + entries = [self.url_result(urljoin('https://mediasetinfinity.mediaset.it', url)) for url in re.findall(r'href="([^<>=]+SE\d{12},ST\d{12},sb\d{9})">[^<]+<', page)] - title = (self._html_search_regex(r'(?s)<h1[^>]*>(.+?)</h1>', page, 'title', default=None) - or self._og_search_title(page)) + title = self._html_extract_title(page).split('|')[0].strip() return self.playlist_result(entries, st or playlist_id, title) entries = OnDemandPagedList( diff --git a/hypervideo_dl/extractor/mediasite.py b/hypervideo_dl/extractor/mediasite.py index fe549c4..7ea78ab 100644 --- a/hypervideo_dl/extractor/mediasite.py +++ b/hypervideo_dl/extractor/mediasite.py @@ -171,7 +171,7 @@ class MediasiteIE(InfoExtractor): query = mobj.group('query') webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? - redirect_url = urlh.geturl() + redirect_url = urlh.url # XXX: might have also extracted UrlReferrer and QueryString from the html service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( diff --git a/hypervideo_dl/extractor/mediastream.py b/hypervideo_dl/extractor/mediastream.py new file mode 100644 index 0000000..cef769f --- /dev/null +++ b/hypervideo_dl/extractor/mediastream.py @@ -0,0 +1,208 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + remove_end, + traverse_obj, + urljoin, +) + + +class MediaStreamBaseIE(InfoExtractor): + _EMBED_BASE_URL = 'https://mdstrm.com/embed' + _BASE_URL_RE = r'https?://mdstrm\.com/(?:embed|live-stream)' + + def _extract_mediastream_urls(self, webpage): + yield from traverse_obj(list(self._yield_json_ld(webpage, None)), ( + lambda _, v: v['@type'] == 'VideoObject', ('embedUrl', 'contentUrl'), + {lambda x: x if re.match(rf'{self._BASE_URL_RE}/\w+', x) else None})) + + for mobj in re.finditer(r'<script[^>]+>[^>]*playerMdStream\.mdstreamVideo\(\s*[\'"](?P<video_id>\w+)', webpage): + yield f'{self._EMBED_BASE_URL}/{mobj.group("video_id")}' + + yield from re.findall( + rf'<iframe[^>]+\bsrc="({self._BASE_URL_RE}/\w+)', webpage) + + for mobj in re.finditer( + r'''(?x) + <(?:div|ps-mediastream)[^>]+ + (class="[^"]*MediaStreamVideoPlayer)[^"]*"[^>]+ + data-video-id="(?P<video_id>\w+)" + (?:\s*data-video-type="(?P<video_type>[^"]+))? + (?:[^>]*>\s*<div[^>]+\1[^"]*"[^>]+data-mediastream=["\'][^>]+ + https://mdstrm\.com/(?P<live>live-stream))? + ''', webpage): + + video_type = 'live-stream' if mobj.group('video_type') == 'live' or mobj.group('live') else 'embed' + yield f'https://mdstrm.com/{video_type}/{mobj.group("video_id")}' + + +class MediaStreamIE(MediaStreamBaseIE): + _VALID_URL = MediaStreamBaseIE._BASE_URL_RE + r'/(?P<id>\w+)' + + _TESTS = [{ + 'url': 'https://mdstrm.com/embed/6318e3f1d1d316083ae48831', + 'md5': '97b4f2634b8e8612cc574dfcd504df05', + 'info_dict': { + 'id': '6318e3f1d1d316083ae48831', + 'title': 'Video: Así fue el despido de Thomas Tuchel del Chelsea', + 'description': 'md5:358ce1e1396010d50a1ece1be3633c95', + 'thumbnail': r're:^https?://[^?#]+6318e3f1d1d316083ae48831', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + _WEBPAGE_TESTS = [{ + 'url': 'https://www.multimedios.com/video/costa-rica-tv-en-vivo/v2616', + 'info_dict': { + 'id': '5a7b1e63a8da282c34d65445', + 'title': 're:mmtv-costarica', + 'description': 'mmtv-costarica', + 'thumbnail': 're:^https?://[^?#]+5a7b1e63a8da282c34d65445', + 'ext': 'mp4', + 'live_status': 'is_live', + }, + 'params': {'skip_download': 'Livestream'}, + }, { + 'url': 'https://www.multimedios.com/television/clases-de-llaves-y-castigos-quien-sabe-mas', + 'md5': 'de31f0b1ecc321fb35bf22d58734ea40', + 'info_dict': { + 'id': '63731bab8ec9b308a2c9ed28', + 'title': 'Clases de llaves y castigos ¿Quién sabe más?', + 'description': 'md5:1b49aa1ee5a4b32fbd66104b2d629e9d', + 'thumbnail': 're:^https?://[^?#]+63731bab8ec9b308a2c9ed28', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.americatv.com.pe/videos/esto-es-guerra/facundo-gonzalez-sufrio-fuerte-golpe-durante-competencia-frente-hugo-garcia-eeg-noticia-139120', + 'info_dict': { + 'id': '63756df1c638b008a5659dec', + 'title': 'Facundo González sufrió fuerte golpe durante competencia frente a Hugo García en EEG', + 'description': 'md5:9490c034264afd756eef7b2c3adee69e', + 'thumbnail': 're:^https?://[^?#]+63756df1c638b008a5659dec', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.americatv.com.pe/videos/al-fondo-hay-sitio/nuevas-lomas-town-bernardo-mata-se-enfrento-sujeto-luchar-amor-macarena-noticia-139083', + 'info_dict': { + 'id': '637307669609130f74cd3a6e', + 'title': 'Las Nuevas Lomas Town: Bernardo De La Mata se enfrentó a sujeto para luchar por el amor de Macarena', + 'description': 'md5:60d71772f1e1496923539ae58aa17124', + 'thumbnail': 're:^https?://[^?#]+637307669609130f74cd3a6e', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _extract_from_webpage(self, url, webpage): + for embed_url in self._extract_mediastream_urls(webpage): + yield self.url_result(embed_url, MediaStreamIE, None) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + if 'Debido a tu ubicación no puedes ver el contenido' in webpage: + self.raise_geo_restricted() + + player_config = self._search_json(r'window\.MDSTRM\.OPTIONS\s*=', webpage, 'metadata', video_id) + + formats, subtitles = [], {} + for video_format in player_config['src']: + if video_format == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles(player_config['src'][video_format], video_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif video_format == 'mpd': + fmts, subs = self._extract_mpd_formats_and_subtitles(player_config['src'][video_format], video_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': player_config['src'][video_format], + }) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage) or player_config.get('title'), + 'description': self._og_search_description(webpage), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': player_config.get('type') == 'live', + 'thumbnail': self._og_search_thumbnail(webpage), + } + + +class WinSportsVideoIE(MediaStreamBaseIE): + _VALID_URL = r'https?://www\.winsports\.co/videos/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://www.winsports.co/videos/siempre-castellanos-gran-atajada-del-portero-cardenal-para-evitar-la-caida-de-su-arco-60536', + 'info_dict': { + 'id': '62dc8357162c4b0821fcfb3c', + 'display_id': 'siempre-castellanos-gran-atajada-del-portero-cardenal-para-evitar-la-caida-de-su-arco-60536', + 'title': '¡Siempre Castellanos! Gran atajada del portero \'cardenal\' para evitar la caída de su arco', + 'description': 'md5:eb811b2b2882bdc59431732c06b905f2', + 'thumbnail': r're:^https?://[^?#]+62dc8357162c4b0821fcfb3c', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.winsports.co/videos/observa-aqui-los-goles-del-empate-entre-tolima-y-nacional-60548', + 'info_dict': { + 'id': '62dcb875ef12a5526790b552', + 'display_id': 'observa-aqui-los-goles-del-empate-entre-tolima-y-nacional-60548', + 'title': 'Observa aquí los goles del empate entre Tolima y Nacional', + 'description': 'md5:b19402ba6e46558b93fd24b873eea9c9', + 'thumbnail': r're:^https?://[^?#]+62dcb875ef12a5526790b552', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.winsports.co/videos/equidad-vuelve-defender-su-arco-de-remates-de-junior', + 'info_dict': { + 'id': '63fa7eca72f1741ad3a4d515', + 'display_id': 'equidad-vuelve-defender-su-arco-de-remates-de-junior', + 'title': '⚽ Equidad vuelve a defender su arco de remates de Junior', + 'description': 'Remate de Sierra', + 'thumbnail': r're:^https?://[^?#]+63fa7eca72f1741ad3a4d515', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.winsports.co/videos/bucaramanga-se-quedo-con-el-grito-de-gol-en-la-garganta', + 'info_dict': { + 'id': '6402adb62bbf3b18d454e1b0', + 'display_id': 'bucaramanga-se-quedo-con-el-grito-de-gol-en-la-garganta', + 'title': '⚽Bucaramanga se quedó con el grito de gol en la garganta', + 'description': 'Gol anulado Bucaramanga', + 'thumbnail': r're:^https?://[^?#]+6402adb62bbf3b18d454e1b0', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + data = self._search_json( + r'<script\s*[^>]+data-drupal-selector="drupal-settings-json">', webpage, 'data', display_id) + + mediastream_url = urljoin(f'{self._EMBED_BASE_URL}/', ( + traverse_obj(data, ( + (('settings', 'mediastream_formatter', ..., 'mediastream_id'), 'url'), {str}), get_all=False) + or next(self._extract_mediastream_urls(webpage), None))) + + if not mediastream_url: + self.raise_no_formats('No MediaStream embed found in webpage') + + title = clean_html(remove_end( + self._search_json_ld(webpage, display_id, expected_type='VideoObject', default={}).get('title') + or self._og_search_title(webpage), '| Win Sports')) + + return self.url_result( + mediastream_url, MediaStreamIE, display_id, url_transparent=True, display_id=display_id, video_title=title) diff --git a/hypervideo_dl/extractor/megatvcom.py b/hypervideo_dl/extractor/megatvcom.py index 2f3f11f..93c7e7d 100644 --- a/hypervideo_dl/extractor/megatvcom.py +++ b/hypervideo_dl/extractor/megatvcom.py @@ -1,14 +1,14 @@ import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( + ExtractorError, clean_html, determine_ext, - ExtractorError, extract_attributes, get_element_by_class, get_element_html_by_id, - HEADRequest, parse_qs, unescapeHTML, unified_timestamp, @@ -160,5 +160,5 @@ class MegaTVComEmbedIE(MegaTVComBaseIE): canonical_url = self._request_webpage( HEADRequest(canonical_url), video_id, note='Resolve canonical URL', - errnote='Could not resolve canonical URL').geturl() + errnote='Could not resolve canonical URL').url return self.url_result(canonical_url, MegaTVComIE.ie_key(), video_id) diff --git a/hypervideo_dl/extractor/mgtv.py b/hypervideo_dl/extractor/mgtv.py index edc92b3..31ccf00 100644 --- a/hypervideo_dl/extractor/mgtv.py +++ b/hypervideo_dl/extractor/mgtv.py @@ -3,15 +3,15 @@ import time import uuid from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, + parse_resolution, + traverse_obj, try_get, url_or_none, + urljoin, ) @@ -30,16 +30,18 @@ class MGTVIE(InfoExtractor): 'duration': 7461, 'thumbnail': r're:^https?://.*\.jpg$', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/427837/15588271.html', 'info_dict': { 'id': '15588271', 'ext': 'mp4', - 'title': '春日迟迟再出发 沉浸版', + 'title': '春日迟迟再出发 沉浸版第1期:陆莹结婚半年查出肾炎被离婚 吴雅婷把一半票根退给前夫', 'description': 'md5:a7a05a05b1aa87bd50cae619b19bbca6', 'thumbnail': r're:^https?://.+\.jpg', 'duration': 4026, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/333652/7329822.html', 'info_dict': { @@ -50,6 +52,7 @@ class MGTVIE(InfoExtractor): 'thumbnail': r're:^https?://.+\.jpg', 'duration': 2656, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://w.mgtv.com/b/427837/15591647.html', 'only_matching': True, @@ -64,6 +67,13 @@ class MGTVIE(InfoExtractor): 'only_matching': True, }] + _RESOLUTIONS = { + '标清': ('480p', '854x480'), + '高清': ('540p', '960x540'), + '超清': ('720p', '1280x720'), + '蓝光': ('1080p', '1920x1080'), + } + def _real_extract(self, url): video_id = self._match_id(url) tk2 = base64.urlsafe_b64encode( @@ -76,55 +86,60 @@ class MGTVIE(InfoExtractor): 'type': 'pch5' }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), None) + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read().decode(), None) if error.get('code') == 40005: self.raise_geo_restricted(countries=self._GEO_COUNTRIES) raise ExtractorError(error['msg'], expected=True) raise - info = api_data['info'] - title = info['title'].strip() + stream_data = self._download_json( 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ - 'pm2': api_data['atc']['pm2'], 'tk2': tk2, + 'pm2': api_data['atc']['pm2'], 'video_id': video_id, + 'type': 'pch5', 'src': 'intelmgtv', }, headers=self.geo_verification_headers())['data'] - stream_domain = stream_data['stream_domain'][0] + stream_domain = traverse_obj(stream_data, ('stream_domain', ..., {url_or_none}), get_all=False) formats = [] - for idx, stream in enumerate(stream_data['stream']): - stream_path = stream.get('url') - if not stream_path: - continue - format_data = self._download_json( - stream_domain + stream_path, video_id, - note=f'Download video info for format #{idx}') - format_url = format_data.get('info') + for idx, stream in enumerate(traverse_obj(stream_data, ('stream', lambda _, v: v['url']))): + stream_name = traverse_obj(stream, 'name', 'standardName', 'barName', expected_type=str) + resolution = traverse_obj( + self._RESOLUTIONS, (stream_name, 1 if stream.get('scale') == '16:9' else 0)) + format_url = traverse_obj(self._download_json( + urljoin(stream_domain, stream['url']), video_id, fatal=False, + note=f'Downloading video info for format {resolution or stream_name}'), + ('info', {url_or_none})) if not format_url: continue tbr = int_or_none(stream.get('filebitrate') or self._search_regex( r'_(\d+)_mp4/', format_url, 'tbr', default=None)) formats.append({ - 'format_id': compat_str(tbr or idx), - 'url': url_or_none(format_url), + 'format_id': str(tbr or idx), + 'url': format_url, 'ext': 'mp4', 'tbr': tbr, + 'vcodec': stream.get('videoFormat'), + 'acodec': stream.get('audioFormat'), + **parse_resolution(resolution), 'protocol': 'm3u8_native', 'http_headers': { 'Referer': url, }, - 'format_note': stream.get('name'), + 'format_note': stream_name, }) return { 'id': video_id, - 'title': title, 'formats': formats, - 'description': info.get('desc'), - 'duration': int_or_none(info.get('duration')), - 'thumbnail': info.get('thumb'), + **traverse_obj(api_data, ('info', { + 'title': ('title', {str.strip}), + 'description': ('desc', {str}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('thumb', {url_or_none}), + })), 'subtitles': self.extract_subtitles(video_id, stream_domain), } diff --git a/hypervideo_dl/extractor/minds.py b/hypervideo_dl/extractor/minds.py index 2fb1792..27a6e38 100644 --- a/hypervideo_dl/extractor/minds.py +++ b/hypervideo_dl/extractor/minds.py @@ -106,7 +106,7 @@ class MindsIE(MindsBaseIE): if poster: urlh = self._request_webpage(poster, video_id, fatal=False) if urlh: - thumbnail = urlh.geturl() + thumbnail = urlh.url return { 'id': video_id, diff --git a/hypervideo_dl/extractor/miomio.py b/hypervideo_dl/extractor/miomio.py index a0a041e..8df8cba 100644 --- a/hypervideo_dl/extractor/miomio.py +++ b/hypervideo_dl/extractor/miomio.py @@ -2,12 +2,8 @@ import random from .common import InfoExtractor from ..compat import compat_urlparse -from ..utils import ( - xpath_text, - int_or_none, - ExtractorError, - sanitized_Request, -) +from ..networking import Request +from ..utils import ExtractorError, int_or_none, xpath_text class MioMioIE(InfoExtractor): @@ -61,7 +57,7 @@ class MioMioIE(InfoExtractor): 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/xml.php?id=%s&r=%s' % (id, random.randint(100, 999)), video_id) - vid_config_request = sanitized_Request( + vid_config_request = Request( 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config), headers=http_headers) diff --git a/hypervideo_dl/extractor/mixch.py b/hypervideo_dl/extractor/mixch.py index 3f430a7..4be6947 100644 --- a/hypervideo_dl/extractor/mixch.py +++ b/hypervideo_dl/extractor/mixch.py @@ -1,8 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - ExtractorError, - traverse_obj, -) +from ..utils import UserNotLive, traverse_obj class MixchIE(InfoExtractor): @@ -33,7 +30,7 @@ class MixchIE(InfoExtractor): initial_js_state = self._parse_json(self._search_regex( r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id) if not initial_js_state.get('liveInfo'): - raise ExtractorError('Livestream has ended.', expected=True) + raise UserNotLive(video_id=video_id) return { 'id': video_id, @@ -45,7 +42,8 @@ class MixchIE(InfoExtractor): 'uploader_id': video_id, 'formats': [{ 'format_id': 'hls', - 'url': traverse_obj(initial_js_state, ('liveInfo', 'hls')) or 'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_%s.m3u8' % video_id, + 'url': (traverse_obj(initial_js_state, ('liveInfo', 'hls')) + or f'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_{video_id}.m3u8'), 'ext': 'mp4', 'protocol': 'm3u8', }], diff --git a/hypervideo_dl/extractor/motherless.py b/hypervideo_dl/extractor/motherless.py index c24ef9b..769b52c 100644 --- a/hypervideo_dl/extractor/motherless.py +++ b/hypervideo_dl/extractor/motherless.py @@ -1,32 +1,39 @@ import datetime import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( ExtractorError, - InAdvancePagedList, - orderedSet, + OnDemandPagedList, + remove_end, str_to_int, unified_strdate, ) class MotherlessIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/|G[VIG]?[A-F0-9]+/)?(?P<id>[A-F0-9]+)' _TESTS = [{ - 'url': 'http://motherless.com/AC3FFE1', - 'md5': '310f62e325a9fafe64f68c0bccb6e75f', + 'url': 'http://motherless.com/EE97006', + 'md5': 'cb5e7438f7a3c4e886b7bccc1292a3bc', 'info_dict': { - 'id': 'AC3FFE1', + 'id': 'EE97006', 'ext': 'mp4', - 'title': 'Fucked in the ass while playing PS3', - 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], - 'upload_date': '20100913', - 'uploader_id': 'famouslyfuckedup', + 'title': 'Dogging blond Brit getting glazed (comp)', + 'categories': ['UK', 'slag', 'whore', 'dogging', 'cunt', 'cumhound', 'big tits', 'Pearl Necklace'], + 'upload_date': '20230519', + 'uploader_id': 'deathbird', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, - } + 'comment_count': int, + 'view_count': int, + 'like_count': int, + }, + 'params': { + # Incomplete cert chains + 'nocheckcertificate': True, + }, }, { 'url': 'http://motherless.com/532291B', 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131', @@ -49,16 +56,36 @@ class MotherlessIE(InfoExtractor): 'id': '633979F', 'ext': 'mp4', 'title': 'Turtlette', - 'categories': ['superheroine heroine superher'], + 'categories': ['superheroine heroine superher'], 'upload_date': '20140827', 'uploader_id': 'shade0230', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, - } + 'like_count': int, + 'comment_count': int, + 'view_count': int, + }, + 'params': { + 'nocheckcertificate': True, + }, }, { - # no keywords 'url': 'http://motherless.com/8B4BBC1', - 'only_matching': True, + 'info_dict': { + 'id': '8B4BBC1', + 'ext': 'mp4', + 'title': 'VIDEO00441.mp4', + 'categories': [], + 'upload_date': '20160214', + 'uploader_id': 'NMWildGirl', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + 'like_count': int, + 'comment_count': int, + 'view_count': int, + }, + 'params': { + 'nocheckcertificate': True, + }, }, { # see https://motherless.com/videos/recent for recent videos with # uploaded date in "ago" format @@ -72,9 +99,12 @@ class MotherlessIE(InfoExtractor): 'uploader_id': 'anonymous', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, + 'like_count': int, + 'comment_count': int, + 'view_count': int, }, 'params': { - 'skip_download': True, + 'nocheckcertificate': True, }, }] @@ -128,10 +158,8 @@ class MotherlessIE(InfoExtractor): (r'''<span\b[^>]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)</span>''', r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)'''), webpage, 'uploader_id', fatal=False) - - categories = self._html_search_meta('keywords', webpage, default=None) - if categories: - categories = [cat.strip() for cat in categories.split(',')] + categories = self._html_search_meta('keywords', webpage, default='') + categories = [cat.strip() for cat in categories.split(',') if cat.strip()] return { 'id': video_id, @@ -148,102 +176,97 @@ class MotherlessIE(InfoExtractor): } -class MotherlessGroupIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' +class MotherlessPaginatedIE(InfoExtractor): + _PAGE_SIZE = 60 + + def _correct_path(self, url, item_id): + raise NotImplementedError('This method must be implemented by subclasses') + + def _extract_entries(self, webpage, base): + for mobj in re.finditer(r'href="[^"]*(?P<href>/[A-F0-9]+)"\s+title="(?P<title>[^"]+)', + webpage): + video_url = urllib.parse.urljoin(base, mobj.group('href')) + video_id = MotherlessIE.get_temp_id(video_url) + + if video_id: + yield self.url_result(video_url, MotherlessIE, video_id, mobj.group('title')) + + def _real_extract(self, url): + item_id = self._match_id(url) + real_url = self._correct_path(url, item_id) + webpage = self._download_webpage(real_url, item_id, 'Downloading page 1') + + def get_page(idx): + page = idx + 1 + current_page = webpage if not idx else self._download_webpage( + real_url, item_id, note=f'Downloading page {page}', query={'page': page}) + yield from self._extract_entries(current_page, real_url) + + return self.playlist_result( + OnDemandPagedList(get_page, self._PAGE_SIZE), item_id, + remove_end(self._html_extract_title(webpage), ' | MOTHERLESS.COM ™')) + + +class MotherlessGroupIE(MotherlessPaginatedIE): + _VALID_URL = r'https?://(?:www\.)?motherless\.com/g[vifm]?/(?P<id>[a-z0-9_]+)/?(?:$|[#?])' _TESTS = [{ - 'url': 'http://motherless.com/g/movie_scenes', + 'url': 'http://motherless.com/gv/movie_scenes', 'info_dict': { 'id': 'movie_scenes', 'title': 'Movie Scenes', - 'description': 'Hot and sexy scenes from "regular" movies... ' - 'Beautiful actresses fully nude... A looot of ' - 'skin! :)Enjoy!', }, - 'playlist_mincount': 662, + 'playlist_mincount': 540, }, { - 'url': 'http://motherless.com/gv/sex_must_be_funny', + 'url': 'http://motherless.com/g/sex_must_be_funny', 'info_dict': { 'id': 'sex_must_be_funny', 'title': 'Sex must be funny', - 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' - 'any kind!' }, - 'playlist_mincount': 0, - 'expected_warnings': [ - 'This group has no videos.', - ] + 'playlist_count': 0, }, { - 'url': 'https://motherless.com/g/beautiful_cock', + 'url': 'https://motherless.com/gv/beautiful_cock', 'info_dict': { 'id': 'beautiful_cock', 'title': 'Beautiful Cock', - 'description': 'Group for lovely cocks yours, mine, a friends anything human', }, - 'playlist_mincount': 2500, + 'playlist_mincount': 2040, }] - @classmethod - def suitable(cls, url): - return (False if MotherlessIE.suitable(url) - else super(MotherlessGroupIE, cls).suitable(url)) + def _correct_path(self, url, item_id): + return urllib.parse.urljoin(url, f'/gv/{item_id}') - def _extract_entries(self, webpage, base): - entries = [] - for mobj in re.finditer( - r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?', - webpage): - video_url = compat_urlparse.urljoin(base, mobj.group('href')) - if not MotherlessIE.suitable(video_url): - continue - video_id = MotherlessIE._match_id(video_url) - title = mobj.group('title') - entries.append(self.url_result( - video_url, ie=MotherlessIE.ie_key(), video_id=video_id, - video_title=title)) - # Alternative fallback - if not entries: - entries = [ - self.url_result( - compat_urlparse.urljoin(base, '/' + entry_id), - ie=MotherlessIE.ie_key(), video_id=entry_id) - for entry_id in orderedSet(re.findall( - r'data-codename=["\']([A-Z0-9]+)', webpage))] - return entries - def _real_extract(self, url): - group_id = self._match_id(url) - page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id) - webpage = self._download_webpage(page_url, group_id) - title = self._search_regex( - r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False) - description = self._html_search_meta( - 'description', webpage, fatal=False) - page_count = str_to_int(self._search_regex( - r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b', - webpage, 'page_count', default=0)) - if not page_count: - message = self._search_regex( - r'''class\s*=\s*['"]error-page\b[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*''', - webpage, 'error_msg', default=None) or 'This group has no videos.' - self.report_warning(message, group_id) - page_count = 1 - PAGE_SIZE = 80 - - def _get_page(idx): - if idx > 0: - webpage = self._download_webpage( - page_url, group_id, query={'page': idx + 1}, - note='Downloading page %d/%d' % (idx + 1, page_count) - ) - for entry in self._extract_entries(webpage, url): - yield entry - - playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) +class MotherlessGalleryIE(MotherlessPaginatedIE): + _VALID_URL = r'https?://(?:www\.)?motherless\.com/G[VIG]?(?P<id>[A-F0-9]+)/?(?:$|[#?])' + _TESTS = [{ + 'url': 'https://motherless.com/GV338999F', + 'info_dict': { + 'id': '338999F', + 'title': 'Random', + }, + 'playlist_mincount': 190, + }, { + 'url': 'https://motherless.com/GVABD6213', + 'info_dict': { + 'id': 'ABD6213', + 'title': 'Cuties', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://motherless.com/GVBCF7622', + 'info_dict': { + 'id': 'BCF7622', + 'title': 'Vintage', + }, + 'playlist_count': 0, + }, { + 'url': 'https://motherless.com/G035DE2F', + 'info_dict': { + 'id': '035DE2F', + 'title': 'General', + }, + 'playlist_mincount': 420, + }] - return { - '_type': 'playlist', - 'id': group_id, - 'title': title, - 'description': description, - 'entries': playlist - } + def _correct_path(self, url, item_id): + return urllib.parse.urljoin(url, f'/GV{item_id}') diff --git a/hypervideo_dl/extractor/moviepilot.py b/hypervideo_dl/extractor/moviepilot.py index ca54156..668c098 100644 --- a/hypervideo_dl/extractor/moviepilot.py +++ b/hypervideo_dl/extractor/moviepilot.py @@ -1,11 +1,5 @@ from .dailymotion import DailymotionIE from .common import InfoExtractor -from ..utils import ( - parse_iso8601, - try_get, -) - -import re class MoviepilotIE(InfoExtractor): @@ -16,21 +10,21 @@ class MoviepilotIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.moviepilot.de/movies/interstellar-2/', 'info_dict': { - 'id': 'x7xdut5', + 'id': 'x7xdpkk', 'display_id': 'interstellar-2', 'ext': 'mp4', 'title': 'Interstellar', - 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaXev1VvzitVZMFsR/x720', - 'timestamp': 1400491705, - 'description': 'md5:7dfc5c1758e7322a7346934f1f0c489c', + 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaV-q1ZganMw4HVXg/x1080', + 'timestamp': 1605010596, + 'description': 'md5:0ae9cb452af52610c9ffc60f2fd0474c', 'uploader': 'Moviepilot', 'like_count': int, 'view_count': int, 'uploader_id': 'x6nd9k', - 'upload_date': '20140519', - 'duration': 140, + 'upload_date': '20201110', + 'duration': 97, 'age_limit': 0, - 'tags': ['Alle Trailer', 'Movie', 'Third Party'], + 'tags': ['Alle Trailer', 'Movie', 'Verleih'], }, }, { 'url': 'https://www.moviepilot.de/movies/interstellar-2/trailer', @@ -45,14 +39,14 @@ class MoviepilotIE(InfoExtractor): 'display_id': 'queen-slim', 'title': 'Queen & Slim', 'ext': 'mp4', - 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SbUM71WtomSjVmI_q/x720', - 'timestamp': 1571838685, - 'description': 'md5:73058bcd030aa12d991e4280d65fbebe', + 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SbUM71ZeG2N975lf2/x1080', + 'timestamp': 1605555825, + 'description': 'md5:83228bb86f5367dd181447fdc4873989', 'uploader': 'Moviepilot', 'like_count': int, 'view_count': int, 'uploader_id': 'x6nd9k', - 'upload_date': '20191023', + 'upload_date': '20201116', 'duration': 138, 'age_limit': 0, 'tags': ['Movie', 'Verleih', 'Neue Trailer'], @@ -72,12 +66,12 @@ class MoviepilotIE(InfoExtractor): 'display_id': 'muellers-buero', 'title': 'Müllers Büro', 'ext': 'mp4', - 'description': 'md5:57501251c05cdc61ca314b7633e0312e', - 'timestamp': 1287584475, + 'description': 'md5:4d23a8f4ca035196cd4523863c4fe5a4', + 'timestamp': 1604958457, 'age_limit': 0, 'duration': 82, - 'upload_date': '20101020', - 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaMes1WfAm1d6maq_/x720', + 'upload_date': '20201109', + 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaMes1Zg3lxLv9j5u/x1080', 'uploader': 'Moviepilot', 'like_count': int, 'view_count': int, @@ -91,22 +85,13 @@ class MoviepilotIE(InfoExtractor): webpage = self._download_webpage(f'https://www.moviepilot.de/movies/{video_id}/trailer', video_id) - duration = try_get( - re.match(r'P(?P<hours>\d+)H(?P<mins>\d+)M(?P<secs>\d+)S', - self._html_search_meta('duration', webpage, fatal=False) or ''), - lambda mobj: sum(float(x) * y for x, y in zip(mobj.groups(), (3600, 60, 1)))) - # _html_search_meta is not used since we don't want name=description to match - description = self._html_search_regex( - '<meta[^>]+itemprop="description"[^>]+content="([^>"]+)"', webpage, 'description', fatal=False) + clip = self._search_nextjs_data(webpage, video_id)['props']['initialProps']['pageProps'] return { '_type': 'url_transparent', 'ie_key': DailymotionIE.ie_key(), 'display_id': video_id, - 'title': self._og_search_title(webpage), - 'url': self._html_search_meta('embedURL', webpage), - 'thumbnail': self._html_search_meta('thumbnailURL', webpage), - 'description': description, - 'duration': duration, - 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage), delimiter=' ') + 'title': clip.get('title'), + 'url': f'https://www.dailymotion.com/video/{clip["videoRemoteId"]}', + 'description': clip.get('summary'), } diff --git a/hypervideo_dl/extractor/mtv.py b/hypervideo_dl/extractor/mtv.py index d91be62..0d700b9 100644 --- a/hypervideo_dl/extractor/mtv.py +++ b/hypervideo_dl/extractor/mtv.py @@ -2,16 +2,15 @@ import re from .common import InfoExtractor from ..compat import compat_str +from ..networking import HEADRequest, Request from ..utils import ( ExtractorError, + RegexNotFoundError, find_xpath_attr, fix_xml_ampersands, float_or_none, - HEADRequest, int_or_none, join_nonempty, - RegexNotFoundError, - sanitized_Request, strip_or_none, timeconvert, try_get, @@ -51,15 +50,15 @@ class MTVServicesInfoExtractor(InfoExtractor): def _extract_mobile_video_formats(self, mtvn_id): webpage_url = self._MOBILE_TEMPLATE % mtvn_id - req = sanitized_Request(webpage_url) + req = Request(webpage_url) # Otherwise we get a webpage that would execute some javascript - req.add_header('User-Agent', 'curl/7') + req.headers['User-Agent'] = 'curl/7' webpage = self._download_webpage(req, mtvn_id, 'Downloading mobile page') metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url')) req = HEADRequest(metrics_url) response = self._request_webpage(req, mtvn_id, 'Resolving url') - url = response.geturl() + url = response.url # Transform the url to get the best quality: url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1) return [{'url': url, 'ext': 'mp4'}] diff --git a/hypervideo_dl/extractor/museai.py b/hypervideo_dl/extractor/museai.py new file mode 100644 index 0000000..7f66928 --- /dev/null +++ b/hypervideo_dl/extractor/museai.py @@ -0,0 +1,112 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + float_or_none, + int_or_none, + js_to_json, + traverse_obj, + url_or_none, +) + + +class MuseAIIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?muse\.ai/(?:v|embed)/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://muse.ai/embed/YdTWvUW', + 'md5': 'f994f9a38be1c3aaf9e37cbd7d76fe7c', + 'info_dict': { + 'id': 'YdTWvUW', + 'ext': 'mp4', + 'title': '2023-05-28-Grabien-1941111 (1)', + 'description': '', + 'uploader': 'Today News Africa', + 'uploader_id': 'TodayNewsAfrica', + 'upload_date': '20230528', + 'timestamp': 1685285044, + 'duration': 1291.3, + 'view_count': int, + 'availability': 'public', + }, + }, { + 'url': 'https://muse.ai/v/gQ4gGAA-0756', + 'md5': '52dbfc78e865e56dc19a1715badc35e8', + 'info_dict': { + 'id': 'gQ4gGAA', + 'ext': 'mp4', + 'title': '0756', + 'description': 'md5:0ca1483f9aac423e9a96ad00bb3a0785', + 'uploader': 'Aerial.ie', + 'uploader_id': 'aerial', + 'upload_date': '20210306', + 'timestamp': 1615072842, + 'duration': 21.4, + 'view_count': int, + 'availability': 'public', + }, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://muse.ai/docs', + 'playlist_mincount': 4, + 'info_dict': { + 'id': 'docs', + 'title': 'muse.ai | docs', + 'description': 'md5:6c0293431481582739c82ee8902687fa', + 'age_limit': 0, + 'thumbnail': 'https://muse.ai/static/imgs/poster-img-docs.png', + }, + 'params': {'allowed_extractors': ['all', '-html5']}, + }] + _EMBED_REGEX = [r'<iframe[^>]*\bsrc=["\'](?P<url>https://muse\.ai/embed/\w+)'] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for embed_id in re.findall(r'<script>[^<]*\bMusePlayer\(\{[^}<]*\bvideo:\s*["\'](\w+)["\']', webpage): + yield f'https://muse.ai/embed/{embed_id}' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://muse.ai/embed/{video_id}', video_id) + data = self._search_json( + r'player\.setData\(', webpage, 'player data', video_id, transform_source=js_to_json) + + source_url = data['url'] + if not url_or_none(source_url): + raise ExtractorError('Unable to extract video URL') + + formats = [{ + 'url': source_url, + 'format_id': 'source', + 'quality': 1, + **traverse_obj(data, { + 'ext': ('filename', {determine_ext}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': ('size', {int_or_none}), + }), + }] + if source_url.endswith('/data'): + base_url = f'{source_url[:-5]}/videos' + formats.extend(self._extract_m3u8_formats( + f'{base_url}/hls.m3u8', video_id, m3u8_id='hls', fatal=False)) + formats.extend(self._extract_mpd_formats( + f'{base_url}/dash.mpd', video_id, mpd_id='dash', fatal=False)) + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {float_or_none}), + 'timestamp': ('tcreated', {int_or_none}), + 'uploader': ('owner_name', {str}), + 'uploader_id': ('owner_username', {str}), + 'view_count': ('views', {int_or_none}), + 'age_limit': ('mature', {lambda x: 18 if x else None}), + 'availability': ('visibility', {lambda x: x if x in ('private', 'unlisted') else 'public'}), + }), + } diff --git a/hypervideo_dl/extractor/myvideoge.py b/hypervideo_dl/extractor/myvideoge.py index 513d4cb..64cee48 100644 --- a/hypervideo_dl/extractor/myvideoge.py +++ b/hypervideo_dl/extractor/myvideoge.py @@ -1,5 +1,16 @@ +import re + from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + MONTH_NAMES, + clean_html, + get_element_by_class, + get_element_by_id, + int_or_none, + js_to_json, + qualities, + unified_strdate, +) class MyVideoGeIE(InfoExtractor): @@ -11,37 +22,50 @@ class MyVideoGeIE(InfoExtractor): 'id': '3941048', 'ext': 'mp4', 'title': 'The best prikol', + 'upload_date': '20200611', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'md5:d72addd357b0dd914e704781f7f777d8', - 'description': 'md5:5c0371f540f5888d603ebfedd46b6df3' - } + 'uploader': 'chixa33', + 'description': 'md5:5b067801318e33c2e6eea4ab90b1fdd3', + }, } + _MONTH_NAMES_KA = ['იანვარი', 'თებერვალი', 'მარტი', 'აპრილი', 'მაისი', 'ივნისი', 'ივლისი', 'აგვისტო', 'სექტემბერი', 'ოქტომბერი', 'ნოემბერი', 'დეკემბერი'] + + _quality = staticmethod(qualities(('SD', 'HD'))) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<h1[^>]*>([^<]+)</h1>', webpage, 'title') - description = self._og_search_description(webpage) - thumbnail = self._html_search_meta(['og:image'], webpage) - uploader = self._search_regex(r'<a[^>]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False) + title = ( + self._og_search_title(webpage, default=None) + or clean_html(get_element_by_class('my_video_title', webpage)) + or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title\b', webpage, 'title')) jwplayer_sources = self._parse_json( self._search_regex( - r"(?s)jwplayer\(\"mvplayer\"\).setup\(.*?sources: (.*?])", webpage, 'jwplayer sources'), - video_id, transform_source=js_to_json) + r'''(?s)jwplayer\s*\(\s*['"]mvplayer['"]\s*\)\s*\.\s*setup\s*\(.*?\bsources\s*:\s*(\[.*?])\s*[,});]''', webpage, 'jwplayer sources', fatal=False) + or '', + video_id, transform_source=js_to_json, fatal=False) + + formats = self._parse_jwplayer_formats(jwplayer_sources or [], video_id) + for f in formats or []: + f['quality'] = self._quality(f['format_id']) - def _formats_key(f): - if f['label'] == 'SD': - return -1 - elif f['label'] == 'HD': - return 1 - else: - return 0 + description = ( + self._og_search_description(webpage) + or get_element_by_id('long_desc_holder', webpage) + or self._html_search_meta('description', webpage)) - jwplayer_sources = sorted(jwplayer_sources, key=_formats_key) + uploader = self._search_regex(r'<a[^>]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False) - formats = self._parse_jwplayer_formats(jwplayer_sources, video_id) + upload_date = get_element_by_class('mv_vid_upl_date', webpage) + # as ka locale may not be present roll a local date conversion + upload_date = (unified_strdate( + # translate any ka month to an en one + re.sub('|'.join(self._MONTH_NAMES_KA), + lambda m: MONTH_NAMES['en'][self._MONTH_NAMES_KA.index(m.group(0))], + upload_date, re.I)) + if upload_date else None) return { 'id': video_id, @@ -49,5 +73,9 @@ class MyVideoGeIE(InfoExtractor): 'description': description, 'uploader': uploader, 'formats': formats, - 'thumbnail': thumbnail + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': upload_date, + 'view_count': int_or_none(get_element_by_class('mv_vid_views', webpage)), + 'like_count': int_or_none(get_element_by_id('likes_count', webpage)), + 'dislike_count': int_or_none(get_element_by_id('dislikes_count', webpage)), } diff --git a/hypervideo_dl/extractor/mzaalo.py b/hypervideo_dl/extractor/mzaalo.py new file mode 100644 index 0000000..1996368 --- /dev/null +++ b/hypervideo_dl/extractor/mzaalo.py @@ -0,0 +1,95 @@ +from .common import InfoExtractor +from ..utils import ( + parse_age_limit, + parse_duration, + traverse_obj, + url_or_none, +) + + +class MzaaloIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?mzaalo\.com/(?:play|watch)/(?P<type>movie|original|clip)/(?P<id>[a-f0-9-]+)/[\w-]+' + _TESTS = [{ + # Movies + 'url': 'https://www.mzaalo.com/play/movie/c0958d9f-f90e-4503-a755-44358758921d/Jamun', + 'info_dict': { + 'id': 'c0958d9f-f90e-4503-a755-44358758921d', + 'title': 'Jamun', + 'ext': 'mp4', + 'description': 'md5:24fe9ebb9bbe5b36f7b54b90ab1e2f31', + 'thumbnails': 'count:15', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 5527.0, + 'language': 'hin', + 'categories': ['Drama'], + 'age_limit': 13, + }, + 'params': {'skip_download': 'm3u8'} + }, { + # Shows + 'url': 'https://www.mzaalo.com/play/original/93d42b2b-f373-4c2d-bca4-997412cb069d/Modi-Season-2-CM-TO-PM/Episode-1:Decision,-Not-Promises', + 'info_dict': { + 'id': '93d42b2b-f373-4c2d-bca4-997412cb069d', + 'title': 'Episode 1:Decision, Not Promises', + 'ext': 'mp4', + 'description': 'md5:16f76058432a54774fbb2561a1955652', + 'thumbnails': 'count:22', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2040.0, + 'language': 'hin', + 'categories': ['Drama'], + 'age_limit': 13, + }, + 'params': {'skip_download': 'm3u8'} + }, { + # Streams/Clips + 'url': 'https://www.mzaalo.com/play/clip/83cdbcb5-400a-42f1-a1d2-459053cfbda5/Manto-Ki-Kahaaniya', + 'info_dict': { + 'id': '83cdbcb5-400a-42f1-a1d2-459053cfbda5', + 'title': 'Manto Ki Kahaaniya', + 'ext': 'mp4', + 'description': 'md5:c3c5f1d05f0fd1bfcb05b673d1cc9f2f', + 'thumbnails': 'count:3', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1937.0, + 'language': 'hin', + }, + 'params': {'skip_download': 'm3u8'} + }, { + 'url': 'https://mzaalo.com/watch/MOVIE/389c892d-0b65-4019-bf73-d4edcb1c014f/Chalo-Dilli', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, type_ = self._match_valid_url(url).group('id', 'type') + path = (f'partner/streamurl?&assetId={video_id}&getClipDetails=YES' if type_ == 'clip' + else f'api/v2/player/details?assetType={type_.upper()}&assetId={video_id}') + data = self._download_json( + f'https://production.mzaalo.com/platform/{path}', video_id, headers={ + 'Ocp-Apim-Subscription-Key': '1d0caac2702049b89a305929fdf4cbae', + })['data'] + + formats = self._extract_m3u8_formats(data['streamURL'], video_id) + + subtitles = {} + for subs_lang, subs_url in traverse_obj(data, ('subtitles', {dict.items}, ...)): + if url_or_none(subs_url): + subtitles[subs_lang] = [{'url': subs_url, 'ext': 'vtt'}] + + lang = traverse_obj(data, ('language', {str.lower})) + for f in formats: + f['language'] = lang + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {parse_duration}), + 'age_limit': ('maturity_rating', {parse_age_limit}), + 'thumbnails': ('images', ..., {'url': {url_or_none}}), + 'categories': ('genre', ..., {str}), + }), + } diff --git a/hypervideo_dl/extractor/naver.py b/hypervideo_dl/extractor/naver.py index e2e6e97..d79caf5 100644 --- a/hypervideo_dl/extractor/naver.py +++ b/hypervideo_dl/extractor/naver.py @@ -21,6 +21,23 @@ from ..utils import ( class NaverBaseIE(InfoExtractor): _CAPTION_EXT_RE = r'\.(?:ttml|vtt)' + @staticmethod # NB: Used in VLiveWebArchiveIE, WeverseIE + def process_subtitles(vod_data, process_url): + ret = {'subtitles': {}, 'automatic_captions': {}} + for caption in traverse_obj(vod_data, ('captions', 'list', ...)): + caption_url = caption.get('source') + if not caption_url: + continue + type_ = 'automatic_captions' if caption.get('type') == 'auto' else 'subtitles' + lang = caption.get('locale') or join_nonempty('language', 'country', from_dict=caption) or 'und' + if caption.get('type') == 'fan': + lang += '_fan%d' % next(i for i in itertools.count(1) if f'{lang}_fan{i}' not in ret[type_]) + ret[type_].setdefault(lang, []).extend({ + 'url': sub_url, + 'name': join_nonempty('label', 'fanName', from_dict=caption, delim=' - '), + } for sub_url in process_url(caption_url)) + return ret + def _extract_video_info(self, video_id, vid, key): video_data = self._download_json( 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + vid, @@ -79,34 +96,18 @@ class NaverBaseIE(InfoExtractor): ] return [caption_url] - automatic_captions = {} - subtitles = {} - for caption in get_list('caption'): - caption_url = caption.get('source') - if not caption_url: - continue - sub_dict = automatic_captions if caption.get('type') == 'auto' else subtitles - lang = caption.get('locale') or join_nonempty('language', 'country', from_dict=caption) or 'und' - if caption.get('type') == 'fan': - lang += '_fan%d' % next(i for i in itertools.count(1) if f'{lang}_fan{i}' not in sub_dict) - sub_dict.setdefault(lang, []).extend({ - 'url': sub_url, - 'name': join_nonempty('label', 'fanName', from_dict=caption, delim=' - '), - } for sub_url in get_subs(caption_url)) - user = meta.get('user', {}) return { 'id': video_id, 'title': title, 'formats': formats, - 'subtitles': subtitles, - 'automatic_captions': automatic_captions, 'thumbnail': try_get(meta, lambda x: x['cover']['source']), 'view_count': int_or_none(meta.get('count')), 'uploader_id': user.get('id'), 'uploader': user.get('name'), 'uploader_url': user.get('url'), + **self.process_subtitles(video_data, get_subs), } diff --git a/hypervideo_dl/extractor/nbc.py b/hypervideo_dl/extractor/nbc.py index 1ea6355..b3c28ab 100644 --- a/hypervideo_dl/extractor/nbc.py +++ b/hypervideo_dl/extractor/nbc.py @@ -3,29 +3,34 @@ import json import re from .common import InfoExtractor -from .theplatform import ThePlatformIE +from .theplatform import ThePlatformIE, default_ns from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote +from ..networking import HEADRequest from ..utils import ( ExtractorError, + RegexNotFoundError, + UserNotLive, + clean_html, + determine_ext, + float_or_none, int_or_none, + mimetype2ext, parse_age_limit, parse_duration, - RegexNotFoundError, + remove_end, smuggle_url, - str_or_none, traverse_obj, try_get, - unified_strdate, + unescapeHTML, unified_timestamp, update_url_query, url_basename, - variadic, ) class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>n?\d+))' + _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>(?:NBCE|n)?\d+))' _TESTS = [ { @@ -38,10 +43,18 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'timestamp': 1424246400, 'upload_date': '20150218', 'uploader': 'NBCU-COM', + 'episode': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', + 'episode_number': 86, + 'season': 'Season 2', + 'season_number': 2, + 'series': 'Tonight Show: Jimmy Fallon', + 'duration': 237.0, + 'chapters': 'count:1', + 'tags': 'count:4', + 'thumbnail': r're:https?://.+\.jpg', }, 'params': { - # m3u8 download - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { @@ -55,11 +68,7 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'upload_date': '20141206', 'uploader': 'NBCU-COM', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Only works from US', + 'skip': 'page not found', }, { # HLS streams requires the 'hdnea3' cookie @@ -73,10 +82,58 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'upload_date': '20090315', 'uploader': 'NBCU-COM', }, + 'skip': 'page not found', + }, + { + # manifest url does not have extension + 'url': 'https://www.nbc.com/the-golden-globe-awards/video/oprah-winfrey-receives-cecil-b-de-mille-award-at-the-2018-golden-globes/3646439', + 'info_dict': { + 'id': '3646439', + 'ext': 'mp4', + 'title': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes', + 'episode': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes', + 'episode_number': 1, + 'season': 'Season 75', + 'season_number': 75, + 'series': 'The Golden Globe Awards', + 'description': 'Oprah Winfrey receives the Cecil B. de Mille Award at the 75th Annual Golden Globe Awards.', + 'uploader': 'NBCU-COM', + 'upload_date': '20180107', + 'timestamp': 1515312000, + 'duration': 570.0, + 'tags': 'count:8', + 'thumbnail': r're:https?://.+\.jpg', + 'chapters': 'count:1', + }, 'params': { - 'skip_download': True, + 'skip_download': 'm3u8', + }, + }, + { + # new video_id format + 'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978', + 'info_dict': { + 'id': 'NBCE125189978', + 'ext': 'mp4', + 'title': 'Ben\'s First Leap | NBC\'s Quantum Leap', + 'description': 'md5:a82762449b7ec4bb83291a7b355ebf8e', + 'uploader': 'NBCU-COM', + 'series': 'Quantum Leap', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Ben\'s First Leap | NBC\'s Quantum Leap', + 'episode_number': 1, + 'duration': 170.171, + 'chapters': [], + 'timestamp': 1663956155, + 'upload_date': '20220923', + 'tags': 'count:10', + 'age_limit': 0, + 'thumbnail': r're:https?://.+\.jpg', + }, + 'params': { + 'skip_download': 'm3u8', }, - 'skip': 'Only works from US', }, { 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', @@ -136,6 +193,7 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE query = { 'mbr': 'true', 'manifest': 'm3u', + 'switch': 'HLSServiceSecure', } video_id = video_data['mpxGuid'] tp_path = 'NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id) @@ -599,32 +657,54 @@ class NBCStationsIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/', - 'md5': '462041d91bd762ef5a38b7d85d6dc18f', 'info_dict': { 'id': '2968618', 'ext': 'mp4', 'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory', - 'description': None, + 'description': 'md5:417ed3c2d91fe9d301e6db7b0942f182', + 'duration': 112.513, 'timestamp': 1661135892, - 'upload_date': '20220821', + 'upload_date': '20220822', 'uploader': 'NBC 4', - 'uploader_id': 'KNBC', + 'channel_id': 'KNBC', 'channel': 'nbclosangeles', }, + 'params': { + 'skip_download': 'm3u8', + }, }, { 'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/', - 'md5': '0917dcf7885be1023a9220630d415f67', 'info_dict': { 'id': '2247002', 'ext': 'mp4', - 'title': 'Huracán complica que televidente de Tucson reciba reembolso', + 'title': 'Huracán complica que televidente de Tucson reciba reembolso', 'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf', + 'duration': 172.406, 'timestamp': 1660886507, 'upload_date': '20220819', 'uploader': 'Telemundo Arizona', - 'uploader_id': 'KTAZ', + 'channel_id': 'KTAZ', 'channel': 'telemundoarizona', }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # direct mp4 link + 'url': 'https://www.nbcboston.com/weather/video-weather/highs-near-freezing-in-boston-on-wednesday/2961135/', + 'md5': '9bf8c41dc7abbb75b1a44f1491a4cc85', + 'info_dict': { + 'id': '2961135', + 'ext': 'mp4', + 'title': 'Highs Near Freezing in Boston on Wednesday', + 'description': 'md5:3ec486609a926c99f00a3512e6c0e85b', + 'duration': 235.669, + 'timestamp': 1675268656, + 'upload_date': '20230201', + 'uploader': '', + 'channel_id': 'WBTS', + 'channel': 'nbcboston', + }, }] _RESOLUTIONS = { @@ -640,51 +720,42 @@ class NBCStationsIE(InfoExtractor): webpage = self._download_webpage(url, video_id) nbc_data = self._search_json( - r'<script>var\s*nbc\s*=', webpage, 'NBC JSON data', video_id) + r'<script>\s*var\s+nbc\s*=', webpage, 'NBC JSON data', video_id) pdk_acct = nbc_data.get('pdkAcct') or 'Yh1nAC' fw_ssid = traverse_obj(nbc_data, ('video', 'fwSSID')) - fw_network_id = traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114') - video_data = self._parse_json(self._html_search_regex( - r'data-videos="([^"]*)"', webpage, 'video data', default='{}'), video_id) - video_data = variadic(video_data)[0] - video_data.update(self._parse_json(self._html_search_regex( - r'data-meta="([^"]*)"', webpage, 'metadata', default='{}'), video_id)) + video_data = self._search_json( + r'data-videos="\[', webpage, 'video data', video_id, default={}, transform_source=unescapeHTML) + video_data.update(self._search_json( + r'data-meta="', webpage, 'metadata', video_id, default={}, transform_source=unescapeHTML)) + if not video_data: + raise ExtractorError('No video metadata found in webpage', expected=True) - formats = [] + info, formats = {}, [] + is_live = int_or_none(video_data.get('mpx_is_livestream')) == 1 + query = { + 'formats': 'MPEG-DASH none,M3U none,MPEG-DASH none,MPEG4,MP3', + 'format': 'SMIL', + 'fwsitesection': fw_ssid, + 'fwNetworkID': traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114'), + 'pprofile': 'ots_desktop_html', + 'sensitive': 'false', + 'w': '1920', + 'h': '1080', + 'mode': 'LIVE' if is_live else 'on-demand', + 'vpaid': 'script', + 'schema': '2.0', + 'sdk': 'PDK 6.1.3', + } - if video_data.get('mpx_is_livestream') == '1': - live = True - player_id = traverse_obj( - video_data, 'mpx_m3upid', ('video', 'meta', 'mpx_m3upid'), 'mpx_pid', - ('video', 'meta', 'mpx_pid'), 'pid_streaming_web_medium') - query = { - 'mbr': 'true', - 'assetTypes': 'LegacyRelease', - 'fwsitesection': fw_ssid, - 'fwNetworkID': fw_network_id, - 'pprofile': 'ots_desktop_html', - 'sensitive': 'false', - 'w': '1920', - 'h': '1080', - 'rnd': '1660303', - 'mode': 'LIVE', - 'format': 'SMIL', - 'tracking': 'true', - 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3', - 'vpaid': 'script', - 'schema': '2.0', - 'SDK': 'PDK+6.1.3', - } - info = { - 'title': f'{channel} livestream', - } + if is_live: + player_id = traverse_obj(video_data, ((None, ('video', 'meta')), ( + 'mpx_m3upid', 'mpx_pid', 'pid_streaming_web_medium')), get_all=False) + info['title'] = f'{channel} livestream' else: - live = False - player_id = traverse_obj( - video_data, ('video', 'meta', 'pid_streaming_web_high'), 'pid_streaming_web_high', - ('video', 'meta', 'mpx_pid'), 'mpx_pid') + player_id = traverse_obj(video_data, ( + (None, ('video', 'meta')), ('pid_streaming_web_high', 'mpx_pid')), get_all=False) date_string = traverse_obj(video_data, 'date_string', 'date_gmt') if date_string: @@ -692,63 +763,70 @@ class NBCStationsIE(InfoExtractor): r'datetime="([^"]+)"', date_string, 'date string', fatal=False) else: date_string = traverse_obj( - nbc_data, ('dataLayer', 'adobe', 'prop70'), ('dataLayer', 'adobe', 'eVar70'), - ('dataLayer', 'adobe', 'eVar59')) + nbc_data, ('dataLayer', 'adobe', ('prop70', 'eVar70', 'eVar59')), get_all=False) - video_url = traverse_obj(video_data, ('video', 'meta', 'mp4_url'), 'mp4_url') + video_url = traverse_obj(video_data, ((None, ('video', 'meta')), 'mp4_url'), get_all=False) if video_url: - height = url_basename(video_url).split('-')[1].split('p')[0] + ext = determine_ext(video_url) + height = self._search_regex(r'\d+-(\d+)p', url_basename(video_url), 'height', default=None) formats.append({ 'url': video_url, - 'ext': 'mp4', + 'ext': ext, 'width': int_or_none(self._RESOLUTIONS.get(height)), 'height': int_or_none(height), - 'format_id': f'http-{height}', + 'format_id': f'http-{ext}', }) - query = { - 'mbr': 'true', - 'assetTypes': 'LegacyRelease', - 'fwsitesection': fw_ssid, - 'fwNetworkID': fw_network_id, - 'format': 'redirect', - 'manifest': 'm3u', - 'Tracking': 'true', - 'Embedded': 'true', - 'formats': 'MPEG4', - } - info = { - 'title': video_data.get('title') or traverse_obj( - nbc_data, ('dataLayer', 'contenttitle'), ('dataLayer', 'title'), - ('dataLayer', 'adobe', 'prop22'), ('dataLayer', 'id')), - 'description': traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text'), - 'upload_date': str_or_none(unified_strdate(date_string)), - 'timestamp': int_or_none(unified_timestamp(date_string)), - } - - if not player_id: - raise ExtractorError( - 'No video player ID or livestream player ID found in webpage', expected=True) - - headers = {'Origin': f'https://www.{channel}.com'} - manifest, urlh = self._download_webpage_handle( - f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id, - headers=headers, query=query, note='Downloading manifest') - if live: - manifest_url = self._search_regex(r'<video src="([^"]*)', manifest, 'manifest URL') - else: - manifest_url = urlh.geturl() + info.update({ + 'title': video_data.get('title') or traverse_obj(nbc_data, ( + 'dataLayer', (None, 'adobe'), ('contenttitle', 'title', 'prop22')), get_all=False), + 'description': + traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text') + or clean_html(traverse_obj(nbc_data, ('dataLayer', 'summary'))), + 'timestamp': unified_timestamp(date_string), + }) + + smil = None + if player_id and fw_ssid: + smil = self._download_xml( + f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id, + note='Downloading SMIL data', query=query, fatal=is_live) + subtitles = self._parse_smil_subtitles(smil, default_ns) if smil else {} + for video in smil.findall(self._xpath_ns('.//video', default_ns)) if smil else []: + info['duration'] = float_or_none(remove_end(video.get('dur'), 'ms'), 1000) + video_src_url = video.get('src') + ext = mimetype2ext(video.get('type'), default=determine_ext(video_src_url)) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_src_url, video_id, 'mp4', m3u8_id='hls', fatal=is_live, + live=is_live, errnote='No HLS formats found') + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif video_src_url: + formats.append({ + 'url': video_src_url, + 'format_id': f'https-{ext}', + 'ext': ext, + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + }) - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', headers=headers, m3u8_id='hls', - fatal=live, live=live, errnote='No HLS formats found')) + if not formats: + self.raise_no_formats('No video content found in webpage', expected=True) + elif is_live: + try: + self._request_webpage( + HEADRequest(formats[0]['url']), video_id, note='Checking live status') + except ExtractorError: + raise UserNotLive(video_id=channel) return { - 'id': str_or_none(video_id), + 'id': video_id, 'channel': channel, - 'uploader': str_or_none(nbc_data.get('on_air_name')), - 'uploader_id': str_or_none(nbc_data.get('callLetters')), + 'channel_id': nbc_data.get('callLetters'), + 'uploader': nbc_data.get('on_air_name'), 'formats': formats, - 'is_live': live, + 'subtitles': subtitles, + 'is_live': is_live, **info, } diff --git a/hypervideo_dl/extractor/nebula.py b/hypervideo_dl/extractor/nebula.py index 861fcb1..4f3e691 100644 --- a/hypervideo_dl/extractor/nebula.py +++ b/hypervideo_dl/extractor/nebula.py @@ -1,13 +1,11 @@ import itertools import json -import time -import urllib.error -import urllib.parse from .common import InfoExtractor -from ..utils import ExtractorError, parse_iso8601, try_get +from ..networking.exceptions import HTTPError +from ..utils import ExtractorError, make_archive_id, parse_iso8601, remove_start -_BASE_URL_RE = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' +_BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' class NebulaBaseIE(InfoExtractor): @@ -15,11 +13,10 @@ class NebulaBaseIE(InfoExtractor): _nebula_api_token = None _nebula_bearer_token = None - _zype_access_token = None def _perform_nebula_auth(self, username, password): if not username or not password: - self.raise_login_required() + self.raise_login_required(method='password') data = json.dumps({'email': username, 'password': password}).encode('utf8') response = self._download_json( @@ -33,38 +30,10 @@ class NebulaBaseIE(InfoExtractor): note='Logging in to Nebula with supplied credentials', errnote='Authentication failed or rejected') if not response or not response.get('key'): - self.raise_login_required() - - # save nebula token as cookie - self._set_cookie( - 'nebula.app', 'nebula-auth', - urllib.parse.quote( - json.dumps({ - "apiToken": response["key"], - "isLoggingIn": False, - "isLoggingOut": False, - }, separators=(",", ":"))), - expire_time=int(time.time()) + 86400 * 365, - ) + self.raise_login_required(method='password') return response['key'] - def _retrieve_nebula_api_token(self, username=None, password=None): - """ - Check cookie jar for valid token. Try to authenticate using credentials if no valid token - can be found in the cookie jar. - """ - nebula_cookies = self._get_cookies('https://nebula.app') - nebula_cookie = nebula_cookies.get('nebula-auth') - if nebula_cookie: - self.to_screen('Authenticating to Nebula with token from cookie jar') - nebula_cookie_value = urllib.parse.unquote(nebula_cookie.value) - nebula_api_token = self._parse_json(nebula_cookie_value, None).get('apiToken') - if nebula_api_token: - return nebula_api_token - - return self._perform_nebula_auth(username, password) - def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''): assert method in ('GET', 'POST',) assert auth_type in ('api', 'bearer',) @@ -79,7 +48,7 @@ class NebulaBaseIE(InfoExtractor): return inner_call() except ExtractorError as exc: # if 401 or 403, attempt credential re-auth and retry - if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403): + if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.status in (401, 403): self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}') self._perform_login() return inner_call() @@ -95,35 +64,25 @@ class NebulaBaseIE(InfoExtractor): note='Authorizing to Nebula') return response['token'] - def _fetch_zype_access_token(self): - """ - Get a Zype access token, which is required to access video streams -- in our case: to - generate video URLs. - """ - user_object = self._call_nebula_api('https://api.watchnebula.com/api/v1/auth/user/', note='Retrieving Zype access token') - - access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], str) - if not access_token: - if try_get(user_object, lambda x: x['is_subscribed'], bool): - # TODO: Reimplement the same Zype token polling the Nebula frontend implements - # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532 - raise ExtractorError( - 'Unable to extract Zype access token from Nebula API authentication endpoint. ' - 'Open an arbitrary video in a browser with this account to generate a token', - expected=True) - raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') - return access_token + def _fetch_video_formats(self, slug): + stream_info = self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/stream/', + video_id=slug, + auth_type='bearer', + note='Fetching video stream info') + manifest_url = stream_info['manifest'] + return self._extract_m3u8_formats_and_subtitles(manifest_url, slug, 'mp4') def _build_video_info(self, episode): - zype_id = episode['zype_id'] - zype_video_url = f'https://player.zype.com/embed/{zype_id}.html?access_token={self._zype_access_token}' + fmts, subs = self._fetch_video_formats(episode['slug']) channel_slug = episode['channel_slug'] + channel_title = episode['channel_title'] + zype_id = episode.get('zype_id') return { - 'id': episode['zype_id'], + 'id': remove_start(episode['id'], 'video_episode:'), 'display_id': episode['slug'], - '_type': 'url_transparent', - 'ie_key': 'Zype', - 'url': zype_video_url, + 'formats': fmts, + 'subtitles': subs, + 'webpage_url': f'https://nebula.tv/{episode["slug"]}', 'title': episode['title'], 'description': episode['description'], 'timestamp': parse_iso8601(episode['published_at']), @@ -133,30 +92,32 @@ class NebulaBaseIE(InfoExtractor): 'height': key, } for key, tn in episode['assets']['thumbnail'].items()], 'duration': episode['duration'], - 'channel': episode['channel_title'], + 'channel': channel_title, 'channel_id': channel_slug, - 'channel_url': f'https://nebula.app/{channel_slug}', - 'uploader': episode['channel_title'], + 'channel_url': f'https://nebula.tv/{channel_slug}', + 'uploader': channel_title, 'uploader_id': channel_slug, - 'uploader_url': f'https://nebula.app/{channel_slug}', - 'series': episode['channel_title'], - 'creator': episode['channel_title'], + 'uploader_url': f'https://nebula.tv/{channel_slug}', + 'series': channel_title, + 'creator': channel_title, + 'extractor_key': NebulaIE.ie_key(), + 'extractor': NebulaIE.IE_NAME, + '_old_archive_ids': [make_archive_id(NebulaIE, zype_id)] if zype_id else None, } def _perform_login(self, username=None, password=None): - self._nebula_api_token = self._retrieve_nebula_api_token(username, password) + self._nebula_api_token = self._perform_nebula_auth(username, password) self._nebula_bearer_token = self._fetch_nebula_bearer_token() - self._zype_access_token = self._fetch_zype_access_token() class NebulaIE(NebulaBaseIE): _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)' _TESTS = [ { - 'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast', + 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast', 'md5': '14944cfee8c7beeea106320c47560efc', 'info_dict': { - 'id': '5c271b40b13fd613090034fd', + 'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf', 'ext': 'mp4', 'title': 'That Time Disney Remade Beauty and the Beast', 'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.', @@ -167,47 +128,43 @@ class NebulaIE(NebulaBaseIE): 'uploader': 'Lindsay Ellis', 'uploader_id': 'lindsayellis', 'timestamp': 1533009600, - 'uploader_url': 'https://nebula.app/lindsayellis', + 'uploader_url': 'https://nebula.tv/lindsayellis', 'series': 'Lindsay Ellis', - 'average_rating': int, 'display_id': 'that-time-disney-remade-beauty-and-the-beast', - 'channel_url': 'https://nebula.app/lindsayellis', + 'channel_url': 'https://nebula.tv/lindsayellis', 'creator': 'Lindsay Ellis', 'duration': 2212, - 'view_count': int, 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', }, }, { - 'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', + 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', 'md5': 'd05739cf6c38c09322422f696b569c23', 'info_dict': { - 'id': '5e7e78171aaf320001fbd6be', + 'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34', 'ext': 'mp4', 'title': 'Landing Craft - How The Allies Got Ashore', 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', 'upload_date': '20200327', 'timestamp': 1585348140, - 'channel': 'Real Engineering', - 'channel_id': 'realengineering', - 'uploader': 'Real Engineering', - 'uploader_id': 'realengineering', - 'view_count': int, - 'series': 'Real Engineering', - 'average_rating': int, + 'channel': 'Real Engineering — The Logistics of D-Day', + 'channel_id': 'd-day', + 'uploader': 'Real Engineering — The Logistics of D-Day', + 'uploader_id': 'd-day', + 'series': 'Real Engineering — The Logistics of D-Day', 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', - 'creator': 'Real Engineering', + 'creator': 'Real Engineering — The Logistics of D-Day', 'duration': 841, - 'channel_url': 'https://nebula.app/realengineering', - 'uploader_url': 'https://nebula.app/realengineering', + 'channel_url': 'https://nebula.tv/d-day', + 'uploader_url': 'https://nebula.tv/d-day', 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', }, }, { - 'url': 'https://nebula.app/videos/money-episode-1-the-draw', + 'url': 'https://nebula.tv/videos/money-episode-1-the-draw', 'md5': 'ebe28a7ad822b9ee172387d860487868', 'info_dict': { - 'id': '5e779ebdd157bc0001d1c75a', + 'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553', 'ext': 'mp4', 'title': 'Episode 1: The Draw', 'description': r'contains:There’s free money on offer… if the players can all work together.', @@ -217,14 +174,12 @@ class NebulaIE(NebulaBaseIE): 'channel_id': 'tom-scott-presents-money', 'uploader': 'Tom Scott Presents: Money', 'uploader_id': 'tom-scott-presents-money', - 'uploader_url': 'https://nebula.app/tom-scott-presents-money', + 'uploader_url': 'https://nebula.tv/tom-scott-presents-money', 'duration': 825, - 'channel_url': 'https://nebula.app/tom-scott-presents-money', - 'view_count': int, + 'channel_url': 'https://nebula.tv/tom-scott-presents-money', 'series': 'Tom Scott Presents: Money', 'display_id': 'money-episode-1-the-draw', 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', - 'average_rating': int, 'creator': 'Tom Scott Presents: Money', }, }, @@ -232,10 +187,14 @@ class NebulaIE(NebulaBaseIE): 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', 'only_matching': True, }, + { + 'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw', + 'only_matching': True, + }, ] def _fetch_video_metadata(self, slug): - return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/', + return self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/', video_id=slug, auth_type='bearer', note='Fetching video meta data') @@ -251,7 +210,7 @@ class NebulaSubscriptionsIE(NebulaBaseIE): _VALID_URL = rf'{_BASE_URL_RE}/myshows' _TESTS = [ { - 'url': 'https://nebula.app/myshows', + 'url': 'https://nebula.tv/myshows', 'playlist_mincount': 1, 'info_dict': { 'id': 'myshows', @@ -279,7 +238,7 @@ class NebulaChannelIE(NebulaBaseIE): _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|videos/)(?P<id>[-\w]+)' _TESTS = [ { - 'url': 'https://nebula.app/tom-scott-presents-money', + 'url': 'https://nebula.tv/tom-scott-presents-money', 'info_dict': { 'id': 'tom-scott-presents-money', 'title': 'Tom Scott Presents: Money', @@ -287,13 +246,13 @@ class NebulaChannelIE(NebulaBaseIE): }, 'playlist_count': 5, }, { - 'url': 'https://nebula.app/lindsayellis', + 'url': 'https://nebula.tv/lindsayellis', 'info_dict': { 'id': 'lindsayellis', 'title': 'Lindsay Ellis', 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', }, - 'playlist_mincount': 100, + 'playlist_mincount': 2, }, ] diff --git a/hypervideo_dl/extractor/nekohacker.py b/hypervideo_dl/extractor/nekohacker.py new file mode 100644 index 0000000..e10ffe9 --- /dev/null +++ b/hypervideo_dl/extractor/nekohacker.py @@ -0,0 +1,217 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + extract_attributes, + get_element_by_class, + get_element_text_and_html_by_tag, + parse_duration, + traverse_obj, + try_call, + url_or_none, +) + + +class NekoHackerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nekohacker\.com/(?P<id>(?!free-dl)[\w-]+)' + _TESTS = [{ + 'url': 'https://nekohacker.com/nekoverse/', + 'info_dict': { + 'id': 'nekoverse', + 'title': 'Nekoverse', + }, + 'playlist': [ + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/01-Spaceship.mp3', + 'md5': '44223701ebedba0467ebda4cc07fb3aa', + 'info_dict': { + 'id': '1712', + 'ext': 'mp3', + 'title': 'Spaceship', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'Spaceship', + 'track_number': 1, + 'duration': 195.0 + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/02-City-Runner.mp3', + 'md5': '8f853c71719389d32bbbd3f1a87b3f08', + 'info_dict': { + 'id': '1713', + 'ext': 'mp3', + 'title': 'City Runner', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'City Runner', + 'track_number': 2, + 'duration': 148.0 + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/03-Nature-Talk.mp3', + 'md5': '5a8a8ae852720cee4c0ac95c7d1a7450', + 'info_dict': { + 'id': '1714', + 'ext': 'mp3', + 'title': 'Nature Talk', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'Nature Talk', + 'track_number': 3, + 'duration': 174.0 + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2022/11/04-Crystal-World.mp3', + 'md5': 'd8e59a48061764e50d92386a294abd50', + 'info_dict': { + 'id': '1715', + 'ext': 'mp3', + 'title': 'Crystal World', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2022/11/Nekoverse_Artwork-1024x1024.jpg', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20221101', + 'album': 'Nekoverse', + 'artist': 'Neko Hacker', + 'track': 'Crystal World', + 'track_number': 4, + 'duration': 199.0 + } + } + ] + }, { + 'url': 'https://nekohacker.com/susume/', + 'info_dict': { + 'id': 'susume', + 'title': '進め!むじなカンパニー', + }, + 'playlist': [ + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/進め!むじなカンパニー-feat.-六科なじむ-CV_-日高里菜-割戶真友-CV_-金元寿子-軽井沢ユキ-CV_-上坂すみれ-出稼ぎガルシア-CV_-金子彩花-.mp3', + 'md5': 'fb13f008aa81f26ba48f91fd2d6186ce', + 'info_dict': { + 'id': '711', + 'ext': 'mp3', + 'title': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0', + 'track_number': 1, + 'duration': None + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/むじな-de-なじむ-feat.-六科なじむ-CV_-日高里菜-.mp3', + 'md5': '028803f70241df512b7764e73396fdd1', + 'info_dict': { + 'id': '709', + 'ext': 'mp3', + 'title': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )', + 'track_number': 2, + 'duration': None + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/進め!むじなカンパニー-instrumental.mp3', + 'md5': 'adde9e9a16e1da5e602b579c247d0fb9', + 'info_dict': { + 'id': '710', + 'ext': 'mp3', + 'title': '進め!むじなカンパニー (instrumental)', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': '進め!むじなカンパニー (instrumental)', + 'track_number': 3, + 'duration': None + } + }, + { + 'url': 'https://nekohacker.com/wp-content/uploads/2021/01/むじな-de-なじむ-instrumental.mp3', + 'md5': 'ebb0443039cf5f9ff7fd557ed9b23599', + 'info_dict': { + 'id': '712', + 'ext': 'mp3', + 'title': 'むじな de なじむ (instrumental)', + 'thumbnail': 'https://nekohacker.com/wp-content/uploads/2021/01/OP表-1024x1024.png', + 'vcodec': 'none', + 'acodec': 'mp3', + 'release_date': '20210115', + 'album': '進め!むじなカンパニー', + 'artist': 'Neko Hacker', + 'track': 'むじな de なじむ (instrumental)', + 'track_number': 4, + 'duration': None + } + } + ] + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + playlist = get_element_by_class('playlist', webpage) + + if not playlist: + iframe = try_call(lambda: get_element_text_and_html_by_tag('iframe', webpage)[1]) or '' + iframe_src = url_or_none(extract_attributes(iframe).get('src')) + if not iframe_src: + raise ExtractorError('No playlist or embed found in webpage') + elif re.match(r'https?://(?:\w+\.)?spotify\.com/', iframe_src): + raise ExtractorError('Spotify embeds are not supported', expected=True) + return self.url_result(url, 'Generic') + + entries = [] + for track_number, track in enumerate(re.findall(r'(<li[^>]+data-audiopath[^>]+>)', playlist), 1): + entry = traverse_obj(extract_attributes(track), { + 'url': ('data-audiopath', {url_or_none}), + 'ext': ('data-audiopath', {determine_ext}), + 'id': 'data-trackid', + 'title': 'data-tracktitle', + 'track': 'data-tracktitle', + 'album': 'data-albumtitle', + 'duration': ('data-tracktime', {parse_duration}), + 'release_date': ('data-releasedate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0), + 'thumbnail': ('data-albumart', {url_or_none}), + }) + entries.append({ + **entry, + 'track_number': track_number, + 'artist': 'Neko Hacker', + 'vcodec': 'none', + 'acodec': 'mp3' if entry['ext'] == 'mp3' else None, + }) + + return self.playlist_result(entries, playlist_id, traverse_obj(entries, (0, 'album'))) diff --git a/hypervideo_dl/extractor/neteasemusic.py b/hypervideo_dl/extractor/neteasemusic.py index 5957098..5b7307b 100644 --- a/hypervideo_dl/extractor/neteasemusic.py +++ b/hypervideo_dl/extractor/neteasemusic.py @@ -11,6 +11,7 @@ from random import randint from .common import InfoExtractor from ..aes import aes_ecb_encrypt, pkcs7_padding from ..compat import compat_urllib_parse_urlencode +from ..networking import Request from ..utils import ( ExtractorError, bytes_to_intlist, @@ -18,7 +19,6 @@ from ..utils import ( float_or_none, int_or_none, intlist_to_bytes, - sanitized_Request, try_get, ) @@ -146,8 +146,8 @@ class NetEaseMusicBaseIE(InfoExtractor): return int(round(ms / 1000.0)) def query_api(self, endpoint, video_id, note): - req = sanitized_Request('%s%s' % (self._API_BASE, endpoint)) - req.add_header('Referer', self._API_BASE) + req = Request('%s%s' % (self._API_BASE, endpoint)) + req.headers['Referer'] = self._API_BASE return self._download_json(req, video_id, note) diff --git a/hypervideo_dl/extractor/netverse.py b/hypervideo_dl/extractor/netverse.py index 719a9da..ef53e15 100644 --- a/hypervideo_dl/extractor/netverse.py +++ b/hypervideo_dl/extractor/netverse.py @@ -1,4 +1,6 @@ -from .common import InfoExtractor +import itertools + +from .common import InfoExtractor, SearchInfoExtractor from .dailymotion import DailymotionIE from ..utils import smuggle_url, traverse_obj @@ -16,6 +18,26 @@ class NetverseBaseIE(InfoExtractor): f'https://api.netverse.id/medias/api/v2/{self._ENDPOINTS[endpoint]}/{slug}/{season_id}', display_id or slug, query=query) + def _get_comments(self, video_id): + last_page_number = None + for i in itertools.count(1): + comment_data = self._download_json( + f'https://api.netverse.id/mediadetails/api/v3/videos/comments/{video_id}', + video_id, data=b'', fatal=False, query={'page': i}, + note=f'Downloading JSON comment metadata page {i}') or {} + yield from traverse_obj(comment_data, ('response', 'comments', 'data', ..., { + 'id': '_id', + 'text': 'comment', + 'author_id': 'customer_id', + 'author': ('customer', 'name'), + 'author_thumbnail': ('customer', 'profile_picture'), + })) + + if not last_page_number: + last_page_number = traverse_obj(comment_data, ('response', 'comments', 'last_page')) + if i >= (last_page_number or 0): + break + class NetverseIE(NetverseBaseIE): _VALID_URL = r'https?://(?:\w+\.)?netverse\.id/(?P<type>watch|video)/(?P<display_id>[^/?#&]+)' @@ -28,7 +50,7 @@ class NetverseIE(NetverseBaseIE): 'ext': 'mp4', 'season': 'Season 2016', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/T7aV31Y0eGRWBbwkK/x1080', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080', 'episode_number': 22, 'episode': 'Episode 22', 'uploader_id': 'x2ir3vq', @@ -51,7 +73,7 @@ class NetverseIE(NetverseBaseIE): 'ext': 'mp4', 'season': 'Season 2', 'description': 'md5:8a74f70812cca267e19ee0635f0af835', - 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/Thwuy1YURicFmGu0v/x1080', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080', 'episode_number': 2, 'episode': 'Episode 2', 'view_count': int, @@ -75,7 +97,7 @@ class NetverseIE(NetverseBaseIE): 'title': 'Tetangga Baru', 'season': 'Season 1', 'description': 'md5:23fcf70e97d461d3029d25d59b2ccfb9', - 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/T3Ogm1YEnnyjVKAFF/x1080', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080', 'episode_number': 1, 'episode': 'Episode 1', 'timestamp': 1624538169, @@ -96,7 +118,7 @@ class NetverseIE(NetverseBaseIE): 'info_dict': { 'id': 'x887jzz', 'ext': 'mp4', - 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/TfuZ_1Y6PboJ5An_s/x1080', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080', 'season': 'Season 1', 'episode_number': 1, 'description': 'md5:d4f627b3e7a3f9acdc55f6cdd5ea41d5', @@ -114,6 +136,60 @@ class NetverseIE(NetverseBaseIE): 'upload_date': '20220225', }, 'skip': 'This video get Geo-blocked for some country' + }, { + # video with comments + 'url': 'https://netverse.id/video/episode-1-season-2016-ok-food', + 'info_dict': { + 'id': 'k6hetBPiQMljSxxvAy7', + 'ext': 'mp4', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080', + 'display_id': 'episode-1-season-2016-ok-food', + 'like_count': int, + 'description': '', + 'duration': 1471, + 'age_limit': 0, + 'timestamp': 1642405848, + 'episode_number': 1, + 'season': 'Season 2016', + 'uploader_id': 'x2ir3vq', + 'title': 'Episode 1 - Season 2016 - Ok Food', + 'upload_date': '20220117', + 'tags': [], + 'view_count': int, + 'episode': 'Episode 1', + 'uploader': 'Net Prime', + 'comment_count': int, + }, + 'params': { + 'getcomments': True + } + }, { + # video with multiple page comment + 'url': 'https://netverse.id/video/match-island-eps-1-fix', + 'info_dict': { + 'id': 'x8aznjc', + 'ext': 'mp4', + 'like_count': int, + 'tags': ['Match-Island', 'Pd00111'], + 'display_id': 'match-island-eps-1-fix', + 'view_count': int, + 'episode': 'Episode 1', + 'uploader': 'Net Prime', + 'duration': 4070, + 'timestamp': 1653068165, + 'description': 'md5:e9cf3b480ad18e9c33b999e3494f223f', + 'age_limit': 0, + 'title': 'Welcome To Match Island', + 'upload_date': '20220520', + 'episode_number': 1, + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/[^/]+/x1080', + 'uploader_id': 'x2ir3vq', + 'season': 'Season 1', + 'comment_count': int, + }, + 'params': { + 'getcomments': True + } }] def _real_extract(self, url): @@ -131,6 +207,7 @@ class NetverseIE(NetverseBaseIE): 'thumbnail': traverse_obj(videos, ('program_detail', 'thumbnail_image')), 'description': traverse_obj(videos, ('program_detail', 'description')), 'episode_number': videos.get('episode_order'), + '__post_extractor': self.extract_comments(display_id), } @@ -174,3 +251,31 @@ class NetversePlaylistIE(NetverseBaseIE): self.parse_playlist(playlist_data['response'], playlist_id), traverse_obj(playlist_data, ('response', 'webseries_info', 'slug')), traverse_obj(playlist_data, ('response', 'webseries_info', 'title'))) + + +class NetverseSearchIE(SearchInfoExtractor): + _SEARCH_KEY = 'netsearch' + + _TESTS = [{ + 'url': 'netsearch10:tetangga', + 'info_dict': { + 'id': 'tetangga', + 'title': 'tetangga', + }, + 'playlist_count': 10, + }] + + def _search_results(self, query): + last_page = None + for i in itertools.count(1): + search_data = self._download_json( + 'https://api.netverse.id/search/elastic/search', query, + query={'q': query, 'page': i}, note=f'Downloading page {i}') + + videos = traverse_obj(search_data, ('response', 'data', ...)) + for video in videos: + yield self.url_result(f'https://netverse.id/video/{video["slug"]}', NetverseIE) + + last_page = last_page or traverse_obj(search_data, ('response', 'lastpage')) + if not videos or i >= (last_page or 0): + break diff --git a/hypervideo_dl/extractor/nfl.py b/hypervideo_dl/extractor/nfl.py index 29c53d5..cc3f449 100644 --- a/hypervideo_dl/extractor/nfl.py +++ b/hypervideo_dl/extractor/nfl.py @@ -1,10 +1,18 @@ +import base64 +import json import re +import time +import uuid +from .anvato import AnvatoIE from .common import InfoExtractor from ..utils import ( + ExtractorError, clean_html, determine_ext, get_element_by_class, + traverse_obj, + urlencode_postdata, ) @@ -54,15 +62,14 @@ class NFLBaseIE(InfoExtractor): )/ ''' _VIDEO_CONFIG_REGEX = r'<script[^>]+id="[^"]*video-config-[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}[^"]*"[^>]*>\s*({.+});?\s*</script>' + _ANVATO_PREFIX = 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:' def _parse_video_config(self, video_config, display_id): video_config = self._parse_json(video_config, display_id) item = video_config['playlist'][0] mcp_id = item.get('mcpID') if mcp_id: - info = self.url_result( - 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:' + mcp_id, - 'Anvato', mcp_id) + info = self.url_result(f'{self._ANVATO_PREFIX}{mcp_id}', AnvatoIE, mcp_id) else: media_id = item.get('id') or item['entityId'] title = item.get('title') @@ -157,3 +164,138 @@ class NFLArticleIE(NFLBaseIE): 'nfl-c-article__title', webpage)) or self._html_search_meta( ['og:title', 'twitter:title'], webpage) return self.playlist_result(entries, display_id, title) + + +class NFLPlusReplayIE(NFLBaseIE): + IE_NAME = 'nfl.com:plus:replay' + _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/games/[\w-]+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.nfl.com/plus/games/giants-at-vikings-2022-post-1/1572108', + 'info_dict': { + 'id': '1572108', + 'ext': 'mp4', + 'title': 'New York Giants at Minnesota Vikings', + 'description': 'New York Giants play the Minnesota Vikings at U.S. Bank Stadium on January 15, 2023', + 'uploader': 'NFL', + 'upload_date': '20230116', + 'timestamp': 1673864520, + 'duration': 7157, + 'categories': ['Game Highlights'], + 'tags': ['Minnesota Vikings', 'New York Giants', 'Minnesota Vikings vs. New York Giants'], + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + + +class NFLPlusEpisodeIE(NFLBaseIE): + IE_NAME = 'nfl.com:plus:episode' + _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/episodes/(?P<id>[\w-]+)' + _TESTS = [{ + 'note': 'premium content', + 'url': 'https://www.nfl.com/plus/episodes/kurt-s-qb-insider-conference-championships', + 'info_dict': { + 'id': '1576832', + 'ext': 'mp4', + 'title': 'Kurt\'s QB Insider: Conference Championships', + 'description': 'md5:944f7fab56f7a37430bf8473f5473857', + 'uploader': 'NFL', + 'upload_date': '20230127', + 'timestamp': 1674782760, + 'duration': 730, + 'categories': ['Analysis'], + 'tags': ['Cincinnati Bengals at Kansas City Chiefs (2022-POST-3)'], + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + _CLIENT_DATA = { + 'clientKey': '4cFUW6DmwJpzT9L7LrG3qRAcABG5s04g', + 'clientSecret': 'CZuvCL49d9OwfGsR', + 'deviceId': str(uuid.uuid4()), + 'deviceInfo': base64.b64encode(json.dumps({ + 'model': 'desktop', + 'version': 'Chrome', + 'osName': 'Windows', + 'osVersion': '10.0', + }, separators=(',', ':')).encode()).decode(), + 'networkType': 'other', + 'nflClaimGroupsToAdd': [], + 'nflClaimGroupsToRemove': [], + } + _ACCOUNT_INFO = {} + _API_KEY = None + + _TOKEN = None + _TOKEN_EXPIRY = 0 + + def _get_account_info(self, url, video_id): + cookies = self._get_cookies('https://www.nfl.com/') + login_token = traverse_obj(cookies, ( + (f'glt_{self._API_KEY}', f'gig_loginToken_{self._API_KEY}', + lambda k, _: k.startswith('glt_') or k.startswith('gig_loginToken_')), + {lambda x: x.value}), get_all=False) + if not login_token: + self.raise_login_required() + + account = self._download_json( + 'https://auth-id.nfl.com/accounts.getAccountInfo', video_id, + note='Downloading account info', data=urlencode_postdata({ + 'include': 'profile,data', + 'lang': 'en', + 'APIKey': self._API_KEY, + 'sdk': 'js_latest', + 'login_token': login_token, + 'authMode': 'cookie', + 'pageURL': url, + 'sdkBuild': traverse_obj(cookies, ( + 'gig_canary_ver', {lambda x: x.value.partition('-')[0]}), default='13642'), + 'format': 'json', + }), headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + self._ACCOUNT_INFO = traverse_obj(account, { + 'signatureTimestamp': 'signatureTimestamp', + 'uid': 'UID', + 'uidSignature': 'UIDSignature', + }) + + if len(self._ACCOUNT_INFO) != 3: + raise ExtractorError('Failed to retrieve account info with provided cookies', expected=True) + + def _get_auth_token(self, url, video_id): + if not self._ACCOUNT_INFO: + self._get_account_info(url, video_id) + + token = self._download_json( + 'https://api.nfl.com/identity/v3/token%s' % ( + '/refresh' if self._ACCOUNT_INFO.get('refreshToken') else ''), + video_id, headers={'Content-Type': 'application/json'}, note='Downloading access token', + data=json.dumps({**self._CLIENT_DATA, **self._ACCOUNT_INFO}, separators=(',', ':')).encode()) + + self._TOKEN = token['accessToken'] + self._TOKEN_EXPIRY = token['expiresIn'] + self._ACCOUNT_INFO['refreshToken'] = token['refreshToken'] + + def _real_extract(self, url): + slug = self._match_id(url) + + if not self._API_KEY: + webpage = self._download_webpage(url, slug, fatal=False) or '' + self._API_KEY = self._search_regex( + r'window\.gigyaApiKey=["\'](\w+)["\'];', webpage, 'API key', + default='3_Qa8TkWpIB8ESCBT8tY2TukbVKgO5F6BJVc7N1oComdwFzI7H2L9NOWdm11i_BY9f') + + if not self._TOKEN or self._TOKEN_EXPIRY <= int(time.time()): + self._get_auth_token(url, slug) + + video_id = self._download_json( + f'https://api.nfl.com/content/v1/videos/episodes/{slug}', slug, headers={ + 'Authorization': f'Bearer {self._TOKEN}', + })['mcpPlaybackId'] + + return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) diff --git a/hypervideo_dl/extractor/nhk.py b/hypervideo_dl/extractor/nhk.py index 59702b2..fbd6a18 100644 --- a/hypervideo_dl/extractor/nhk.py +++ b/hypervideo_dl/extractor/nhk.py @@ -2,11 +2,15 @@ import re from .common import InfoExtractor from ..utils import ( + ExtractorError, + int_or_none, + join_nonempty, parse_duration, traverse_obj, unescapeHTML, unified_timestamp, - urljoin + url_or_none, + urljoin, ) @@ -66,7 +70,7 @@ class NhkBaseIE(InfoExtractor): info.update({ '_type': 'url_transparent', 'ie_key': 'Piksel', - 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id, + 'url': 'https://movie-s.nhk.or.jp/v/refid/nhkworld/prefid/' + vod_id, 'id': vod_id, }) else: @@ -93,6 +97,19 @@ class NhkVodIE(NhkBaseIE): # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2061601/', + 'info_dict': { + 'id': 'yd8322ch', + 'ext': 'mp4', + 'description': 'md5:109c8b05d67a62d0592f2b445d2cd898', + 'title': 'GRAND SUMO Highlights - [Recap] May Tournament Day 1 (Opening Day)', + 'upload_date': '20230514', + 'timestamp': 1684083791, + 'series': 'GRAND SUMO Highlights', + 'episode': '[Recap] May Tournament Day 1 (Opening Day)', + 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1684084443/4028649.jpg?w=1920&h=1080', + }, + }, { # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', @@ -103,6 +120,9 @@ class NhkVodIE(NhkBaseIE): 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', 'timestamp': 1565965194, 'upload_date': '20190816', + 'thumbnail': 'https://mz-edge.stream.co.jp/thumbs/aid/t1567086278/3715195.jpg?w=1920&h=1080', + 'series': 'Dining with the Chef', + 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU', }, }, { # audio clip @@ -113,10 +133,7 @@ class NhkVodIE(NhkBaseIE): 'title': "Japan's Top Inventions - Miniature Video Cameras", 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, + 'skip': '404 Not Found', }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, @@ -132,7 +149,6 @@ class NhkVodIE(NhkBaseIE): }, { # video, alphabetic character in ID #29670 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/', - 'only_matching': True, 'info_dict': { 'id': 'qfjay6cg', 'ext': 'mp4', @@ -141,7 +157,8 @@ class NhkVodIE(NhkBaseIE): 'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$', 'upload_date': '20210615', 'timestamp': 1623722008, - } + }, + 'skip': '404 Not Found', }] def _real_extract(self, url): @@ -152,12 +169,19 @@ class NhkVodProgramIE(NhkBaseIE): _VALID_URL = r'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) _TESTS = [{ # video program episodes + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo', + 'info_dict': { + 'id': 'sumo', + 'title': 'GRAND SUMO Highlights', + }, + 'playlist_mincount': 12, + }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', }, - 'playlist_mincount': 1, + 'playlist_mincount': 12, }, { # video program clips 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip', @@ -334,3 +358,210 @@ class NhkForSchoolProgramListIE(InfoExtractor): for x in traverse_obj(bangumi_list, ('part', ..., 'part-video-dasid')) or []] return self.playlist_result(bangumis, program_id, title, description) + + +class NhkRadiruIE(InfoExtractor): + _GEO_COUNTRIES = ['JP'] + IE_DESC = 'NHK らじる (Radiru/Rajiru)' + _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/(?:player/ondemand|ondemand/detail)\.html\?p=(?P<site>[\da-zA-Z]+)_(?P<corner>[\da-zA-Z]+)(?:_(?P<headline>[\da-zA-Z]+))?' + _TESTS = [{ + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=0449_01_3853544', + 'skip': 'Episode expired on 2023-04-16', + 'info_dict': { + 'channel': 'NHK-FM', + 'description': 'md5:94b08bdeadde81a97df4ec882acce3e9', + 'ext': 'm4a', + 'id': '0449_01_3853544', + 'series': 'ジャズ・トゥナイト', + 'thumbnail': 'https://www.nhk.or.jp/prog/img/449/g449.jpg', + 'timestamp': 1680969600, + 'title': 'ジャズ・トゥナイト NEWジャズ特集', + 'upload_date': '20230408', + 'release_timestamp': 1680962400, + 'release_date': '20230408', + 'was_live': True, + }, + }, { + # playlist, airs every weekday so it should _hopefully_ be okay forever + 'url': 'https://www.nhk.or.jp/radio/ondemand/detail.html?p=0458_01', + 'info_dict': { + 'id': '0458_01', + 'title': 'ベストオブクラシック', + 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。', + 'channel': 'NHK-FM', + 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg', + }, + 'playlist_mincount': 3, + }, { + # one with letters in the id + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F300_06_3738470', + 'note': 'Expires on 2024-03-31', + 'info_dict': { + 'id': 'F300_06_3738470', + 'ext': 'm4a', + 'title': '有島武郎「一房のぶどう」', + 'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n(2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より)', + 'channel': 'NHKラジオ第1、NHK-FM', + 'timestamp': 1635757200, + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg', + 'release_date': '20161207', + 'series': 'らじる文庫 by ラジオ深夜便 ', + 'release_timestamp': 1481126700, + 'upload_date': '20211101', + } + }, { + # news + 'url': 'https://www.nhk.or.jp/radio/player/ondemand.html?p=F261_01_3855109', + 'skip': 'Expires on 2023-04-17', + 'info_dict': { + 'id': 'F261_01_3855109', + 'ext': 'm4a', + 'channel': 'NHKラジオ第1', + 'timestamp': 1681635900, + 'release_date': '20230416', + 'series': 'NHKラジオニュース', + 'title': '午後6時のNHKニュース', + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', + 'upload_date': '20230416', + 'release_timestamp': 1681635600, + }, + }] + + def _extract_episode_info(self, headline, programme_id, series_meta): + episode_id = f'{programme_id}_{headline["headline_id"]}' + episode = traverse_obj(headline, ('file_list', 0, {dict})) + + return { + **series_meta, + 'id': episode_id, + 'formats': self._extract_m3u8_formats(episode.get('file_name'), episode_id, fatal=False), + 'container': 'm4a_dash', # force fixup, AAC-only HLS + 'was_live': True, + 'series': series_meta.get('title'), + 'thumbnail': url_or_none(headline.get('headline_image')) or series_meta.get('thumbnail'), + **traverse_obj(episode, { + 'title': 'file_title', + 'description': 'file_title_sub', + 'timestamp': ('open_time', {unified_timestamp}), + 'release_timestamp': ('aa_vinfo4', {lambda x: x.split('_')[0]}, {unified_timestamp}), + }), + } + + def _real_extract(self, url): + site_id, corner_id, headline_id = self._match_valid_url(url).group('site', 'corner', 'headline') + programme_id = f'{site_id}_{corner_id}' + + if site_id == 'F261': + json_url = 'https://www.nhk.or.jp/s-media/news/news-site/list/v1/all.json' + else: + json_url = f'https://www.nhk.or.jp/radioondemand/json/{site_id}/bangumi_{programme_id}.json' + + meta = self._download_json(json_url, programme_id)['main'] + + series_meta = traverse_obj(meta, { + 'title': 'program_name', + 'channel': 'media_name', + 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}), + }, get_all=False) + + if headline_id: + return self._extract_episode_info( + traverse_obj(meta, ( + 'detail_list', lambda _, v: v['headline_id'] == headline_id), get_all=False), + programme_id, series_meta) + + def entries(): + for headline in traverse_obj(meta, ('detail_list', ..., {dict})): + yield self._extract_episode_info(headline, programme_id, series_meta) + + return self.playlist_result( + entries(), programme_id, playlist_description=meta.get('site_detail'), **series_meta) + + +class NhkRadioNewsPageIE(InfoExtractor): + _VALID_URL = r'https?://www\.nhk\.or\.jp/radionews/?(?:$|[?#])' + _TESTS = [{ + # airs daily, on-the-hour most hours + 'url': 'https://www.nhk.or.jp/radionews/', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'F261_01', + 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', + 'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d', + 'channel': 'NHKラジオ第1', + 'title': 'NHKラジオニュース', + } + }] + + def _real_extract(self, url): + return self.url_result('https://www.nhk.or.jp/radio/ondemand/detail.html?p=F261_01', NhkRadiruIE) + + +class NhkRadiruLiveIE(InfoExtractor): + _GEO_COUNTRIES = ['JP'] + _VALID_URL = r'https?://www\.nhk\.or\.jp/radio/player/\?ch=(?P<id>r[12]|fm)' + _TESTS = [{ + # radio 1, no area specified + 'url': 'https://www.nhk.or.jp/radio/player/?ch=r1', + 'info_dict': { + 'id': 'r1-tokyo', + 'title': 're:^NHKネットラジオ第1 東京.+$', + 'ext': 'm4a', + 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r1-200x200.png', + 'live_status': 'is_live', + }, + }, { + # radio 2, area specified + # (the area doesnt actually matter, r2 is national) + 'url': 'https://www.nhk.or.jp/radio/player/?ch=r2', + 'params': {'extractor_args': {'nhkradirulive': {'area': ['fukuoka']}}}, + 'info_dict': { + 'id': 'r2-fukuoka', + 'title': 're:^NHKネットラジオ第2 福岡.+$', + 'ext': 'm4a', + 'thumbnail': 'https://www.nhk.or.jp/common/img/media/r2-200x200.png', + 'live_status': 'is_live', + }, + }, { + # fm, area specified + 'url': 'https://www.nhk.or.jp/radio/player/?ch=fm', + 'params': {'extractor_args': {'nhkradirulive': {'area': ['sapporo']}}}, + 'info_dict': { + 'id': 'fm-sapporo', + 'title': 're:^NHKネットラジオFM 札幌.+$', + 'ext': 'm4a', + 'thumbnail': 'https://www.nhk.or.jp/common/img/media/fm-200x200.png', + 'live_status': 'is_live', + } + }] + + _NOA_STATION_IDS = {'r1': 'n1', 'r2': 'n2', 'fm': 'n3'} + + def _real_extract(self, url): + station = self._match_id(url) + area = self._configuration_arg('area', ['tokyo'])[0] + + config = self._download_xml( + 'https://www.nhk.or.jp/radio/config/config_web.xml', station, 'Downloading area information') + data = config.find(f'.//data//area[.="{area}"]/..') + + if not data: + raise ExtractorError('Invalid area. Valid areas are: %s' % ', '.join( + [i.text for i in config.findall('.//data//area')]), expected=True) + + noa_info = self._download_json( + f'https:{config.find(".//url_program_noa").text}'.format(area=data.find('areakey').text), + station, note=f'Downloading {area} station metadata') + present_info = traverse_obj(noa_info, ('nowonair_list', self._NOA_STATION_IDS.get(station), 'present')) + + return { + 'title': ' '.join(traverse_obj(present_info, (('service', 'area',), 'name', {str}))), + 'id': join_nonempty(station, area), + 'thumbnails': traverse_obj(present_info, ('service', 'images', ..., { + 'url': 'url', + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + })), + 'formats': self._extract_m3u8_formats(data.find(f'{station}hls').text, station), + 'is_live': True, + } diff --git a/hypervideo_dl/extractor/niconico.py b/hypervideo_dl/extractor/niconico.py index 2103037..fa2d709 100644 --- a/hypervideo_dl/extractor/niconico.py +++ b/hypervideo_dl/extractor/niconico.py @@ -5,13 +5,15 @@ import json import re import time +from urllib.parse import urlparse + from .common import InfoExtractor, SearchInfoExtractor -from ..compat import ( - compat_HTTPError, -) +from ..dependencies import websockets +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, OnDemandPagedList, + WebSocketsWrapper, bug_reports_message, clean_html, float_or_none, @@ -392,7 +394,7 @@ class NiconicoIE(InfoExtractor): webpage, handle = self._download_webpage_handle( 'https://www.nicovideo.jp/watch/' + video_id, video_id) if video_id.startswith('so'): - video_id = self._match_id(handle.geturl()) + video_id = self._match_id(handle.url) api_data = self._parse_json(self._html_search_regex( 'data-api-data="([^"]+)"', webpage, @@ -403,9 +405,9 @@ class NiconicoIE(InfoExtractor): 'https://www.nicovideo.jp/api/watch/v3/%s?_frontendId=6&_frontendVersion=0&actionTrackId=AAAAAAAAAA_%d' % (video_id, round(time.time() * 1000)), video_id, note='Downloading API JSON', errnote='Unable to fetch data')['data'] except ExtractorError: - if not isinstance(e.cause, compat_HTTPError): + if not isinstance(e.cause, HTTPError): raise - webpage = e.cause.read().decode('utf-8', 'replace') + webpage = e.cause.response.read().decode('utf-8', 'replace') error_msg = self._html_search_regex( r'(?s)<section\s+class="(?:(?:ErrorMessage|WatchExceptionPage-message)\s*)+">(.+?)</section>', webpage, 'error reason', default=None) @@ -477,23 +479,32 @@ class NiconicoIE(InfoExtractor): user_id_str = session_api_data.get('serviceUserId') thread_ids = traverse_obj(api_data, ('comment', 'threads', lambda _, v: v['isActive'])) - raw_danmaku = self._extract_all_comments(video_id, thread_ids, user_id_str, comment_user_key) - if not raw_danmaku: + legacy_danmaku = self._extract_legacy_comments(video_id, thread_ids, user_id_str, comment_user_key) or [] + + new_comments = traverse_obj(api_data, ('comment', 'nvComment')) + new_danmaku = self._extract_new_comments( + new_comments.get('server'), video_id, + new_comments.get('params'), new_comments.get('threadKey')) + + if not legacy_danmaku and not new_danmaku: self.report_warning(f'Failed to get comments. {bug_reports_message()}') return + return { 'comments': [{ 'ext': 'json', - 'data': json.dumps(raw_danmaku), + 'data': json.dumps(legacy_danmaku + new_danmaku), }], } - def _extract_all_comments(self, video_id, threads, user_id, user_key): + def _extract_legacy_comments(self, video_id, threads, user_id, user_key): auth_data = { 'user_id': user_id, 'userkey': user_key, } if user_id and user_key else {'user_id': ''} + api_url = traverse_obj(threads, (..., 'server'), get_all=False) + # Request Start post_data = [{'ping': {'content': 'rs:0'}}] for i, thread in enumerate(threads): @@ -532,17 +543,32 @@ class NiconicoIE(InfoExtractor): # Request Final post_data.append({'ping': {'content': 'rf:0'}}) - for api_url in self._COMMENT_API_ENDPOINTS: - comments = self._download_json( - api_url, video_id, data=json.dumps(post_data).encode(), fatal=False, - headers={ - 'Referer': 'https://www.nicovideo.jp/watch/%s' % video_id, - 'Origin': 'https://www.nicovideo.jp', - 'Content-Type': 'text/plain;charset=UTF-8', - }, - note='Downloading comments', errnote=f'Failed to access endpoint {api_url}') - if comments: - return comments + return self._download_json( + f'{api_url}/api.json', video_id, data=json.dumps(post_data).encode(), fatal=False, + headers={ + 'Referer': f'https://www.nicovideo.jp/watch/{video_id}', + 'Origin': 'https://www.nicovideo.jp', + 'Content-Type': 'text/plain;charset=UTF-8', + }, + note='Downloading comments', errnote=f'Failed to access endpoint {api_url}') + + def _extract_new_comments(self, endpoint, video_id, params, thread_key): + comments = self._download_json( + f'{endpoint}/v1/threads', video_id, data=json.dumps({ + 'additionals': {}, + 'params': params, + 'threadKey': thread_key, + }).encode(), fatal=False, + headers={ + 'Referer': 'https://www.nicovideo.jp/', + 'Origin': 'https://www.nicovideo.jp', + 'Content-Type': 'text/plain;charset=UTF-8', + 'x-client-os-type': 'others', + 'x-frontend-id': '6', + 'x-frontend-version': '0', + }, + note='Downloading comments (new)', errnote='Failed to download comments (new)') + return traverse_obj(comments, ('data', 'threads', ..., 'comments', ...)) class NiconicoPlaylistBaseIE(InfoExtractor): @@ -636,10 +662,10 @@ class NiconicoPlaylistIE(NiconicoPlaylistBaseIE): class NiconicoSeriesIE(InfoExtractor): IE_NAME = 'niconico:series' - _VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp|nico\.ms)/series/(?P<id>\d+)' + _VALID_URL = r'https?://(?:(?:www\.|sp\.)?nicovideo\.jp(?:/user/\d+)?|nico\.ms)/series/(?P<id>\d+)' _TESTS = [{ - 'url': 'https://www.nicovideo.jp/series/110226', + 'url': 'https://www.nicovideo.jp/user/44113208/series/110226', 'info_dict': { 'id': '110226', 'title': 'ご立派ァ!のシリーズ', @@ -659,7 +685,7 @@ class NiconicoSeriesIE(InfoExtractor): def _real_extract(self, url): list_id = self._match_id(url) - webpage = self._download_webpage(f'https://www.nicovideo.jp/series/{list_id}', list_id) + webpage = self._download_webpage(url, list_id) title = self._search_regex( (r'<title>「(.+)(全', @@ -667,16 +693,15 @@ class NiconicoSeriesIE(InfoExtractor): webpage, 'title', fatal=False) if title: title = unescapeHTML(title) - playlist = [ - self.url_result(f'https://www.nicovideo.jp/watch/{v_id}', video_id=v_id) - for v_id in re.findall(r'data-href=[\'"](?:https://www\.nicovideo\.jp)?/watch/([a-z0-9]+)', webpage)] - return self.playlist_result(playlist, list_id, title) + json_data = next(self._yield_json_ld(webpage, None, fatal=False)) + return self.playlist_from_matches( + traverse_obj(json_data, ('itemListElement', ..., 'url')), list_id, title, ie=NiconicoIE) class NiconicoHistoryIE(NiconicoPlaylistBaseIE): IE_NAME = 'niconico:history' - IE_DESC = 'NicoNico user history. Requires cookies.' - _VALID_URL = r'https?://(?:www\.|sp\.)?nicovideo\.jp/my/history' + IE_DESC = 'NicoNico user history or likes. Requires cookies.' + _VALID_URL = r'https?://(?:www\.|sp\.)?nicovideo\.jp/my/(?P<id>history(?:/like)?)' _TESTS = [{ 'note': 'PC page, with /video', @@ -694,23 +719,29 @@ class NiconicoHistoryIE(NiconicoPlaylistBaseIE): 'note': 'mobile page, without /video', 'url': 'https://sp.nicovideo.jp/my/history', 'only_matching': True, + }, { + 'note': 'PC page', + 'url': 'https://www.nicovideo.jp/my/history/like', + 'only_matching': True, + }, { + 'note': 'Mobile page', + 'url': 'https://sp.nicovideo.jp/my/history/like', + 'only_matching': True, }] def _call_api(self, list_id, resource, query): + path = 'likes' if list_id == 'history/like' else 'watch/history' return self._download_json( - 'https://nvapi.nicovideo.jp/v1/users/me/watch/history', 'history', - f'Downloading {resource}', query=query, - headers=self._API_HEADERS)['data'] + f'https://nvapi.nicovideo.jp/v1/users/me/{path}', list_id, + f'Downloading {resource}', query=query, headers=self._API_HEADERS)['data'] def _real_extract(self, url): - list_id = 'history' + list_id = self._match_id(url) try: - mylist = self._call_api(list_id, 'list', { - 'pageSize': 1, - }) + mylist = self._call_api(list_id, 'list', {'pageSize': 1}) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - self.raise_login_required('You have to be logged in to get your watch history') + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + self.raise_login_required('You have to be logged in to get your history') raise return self.playlist_result(self._entries(list_id), list_id, **self._parse_owner(mylist)) @@ -866,3 +897,162 @@ class NiconicoUserIE(InfoExtractor): def _real_extract(self, url): list_id = self._match_id(url) return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key()) + + +class NiconicoLiveIE(InfoExtractor): + IE_NAME = 'niconico:live' + IE_DESC = 'ニコニコ生放送' + _VALID_URL = r'https?://(?:sp\.)?live2?\.nicovideo\.jp/(?:watch|gate)/(?P<id>lv\d+)' + _TESTS = [{ + 'note': 'this test case includes invisible characters for title, pasting them as-is', + 'url': 'https://live.nicovideo.jp/watch/lv339533123', + 'info_dict': { + 'id': 'lv339533123', + 'title': '激辛ペヤング食べます( ;ᯅ; )(歌枠オーディション参加中)', + 'view_count': 1526, + 'comment_count': 1772, + 'description': '初めましてもかって言います❕\nのんびり自由に適当に暮らしてます', + 'uploader': 'もか', + 'channel': 'ゲストさんのコミュニティ', + 'channel_id': 'co5776900', + 'channel_url': 'https://com.nicovideo.jp/community/co5776900', + 'timestamp': 1670677328, + 'is_live': True, + }, + 'skip': 'livestream', + }, { + 'url': 'https://live2.nicovideo.jp/watch/lv339533123', + 'only_matching': True, + }, { + 'url': 'https://sp.live.nicovideo.jp/watch/lv339533123', + 'only_matching': True, + }, { + 'url': 'https://sp.live2.nicovideo.jp/watch/lv339533123', + 'only_matching': True, + }] + + _KNOWN_LATENCY = ('high', 'low') + + def _real_extract(self, url): + if not websockets: + raise ExtractorError('websockets library is not available. Please install it.', expected=True) + video_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id) + + embedded_data = self._parse_json(unescapeHTML(self._search_regex( + r'<script\s+id="embedded-data"\s*data-props="(.+?)"', webpage, 'embedded data')), video_id) + + ws_url = traverse_obj(embedded_data, ('site', 'relive', 'webSocketUrl')) + if not ws_url: + raise ExtractorError('The live hasn\'t started yet or already ended.', expected=True) + ws_url = update_url_query(ws_url, { + 'frontend_id': traverse_obj(embedded_data, ('site', 'frontendId')) or '9', + }) + + hostname = remove_start(urlparse(urlh.url).hostname, 'sp.') + cookies = try_get(urlh.url, self._downloader._calc_cookies) + latency = try_get(self._configuration_arg('latency'), lambda x: x[0]) + if latency not in self._KNOWN_LATENCY: + latency = 'high' + + ws = WebSocketsWrapper(ws_url, { + 'Cookies': str_or_none(cookies) or '', + 'Origin': f'https://{hostname}', + 'Accept': '*/*', + 'User-Agent': self.get_param('http_headers')['User-Agent'], + }) + + self.write_debug('[debug] Sending HLS server request') + ws.send(json.dumps({ + 'type': 'startWatching', + 'data': { + 'stream': { + 'quality': 'abr', + 'protocol': 'hls+fmp4', + 'latency': latency, + 'chasePlay': False + }, + 'room': { + 'protocol': 'webSocket', + 'commentable': True + }, + 'reconnect': False, + } + })) + + while True: + recv = ws.recv() + if not recv: + continue + data = json.loads(recv) + if not isinstance(data, dict): + continue + if data.get('type') == 'stream': + m3u8_url = data['data']['uri'] + qualities = data['data']['availableQualities'] + break + elif data.get('type') == 'disconnect': + self.write_debug(recv) + raise ExtractorError('Disconnected at middle of extraction') + elif data.get('type') == 'error': + self.write_debug(recv) + message = traverse_obj(data, ('body', 'code')) or recv + raise ExtractorError(message) + elif self.get_param('verbose', False): + if len(recv) > 100: + recv = recv[:100] + '...' + self.write_debug('Server said: %s' % recv) + + title = traverse_obj(embedded_data, ('program', 'title')) or self._html_search_meta( + ('og:title', 'twitter:title'), webpage, 'live title', fatal=False) + + raw_thumbs = traverse_obj(embedded_data, ('program', 'thumbnail')) or {} + thumbnails = [] + for name, value in raw_thumbs.items(): + if not isinstance(value, dict): + thumbnails.append({ + 'id': name, + 'url': value, + **parse_resolution(value, lenient=True), + }) + continue + + for k, img_url in value.items(): + res = parse_resolution(k, lenient=True) or parse_resolution(img_url, lenient=True) + width, height = res.get('width'), res.get('height') + + thumbnails.append({ + 'id': f'{name}_{width}x{height}', + 'url': img_url, + **res, + }) + + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True) + for fmt, q in zip(formats, reversed(qualities[1:])): + fmt.update({ + 'format_id': q, + 'protocol': 'niconico_live', + 'ws': ws, + 'video_id': video_id, + 'cookies': cookies, + 'live_latency': latency, + 'origin': hostname, + }) + + return { + 'id': video_id, + 'title': title, + **traverse_obj(embedded_data, { + 'view_count': ('program', 'statistics', 'watchCount'), + 'comment_count': ('program', 'statistics', 'commentCount'), + 'uploader': ('program', 'supplier', 'name'), + 'channel': ('socialGroup', 'name'), + 'channel_id': ('socialGroup', 'id'), + 'channel_url': ('socialGroup', 'socialGroupPageUrl'), + }), + 'description': clean_html(traverse_obj(embedded_data, ('program', 'description'))), + 'timestamp': int_or_none(traverse_obj(embedded_data, ('program', 'openTime'))), + 'is_live': True, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/hypervideo_dl/extractor/ninenow.py b/hypervideo_dl/extractor/ninenow.py index b970f8c..c655b75 100644 --- a/hypervideo_dl/extractor/ninenow.py +++ b/hypervideo_dl/extractor/ninenow.py @@ -53,7 +53,7 @@ class NineNowIE(InfoExtractor): 'upload_date': '20210421', }, 'expected_warnings': ['Ignoring subtitle tracks'], - 'params':{ + 'params': { 'skip_download': True, } }] diff --git a/hypervideo_dl/extractor/nitter.py b/hypervideo_dl/extractor/nitter.py index 251bf44..5d1ca1f 100644 --- a/hypervideo_dl/extractor/nitter.py +++ b/hypervideo_dl/extractor/nitter.py @@ -39,59 +39,99 @@ class NitterIE(InfoExtractor): ) HTTP_INSTANCES = ( - 'nitter.42l.fr', - 'nitter.pussthecat.org', - 'nitter.nixnet.services', + 'nitter.lacontrevoie.fr', 'nitter.fdn.fr', 'nitter.1d4.us', 'nitter.kavin.rocks', 'nitter.unixfox.eu', 'nitter.domain.glass', - 'nitter.eu', 'nitter.namazso.eu', - 'nitter.actionsack.com', 'birdsite.xanny.family', - 'nitter.hu', - 'twitr.gq', 'nitter.moomoo.me', - 'nittereu.moomoo.me', - 'bird.from.tf', + 'bird.trom.tf', 'nitter.it', 'twitter.censors.us', - 'twitter.grimneko.de', - 'nitter.alefvanoon.xyz', - 'n.hyperborea.cloud', - 'nitter.ca', + 'nitter.grimneko.de', 'twitter.076.ne.jp', - 'twitter.mstdn.social', 'nitter.fly.dev', 'notabird.site', 'nitter.weiler.rocks', - 'nitter.silkky.cloud', 'nitter.sethforprivacy.com', - 'nttr.stream', 'nitter.cutelab.space', 'nitter.nl', 'nitter.mint.lgbt', 'nitter.bus-hit.me', - 'fuckthesacklers.network', - 'nitter.govt.land', - 'nitter.datatunnel.xyz', 'nitter.esmailelbob.xyz', 'tw.artemislena.eu', - 'de.nttr.stream', 'nitter.winscloud.net', 'nitter.tiekoetter.com', 'nitter.spaceint.fr', - 'twtr.bch.bar', - 'nitter.exonip.de', - 'nitter.mastodon.pro', - 'nitter.notraxx.ch', - - - # not in the list anymore - 'nitter.skrep.in', - 'nitter.snopyta.org', + 'nitter.privacy.com.de', + 'nitter.poast.org', + 'nitter.bird.froth.zone', + 'nitter.dcs0.hu', + 'twitter.dr460nf1r3.org', + 'nitter.garudalinux.org', + 'twitter.femboy.hu', + 'nitter.cz', + 'nitter.privacydev.net', + 'nitter.evil.site', + 'tweet.lambda.dance', + 'nitter.kylrth.com', + 'nitter.foss.wtf', + 'nitter.priv.pw', + 'nitter.tokhmi.xyz', + 'nitter.catalyst.sx', + 'unofficialbird.com', + 'nitter.projectsegfau.lt', + 'nitter.eu.projectsegfau.lt', + 'singapore.unofficialbird.com', + 'canada.unofficialbird.com', + 'india.unofficialbird.com', + 'nederland.unofficialbird.com', + 'uk.unofficialbird.com', + 'n.l5.ca', + 'nitter.slipfox.xyz', + 'nitter.soopy.moe', + 'nitter.qwik.space', + 'read.whatever.social', + 'nitter.rawbit.ninja', + 'nt.vern.cc', + 'ntr.odyssey346.dev', + 'nitter.ir', + 'nitter.privacytools.io', + 'nitter.sneed.network', + 'n.sneed.network', + 'nitter.manasiwibi.com', + 'nitter.smnz.de', + 'nitter.twei.space', + 'nitter.inpt.fr', + 'nitter.d420.de', + 'nitter.caioalonso.com', + 'nitter.at', + 'nitter.drivet.xyz', + 'nitter.pw', + 'nitter.nicfab.eu', + 'bird.habedieeh.re', + 'nitter.hostux.net', + 'nitter.adminforge.de', + 'nitter.platypush.tech', + 'nitter.mask.sh', + 'nitter.pufe.org', + 'nitter.us.projectsegfau.lt', + 'nitter.arcticfoxes.net', + 't.com.sb', + 'nitter.kling.gg', + 'nitter.ktachibana.party', + 'nitter.riverside.rocks', + 'nitter.girlboss.ceo', + 'nitter.lunar.icu', + 'twitter.moe.ngo', + 'nitter.freedit.eu', + 'ntr.frail.duckdns.org', + 'nitter.librenode.org', + 'n.opnxng.com', + 'nitter.plus.st', ) DEAD_INSTANCES = ( @@ -117,6 +157,32 @@ class NitterIE(InfoExtractor): 'nitter.weaponizedhumiliation.com', 'nitter.vxempire.xyz', 'tweet.lambda.dance', + 'nitter.ca', + 'nitter.42l.fr', + 'nitter.pussthecat.org', + 'nitter.nixnet.services', + 'nitter.eu', + 'nitter.actionsack.com', + 'nitter.hu', + 'twitr.gq', + 'nittereu.moomoo.me', + 'bird.from.tf', + 'twitter.grimneko.de', + 'nitter.alefvanoon.xyz', + 'n.hyperborea.cloud', + 'twitter.mstdn.social', + 'nitter.silkky.cloud', + 'nttr.stream', + 'fuckthesacklers.network', + 'nitter.govt.land', + 'nitter.datatunnel.xyz', + 'de.nttr.stream', + 'twtr.bch.bar', + 'nitter.exonip.de', + 'nitter.mastodon.pro', + 'nitter.notraxx.ch', + 'nitter.skrep.in', + 'nitter.snopyta.org', ) INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES diff --git a/hypervideo_dl/extractor/njpwworld.py b/hypervideo_dl/extractor/njpwworld.py index 7b8a526..6078381 100644 --- a/hypervideo_dl/extractor/njpwworld.py +++ b/hypervideo_dl/extractor/njpwworld.py @@ -51,7 +51,7 @@ class NJPWWorldIE(InfoExtractor): data=urlencode_postdata({'login_id': username, 'pw': password}), headers={'Referer': 'https://front.njpwworld.com/auth'}) # /auth/login will return 302 for successful logins - if urlh.geturl() == self._LOGIN_URL: + if urlh.url == self._LOGIN_URL: self.report_warning('unable to login') return False diff --git a/hypervideo_dl/extractor/noice.py b/hypervideo_dl/extractor/noice.py new file mode 100644 index 0000000..e6e3433 --- /dev/null +++ b/hypervideo_dl/extractor/noice.py @@ -0,0 +1,116 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + parse_iso8601, + traverse_obj, + variadic, +) + + +class NoicePodcastIE(InfoExtractor): + _VALID_URL = r'https?://open\.noice\.id/content/(?P<id>[a-fA-F0-9-]+)' + _TESTS = [{ + 'url': 'https://open.noice.id/content/7694bb04-ff0f-40fa-a60b-5b39f29584b2', + 'info_dict': { + 'id': '7694bb04-ff0f-40fa-a60b-5b39f29584b2', + 'ext': 'm4a', + 'season': 'Season 1', + 'description': 'md5:58d1274e6857b6fbbecf47075885380d', + 'release_date': '20221115', + 'timestamp': 1668496642, + 'season_number': 1, + 'upload_date': '20221115', + 'release_timestamp': 1668496642, + 'title': 'Eps 1. Belajar dari Wishnutama: Kreatif Bukan Followers! (bersama Wishnutama)', + 'modified_date': '20221121', + 'categories': ['Bisnis dan Keuangan'], + 'duration': 3567, + 'modified_timestamp': 1669030647, + 'thumbnail': 'https://images.noiceid.cc/catalog/content-1668496302560', + 'channel_id': '9dab1024-5b92-4265-ae1c-63da87359832', + 'like_count': int, + 'channel': 'Noice Space Talks', + 'comment_count': int, + 'dislike_count': int, + 'channel_follower_count': int, + } + }, { + 'url': 'https://open.noice.id/content/222134e4-99f2-456f-b8a2-b8be404bf063', + 'info_dict': { + 'id': '222134e4-99f2-456f-b8a2-b8be404bf063', + 'ext': 'm4a', + 'release_timestamp': 1653488220, + 'description': 'md5:35074f6190cef52b05dd133bb2ef460e', + 'upload_date': '20220525', + 'timestamp': 1653460637, + 'release_date': '20220525', + 'thumbnail': 'https://images.noiceid.cc/catalog/content-1653460337625', + 'title': 'Eps 1: Dijodohin Sama Anak Pak RT', + 'modified_timestamp': 1669030647, + 'season_number': 1, + 'modified_date': '20221121', + 'categories': ['Cerita dan Drama'], + 'duration': 1830, + 'season': 'Season 1', + 'channel_id': '60193f6b-d24d-4b23-913b-ceed5a731e74', + 'dislike_count': int, + 'like_count': int, + 'comment_count': int, + 'channel': 'Dear Jerome', + 'channel_follower_count': int, + } + }] + + def _get_formats_and_subtitles(self, media_url, video_id): + formats, subtitles = [], {} + for url in variadic(media_url): + ext = determine_ext(url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles(url, video_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': url, + 'ext': 'mp3', + 'vcodec': 'none', + 'acodec': 'mp3', + }) + return formats, subtitles + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + nextjs_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['contentDetails'] + + media_url_list = traverse_obj(nextjs_data, (('rawContentUrl', 'url'), )) + formats, subtitles = self._get_formats_and_subtitles(media_url_list, display_id) + + return { + 'id': nextjs_data.get('id') or display_id, + 'title': nextjs_data.get('title') or self._html_search_meta('og:title', webpage), + 'formats': formats, + 'subtitles': subtitles, + 'description': (nextjs_data.get('description') or clean_html(nextjs_data.get('htmlDescription')) + or self._html_search_meta(['description', 'og:description'], webpage)), + 'thumbnail': nextjs_data.get('image') or self._html_search_meta('og:image', webpage), + 'timestamp': parse_iso8601(nextjs_data.get('createdAt')), + 'release_timestamp': parse_iso8601(nextjs_data.get('publishedAt')), + 'modified_timestamp': parse_iso8601( + nextjs_data.get('updatedAt') or self._html_search_meta('og:updated_time', webpage)), + 'duration': int_or_none(nextjs_data.get('duration')), + 'categories': traverse_obj(nextjs_data, ('genres', ..., 'name')), + 'season': nextjs_data.get('seasonName'), + 'season_number': int_or_none(nextjs_data.get('seasonNumber')), + 'channel': traverse_obj(nextjs_data, ('catalog', 'title')), + 'channel_id': traverse_obj(nextjs_data, ('catalog', 'id'), 'catalogId'), + **traverse_obj(nextjs_data, ('meta', 'aggregations', { + 'like_count': 'likes', + 'dislike_count': 'dislikes', + 'comment_count': 'comments', + 'channel_follower_count': 'followers', + })) + } diff --git a/hypervideo_dl/extractor/noodlemagazine.py b/hypervideo_dl/extractor/noodlemagazine.py index e620895..1cea0db 100644 --- a/hypervideo_dl/extractor/noodlemagazine.py +++ b/hypervideo_dl/extractor/noodlemagazine.py @@ -1,9 +1,14 @@ from .common import InfoExtractor from ..utils import ( - parse_duration, + extract_attributes, + get_element_html_by_id, + int_or_none, parse_count, - unified_strdate + parse_duration, + unified_strdate, + urljoin, ) +from ..utils.traversal import traverse_obj class NoodleMagazineIE(InfoExtractor): @@ -37,15 +42,21 @@ class NoodleMagazineIE(InfoExtractor): like_count = parse_count(self._html_search_meta('ya:ovs:likes', webpage, default=None)) upload_date = unified_strdate(self._html_search_meta('ya:ovs:upload_date', webpage, default='')) - key = self._html_search_regex(rf'/{video_id}\?(?:.*&)?m=([^&"\'\s,]+)', webpage, 'key') - playlist_info = self._download_json(f'https://adult.noodlemagazine.com/playlist/{video_id}?m={key}', video_id) - thumbnail = self._og_search_property('image', webpage, default=None) or playlist_info.get('image') + player_path = extract_attributes(get_element_html_by_id('iplayer', webpage) or '')['src'] + player_iframe = self._download_webpage( + urljoin('https://adult.noodlemagazine.com', player_path), video_id, 'Downloading iframe page') + playlist_url = self._search_regex( + r'window\.playlistUrl\s*=\s*["\']([^"\']+)["\']', player_iframe, 'playlist url') + playlist_info = self._download_json( + urljoin('https://adult.noodlemagazine.com', playlist_url), video_id, headers={'Referer': url}) - formats = [{ - 'url': source.get('file'), - 'quality': source.get('label'), - 'ext': source.get('type'), - } for source in playlist_info.get('sources')] + thumbnail = self._og_search_property('image', webpage, default=None) or playlist_info.get('image') + formats = traverse_obj(playlist_info, ('sources', lambda _, v: v['file'], { + 'url': 'file', + 'format_id': 'label', + 'height': ('label', {int_or_none}), + 'ext': 'type', + })) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/nosnl.py b/hypervideo_dl/extractor/nosnl.py index eba94c4..cea54c9 100644 --- a/hypervideo_dl/extractor/nosnl.py +++ b/hypervideo_dl/extractor/nosnl.py @@ -3,7 +3,7 @@ from ..utils import parse_duration, parse_iso8601, traverse_obj class NOSNLArticleIE(InfoExtractor): - _VALID_URL = r'https?://nos\.nl/((?!video)(\w+/)?\w+/)\d+-(?P<display_id>[\w-]+)' + _VALID_URL = r'https?://nos\.nl/(?P<type>video|(\w+/)?\w+)/?\d+-(?P<display_id>[\w-]+)' _TESTS = [ { # only 1 video @@ -22,13 +22,14 @@ class NOSNLArticleIE(InfoExtractor): 'info_dict': { 'id': '2440409', 'title': 'Vannacht sliepen weer enkele honderden asielzoekers in Ter Apel buiten', - 'description': 'Er werd wel geprobeerd om kwetsbare migranten onderdak te bieden, zegt het COA.', + 'description': 'md5:72b1e1674d798460e79d78fa37e9f56d', 'tags': ['aanmeldcentrum', 'Centraal Orgaan opvang asielzoekers', 'COA', 'asielzoekers', 'Ter Apel'], 'modified_timestamp': 1660452773, 'modified_date': '20220814', 'upload_date': '20220813', 'thumbnail': 'https://cdn.nos.nl/image/2022/07/18/880346/1024x576a.jpg', 'timestamp': 1660401384, + 'categories': ['Regionaal nieuws', 'Binnenland'], }, 'playlist_count': 2, }, { @@ -37,20 +38,37 @@ class NOSNLArticleIE(InfoExtractor): 'info_dict': { 'id': '2440789', 'title': 'Wekdienst 16/8: Groningse acties tien jaar na zware aardbeving • Femke Bol in actie op EK atletiek ', - 'description': 'Nieuws, weer, verkeer: met dit overzicht begin je geïnformeerd aan de dag.', + 'description': 'md5:0bd277ed7a44fc15cb12a9d27d8f6641', 'tags': ['wekdienst'], 'modified_date': '20220816', 'modified_timestamp': 1660625449, 'timestamp': 1660625449, 'upload_date': '20220816', 'thumbnail': 'https://cdn.nos.nl/image/2022/08/16/888178/1024x576a.jpg', + 'categories': ['Binnenland', 'Buitenland'], }, 'playlist_count': 2, + }, { + # video url + 'url': 'https://nos.nl/video/2452718-xi-en-trudeau-botsen-voor-de-camera-op-g20-top-je-hebt-gelekt', + 'info_dict': { + 'id': '2452718', + 'title': 'Xi en Trudeau botsen voor de camera op G20-top: \'Je hebt gelekt\'', + 'modified_date': '20221117', + 'description': 'md5:61907dac576f75c11bf8ffffd4a3cc0f', + 'tags': ['Xi', 'Trudeau', 'G20', 'indonesié'], + 'upload_date': '20221117', + 'thumbnail': 'https://cdn.nos.nl/image/2022/11/17/916155/1024x576a.jpg', + 'modified_timestamp': 1668663388, + 'timestamp': 1668663388, + 'categories': ['Buitenland'], + }, + 'playlist_mincount': 1, } ] def _entries(self, nextjs_json, display_id): - for item in nextjs_json['items']: + for item in nextjs_json: if item.get('type') == 'video': formats, subtitle = self._extract_m3u8_formats_and_subtitles( traverse_obj(item, ('source', 'url')), display_id, ext='mp4') @@ -77,13 +95,14 @@ class NOSNLArticleIE(InfoExtractor): } def _real_extract(self, url): - display_id = self._match_valid_url(url).group('display_id') + site_type, display_id = self._match_valid_url(url).group('type', 'display_id') webpage = self._download_webpage(url, display_id) nextjs_json = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['data'] return { '_type': 'playlist', - 'entries': self._entries(nextjs_json, display_id), + 'entries': self._entries( + [nextjs_json['video']] if site_type == 'video' else nextjs_json['items'], display_id), 'id': str(nextjs_json['id']), 'title': nextjs_json.get('title') or self._html_search_meta(['title', 'og:title', 'twitter:title'], webpage), 'description': (nextjs_json.get('description') @@ -91,5 +110,6 @@ class NOSNLArticleIE(InfoExtractor): 'tags': nextjs_json.get('keywords'), 'modified_timestamp': parse_iso8601(nextjs_json.get('modifiedAt')), 'thumbnail': nextjs_json.get('shareImageSrc') or self._html_search_meta(['og:image', 'twitter:image'], webpage), - 'timestamp': parse_iso8601(nextjs_json.get('publishedAt')) + 'timestamp': parse_iso8601(nextjs_json.get('publishedAt')), + 'categories': traverse_obj(nextjs_json, ('categories', ..., 'label')), } diff --git a/hypervideo_dl/extractor/nosvideo.py b/hypervideo_dl/extractor/nosvideo.py index b6d3ea4..7e9688c 100644 --- a/hypervideo_dl/extractor/nosvideo.py +++ b/hypervideo_dl/extractor/nosvideo.py @@ -1,9 +1,9 @@ import re from .common import InfoExtractor +from ..networking import Request from ..utils import ( ExtractorError, - sanitized_Request, urlencode_postdata, xpath_text, xpath_with_ns, @@ -36,8 +36,8 @@ class NosVideoIE(InfoExtractor): 'op': 'download1', 'method_free': 'Continue to Video', } - req = sanitized_Request(url, urlencode_postdata(fields)) - req.add_header('Content-type', 'application/x-www-form-urlencoded') + req = Request(url, urlencode_postdata(fields)) + req.headers['Content-type'] = 'application/x-www-form-urlencoded' webpage = self._download_webpage(req, video_id, 'Downloading download page') if re.search(self._FILE_DELETED_REGEX, webpage) is not None: diff --git a/hypervideo_dl/extractor/nowness.py b/hypervideo_dl/extractor/nowness.py index 18bb880..b86b7e2 100644 --- a/hypervideo_dl/extractor/nowness.py +++ b/hypervideo_dl/extractor/nowness.py @@ -4,10 +4,8 @@ from .brightcove import ( ) from .common import InfoExtractor from ..compat import compat_str -from ..utils import ( - ExtractorError, - sanitized_Request, -) +from ..networking import Request +from ..utils import ExtractorError class NownessBaseIE(InfoExtractor): @@ -40,7 +38,7 @@ class NownessBaseIE(InfoExtractor): def _api_request(self, url, request_path): display_id = self._match_id(url) - request = sanitized_Request( + request = Request( 'http://api.nowness.com/api/' + request_path % display_id, headers={ 'X-Nowness-Language': 'zh-cn' if 'cn.nowness.com' in url else 'en-us', diff --git a/hypervideo_dl/extractor/npo.py b/hypervideo_dl/extractor/npo.py index f18cb9e..40fee24 100644 --- a/hypervideo_dl/extractor/npo.py +++ b/hypervideo_dl/extractor/npo.py @@ -1,36 +1,22 @@ +import random import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) from ..utils import ( determine_ext, - ExtractorError, - fix_xml_ampersands, int_or_none, merge_dicts, orderedSet, - parse_duration, - qualities, str_or_none, - strip_jsonp, - unified_strdate, + try_call, unified_timestamp, url_or_none, urlencode_postdata, ) -class NPOBaseIE(InfoExtractor): - def _get_token(self, video_id): - return self._download_json( - 'http://ida.omroep.nl/app.php/auth', video_id, - note='Downloading token')['token'] - - -class NPOIE(NPOBaseIE): +class NPOIE(InfoExtractor): IE_NAME = 'npo' IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl' _VALID_URL = r'''(?x) @@ -58,6 +44,7 @@ class NPOIE(NPOBaseIE): 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', 'upload_date': '20140622', }, + 'skip': 'Video was removed', }, { 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', 'md5': 'da50a5787dbfc1603c4ad80f31c5120b', @@ -69,29 +56,41 @@ class NPOIE(NPOBaseIE): 'upload_date': '20090227', 'duration': 2400, }, + 'skip': 'Video was removed', }, { 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289', - 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', + 'md5': '1b279c0547f6b270e014c576415268c5', 'info_dict': { 'id': 'VPWON_1169289', - 'ext': 'm4v', - 'title': 'Tegenlicht: Zwart geld. De toekomst komt uit Afrika', - 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', + 'ext': 'mp4', + 'title': 'Zwart geld: de toekomst komt uit Afrika', + 'description': 'md5:dffaf3d628a9c36f78ca48d834246261', 'upload_date': '20130225', 'duration': 3000, + 'creator': 'NED2', + 'series': 'Tegenlicht', + 'timestamp': 1361822340, + 'thumbnail': 'https://images.npo.nl/tile/1280x720/142854.jpg', + 'episode': 'Zwart geld: de toekomst komt uit Afrika', + 'episode_number': 18, }, }, { 'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706', 'info_dict': { 'id': 'WO_VPRO_043706', - 'ext': 'm4v', + 'ext': 'mp4', 'title': 'De nieuwe mens - Deel 1', 'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b', 'duration': 4680, + 'episode': 'De nieuwe mens - Deel 1', + 'thumbnail': 'https://images.npo.nl/tile/1280x720/6289.jpg', + 'timestamp': 1279716057, + 'series': 'De nieuwe mens - Deel 1', + 'upload_date': '20100721', }, 'params': { 'skip_download': True, - } + }, }, { # non asf in streams 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771', @@ -102,20 +101,25 @@ class NPOIE(NPOBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Video was removed', }, { 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', 'info_dict': { 'id': 'VPWON_1233944', - 'ext': 'm4v', + 'ext': 'mp4', 'title': 'Aap, poot, pies', - 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde', + 'description': 'md5:4b46b1b9553b4c036a04d2a532a137e6', 'upload_date': '20150508', 'duration': 599, + 'episode': 'Aap, poot, pies', + 'thumbnail': 'https://images.poms.omroep.nl/image/s1280/c1280x720/608118.jpg', + 'timestamp': 1431064200, + 'series': 'Aap, poot, pies', }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', 'info_dict': { @@ -128,7 +132,8 @@ class NPOIE(NPOBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Video was removed', }, { # audio 'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437', @@ -140,7 +145,8 @@ class NPOIE(NPOBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Video was removed', }, { 'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547', 'only_matching': True, @@ -169,6 +175,25 @@ class NPOIE(NPOBaseIE): }, { 'url': 'https://npo.nl/KN_1698996', 'only_matching': True, + }, { + 'url': 'https://www.npo3.nl/the-genius/21-11-2022/VPWON_1341105', + 'info_dict': { + 'id': 'VPWON_1341105', + 'ext': 'mp4', + 'duration': 2658, + 'series': 'The Genius', + 'description': 'md5:db02f1456939ca63f7c408f858044e94', + 'title': 'The Genius', + 'timestamp': 1669062000, + 'creator': 'NED3', + 'episode': 'The Genius', + 'thumbnail': 'https://images.npo.nl/tile/1280x720/1827650.jpg', + 'episode_number': 8, + 'upload_date': '20221121', + }, + 'params': { + 'skip_download': True, + }, }] @classmethod @@ -179,25 +204,32 @@ class NPOIE(NPOBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - return self._get_info(url, video_id) or self._get_old_info(video_id) - - def _get_info(self, url, video_id): - token = self._download_json( - 'https://www.npostart.nl/api/token', video_id, - 'Downloading token', headers={ - 'Referer': url, - 'X-Requested-With': 'XMLHttpRequest', - })['token'] - - player = self._download_json( - 'https://www.npostart.nl/player/%s' % video_id, video_id, - 'Downloading player JSON', data=urlencode_postdata({ - 'autoplay': 0, - 'share': 1, - 'pageUrl': url, - 'hasAdConsent': 0, - '_token': token, - })) + if urllib.parse.urlparse(url).netloc in ['www.ntr.nl', 'ntr.nl']: + player = self._download_json( + f'https://www.ntr.nl/ajax/player/embed/{video_id}', video_id, + 'Downloading player JSON', query={ + 'parameters[elementId]': f'npo{random.randint(0, 999)}', + 'parameters[sterReferralUrl]': url, + 'parameters[autoplay]': 0, + }) + else: + self._request_webpage( + 'https://www.npostart.nl/api/token', video_id, + 'Downloading token', headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + }) + player = self._download_json( + f'https://www.npostart.nl/player/{video_id}', video_id, + 'Downloading player JSON', data=urlencode_postdata({ + 'autoplay': 0, + 'share': 1, + 'pageUrl': url, + 'hasAdConsent': 0, + }), headers={ + 'x-xsrf-token': try_call(lambda: urllib.parse.unquote( + self._get_cookies('https://www.npostart.nl')['XSRF-TOKEN'].value)) + }) player_token = player['token'] @@ -210,7 +242,7 @@ class NPOIE(NPOBaseIE): video_id, 'Downloading %s profile JSON' % profile, fatal=False, query={ 'profile': profile, - 'quality': 'npo', + 'quality': 'npoplus', 'tokenId': player_token, 'streamType': 'broadcast', }) @@ -291,188 +323,8 @@ class NPOIE(NPOBaseIE): return info - def _get_old_info(self, video_id): - metadata = self._download_json( - 'http://e.omroep.nl/metadata/%s' % video_id, - video_id, - # We have to remove the javascript callback - transform_source=strip_jsonp, - ) - - error = metadata.get('error') - if error: - raise ExtractorError(error, expected=True) - - # For some videos actual video id (prid) is different (e.g. for - # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698 - # video id is POMS_WNL_853698 but prid is POW_00996502) - video_id = metadata.get('prid') or video_id - - # titel is too generic in some cases so utilize aflevering_titel as well - # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html) - title = metadata['titel'] - sub_title = metadata.get('aflevering_titel') - if sub_title and sub_title != title: - title += ': %s' % sub_title - - token = self._get_token(video_id) - - formats = [] - urls = set() - - def is_legal_url(format_url): - return format_url and format_url not in urls and re.match( - r'^(?:https?:)?//', format_url) - - QUALITY_LABELS = ('Laag', 'Normaal', 'Hoog') - QUALITY_FORMATS = ('adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std') - - quality_from_label = qualities(QUALITY_LABELS) - quality_from_format_id = qualities(QUALITY_FORMATS) - items = self._download_json( - 'http://ida.omroep.nl/app.php/%s' % video_id, video_id, - 'Downloading formats JSON', query={ - 'adaptive': 'yes', - 'token': token, - })['items'][0] - for num, item in enumerate(items): - item_url = item.get('url') - if not is_legal_url(item_url): - continue - urls.add(item_url) - format_id = self._search_regex( - r'video/ida/([^/]+)', item_url, 'format id', - default=None) - - item_label = item.get('label') - - def add_format_url(format_url): - width = int_or_none(self._search_regex( - r'(\d+)[xX]\d+', format_url, 'width', default=None)) - height = int_or_none(self._search_regex( - r'\d+[xX](\d+)', format_url, 'height', default=None)) - if item_label in QUALITY_LABELS: - quality = quality_from_label(item_label) - f_id = item_label - elif item_label in QUALITY_FORMATS: - quality = quality_from_format_id(format_id) - f_id = format_id - else: - quality, f_id = [None] * 2 - formats.append({ - 'url': format_url, - 'format_id': f_id, - 'width': width, - 'height': height, - 'quality': quality, - }) - - # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 - if item.get('contentType') in ('url', 'audio'): - add_format_url(item_url) - continue - - try: - stream_info = self._download_json( - item_url + '&type=json', video_id, - 'Downloading %s stream JSON' - % item_label or item.get('format') or format_id or num) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: - error = (self._parse_json( - ee.cause.read().decode(), video_id, - fatal=False) or {}).get('errorstring') - if error: - raise ExtractorError(error, expected=True) - raise - # Stream URL instead of JSON, example: npo:LI_NL1_4188102 - if isinstance(stream_info, compat_str): - if not stream_info.startswith('http'): - continue - video_url = stream_info - # JSON - else: - video_url = stream_info.get('url') - if not video_url or 'vodnotavailable.' in video_url or video_url in urls: - continue - urls.add(video_url) - if determine_ext(video_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - else: - add_format_url(video_url) - - is_live = metadata.get('medium') == 'live' - - if not is_live: - for num, stream in enumerate(metadata.get('streams', [])): - stream_url = stream.get('url') - if not is_legal_url(stream_url): - continue - urls.add(stream_url) - # smooth streaming is not supported - stream_type = stream.get('type', '').lower() - if stream_type in ['ss', 'ms']: - continue - if stream_type == 'hds': - f4m_formats = self._extract_f4m_formats( - stream_url, video_id, fatal=False) - # f4m downloader downloads only piece of live stream - for f4m_format in f4m_formats: - f4m_format['preference'] = -5 - formats.extend(f4m_formats) - elif stream_type == 'hls': - formats.extend(self._extract_m3u8_formats( - stream_url, video_id, ext='mp4', fatal=False)) - # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 - elif '.asf' in stream_url: - asx = self._download_xml( - stream_url, video_id, - 'Downloading stream %d ASX playlist' % num, - transform_source=fix_xml_ampersands, fatal=False) - if not asx: - continue - ref = asx.find('./ENTRY/Ref') - if ref is None: - continue - video_url = ref.get('href') - if not video_url or video_url in urls: - continue - urls.add(video_url) - formats.append({ - 'url': video_url, - 'ext': stream.get('formaat', 'asf'), - 'quality': stream.get('kwaliteit'), - 'preference': -10, - }) - else: - formats.append({ - 'url': stream_url, - 'quality': stream.get('kwaliteit'), - }) - - subtitles = {} - if metadata.get('tt888') == 'ja': - subtitles['nl'] = [{ - 'ext': 'vtt', - 'url': 'http://tt888.omroep.nl/tt888/%s' % video_id, - }] - - return { - 'id': video_id, - 'title': title, - 'description': metadata.get('info'), - 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], - 'upload_date': unified_strdate(metadata.get('gidsdatum')), - 'duration': parse_duration(metadata.get('tijdsduur')), - 'formats': formats, - 'subtitles': subtitles, - 'is_live': is_live, - } - -class NPOLiveIE(NPOBaseIE): +class NPOLiveIE(InfoExtractor): IE_NAME = 'npo.nl:live' _VALID_URL = r'https?://(?:www\.)?npo(?:start)?\.nl/live(?:/(?P<id>[^/?#&]+))?' diff --git a/hypervideo_dl/extractor/nrk.py b/hypervideo_dl/extractor/nrk.py index 88d08e5..384865a 100644 --- a/hypervideo_dl/extractor/nrk.py +++ b/hypervideo_dl/extractor/nrk.py @@ -3,7 +3,8 @@ import random import re from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -148,7 +149,7 @@ class NRKIE(NRKBaseIE): try: return self._call_api(f'playback/{item}/program/{video_id}', video_id, item, query=query) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: return self._call_api(f'playback/{item}/{video_id}', video_id, item, query=query) raise diff --git a/hypervideo_dl/extractor/ntvru.py b/hypervideo_dl/extractor/ntvru.py index 8d5877d..91b7724 100644 --- a/hypervideo_dl/extractor/ntvru.py +++ b/hypervideo_dl/extractor/ntvru.py @@ -21,6 +21,7 @@ class NTVRuIE(InfoExtractor): 'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', 'thumbnail': r're:^http://.*\.jpg', 'duration': 136, + 'view_count': int, }, }, { 'url': 'http://www.ntv.ru/video/novosti/750370/', @@ -32,6 +33,7 @@ class NTVRuIE(InfoExtractor): 'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', 'thumbnail': r're:^http://.*\.jpg', 'duration': 172, + 'view_count': int, }, }, { 'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416', @@ -43,6 +45,7 @@ class NTVRuIE(InfoExtractor): 'description': '«Сегодня». 21 марта 2014 года. 16:00', 'thumbnail': r're:^http://.*\.jpg', 'duration': 1496, + 'view_count': int, }, }, { 'url': 'https://www.ntv.ru/kino/Koma_film/m70281/o336036/video/', @@ -54,6 +57,7 @@ class NTVRuIE(InfoExtractor): 'description': 'Остросюжетный фильм «Кома»', 'thumbnail': r're:^http://.*\.jpg', 'duration': 5592, + 'view_count': int, }, }, { 'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/', @@ -65,6 +69,7 @@ class NTVRuIE(InfoExtractor): 'description': '«Дело врачей»: «Деревце жизни»', 'thumbnail': r're:^http://.*\.jpg', 'duration': 2590, + 'view_count': int, }, }, { # Schemeless file URL @@ -115,6 +120,14 @@ class NTVRuIE(InfoExtractor): 'url': file_, 'filesize': int_or_none(xpath_text(video, './%ssize' % format_id)), }) + hls_manifest = xpath_text(video, './playback/hls') + if hls_manifest: + formats.extend(self._extract_m3u8_formats( + hls_manifest, video_id, m3u8_id='hls', fatal=False)) + dash_manifest = xpath_text(video, './playback/dash') + if dash_manifest: + formats.extend(self._extract_mpd_formats( + dash_manifest, video_id, mpd_id='dash', fatal=False)) return { 'id': xpath_text(video, './id'), diff --git a/hypervideo_dl/extractor/nubilesporn.py b/hypervideo_dl/extractor/nubilesporn.py new file mode 100644 index 0000000..d4f1d9d --- /dev/null +++ b/hypervideo_dl/extractor/nubilesporn.py @@ -0,0 +1,99 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + float_or_none, + format_field, + get_element_by_class, + get_element_by_id, + get_element_html_by_class, + get_elements_by_class, + int_or_none, + try_call, + unified_timestamp, + urlencode_postdata, +) + + +class NubilesPornIE(InfoExtractor): + _NETRC_MACHINE = 'nubiles-porn' + _VALID_URL = r'''(?x) + https://members.nubiles-porn.com/video/watch/(?P<id>\d+) + (?:/(?P<display_id>[\w\-]+-s(?P<season>\d+)e(?P<episode>\d+)))? + ''' + + _TESTS = [{ + 'url': 'https://members.nubiles-porn.com/video/watch/165320/trying-to-focus-my-one-track-mind-s3e1', + 'md5': 'fa7f09da8027c35e4bdf0f94f55eac82', + 'info_dict': { + 'id': '165320', + 'title': 'Trying To Focus My One Track Mind - S3:E1', + 'ext': 'mp4', + 'display_id': 'trying-to-focus-my-one-track-mind-s3e1', + 'thumbnail': 'https://images.nubiles-porn.com/videos/trying_to_focus_my_one_track_mind/samples/cover1280.jpg', + 'description': 'md5:81f3d4372e0e39bff5c801da277a5141', + 'timestamp': 1676160000, + 'upload_date': '20230212', + 'channel': 'Younger Mommy', + 'channel_id': '64', + 'channel_url': 'https://members.nubiles-porn.com/video/website/64', + 'like_count': int, + 'average_rating': float, + 'age_limit': 18, + 'categories': ['Big Boobs', 'Big Naturals', 'Blowjob', 'Brunette', 'Cowgirl', 'Girl Orgasm', 'Girl-Boy', + 'Glasses', 'Hardcore', 'Milf', 'Shaved Pussy', 'Tattoos', 'YoungerMommy.com'], + 'tags': list, + 'cast': ['Kenzie Love'], + 'availability': 'needs_auth', + 'series': 'Younger Mommy', + 'series_id': '64', + 'season': 'Season 3', + 'season_number': 3, + 'episode': 'Episode 1', + 'episode_number': 1 + } + }] + + def _perform_login(self, username, password): + login_webpage = self._download_webpage('https://nubiles-porn.com/login', video_id=None) + inputs = self._hidden_inputs(login_webpage) + inputs.update({'username': username, 'password': password}) + self._request_webpage('https://nubiles-porn.com/authentication/login', None, data=urlencode_postdata(inputs)) + + def _real_extract(self, url): + url_match = self._match_valid_url(url) + video_id = url_match.group('id') + page = self._download_webpage(url, video_id) + + media_entries = self._parse_html5_media_entries( + url, get_element_by_class('watch-page-video-wrapper', page), video_id)[0] + + channel_id, channel_name = self._search_regex( + r'/video/website/(?P<id>\d+).+>(?P<name>\w+).com', get_element_html_by_class('site-link', page), + 'channel', fatal=False, group=('id', 'name')) or (None, None) + channel_name = re.sub(r'([^A-Z]+)([A-Z]+)', r'\1 \2', channel_name) + + return { + 'id': video_id, + 'title': self._search_regex('<h2>([^<]+)</h2>', page, 'title', fatal=False), + 'formats': media_entries.get('formats'), + 'display_id': url_match.group('display_id'), + 'thumbnail': media_entries.get('thumbnail'), + 'description': clean_html(get_element_html_by_class('content-pane-description', page)), + 'timestamp': unified_timestamp(get_element_by_class('date', page)), + 'channel': channel_name, + 'channel_id': channel_id, + 'channel_url': format_field(channel_id, None, 'https://members.nubiles-porn.com/video/website/%s'), + 'like_count': int_or_none(get_element_by_id('likecount', page)), + 'average_rating': float_or_none(get_element_by_class('score', page)), + 'age_limit': 18, + 'categories': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_element_by_class('categories', page))))), + 'tags': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_elements_by_class('tags', page)[1])))), + 'cast': get_elements_by_class('content-pane-performer', page), + 'availability': 'needs_auth', + 'series': channel_name, + 'series_id': channel_id, + 'season_number': int_or_none(url_match.group('season')), + 'episode_number': int_or_none(url_match.group('episode')) + } diff --git a/hypervideo_dl/extractor/nzonscreen.py b/hypervideo_dl/extractor/nzonscreen.py new file mode 100644 index 0000000..6926bc5 --- /dev/null +++ b/hypervideo_dl/extractor/nzonscreen.py @@ -0,0 +1,93 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + remove_end, + strip_or_none, + traverse_obj, + url_or_none, +) + + +class NZOnScreenIE(InfoExtractor): + _VALID_URL = r'^https://www\.nzonscreen\.com/title/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.nzonscreen.com/title/shoop-shoop-diddy-wop-cumma-cumma-wang-dang-1982', + 'info_dict': { + 'id': '726ed6585c6bfb30', + 'ext': 'mp4', + 'format_id': 'hi', + 'display_id': 'shoop-shoop-diddy-wop-cumma-cumma-wang-dang-1982', + 'title': 'Monte Video - "Shoop Shoop, Diddy Wop"', + 'description': 'Monte Video - "Shoop Shoop, Diddy Wop"', + 'alt_title': 'Shoop Shoop Diddy Wop Cumma Cumma Wang Dang | Music Video', + 'thumbnail': r're:https://www\.nzonscreen\.com/content/images/.+\.jpg', + 'duration': 158, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.nzonscreen.com/title/shes-a-mod-1964?collection=best-of-the-60s', + 'info_dict': { + 'id': '3dbe709ff03c36f1', + 'ext': 'mp4', + 'format_id': 'hi', + 'display_id': 'shes-a-mod-1964', + 'title': 'Ray Columbus - \'She\'s A Mod\'', + 'description': 'Ray Columbus - \'She\'s A Mod\'', + 'alt_title': 'She\'s a Mod | Music Video', + 'thumbnail': r're:https://www\.nzonscreen\.com/content/images/.+\.jpg', + 'duration': 130, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.nzonscreen.com/title/puha-and-pakeha-1968/overview', + 'info_dict': { + 'id': 'f86342544385ad8a', + 'ext': 'mp4', + 'format_id': 'hi', + 'display_id': 'puha-and-pakeha-1968', + 'title': 'Looking At New Zealand - Puha and Pakeha', + 'alt_title': 'Looking at New Zealand - \'Pūhā and Pākehā\' | Television', + 'description': 'An excerpt from this television programme.', + 'duration': 212, + 'thumbnail': r're:https://www\.nzonscreen\.com/content/images/.+\.jpg', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _extract_formats(self, playlist): + for quality, (id_, url) in enumerate(traverse_obj( + playlist, ('h264', {'lo': 'lo_res', 'hi': 'hi_res'}), expected_type=url_or_none).items()): + yield { + 'url': url, + 'format_id': id_, + 'ext': 'mp4', + 'quality': quality, + 'height': int_or_none(playlist.get('height')) if id_ == 'hi' else None, + 'width': int_or_none(playlist.get('width')) if id_ == 'hi' else None, + 'filesize_approx': float_or_none(traverse_obj(playlist, ('h264', f'{id_}_res_mb')), invscale=1024**2), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + playlist = self._parse_json(self._html_search_regex( + r'data-video-config=\'([^\']+)\'', webpage, 'media data'), video_id) + + return { + 'id': playlist['uuid'], + 'display_id': video_id, + 'title': strip_or_none(playlist.get('label')), + 'description': strip_or_none(playlist.get('description')), + 'alt_title': strip_or_none(remove_end( + self._html_extract_title(webpage, default=None) or self._og_search_title(webpage), + ' | NZ On Screen')), + 'thumbnail': traverse_obj(playlist, ('thumbnail', 'path')), + 'duration': float_or_none(playlist.get('duration')), + 'formats': list(self._extract_formats(playlist)), + 'http_headers': { + 'Referer': 'https://www.nzonscreen.com/', + 'Origin': 'https://www.nzonscreen.com/', + } + } diff --git a/hypervideo_dl/extractor/odkmedia.py b/hypervideo_dl/extractor/odkmedia.py new file mode 100644 index 0000000..b852160 --- /dev/null +++ b/hypervideo_dl/extractor/odkmedia.py @@ -0,0 +1,105 @@ +import json + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + GeoRestrictedError, + float_or_none, + traverse_obj, + try_call +) + + +class OnDemandChinaEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://www\.ondemandchina\.com/\w+/watch/(?P<series>[\w-]+)/(?P<id>ep-(?P<ep>\d+))' + _TESTS = [{ + 'url': 'https://www.ondemandchina.com/en/watch/together-against-covid-19/ep-1', + 'info_dict': { + 'id': '264394', + 'ext': 'mp4', + 'duration': 3256.88, + 'title': 'EP 1 The Calling', + 'alt_title': '第1集 令出如山', + 'thumbnail': 'https://d2y2efdi5wgkcl.cloudfront.net/fit-in/256x256/media-io/2020/9/11/image.d9816e81.jpg', + 'description': '疫情严峻,党政军民学、东西南北中协同应考', + 'tags': ['Social Humanities', 'Documentary', 'Medical', 'Social'], + } + }] + + _QUERY = ''' + query Episode($programSlug: String!, $episodeNumber: Int!) { + episode( + programSlug: $programSlug + episodeNumber: $episodeNumber + kind: "series" + part: null + ) { + id + title + titleEn + titleKo + titleZhHans + titleZhHant + synopsis + synopsisEn + synopsisKo + synopsisZhHans + synopsisZhHant + videoDuration + images { + thumbnail + } + } + }''' + + def _real_extract(self, url): + program_slug, display_id, ep_number = self._match_valid_url(url).group('series', 'id', 'ep') + webpage = self._download_webpage(url, display_id) + + video_info = self._download_json( + 'https://odc-graphql.odkmedia.io/graphql', display_id, + headers={'Content-type': 'application/json'}, + data=json.dumps({ + 'operationName': 'Episode', + 'query': self._QUERY, + 'variables': { + 'programSlug': program_slug, + 'episodeNumber': int(ep_number), + }, + }).encode())['data']['episode'] + + try: + source_json = self._download_json( + f'https://odkmedia.io/odc/api/v2/playback/{video_info["id"]}/', display_id, + headers={'Authorization': '', 'service-name': 'odc'}) + except ExtractorError as e: + if isinstance(e.cause, HTTPError): + error_data = self._parse_json(e.cause.response.read(), display_id)['detail'] + raise GeoRestrictedError(error_data) + + formats, subtitles = [], {} + for source in traverse_obj(source_json, ('sources', ...)): + if source.get('type') == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles(source.get('url'), display_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + self.report_warning(f'Unsupported format {source.get("type")}', display_id) + + return { + 'id': str(video_info['id']), + 'duration': float_or_none(video_info.get('videoDuration'), 1000), + 'thumbnail': (traverse_obj(video_info, ('images', 'thumbnail')) + or self._html_search_meta(['og:image', 'twitter:image'], webpage)), + 'title': (traverse_obj(video_info, 'title', 'titleEn') + or self._html_search_meta(['og:title', 'twitter:title'], webpage) + or self._html_extract_title(webpage)), + 'alt_title': traverse_obj(video_info, 'titleKo', 'titleZhHans', 'titleZhHant'), + 'description': (traverse_obj( + video_info, 'synopsisEn', 'synopsisKo', 'synopsisZhHans', 'synopsisZhHant', 'synopisis') + or self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)), + 'formats': formats, + 'subtitles': subtitles, + 'tags': try_call(lambda: self._html_search_meta('keywords', webpage).split(', ')) + } diff --git a/hypervideo_dl/extractor/odnoklassniki.py b/hypervideo_dl/extractor/odnoklassniki.py index 4f325f0..1be45d8 100644 --- a/hypervideo_dl/extractor/odnoklassniki.py +++ b/hypervideo_dl/extractor/odnoklassniki.py @@ -1,3 +1,5 @@ +import urllib.parse + from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, @@ -5,15 +7,18 @@ from ..compat import ( compat_urllib_parse_unquote, compat_urllib_parse_urlparse, ) +from ..networking import HEADRequest from ..utils import ( ExtractorError, float_or_none, int_or_none, qualities, smuggle_url, + traverse_obj, unescapeHTML, unified_strdate, unsmuggle_url, + url_or_none, urlencode_postdata, ) @@ -40,7 +45,7 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'timestamp': 1545580896, 'view_count': int, - 'thumbnail': 'https://coub-attachments.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', 'title': 'Народная забава', 'uploader': 'Nevata', 'upload_date': '20181223', @@ -64,13 +69,14 @@ class OdnoklassnikiIE(InfoExtractor): 'title': str, 'uploader': str, }, + 'skip': 'vk extractor error', }, { - # metadata in JSON + # metadata in JSON, webm_dash with Firefox UA 'url': 'http://ok.ru/video/20079905452', - 'md5': '5d2b64756e2af296e3b383a0bc02a6aa', + 'md5': '8f477d8931c531374a3e36daec617b2c', 'info_dict': { 'id': '20079905452', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Культура меняет нас (прекрасный ролик!))', 'thumbnail': str, 'duration': 100, @@ -80,10 +86,14 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': int, 'age_limit': 0, }, + 'params': { + 'format': 'bv[ext=webm]', + 'http_headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0'}, + }, }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', - 'md5': 'f8c951122516af72e6e6ffdd3c41103b', + 'md5': '2bae2f58eefe1b3d26f3926c4a64d2f3', 'info_dict': { 'id': '63567059965189-0', 'ext': 'mp4', @@ -97,10 +107,11 @@ class OdnoklassnikiIE(InfoExtractor): 'age_limit': 0, 'start_time': 5, }, + 'params': {'skip_download': 'm3u8'}, }, { # YouTube embed (metadataUrl, provider == USER_YOUTUBE) 'url': 'https://ok.ru/video/3952212382174', - 'md5': '91749d0bd20763a28d083fa335bbd37a', + 'md5': '5fb5f83ce16cb212d6bf887282b5da53', 'info_dict': { 'id': '5axVgHHDBvU', 'ext': 'mp4', @@ -115,7 +126,7 @@ class OdnoklassnikiIE(InfoExtractor): 'live_status': 'not_live', 'view_count': int, 'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8', - 'uploader_url': 'http://www.youtube.com/user/MrKewlkid94', + 'uploader_url': 'https://www.youtube.com/@MrKewlkid94', 'channel_follower_count': int, 'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'], 'channel_id': 'UCVGtvURtEURYHtJFUegdSug', @@ -144,7 +155,6 @@ class OdnoklassnikiIE(InfoExtractor): }, 'skip': 'Video has not been found', }, { - # TODO: HTTP Error 400: Bad Request, it only works if there's no cookies when downloading 'note': 'Only available in mobile webpage', 'url': 'https://m.ok.ru/video/2361249957145', 'info_dict': { @@ -152,6 +162,26 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'title': 'Быковское крещение', 'duration': 3038.181, + 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+', + }, + }, { + 'note': 'subtitles', + 'url': 'https://ok.ru/video/4249587550747', + 'info_dict': { + 'id': '4249587550747', + 'ext': 'mp4', + 'title': 'Small Country An African Childhood (2020) (1080p) +subtitle', + 'uploader': 'Sunflower Movies', + 'uploader_id': '595802161179', + 'upload_date': '20220816', + 'duration': 6728, + 'age_limit': 0, + 'thumbnail': r're:^https?://i\.mycdn\.me/videoPreview\?.+', + 'like_count': int, + 'subtitles': dict, + }, + 'params': { + 'skip_download': True, }, }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', @@ -202,8 +232,15 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': 0, 'duration': 10444, }, + 'skip': 'Site no longer embeds', }] + def _clear_cookies(self, cdn_url): + # Direct http downloads will fail if CDN cookies are set + # so we need to reset them after each format extraction + self.cookiejar.clear(domain='.mycdn.me') + self.cookiejar.clear(domain=urllib.parse.urlparse(cdn_url).hostname) + @classmethod def _extract_embed_urls(cls, url, webpage): for x in super()._extract_embed_urls(url, webpage): @@ -294,6 +331,16 @@ class OdnoklassnikiIE(InfoExtractor): like_count = int_or_none(metadata.get('likeCount')) + subtitles = {} + for sub in traverse_obj(metadata, ('movie', 'subtitleTracks', ...), expected_type=dict): + sub_url = sub.get('url') + if not sub_url: + continue + subtitles.setdefault(sub.get('language') or 'en', []).append({ + 'url': sub_url, + 'ext': 'vtt', + }) + info = { 'id': video_id, 'title': title, @@ -305,6 +352,7 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': like_count, 'age_limit': age_limit, 'start_time': start_time, + 'subtitles': subtitles, } # pladform @@ -331,14 +379,22 @@ class OdnoklassnikiIE(InfoExtractor): formats = [{ 'url': f['url'], 'ext': 'mp4', - 'format_id': f['name'], - } for f in metadata['videos']] + 'format_id': f.get('name'), + } for f in traverse_obj(metadata, ('videos', lambda _, v: url_or_none(v['url'])))] - m3u8_url = metadata.get('hlsManifestUrl') + m3u8_url = traverse_obj(metadata, 'hlsManifestUrl', 'ondemandHls') if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._clear_cookies(m3u8_url) + + for mpd_id, mpd_key in [('dash', 'ondemandDash'), ('webm', 'metadataWebmUrl')]: + mpd_url = metadata.get(mpd_key) + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, video_id, mpd_id=mpd_id, fatal=False)) + self._clear_cookies(mpd_url) dash_manifest = metadata.get('metadataEmbedded') if dash_manifest: @@ -357,6 +413,7 @@ class OdnoklassnikiIE(InfoExtractor): if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + self._clear_cookies(m3u8_url) rtmp_url = metadata.get('rtmpUrl') if rtmp_url: formats.append({ @@ -390,6 +447,10 @@ class OdnoklassnikiIE(InfoExtractor): r'data-video="(.+?)"', webpage, 'json data') json_data = self._parse_json(unescapeHTML(json_data), video_id) or {} + redirect_url = self._request_webpage(HEADRequest( + json_data['videoSrc']), video_id, 'Requesting download URL').url + self._clear_cookies(redirect_url) + return { 'id': video_id, 'title': json_data.get('videoName'), @@ -397,7 +458,7 @@ class OdnoklassnikiIE(InfoExtractor): 'thumbnail': json_data.get('videoPosterSrc'), 'formats': [{ 'format_id': 'mobile', - 'url': json_data.get('videoSrc'), + 'url': redirect_url, 'ext': 'mp4', }] } diff --git a/hypervideo_dl/extractor/oneplace.py b/hypervideo_dl/extractor/oneplace.py new file mode 100644 index 0000000..86337ad --- /dev/null +++ b/hypervideo_dl/extractor/oneplace.py @@ -0,0 +1,43 @@ +from .common import InfoExtractor + + +class OnePlacePodcastIE(InfoExtractor): + _VALID_URL = r'https?://www\.oneplace\.com/[\w]+/[^/]+/listen/[\w-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.oneplace.com/ministries/a-daily-walk/listen/living-in-the-last-days-part-2-958461.html', + 'info_dict': { + 'id': '958461', + 'ext': 'mp3', + 'title': 'Living in the Last Days Part 2 | A Daily Walk with John Randall', + 'description': 'md5:fbb8f1cf21447ac54ecaa2887fc20c6e', + } + }, { + 'url': 'https://www.oneplace.com/ministries/ankerberg-show/listen/ep-3-relying-on-the-constant-companionship-of-the-holy-spirit-part-2-922513.html', + 'info_dict': { + 'id': '922513', + 'ext': 'mp3', + 'description': 'md5:8b810b4349aa40a5d033b4536fe428e1', + 'title': 'md5:ce10f7d8d5ddcf485ed8905ef109659d', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + return { + 'id': video_id, + 'url': self._search_regex(( + r'mp3-url\s*=\s*"([^"]+)', + r'<div[^>]+id\s*=\s*"player"[^>]+data-media-url\s*=\s*"(?P<media_url>[^"]+)', + ), webpage, 'media url'), + 'ext': 'mp3', + 'vcodec': 'none', + 'title': self._html_search_regex(( + r'<div[^>]class\s*=\s*"details"[^>]+>[^<]<h2[^>]+>(?P<content>[^>]+)>', + self._meta_regex('og:title'), self._meta_regex('title'), + ), webpage, 'title', group='content', default=None), + 'description': self._html_search_regex( + r'<div[^>]+class="[^"]+epDesc"[^>]*>\s*(?P<desc>.+?)\s*</div>', + webpage, 'description', default=None), + } diff --git a/hypervideo_dl/extractor/opencast.py b/hypervideo_dl/extractor/opencast.py index fa46757..1fafd9a 100644 --- a/hypervideo_dl/extractor/opencast.py +++ b/hypervideo_dl/extractor/opencast.py @@ -55,9 +55,9 @@ class OpencastBaseIE(InfoExtractor): transport = track.get('transport') if transport == 'DASH' or ext == 'mpd': - formats.extend(self._extract_mpd_formats_and_subtitles(href, video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_mpd_formats(href, video_id, mpd_id='dash', fatal=False)) elif transport == 'HLS' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats_and_subtitles( + formats.extend(self._extract_m3u8_formats( href, video_id, m3u8_id='hls', entry_protocol='m3u8_native', fatal=False)) elif transport == 'HDS' or ext == 'f4m': formats.extend(self._extract_f4m_formats(href, video_id, f4m_id='hds', fatal=False)) @@ -105,10 +105,9 @@ class OpencastBaseIE(InfoExtractor): class OpencastIE(OpencastBaseIE): - _VALID_URL = r'''(?x) - https?://(?P<host>%s)/paella/ui/watch.html\?.*? - id=(?P<id>%s) - ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE) + _VALID_URL = rf'''(?x) + https?://(?P<host>{OpencastBaseIE._INSTANCES_RE})/paella/ui/watch\.html\? + (?:[^#]+&)?id=(?P<id>{OpencastBaseIE._UUID_RE})''' _API_BASE = 'https://%s/search/episode.json?id=%s' @@ -123,6 +122,9 @@ class OpencastIE(OpencastBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1606208400, 'upload_date': '20201124', + 'season_id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', + 'series': 'Kryptographie - WiSe 15/16', + 'creator': 'Alexander May', }, } ] @@ -134,10 +136,11 @@ class OpencastIE(OpencastBaseIE): class OpencastPlaylistIE(OpencastBaseIE): - _VALID_URL = r'''(?x) - https?://(?P<host>%s)/engage/ui/index.html\?.*? - epFrom=(?P<id>%s) - ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE) + _VALID_URL = rf'''(?x) + https?://(?P<host>{OpencastBaseIE._INSTANCES_RE})(?: + /engage/ui/index\.html\?(?:[^#]+&)?epFrom=| + /ltitools/index\.html\?(?:[^#]+&)?series= + )(?P<id>{OpencastBaseIE._UUID_RE})''' _API_BASE = 'https://%s/search/episode.json?sid=%s' @@ -148,15 +151,23 @@ class OpencastPlaylistIE(OpencastBaseIE): 'id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', 'title': 'Kryptographie - WiSe 15/16', }, - 'playlist_mincount': 28, + 'playlist_mincount': 29, }, { - 'url': 'https://oc-video.ruhr-uni-bochum.de/engage/ui/index.html?e=1&p=1&epFrom=b1a54262-3684-403f-9731-8e77c3766f9a', + 'url': 'https://oc-video1.ruhr-uni-bochum.de/ltitools/index.html?subtool=series&series=cf68a4a1-36b1-4a53-a6ba-61af5705a0d0&lng=de', 'info_dict': { - 'id': 'b1a54262-3684-403f-9731-8e77c3766f9a', - 'title': 'inSTUDIES-Social movements and prefigurative politics in a global perspective', + 'id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', + 'title': 'Kryptographie - WiSe 15/16', + }, + 'playlist_mincount': 29, + }, + { + 'url': 'https://electures.uni-muenster.de/engage/ui/index.html?e=1&p=1&epFrom=39391d10-a711-4d23-b21d-afd2ed7d758c', + 'info_dict': { + 'id': '39391d10-a711-4d23-b21d-afd2ed7d758c', + 'title': '021670 Theologische Themen bei Hans Blumenberg WiSe 2017/18', }, - 'playlist_mincount': 6, + 'playlist_mincount': 13, }, ] diff --git a/hypervideo_dl/extractor/orf.py b/hypervideo_dl/extractor/orf.py index e9d23a4..cc3c003 100644 --- a/hypervideo_dl/extractor/orf.py +++ b/hypervideo_dl/extractor/orf.py @@ -2,11 +2,11 @@ import functools import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( clean_html, determine_ext, float_or_none, - HEADRequest, InAdvancePagedList, int_or_none, join_nonempty, diff --git a/hypervideo_dl/extractor/owncloud.py b/hypervideo_dl/extractor/owncloud.py new file mode 100644 index 0000000..79fd830 --- /dev/null +++ b/hypervideo_dl/extractor/owncloud.py @@ -0,0 +1,80 @@ +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext, + url_or_none, + urlencode_postdata, +) + + +class OwnCloudIE(InfoExtractor): + _INSTANCES_RE = '|'.join(( + r'(?:[^\.]+\.)?sciebo\.de', + r'cloud\.uni-koblenz-landau\.de', + )) + _VALID_URL = rf'https?://(?:{_INSTANCES_RE})/s/(?P<id>[\w.-]+)' + + _TESTS = [ + { + 'url': 'https://ruhr-uni-bochum.sciebo.de/s/wWhqZzh9jTumVFN', + 'info_dict': { + 'id': 'wWhqZzh9jTumVFN', + 'ext': 'mp4', + 'title': 'CmvpJST.mp4', + }, + }, + { + 'url': 'https://ruhr-uni-bochum.sciebo.de/s/WNDuFu0XuFtmm3f', + 'info_dict': { + 'id': 'WNDuFu0XuFtmm3f', + 'ext': 'mp4', + 'title': 'CmvpJST.mp4', + }, + 'params': { + 'videopassword': '12345', + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage, urlh = self._download_webpage_handle(url, video_id) + + if re.search(r'<label[^>]+for="password"', webpage): + webpage = self._verify_video_password(webpage, urlh.url, video_id) + + hidden_inputs = self._hidden_inputs(webpage) + title = hidden_inputs.get('filename') + parsed_url = urllib.parse.urlparse(url) + + return { + 'id': video_id, + 'title': title, + 'url': url_or_none(hidden_inputs.get('downloadURL')) or parsed_url._replace( + path=urllib.parse.urljoin(parsed_url.path, 'download')).geturl(), + 'ext': determine_ext(title), + } + + def _verify_video_password(self, webpage, url, video_id): + password = self.get_param('videopassword') + if password is None: + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', + expected=True) + + validation_response = self._download_webpage( + url, video_id, 'Validating Password', 'Wrong password?', + data=urlencode_postdata({ + 'requesttoken': self._hidden_inputs(webpage)['requesttoken'], + 'password': password, + })) + + if re.search(r'<label[^>]+for="password"', validation_response): + warning = self._search_regex( + r'<div[^>]+class="warning">([^<]*)</div>', validation_response, + 'warning', default='The password is wrong') + raise ExtractorError(f'Opening the video failed, {self.IE_NAME} said: {warning!r}', expected=True) + return validation_response diff --git a/hypervideo_dl/extractor/packtpub.py b/hypervideo_dl/extractor/packtpub.py index 51778d8..5620330 100644 --- a/hypervideo_dl/extractor/packtpub.py +++ b/hypervideo_dl/extractor/packtpub.py @@ -1,10 +1,7 @@ import json from .common import InfoExtractor -from ..compat import ( - # compat_str, - compat_HTTPError, -) +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, ExtractorError, @@ -54,8 +51,8 @@ class PacktPubIE(PacktPubBaseIE): 'password': password, }).encode())['data']['access'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 404): - message = self._parse_json(e.cause.read().decode(), None)['message'] + if isinstance(e.cause, HTTPError) and e.cause.status in (400, 401, 404): + message = self._parse_json(e.cause.response.read().decode(), None)['message'] raise ExtractorError(message, expected=True) raise @@ -70,7 +67,7 @@ class PacktPubIE(PacktPubBaseIE): 'https://services.packtpub.com/products-v1/products/%s/%s/%s' % (course_id, chapter_id, video_id), video_id, 'Downloading JSON video', headers=headers)['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: self.raise_login_required('This video is locked') raise diff --git a/hypervideo_dl/extractor/panopto.py b/hypervideo_dl/extractor/panopto.py index 32c103b..6e3c9f4 100644 --- a/hypervideo_dl/extractor/panopto.py +++ b/hypervideo_dl/extractor/panopto.py @@ -412,7 +412,7 @@ class PanoptoIE(PanoptoBaseIE): return { 'id': video_id, 'title': delivery.get('SessionName'), - 'cast': traverse_obj(delivery, ('Contributors', ..., 'DisplayName'), default=[], expected_type=lambda x: x or None), + 'cast': traverse_obj(delivery, ('Contributors', ..., 'DisplayName'), expected_type=lambda x: x or None), 'timestamp': session_start_time - 11640000000 if session_start_time else None, 'duration': delivery.get('Duration'), 'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random()}', @@ -563,7 +563,7 @@ class PanoptoListIE(PanoptoBaseIE): base_url, '/Services/Data.svc/GetFolderInfo', folder_id, data={'folderID': folder_id}, fatal=False) return { - 'title': get_first(response, 'Name', default=[]) + 'title': get_first(response, 'Name') } def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/parler.py b/hypervideo_dl/extractor/parler.py index 68a60bc..2af805e 100644 --- a/hypervideo_dl/extractor/parler.py +++ b/hypervideo_dl/extractor/parler.py @@ -1,13 +1,14 @@ +import functools + from .common import InfoExtractor from .youtube import YoutubeIE from ..utils import ( clean_html, - format_field, int_or_none, strip_or_none, traverse_obj, unified_timestamp, - urlencode_postdata, + urljoin, ) @@ -24,7 +25,7 @@ class ParlerIE(InfoExtractor): 'thumbnail': 'https://bl-images.parler.com/videos/6ce7cdf3-a27a-4d72-bf9c-d3e17ce39a66/thumbnail.jpeg', 'title': 'Parler video #df79fdba-07cc-48fe-b085-3293897520d7', 'description': 'md5:6f220bde2df4a97cbb89ac11f1fd8197', - 'timestamp': 1659744000, + 'timestamp': 1659785481, 'upload_date': '20220806', 'uploader': 'Tulsi Gabbard', 'uploader_id': 'TulsiGabbard', @@ -35,77 +36,56 @@ class ParlerIE(InfoExtractor): }, }, { - 'url': 'https://parler.com/feed/a7406eb4-91e5-4793-b5e3-ade57a24e287', - 'md5': '11687e2f5bb353682cee338d181422ed', - 'info_dict': { - 'id': 'a7406eb4-91e5-4793-b5e3-ade57a24e287', - 'ext': 'mp4', - 'thumbnail': 'https://bl-images.parler.com/videos/317827a8-1e48-4cbc-981f-7dd17d4c1183/thumbnail.jpeg', - 'title': 'Parler video #a7406eb4-91e5-4793-b5e3-ade57a24e287', - 'description': 'This man should run for office', - 'timestamp': 1659657600, - 'upload_date': '20220805', - 'uploader': 'Benny Johnson', - 'uploader_id': 'BennyJohnson', - 'uploader_url': 'https://parler.com/BennyJohnson', - 'view_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, - { 'url': 'https://parler.com/feed/f23b85c1-6558-470f-b9ff-02c145f28da5', 'md5': 'eaba1ff4a10fe281f5ce74e930ab2cb4', 'info_dict': { 'id': 'r5vkSaz8PxQ', 'ext': 'mp4', - 'thumbnail': 'https://i.ytimg.com/vi_webp/r5vkSaz8PxQ/maxresdefault.webp', - 'title': 'Tom MacDonald Names Reaction', - 'description': 'md5:33c21f0d35ae6dc2edf3007d6696baea', - 'upload_date': '20220716', - 'duration': 1267, - 'uploader': 'Mahesh Chookolingo', - 'uploader_id': 'maheshchookolingo', - 'uploader_url': 'http://www.youtube.com/user/maheshchookolingo', - 'channel': 'Mahesh Chookolingo', - 'channel_id': 'UCox6YeMSY1PQInbCtTaZj_w', - 'channel_url': 'https://www.youtube.com/channel/UCox6YeMSY1PQInbCtTaZj_w', - 'categories': ['Entertainment'], - 'tags': list, - 'availability': 'public', 'live_status': 'not_live', - 'view_count': int, 'comment_count': int, + 'duration': 1267, 'like_count': int, 'channel_follower_count': int, - 'age_limit': 0, + 'channel_id': 'UCox6YeMSY1PQInbCtTaZj_w', + 'upload_date': '20220716', + 'thumbnail': 'https://i.ytimg.com/vi/r5vkSaz8PxQ/maxresdefault.jpg', + 'tags': 'count:17', + 'availability': 'public', + 'categories': ['Entertainment'], 'playable_in_embed': True, + 'channel': 'Who Knows What! With Mahesh & Friends', + 'title': 'Tom MacDonald Names Reaction', + 'uploader': 'Who Knows What! With Mahesh & Friends', + 'uploader_id': '@maheshchookolingo', + 'age_limit': 0, + 'description': 'md5:33c21f0d35ae6dc2edf3007d6696baea', + 'channel_url': 'https://www.youtube.com/channel/UCox6YeMSY1PQInbCtTaZj_w', + 'view_count': int, + 'uploader_url': 'http://www.youtube.com/@maheshchookolingo', }, }, ] def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'https://parler.com/open-api/ParleyDetailEndpoint.php', video_id, - data=urlencode_postdata({'uuid': video_id}))['data'][0] - primary = data['primary'] - - embed = self._parse_json(primary.get('V2LINKLONG') or '', video_id, fatal=False) - if embed: - return self.url_result(embed[0], YoutubeIE) + data = self._download_json(f'https://api.parler.com/v0/public/parleys/{video_id}', + video_id)['data'] + if data.get('link'): + return self.url_result(data['link'], YoutubeIE) return { 'id': video_id, - 'url': traverse_obj(primary, ('video_data', 'videoSrc')), - 'thumbnail': traverse_obj(primary, ('video_data', 'thumbnailUrl')), - 'title': '', - 'description': strip_or_none(clean_html(primary.get('full_body'))) or None, - 'timestamp': unified_timestamp(primary.get('date_created')), - 'uploader': strip_or_none(primary.get('name')), - 'uploader_id': strip_or_none(primary.get('username')), - 'uploader_url': format_field(strip_or_none(primary.get('username')), None, 'https://parler.com/%s'), - 'view_count': int_or_none(primary.get('view_count')), - 'comment_count': int_or_none(traverse_obj(data, ('engagement', 'commentCount'))), - 'repost_count': int_or_none(traverse_obj(data, ('engagement', 'echoCount'))), + 'title': strip_or_none(data.get('title')) or '', + **traverse_obj(data, { + 'url': ('video', 'videoSrc'), + 'thumbnail': ('video', 'thumbnailUrl'), + 'description': ('body', {clean_html}), + 'timestamp': ('date_created', {unified_timestamp}), + 'uploader': ('user', 'name', {strip_or_none}), + 'uploader_id': ('user', 'username', {str}), + 'uploader_url': ('user', 'username', {functools.partial(urljoin, 'https://parler.com/')}), + 'view_count': ('views', {int_or_none}), + 'comment_count': ('total_comments', {int_or_none}), + 'repost_count': ('echos', {int_or_none}), + }) } diff --git a/hypervideo_dl/extractor/patreon.py b/hypervideo_dl/extractor/patreon.py index 4dc0298..79b041d 100644 --- a/hypervideo_dl/extractor/patreon.py +++ b/hypervideo_dl/extractor/patreon.py @@ -1,22 +1,22 @@ import itertools -from urllib.error import HTTPError from .common import InfoExtractor from .vimeo import VimeoIE - from ..compat import compat_urllib_parse_unquote +from ..networking.exceptions import HTTPError from ..utils import ( + KNOWN_EXTENSIONS, + ExtractorError, clean_html, determine_ext, - ExtractorError, int_or_none, - KNOWN_EXTENSIONS, mimetype2ext, parse_iso8601, str_or_none, traverse_obj, try_get, url_or_none, + urljoin, ) @@ -37,9 +37,9 @@ class PatreonBaseIE(InfoExtractor): item_id, note='Downloading API JSON' if not note else note, query=query, fatal=fatal, headers=headers) except ExtractorError as e: - if not isinstance(e.cause, HTTPError) or mimetype2ext(e.cause.headers.get('Content-Type')) != 'json': + if not isinstance(e.cause, HTTPError) or mimetype2ext(e.cause.response.headers.get('Content-Type')) != 'json': raise - err_json = self._parse_json(self._webpage_read_content(e.cause, None, item_id), item_id, fatal=False) + err_json = self._parse_json(self._webpage_read_content(e.cause.response, None, item_id), item_id, fatal=False) err_message = traverse_obj(err_json, ('errors', ..., 'detail'), get_all=False) if err_message: raise ExtractorError(f'Patreon said: {err_message}', expected=True) @@ -310,7 +310,7 @@ class PatreonIE(PatreonBaseIE): f'posts/{post_id}/comments', post_id, query=params, note='Downloading comments page %d' % page) cursor = None - for comment in traverse_obj(response, (('data', ('included', lambda _, v: v['type'] == 'comment')), ...), default=[]): + for comment in traverse_obj(response, (('data', ('included', lambda _, v: v['type'] == 'comment')), ...)): count += 1 comment_id = comment.get('id') attributes = comment.get('attributes') or {} @@ -404,8 +404,8 @@ class PatreonCampaignIE(PatreonBaseIE): posts_json = self._call_api('posts', campaign_id, query=params, note='Downloading posts page %d' % page) cursor = traverse_obj(posts_json, ('meta', 'pagination', 'cursors', 'next')) - for post in posts_json.get('data') or []: - yield self.url_result(url_or_none(traverse_obj(post, ('attributes', 'patreon_url'))), 'Patreon') + for post_url in traverse_obj(posts_json, ('data', ..., 'attributes', 'patreon_url')): + yield self.url_result(urljoin('https://www.patreon.com/', post_url), PatreonIE) if cursor is None: break diff --git a/hypervideo_dl/extractor/pbs.py b/hypervideo_dl/extractor/pbs.py index 5bdf561..2bb2ea9 100644 --- a/hypervideo_dl/extractor/pbs.py +++ b/hypervideo_dl/extractor/pbs.py @@ -11,6 +11,7 @@ from ..utils import ( orderedSet, strip_jsonp, strip_or_none, + traverse_obj, unified_strdate, url_or_none, US_RATINGS, @@ -696,3 +697,61 @@ class PBSIE(InfoExtractor): 'subtitles': subtitles, 'chapters': chapters, } + + +class PBSKidsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pbskids\.org/video/[\w-]+/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://pbskids.org/video/molly-of-denali/3030407927', + 'md5': '1ded20a017cc6b53446238f1804ce4c7', + 'info_dict': { + 'id': '3030407927', + 'title': 'Bird in the Hand/Bye-Bye Birdie', + 'channel': 'molly-of-denali', + 'duration': 1540, + 'ext': 'mp4', + 'series': 'Molly of Denali', + 'description': 'md5:d006b2211633685d8ebc8d03b6d5611e', + 'categories': ['Episode'], + 'upload_date': '20190718', + } + }, + { + 'url': 'https://pbskids.org/video/plum-landing/2365205059', + 'md5': '92e5d189851a64ae1d0237a965be71f5', + 'info_dict': { + 'id': '2365205059', + 'title': 'Cooper\'s Favorite Place in Nature', + 'channel': 'plum-landing', + 'duration': 67, + 'ext': 'mp4', + 'series': 'Plum Landing', + 'description': 'md5:657e5fc4356a84ead1c061eb280ff05d', + 'categories': ['Episode'], + 'upload_date': '20140302', + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + meta = self._search_json(r'window\._PBS_KIDS_DEEPLINK\s*=', webpage, 'video info', video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + traverse_obj(meta, ('video_obj', 'URI', {url_or_none})), video_id, ext='mp4') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(meta, { + 'categories': ('video_obj', 'video_type', {str}, {lambda x: [x] if x else None}), + 'channel': ('show_slug', {str}), + 'description': ('video_obj', 'description', {str}), + 'duration': ('video_obj', 'duration', {int_or_none}), + 'series': ('video_obj', 'program_title', {str}), + 'title': ('video_obj', 'title', {str}), + 'upload_date': ('video_obj', 'air_date', {unified_strdate}), + }) + } diff --git a/hypervideo_dl/extractor/peekvids.py b/hypervideo_dl/extractor/peekvids.py index 2d9b9a7..d1fc058 100644 --- a/hypervideo_dl/extractor/peekvids.py +++ b/hypervideo_dl/extractor/peekvids.py @@ -1,71 +1,128 @@ +import re + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_class, + int_or_none, + merge_dicts, + url_or_none, +) + + +class PeekVidsBaseIE(InfoExtractor): + def _real_extract(self, url): + domain, video_id = self._match_valid_url(url).group('domain', 'id') + webpage = self._download_webpage(url, video_id, expected_status=429) + if '>Rate Limit Exceeded' in webpage: + raise ExtractorError( + f'You are suspected as a bot. Wait, or pass the captcha on the site and provide cookies. {self._login_hint()}', + video_id=video_id, expected=True) + + title = self._html_search_regex(r'(?s)<h1\b[^>]*>(.+?)</h1>', webpage, 'title') + + display_id = video_id + video_id = self._search_regex(r'(?s)<video\b[^>]+\bdata-id\s*=\s*["\']?([\w-]+)', webpage, 'short video ID') + srcs = self._download_json( + f'https://www.{domain}/v-alt/{video_id}', video_id, + note='Downloading list of source files') + + formats = [] + for k, v in srcs.items(): + f_url = url_or_none(v) + if not f_url: + continue + + height = self._search_regex(r'^data-src(\d{3,})$', k, 'height', default=None) + if not height: + continue + + formats.append({ + 'url': f_url, + 'format_id': height, + 'height': int_or_none(height), + }) + + if not formats: + formats = [{'url': url} for url in srcs.values()] + info = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={}) + info.pop('url', None) -class PeekVidsIE(InfoExtractor): + # may not have found the thumbnail if it was in a list in the ld+json + info.setdefault('thumbnail', self._og_search_thumbnail(webpage)) + detail = (get_element_by_class('detail-video-block', webpage) + or get_element_by_class('detail-block', webpage) or '') + info['description'] = self._html_search_regex( + rf'(?s)(.+?)(?:{re.escape(info.get("description", ""))}\s*<|<ul\b)', + detail, 'description', default=None) or None + info['title'] = re.sub(r'\s*[,-][^,-]+$', '', info.get('title') or title) or self._generic_title(url) + + def cat_tags(name, html): + l = self._html_search_regex( + rf'(?s)<span\b[^>]*>\s*{re.escape(name)}\s*:\s*</span>(.+?)</li>', + html, name, default='') + return list(filter(None, re.split(r'\s+', l))) + + return merge_dicts({ + 'id': video_id, + 'display_id': display_id, + 'age_limit': 18, + 'formats': formats, + 'categories': cat_tags('Categories', detail), + 'tags': cat_tags('Tags', detail), + 'uploader': self._html_search_regex(r'[Uu]ploaded\s+by\s(.+?)"', webpage, 'uploader', default=None), + }, info) + + +class PeekVidsIE(PeekVidsBaseIE): _VALID_URL = r'''(?x) - https?://(?:www\.)?peekvids\.com/ + https?://(?:www\.)?(?P<domain>peekvids\.com)/ (?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=) (?P<id>[^/?&#]*) ''' _TESTS = [{ 'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd', - 'md5': 'a00940646c428e232407e3e62f0e8ef5', + 'md5': '2ff6a357a9717dc9dc9894b51307e9a2', 'info_dict': { - 'id': 'BSyLMbN0YCd', - 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub', + 'id': '1262717', + 'display_id': 'BSyLMbN0YCd', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', 'ext': 'mp4', 'thumbnail': r're:^https?://.*\.jpg$', - 'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp (7 min), uploaded by SEXYhub.com', + 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', 'timestamp': 1642579329, 'upload_date': '20220119', 'duration': 416, 'view_count': int, 'age_limit': 18, + 'uploader': 'SEXYhub.com', + 'categories': list, + 'tags': list, }, }] - _DOMAIN = 'www.peekvids.com' - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - short_video_id = self._html_search_regex(r'<video [^>]*data-id="(.+?)"', webpage, 'short video ID') - srcs = self._download_json( - f'https://{self._DOMAIN}/v-alt/{short_video_id}', video_id, - note='Downloading list of source files') - formats = [{ - 'url': url, - 'ext': 'mp4', - 'format_id': name[8:], - } for name, url in srcs.items() if len(name) > 8 and name.startswith('data-src')] - if not formats: - formats = [{'url': url} for url in srcs.values()] - info = self._search_json_ld(webpage, video_id, expected_type='VideoObject') - info.update({ - 'id': video_id, - 'age_limit': 18, - 'formats': formats, - }) - return info - - -class PlayVidsIE(PeekVidsIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?://(?:www\.)?playvids\.com/(?:embed/|[^/]{2}/)?(?P<id>[^/?#]*)' +class PlayVidsIE(PeekVidsBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?P<domain>playvids\.com)/(?:embed/|\w\w?/)?(?P<id>[^/?#]*)' _TESTS = [{ 'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', - 'md5': 'cd7dfd8a2e815a45402369c76e3c1825', + 'md5': '2f12e50213dd65f142175da633c4564c', 'info_dict': { - 'id': 'U3pBrYhsjXM', - 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub', + 'id': '1978030', + 'display_id': 'U3pBrYhsjXM', + 'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp', 'ext': 'mp4', 'thumbnail': r're:^https?://.*\.jpg$', - 'description': 'Watch Dane Jones - Cute redhead with perfect tits with Mini Vamp video in HD, uploaded by SEXYhub.com', + 'description': 'md5:0a61df3620de26c0af8963b1a730cd69', 'timestamp': 1640435839, 'upload_date': '20211225', 'duration': 416, 'view_count': int, 'age_limit': 18, + 'uploader': 'SEXYhub.com', + 'categories': list, + 'tags': list, }, }, { 'url': 'https://www.playvids.com/es/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', @@ -73,5 +130,62 @@ class PlayVidsIE(PeekVidsIE): # XXX: Do not subclass from concrete IE }, { 'url': 'https://www.playvids.com/embed/U3pBrYhsjXM', 'only_matching': True, + }, { + 'url': 'https://www.playvids.com/bKmGLe3IwjZ/sv/brazzers-800-phone-sex-madison-ivy-always-on-the-line', + 'md5': 'e783986e596cafbf46411a174ab42ba6', + 'info_dict': { + 'id': '762385', + 'display_id': 'bKmGLe3IwjZ', + 'ext': 'mp4', + 'title': 'Brazzers - 1 800 Phone Sex: Madison Ivy Always On The Line 6', + 'description': 'md5:bdcd2db2b8ad85831a491d7c8605dcef', + 'timestamp': 1516958544, + 'upload_date': '20180126', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 480, + 'uploader': 'Brazzers', + 'age_limit': 18, + 'view_count': int, + 'age_limit': 18, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://www.playvids.com/v/47iUho33toY', + 'md5': 'b056b5049d34b648c1e86497cf4febce', + 'info_dict': { + 'id': '700621', + 'display_id': '47iUho33toY', + 'ext': 'mp4', + 'title': 'KATEE OWEN STRIPTIASE IN SEXY RED LINGERIE', + 'description': None, + 'timestamp': 1507052209, + 'upload_date': '20171003', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 332, + 'uploader': 'Cacerenele', + 'age_limit': 18, + 'view_count': int, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://www.playvids.com/z3_7iwWCmqt/sexy-teen-filipina-striptease-beautiful-pinay-bargirl-strips-and-dances', + 'md5': 'efa09be9f031314b7b7e3bc6510cd0df', + 'info_dict': { + 'id': '1523518', + 'display_id': 'z3_7iwWCmqt', + 'ext': 'mp4', + 'title': 'SEXY TEEN FILIPINA STRIPTEASE - Beautiful Pinay Bargirl Strips and Dances', + 'description': None, + 'timestamp': 1607470323, + 'upload_date': '20201208', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 593, + 'uploader': 'yorours', + 'age_limit': 18, + 'view_count': int, + 'categories': list, + 'tags': list, + }, }] - _DOMAIN = 'www.playvids.com' diff --git a/hypervideo_dl/extractor/peloton.py b/hypervideo_dl/extractor/peloton.py index 4835822..7864299 100644 --- a/hypervideo_dl/extractor/peloton.py +++ b/hypervideo_dl/extractor/peloton.py @@ -3,7 +3,7 @@ import re import urllib.parse from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, @@ -83,8 +83,8 @@ class PelotonIE(InfoExtractor): }).encode(), headers={'Content-Type': 'application/json', 'User-Agent': 'web'}) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - json_string = self._webpage_read_content(e.cause, None, video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + json_string = self._webpage_read_content(e.cause.response, None, video_id) res = self._parse_json(json_string, video_id) raise ExtractorError(res['message'], expected=res['message'] == 'Login failed') else: @@ -96,8 +96,8 @@ class PelotonIE(InfoExtractor): 'https://api.onepeloton.com/api/subscription/stream', video_id, note='Downloading token', data=json.dumps({}).encode(), headers={'Content-Type': 'application/json'}) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - json_string = self._webpage_read_content(e.cause, None, video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + json_string = self._webpage_read_content(e.cause.response, None, video_id) res = self._parse_json(json_string, video_id) raise ExtractorError(res['message'], expected=res['message'] == 'Stream limit reached') else: @@ -109,7 +109,7 @@ class PelotonIE(InfoExtractor): try: self._start_session(video_id) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: self._login(video_id) self._start_session(video_id) else: diff --git a/hypervideo_dl/extractor/pgatour.py b/hypervideo_dl/extractor/pgatour.py new file mode 100644 index 0000000..36c2c62 --- /dev/null +++ b/hypervideo_dl/extractor/pgatour.py @@ -0,0 +1,47 @@ +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor + + +class PGATourIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pgatour\.com/video/[\w-]+/(?P<tc>T)?(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.pgatour.com/video/competition/T6322447785112/adam-hadwin-2023-the-players-round-4-18th-hole-shot-1', + 'info_dict': { + 'id': '6322447785112', + 'ext': 'mp4', + 'title': 'Adam Hadwin | 2023 THE PLAYERS | Round 4 | 18th hole | Shot 1', + 'uploader_id': '6116716431001', + 'upload_date': '20230312', + 'timestamp': 1678653136, + 'duration': 20.011, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': 'count:7', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.pgatour.com/video/features/6322506425112/follow-the-players-trophy-on-championship-sunday', + 'info_dict': { + 'id': '6322506425112', + 'ext': 'mp4', + 'title': 'Follow THE PLAYERS trophy on Championship Sunday', + 'description': 'md5:4d29e4bdfa03694a0ebfd08950398568', + 'uploader_id': '6082840763001', + 'upload_date': '20230313', + 'timestamp': 1678739835, + 'duration': 123.435, + 'thumbnail': r're:^https://.+\.jpg', + 'tags': 'count:8', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id, is_tourcast = self._match_valid_url(url).group('id', 'tc') + + # From https://www.pgatour.com/_next/static/chunks/pages/_app-8bcf849560daf38d.js + account_id = '6116716431001' if is_tourcast else '6082840763001' + player_id = 'Vsd5Umu8r' if is_tourcast else 'FWIBYMBPj' + + return self.url_result( + f'https://players.brightcove.net/{account_id}/{player_id}_default/index.html?videoId={video_id}', + BrightcoveNewIE) diff --git a/hypervideo_dl/extractor/piapro.py b/hypervideo_dl/extractor/piapro.py index d8d9c78..5f39e06 100644 --- a/hypervideo_dl/extractor/piapro.py +++ b/hypervideo_dl/extractor/piapro.py @@ -12,17 +12,22 @@ from ..utils import ( class PiaproIE(InfoExtractor): _NETRC_MACHINE = 'piapro' - _VALID_URL = r'https?://piapro\.jp/t/(?P<id>\w+)/?' + _VALID_URL = r'https?://piapro\.jp/(?:t|content)/(?P<id>\w+)/?' _TESTS = [{ 'url': 'https://piapro.jp/t/NXYR', - 'md5': 'a9d52f27d13bafab7ee34116a7dcfa77', + 'md5': 'f7c0f760913fb1d44a1c45a4af793909', 'info_dict': { 'id': 'NXYR', 'ext': 'mp3', 'uploader': 'wowaka', 'uploader_id': 'wowaka', 'title': '裏表ラバーズ', - 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'http://www.nicovideo.jp/watch/sm8082467', + 'duration': 189.0, + 'timestamp': 1251785475, + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', + 'upload_date': '20090901', + 'view_count': int, } }, { 'note': 'There are break lines in description, mandating (?s) flag', @@ -34,8 +39,16 @@ class PiaproIE(InfoExtractor): 'title': '青に溶けた風船 / 初音ミク', 'description': 'md5:d395a9bd151447631a5a1460bc7f9132', 'uploader': 'シアン・キノ', + 'duration': 229.0, + 'timestamp': 1644030039, + 'upload_date': '20220205', + 'view_count': int, + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', 'uploader_id': 'cyankino', } + }, { + 'url': 'https://piapro.jp/content/hcw0z3a169wtemz6', + 'only_matching': True }] _login_status = False @@ -56,7 +69,7 @@ class PiaproIE(InfoExtractor): if urlh is False: login_ok = False else: - parts = compat_urlparse.urlparse(urlh.geturl()) + parts = compat_urlparse.urlparse(urlh.url) if parts.path != '/': login_ok = False if not login_ok: diff --git a/hypervideo_dl/extractor/picarto.py b/hypervideo_dl/extractor/picarto.py index 36a062d..d415ba2 100644 --- a/hypervideo_dl/extractor/picarto.py +++ b/hypervideo_dl/extractor/picarto.py @@ -1,7 +1,10 @@ +import urllib.parse + from .common import InfoExtractor from ..utils import ( ExtractorError, - js_to_json, + str_or_none, + traverse_obj, ) @@ -84,7 +87,7 @@ class PicartoIE(InfoExtractor): class PicartoVodIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?picarto\.tv/(?:videopopout|\w+/videos)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', @@ -94,6 +97,18 @@ class PicartoVodIE(InfoExtractor): 'title': 'ArtofZod_2017.12.12.00.13.23.flv', 'thumbnail': r're:^https?://.*\.jpg' }, + 'skip': 'The VOD does not exist', + }, { + 'url': 'https://picarto.tv/ArtofZod/videos/772650', + 'md5': '00067a0889f1f6869cc512e3e79c521b', + 'info_dict': { + 'id': '772650', + 'ext': 'mp4', + 'title': 'Art of Zod - Drawing and Painting', + 'thumbnail': r're:^https?://.*\.jpg', + 'channel': 'ArtofZod', + 'age_limit': 18, + } }, { 'url': 'https://picarto.tv/videopopout/Plague', 'only_matching': True, @@ -102,21 +117,36 @@ class PicartoVodIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - vod_info = self._parse_json( - self._search_regex( - r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, - 'vod player'), - video_id, transform_source=js_to_json) + data = self._download_json( + 'https://ptvintern.picarto.tv/ptvapi', video_id, query={ + 'query': f'''{{ + video(id: "{video_id}") {{ + id + title + adult + file_name + video_recording_image_url + channel {{ + name + }} + }} +}}''' + })['data']['video'] + + file_name = data['file_name'] + netloc = urllib.parse.urlparse(data['video_recording_image_url']).netloc formats = self._extract_m3u8_formats( - vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + f'https://{netloc}/stream/hls/{file_name}/index.m3u8', video_id, 'mp4', m3u8_id='hls') return { 'id': video_id, - 'title': video_id, - 'thumbnail': vod_info.get('vodThumb'), + **traverse_obj(data, { + 'id': ('id', {str_or_none}), + 'title': ('title', {str}), + 'thumbnail': 'video_recording_image_url', + 'channel': ('channel', 'name', {str}), + 'age_limit': ('adult', {lambda x: 18 if x else 0}), + }), 'formats': formats, } diff --git a/hypervideo_dl/extractor/piksel.py b/hypervideo_dl/extractor/piksel.py index cc60b30..97a9bf5 100644 --- a/hypervideo_dl/extractor/piksel.py +++ b/hypervideo_dl/extractor/piksel.py @@ -7,8 +7,10 @@ from ..utils import ( int_or_none, join_nonempty, parse_iso8601, + traverse_obj, try_get, unescapeHTML, + urljoin, ) @@ -63,11 +65,11 @@ class PikselIE(InfoExtractor): } ] - def _call_api(self, app_token, resource, display_id, query, fatal=True): - response = (self._download_json( - 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), - display_id, query=query, fatal=fatal) or {}).get('response') - failure = try_get(response, lambda x: x['failure']['reason']) + def _call_api(self, app_token, resource, display_id, query, host='https://player.piksel.com', fatal=True): + url = urljoin(host, f'/ws/ws_{resource}/api/{app_token}/mode/json/apiv/5') + response = traverse_obj( + self._download_json(url, display_id, query=query, fatal=fatal), ('response', {dict})) or {} + failure = traverse_obj(response, ('failure', 'reason')) if response else 'Empty response from API' if failure: if fatal: raise ExtractorError(failure, expected=True) @@ -83,7 +85,7 @@ class PikselIE(InfoExtractor): ], webpage, 'app token') query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} program = self._call_api( - app_token, 'program', display_id, query)['WsProgramResponse']['program'] + app_token, 'program', display_id, query, url)['WsProgramResponse']['program'] video_id = program['uuid'] video_data = program['asset'] title = video_data['title'] @@ -129,7 +131,7 @@ class PikselIE(InfoExtractor): process_asset_files(try_get(self._call_api( app_token, 'asset_file', display_id, { 'assetid': asset_id, - }, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) + }, url, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) m3u8_url = dict_get(video_data, [ 'm3u8iPadURL', diff --git a/hypervideo_dl/extractor/pinterest.py b/hypervideo_dl/extractor/pinterest.py index 2c6cd6d..8361fbb 100644 --- a/hypervideo_dl/extractor/pinterest.py +++ b/hypervideo_dl/extractor/pinterest.py @@ -1,19 +1,24 @@ import json from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, int_or_none, - try_get, + str_or_none, + strip_or_none, + traverse_obj, unified_timestamp, url_or_none, ) class PinterestBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' + _VALID_URL_BASE = r'''(?x) + https?://(?:[^/]+\.)?pinterest\.(?: + com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx| + dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu| + co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)''' def _call_api(self, resource, video_id, options): return self._download_json( @@ -24,14 +29,53 @@ class PinterestBaseIE(InfoExtractor): def _extract_video(self, data, extract_formats=True): video_id = data['id'] + thumbnails = [] + images = data.get('images') + if isinstance(images, dict): + for thumbnail_id, thumbnail in images.items(): + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) - title = (data.get('title') or data.get('grid_title') or video_id).strip() + info = { + 'title': strip_or_none(traverse_obj(data, 'title', 'grid_title', default='')), + 'description': traverse_obj(data, 'seo_description', 'description'), + 'timestamp': unified_timestamp(data.get('created_at')), + 'thumbnails': thumbnails, + 'uploader': traverse_obj(data, ('closeup_attribution', 'full_name')), + 'uploader_id': str_or_none(traverse_obj(data, ('closeup_attribution', 'id'))), + 'repost_count': int_or_none(data.get('repin_count')), + 'comment_count': int_or_none(data.get('comment_count')), + 'categories': traverse_obj(data, ('pin_join', 'visual_annotation'), expected_type=list), + 'tags': traverse_obj(data, 'hashtags', expected_type=list), + } urls = [] formats = [] duration = None - if extract_formats: - for format_id, format_dict in data['videos']['video_list'].items(): + domain = data.get('domain', '') + if domain.lower() != 'uploaded by user' and traverse_obj(data, ('embed', 'src')): + if not info['title']: + info['title'] = None + return { + '_type': 'url_transparent', + 'url': data['embed']['src'], + **info, + } + + elif extract_formats: + video_list = traverse_obj( + data, ('videos', 'video_list'), + ('story_pin_data', 'pages', ..., 'blocks', ..., 'video', 'video_list'), + expected_type=dict, get_all=False, default={}) + for format_id, format_dict in video_list.items(): if not isinstance(format_dict, dict): continue format_url = url_or_none(format_dict.get('url')) @@ -53,72 +97,79 @@ class PinterestBaseIE(InfoExtractor): 'duration': duration, }) - description = data.get('description') or data.get('description_html') or data.get('seo_description') - timestamp = unified_timestamp(data.get('created_at')) - - def _u(field): - return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) - - uploader = _u('full_name') - uploader_id = _u('id') - - repost_count = int_or_none(data.get('repin_count')) - comment_count = int_or_none(data.get('comment_count')) - categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) - tags = data.get('hashtags') - - thumbnails = [] - images = data.get('images') - if isinstance(images, dict): - for thumbnail_id, thumbnail in images.items(): - if not isinstance(thumbnail, dict): - continue - thumbnail_url = url_or_none(thumbnail.get('url')) - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) - return { 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'thumbnails': thumbnails, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'repost_count': repost_count, - 'comment_count': comment_count, - 'categories': categories, - 'tags': tags, 'formats': formats, + 'duration': duration, + 'webpage_url': f'https://www.pinterest.com/pin/{video_id}/', 'extractor_key': PinterestIE.ie_key(), + 'extractor': PinterestIE.IE_NAME, + **info, } class PinterestIE(PinterestBaseIE): _VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE _TESTS = [{ + # formats found in data['videos'] 'url': 'https://www.pinterest.com/pin/664281013778109217/', 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', 'info_dict': { 'id': '664281013778109217', 'ext': 'mp4', 'title': 'Origami', - 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', + 'description': 'md5:e29801cab7d741ea8c741bc50c8d00ab', 'duration': 57.7, 'timestamp': 1593073622, 'upload_date': '20200625', - 'uploader': 'Love origami -I am Dafei', - 'uploader_id': '586523688879454212', - 'repost_count': 50, - 'comment_count': 0, + 'repost_count': int, + 'comment_count': int, 'categories': list, 'tags': list, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + }, { + # formats found in data['story_pin_data'] + 'url': 'https://www.pinterest.com/pin/1084663891475263837/', + 'md5': '069ac19919ab9e1e13fa60de46290b03', + 'info_dict': { + 'id': '1084663891475263837', + 'ext': 'mp4', + 'title': 'Gadget, Cool products, Amazon product, technology, Kitchen gadgets', + 'description': 'md5:d0a4b6ae996ff0c6eed83bc869598d13', + 'uploader': 'CoolCrazyGadgets', + 'uploader_id': '1084664028912989237', + 'upload_date': '20211003', + 'timestamp': 1633246654.0, + 'duration': 14.9, + 'comment_count': int, + 'repost_count': int, + 'categories': 'count:9', + 'tags': list, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + }, { + # vimeo.com embed + 'url': 'https://www.pinterest.ca/pin/441282463481903715/', + 'info_dict': { + 'id': '111691128', + 'ext': 'mp4', + 'title': 'Tonite Let\'s All Make Love In London (1967)', + 'description': 'md5:8190f37b3926807809ec57ec21aa77b2', + 'uploader': 'Vimeo', + 'uploader_id': '473792960706651251', + 'upload_date': '20180120', + 'timestamp': 1516409040, + 'duration': 3404, + 'comment_count': int, + 'repost_count': int, + 'categories': 'count:9', + 'tags': [], + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + 'uploader_url': 'https://vimeo.com/willardandrade', + }, + 'params': { + 'skip_download': 'm3u8', }, }, { 'url': 'https://co.pinterest.com/pin/824721750502199491/', diff --git a/hypervideo_dl/extractor/pladform.py b/hypervideo_dl/extractor/pladform.py index dcf18e1..0050068 100644 --- a/hypervideo_dl/extractor/pladform.py +++ b/hypervideo_dl/extractor/pladform.py @@ -78,7 +78,7 @@ class PladformIE(InfoExtractor): expected=True) if not video: - targetUrl = self._request_webpage(url, video_id, note='Resolving final URL').geturl() + targetUrl = self._request_webpage(url, video_id, note='Resolving final URL').url if targetUrl == url: raise ExtractorError('Can\'t parse page') return self.url_result(targetUrl) diff --git a/hypervideo_dl/extractor/platzi.py b/hypervideo_dl/extractor/platzi.py index b8a4414..166b98c 100644 --- a/hypervideo_dl/extractor/platzi.py +++ b/hypervideo_dl/extractor/platzi.py @@ -36,7 +36,7 @@ class PlatziBaseIE(InfoExtractor): headers={'Referer': self._LOGIN_URL}) # login succeeded - if 'platzi.com/login' not in urlh.geturl(): + if 'platzi.com/login' not in urlh.url: return login_error = self._webpage_read_content( diff --git a/hypervideo_dl/extractor/playplustv.py b/hypervideo_dl/extractor/playplustv.py index 316f220..a4439c8 100644 --- a/hypervideo_dl/extractor/playplustv.py +++ b/hypervideo_dl/extractor/playplustv.py @@ -1,13 +1,9 @@ import json from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - clean_html, - ExtractorError, - int_or_none, - PUTRequest, -) +from ..networking import PUTRequest +from ..networking.exceptions import HTTPError +from ..utils import ExtractorError, clean_html, int_or_none class PlayPlusTVIE(InfoExtractor): @@ -47,9 +43,9 @@ class PlayPlusTVIE(InfoExtractor): try: self._token = self._download_json(req, None)['token'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: raise ExtractorError(self._parse_json( - e.cause.read(), None)['errorMessage'], expected=True) + e.cause.response.read(), None)['errorMessage'], expected=True) raise self._profile = self._call_api('Profiles')['list'][0]['_id'] diff --git a/hypervideo_dl/extractor/playsuisse.py b/hypervideo_dl/extractor/playsuisse.py index a635ac9..76288c7 100644 --- a/hypervideo_dl/extractor/playsuisse.py +++ b/hypervideo_dl/extractor/playsuisse.py @@ -5,10 +5,16 @@ from ..utils import int_or_none, traverse_obj class PlaySuisseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/watch/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/(?:watch|detail)/(?:[^#]*[?&]episodeId=)?(?P<id>[0-9]+)' _TESTS = [ { + # Old URL 'url': 'https://www.playsuisse.ch/watch/763211/0', + 'only_matching': True, + }, + { + # episode in a series + 'url': 'https://www.playsuisse.ch/watch/763182?episodeId=763211', 'md5': '82df2a470b2dfa60c2d33772a8a60cf8', 'info_dict': { 'id': '763211', @@ -21,11 +27,11 @@ class PlaySuisseIE(InfoExtractor): 'season_number': 1, 'episode': 'Knochen', 'episode_number': 1, - 'thumbnail': 'md5:9260abe0c0ec9b69914d0a10d54c5878' + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', } - }, - { - 'url': 'https://www.playsuisse.ch/watch/808675/0', + }, { + # film + 'url': 'https://www.playsuisse.ch/watch/808675', 'md5': '818b94c1d2d7c4beef953f12cb8f3e75', 'info_dict': { 'id': '808675', @@ -33,26 +39,60 @@ class PlaySuisseIE(InfoExtractor): 'title': 'Der Läufer', 'description': 'md5:9f61265c7e6dcc3e046137a792b275fd', 'duration': 5280, - 'episode': 'Der Läufer', - 'thumbnail': 'md5:44af7d65ee02bbba4576b131868bb783' + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', } - }, - { - 'url': 'https://www.playsuisse.ch/watch/817193/0', - 'md5': '1d6c066f92cd7fffd8b28a53526d6b59', + }, { + # series (treated as a playlist) + 'url': 'https://www.playsuisse.ch/detail/1115687', 'info_dict': { - 'id': '817193', - 'ext': 'mp4', - 'title': 'Die Einweihungsparty', - 'description': 'md5:91ebf04d3a42cb3ab70666acf750a930', - 'duration': 1380, - 'series': 'Nr. 47', - 'season': 'Season 1', - 'season_number': 1, - 'episode': 'Die Einweihungsparty', - 'episode_number': 1, - 'thumbnail': 'md5:637585fb106e3a4bcd991958924c7e44' - } + 'description': 'md5:e4a2ae29a8895823045b5c3145a02aa3', + 'id': '1115687', + 'series': 'They all came out to Montreux', + 'title': 'They all came out to Montreux', + }, + 'playlist': [{ + 'info_dict': { + 'description': 'md5:f2462744834b959a31adc6292380cda2', + 'duration': 3180, + 'episode': 'Folge 1', + 'episode_number': 1, + 'id': '1112663', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 1', + 'ext': 'mp4' + }, + }, { + 'info_dict': { + 'description': 'md5:9dfd308699fe850d3bce12dc1bad9b27', + 'duration': 2935, + 'episode': 'Folge 2', + 'episode_number': 2, + 'id': '1112661', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 2', + 'ext': 'mp4' + }, + }, { + 'info_dict': { + 'description': 'md5:14a93a3356b2492a8f786ab2227ef602', + 'duration': 2994, + 'episode': 'Folge 3', + 'episode_number': 3, + 'id': '1112664', + 'season': 'Season 1', + 'season_number': 1, + 'series': 'They all came out to Montreux', + 'thumbnail': 're:https://playsuisse-img.akamaized.net/', + 'title': 'Folge 3', + 'ext': 'mp4' + } + }], } ] @@ -142,6 +182,6 @@ class PlaySuisseIE(InfoExtractor): 'subtitles': subtitles, 'series': media_data.get('seriesName'), 'season_number': int_or_none(media_data.get('seasonNumber')), - 'episode': media_data.get('name'), + 'episode': media_data.get('name') if media_data.get('episodeNumber') else None, 'episode_number': int_or_none(media_data.get('episodeNumber')), } diff --git a/hypervideo_dl/extractor/plutotv.py b/hypervideo_dl/extractor/plutotv.py index 71a05cc..caffeb2 100644 --- a/hypervideo_dl/extractor/plutotv.py +++ b/hypervideo_dl/extractor/plutotv.py @@ -84,6 +84,17 @@ class PlutoTVIE(InfoExtractor): }, { 'url': 'https://pluto.tv/it/on-demand/series/csi-vegas/episode/legacy-2021-1-1', 'only_matching': True, + }, + { + 'url': 'https://pluto.tv/en/on-demand/movies/attack-of-the-killer-tomatoes-1977-1-1-ptv1', + 'md5': '7db56369c0da626a32d505ec6eb3f89f', + 'info_dict': { + 'id': '5b190c7bb0875c36c90c29c4', + 'ext': 'mp4', + 'title': 'Attack of the Killer Tomatoes', + 'description': 'A group of scientists band together to save the world from mutated tomatoes that KILL! (1978)', + 'duration': 5700, + } } ] @@ -103,7 +114,7 @@ class PlutoTVIE(InfoExtractor): compat_urlparse.urljoin(first_segment_url.group(1), '0-end/master.m3u8')) continue first_segment_url = re.search( - r'^(https?://.*/).+\-0+\.ts$', res, + r'^(https?://.*/).+\-0+[0-1]0\.ts$', res, re.MULTILINE) if first_segment_url: m3u8_urls.add( diff --git a/hypervideo_dl/extractor/polskieradio.py b/hypervideo_dl/extractor/polskieradio.py index 99244f6..5bf92b9 100644 --- a/hypervideo_dl/extractor/polskieradio.py +++ b/hypervideo_dl/extractor/polskieradio.py @@ -2,24 +2,24 @@ import itertools import json import math import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, - compat_urlparse -) +from ..compat import compat_str from ..utils import ( - extract_attributes, ExtractorError, InAdvancePagedList, + determine_ext, + extract_attributes, int_or_none, js_to_json, parse_iso8601, strip_or_none, - unified_timestamp, + traverse_obj, unescapeHTML, + unified_timestamp, url_or_none, + urljoin, ) @@ -42,34 +42,17 @@ class PolskieRadioBaseExtractor(InfoExtractor): 'duration': int_or_none(media.get('length')), 'vcodec': 'none' if media.get('provider') == 'audio' else None, }) - entry_title = compat_urllib_parse_unquote(media['desc']) + entry_title = urllib.parse.unquote(media['desc']) if entry_title: entry['title'] = entry_title yield entry -class PolskieRadioIE(PolskieRadioBaseExtractor): - _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' - _TESTS = [{ # Old-style single broadcast. - 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', - 'info_dict': { - 'id': '1587943', - 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', - 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', - }, - 'playlist': [{ - 'md5': '2984ee6ce9046d91fc233bc1a864a09a', - 'info_dict': { - 'id': '1540576', - 'ext': 'mp3', - 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', - 'timestamp': 1456594200, - 'upload_date': '20160227', - 'duration': 2364, - 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' - }, - }], - }, { # New-style single broadcast. +class PolskieRadioLegacyIE(PolskieRadioBaseExtractor): + # legacy sites + IE_NAME = 'polskieradio:legacy' + _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/[Aa]rtykul/(?P<id>\d+)' + _TESTS = [{ 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo', 'info_dict': { 'id': '2534482', @@ -97,16 +80,6 @@ class PolskieRadioIE(PolskieRadioBaseExtractor): 'title': 'Pogłos 29 października godz. 23:01', }, }, { - 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', - 'only_matching': True, - }, { - 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', - 'only_matching': True, - }, { - # with mp4 video - 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', - 'only_matching': True, - }, { 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci', 'only_matching': True, }] @@ -114,7 +87,9 @@ class PolskieRadioIE(PolskieRadioBaseExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + webpage, urlh = self._download_webpage_handle(url, playlist_id) + if PolskieRadioIE.suitable(urlh.url): + return self.url_result(urlh.url, PolskieRadioIE, playlist_id) content = self._search_regex( r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', @@ -153,26 +128,201 @@ class PolskieRadioIE(PolskieRadioBaseExtractor): return self.playlist_result(entries, playlist_id, title, description) -class PolskieRadioCategoryIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)' +class PolskieRadioIE(PolskieRadioBaseExtractor): + # new next.js sites + _VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P<id>\d+)' + _TESTS = [{ + # articleData, attachments + 'url': 'https://jedynka.polskieradio.pl/artykul/1587943', + 'info_dict': { + 'id': '1587943', + 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', + 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', + }, + 'playlist': [{ + 'md5': '2984ee6ce9046d91fc233bc1a864a09a', + 'info_dict': { + 'id': '7a85d429-5356-4def-a347-925e4ae7406b', + 'ext': 'mp3', + 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', + }, + }], + }, { + # post, legacy html players + 'url': 'https://trojka.polskieradio.pl/artykul/2589163,Czy-wciaz-otrzymujemy-zdjecia-z-sond-Voyager', + 'info_dict': { + 'id': '2589163', + 'title': 'Czy wciąż otrzymujemy zdjęcia z sond Voyager?', + 'description': 'md5:cf1a7f348d63a2db9c0d7a63d1669473', + }, + 'playlist': [{ + 'info_dict': { + 'id': '2577880', + 'ext': 'mp3', + 'title': 'md5:a57d10a0c02abd34dd675cb33707ad5a', + 'duration': 321, + }, + }], + }, { + # data, legacy + 'url': 'https://radiokierowcow.pl/artykul/2694529', + 'info_dict': { + 'id': '2694529', + 'title': 'Zielona fala reliktem przeszłości?', + 'description': 'md5:f20a9a7ed9cb58916c54add94eae3bc0', + }, + 'playlist_count': 3, + }, { + 'url': 'https://trojka.polskieradio.pl/artykul/1632955', + 'only_matching': True, + }, { + # with mp4 video + 'url': 'https://trojka.polskieradio.pl/artykul/1634903', + 'only_matching': True, + }, { + 'url': 'https://jedynka.polskieradio.pl/artykul/3042436,Polityka-wschodnia-ojca-i-syna-Wladyslawa-Lokietka-i-Kazimierza-Wielkiego', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + article_data = traverse_obj( + self._search_nextjs_data(webpage, playlist_id), ( + 'props', 'pageProps', (('data', 'articleData'), 'post', 'data')), get_all=False) + + title = strip_or_none(article_data['title']) + + description = strip_or_none(article_data.get('lead')) + + entries = [{ + 'url': entry['file'], + 'ext': determine_ext(entry.get('fileName')), + 'id': self._search_regex( + r'([a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12})', entry['file'], 'entry id'), + 'title': strip_or_none(entry.get('description')) or title, + } for entry in article_data.get('attachments') or () if entry.get('fileType') in ('Audio', )] + + if not entries: + # some legacy articles have no json attachments, but players in body + entries = self._extract_webpage_player_entries(article_data['content'], playlist_id, { + 'title': title, + }) + + return self.playlist_result(entries, playlist_id, title, description) + + +class PolskieRadioAuditionIE(InfoExtractor): + # new next.js sites + IE_NAME = 'polskieradio:audition' + _VALID_URL = r'https?://(?:[^/]+\.)?polskieradio\.pl/audycj[ae]/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', + # articles, PR1 + 'url': 'https://jedynka.polskieradio.pl/audycje/5102', 'info_dict': { 'id': '5102', - 'title': 'HISTORIA ŻYWA', + 'title': 'Historia żywa', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', }, 'playlist_mincount': 38, }, { - 'url': 'http://www.polskieradio.pl/7/4807', + # episodes, PR1 + 'url': 'https://jedynka.polskieradio.pl/audycje/5769', 'info_dict': { - 'id': '4807', - 'title': 'Vademecum 1050. rocznicy Chrztu Polski' + 'id': '5769', + 'title': 'AgroFakty', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', }, - 'playlist_mincount': 5 + 'playlist_mincount': 269, }, { - 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', - 'only_matching': True + # both episodes and articles, PR3 + 'url': 'https://trojka.polskieradio.pl/audycja/8906', + 'info_dict': { + 'id': '8906', + 'title': 'Trójka budzi', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', + }, + 'playlist_mincount': 722, }, { + # some articles were "promoted to main page" and thus link to old frontend + 'url': 'https://trojka.polskieradio.pl/audycja/305', + 'info_dict': { + 'id': '305', + 'title': 'Co w mowie piszczy?', + 'thumbnail': r're:https://static\.prsa\.pl/images/.+', + }, + 'playlist_count': 1523, + }] + + def _call_lp3(self, path, query, video_id, note): + return self._download_json( + f'https://lp3test.polskieradio.pl/{path}', video_id, note, + query=query, headers={'x-api-key': '9bf6c5a2-a7d0-4980-9ed7-a3f7291f2a81'}) + + def _entries(self, playlist_id, has_episodes, has_articles): + for i in itertools.count(1) if has_episodes else []: + page = self._call_lp3( + 'AudioArticle/GetListByCategoryId', { + 'categoryId': playlist_id, + 'PageSize': 10, + 'skip': i, + 'format': 400, + }, playlist_id, f'Downloading episode list page {i}') + if not traverse_obj(page, 'data'): + break + for episode in page['data']: + yield { + 'id': str(episode['id']), + 'url': episode['file'], + 'title': episode.get('title'), + 'duration': int_or_none(episode.get('duration')), + 'timestamp': parse_iso8601(episode.get('datePublic')), + } + + for i in itertools.count(1) if has_articles else []: + page = self._call_lp3( + 'Article/GetListByCategoryId', { + 'categoryId': playlist_id, + 'PageSize': 9, + 'skip': i, + 'format': 400, + }, playlist_id, f'Downloading article list page {i}') + if not traverse_obj(page, 'data'): + break + for article in page['data']: + yield { + '_type': 'url_transparent', + 'id': str(article['id']), + 'url': article['url'], + 'title': article.get('shortTitle'), + 'description': traverse_obj(article, ('description', 'lead')), + 'timestamp': parse_iso8601(article.get('datePublic')), + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + page_props = traverse_obj( + self._search_nextjs_data(self._download_webpage(url, playlist_id), playlist_id), + ('props', 'pageProps', ('data', None)), get_all=False) + + has_episodes = bool(traverse_obj(page_props, 'episodes', 'audios')) + has_articles = bool(traverse_obj(page_props, 'articles')) + + return self.playlist_result( + self._entries(playlist_id, has_episodes, has_articles), playlist_id, + title=traverse_obj(page_props, ('details', 'name')), + description=traverse_obj(page_props, ('details', 'description', 'lead')), + thumbnail=traverse_obj(page_props, ('details', 'photo'))) + + +class PolskieRadioCategoryIE(InfoExtractor): + # legacy sites + IE_NAME = 'polskieradio:category' + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/(?:\d+(?:,[^/]+)?/|[^/]+/Tag)(?P<id>\d+)' + _TESTS = [{ 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', 'info_dict': { 'id': '4143', @@ -187,8 +337,35 @@ class PolskieRadioCategoryIE(InfoExtractor): }, 'playlist_mincount': 61 }, { - 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', - 'only_matching': True, + # billennium tabs + 'url': 'https://www.polskieradio.pl/8/2385', + 'info_dict': { + 'id': '2385', + 'title': 'Droga przez mąkę', + }, + 'playlist_mincount': 111, + }, { + 'url': 'https://www.polskieradio.pl/10/4930', + 'info_dict': { + 'id': '4930', + 'title': 'Teraz K-pop!', + }, + 'playlist_mincount': 392, + }, { + # post back pages, audio content directly without articles + 'url': 'https://www.polskieradio.pl/8,dwojka/7376,nowa-mowa', + 'info_dict': { + 'id': '7376', + 'title': 'Nowa mowa', + }, + 'playlist_mincount': 244, + }, { + 'url': 'https://www.polskieradio.pl/Krzysztof-Dziuba/Tag175458', + 'info_dict': { + 'id': '175458', + 'title': 'Krzysztof Dziuba', + }, + 'playlist_mincount': 420, }, { 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', 'only_matching': True, @@ -196,35 +373,73 @@ class PolskieRadioCategoryIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url) + return False if PolskieRadioLegacyIE.suitable(url) else super().suitable(url) def _entries(self, url, page, category_id): content = page + is_billennium_tabs = 'onclick="TB_LoadTab(' in page + is_post_back = 'onclick="__doPostBack(' in page + pagination = page if is_billennium_tabs else None for page_num in itertools.count(2): for a_entry, entry_id in re.findall( - r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', + r'(?s)<article[^>]+>.*?(<a[^>]+href=["\'](?:(?:https?)?://[^/]+)?/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>', content): entry = extract_attributes(a_entry) - href = entry.get('href') - if not href: - continue - yield self.url_result( - compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(), - entry_id, entry.get('title')) - mobj = re.search( - r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', - content) - if not mobj: - break - next_url = compat_urlparse.urljoin(url, mobj.group('url')) - content = self._download_webpage( - next_url, category_id, 'Downloading page %s' % page_num) + if entry.get('href'): + yield self.url_result( + urljoin(url, entry['href']), PolskieRadioLegacyIE, entry_id, entry.get('title')) + for a_entry in re.findall(r'<span data-media=({[^ ]+})', content): + yield traverse_obj(self._parse_json(a_entry, category_id), { + 'url': 'file', + 'id': 'uid', + 'duration': 'length', + 'title': ('title', {urllib.parse.unquote}), + 'description': ('desc', {urllib.parse.unquote}), + }) + if is_billennium_tabs: + params = self._search_json( + r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+onclick=["\']TB_LoadTab\(', + pagination, 'next page params', category_id, default=None, close_objects=1, + contains_pattern='.+', transform_source=lambda x: '[%s' % js_to_json(unescapeHTML(x))) + if not params: + break + tab_content = self._download_json( + 'https://www.polskieradio.pl/CMS/TemplateBoxesManagement/TemplateBoxTabContent.aspx/GetTabContent', + category_id, f'Downloading page {page_num}', headers={'content-type': 'application/json'}, + data=json.dumps(dict(zip(( + 'boxInstanceId', 'tabId', 'categoryType', 'sectionId', 'categoryId', 'pagerMode', + 'subjectIds', 'tagIndexId', 'queryString', 'name', 'openArticlesInParentTemplate', + 'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber' + ), params))).encode())['d'] + content, pagination = tab_content['Content'], tab_content.get('PagerContent') + elif is_post_back: + target = self._search_regex( + r'onclick=(?:["\'])__doPostBack\((?P<q1>["\'])(?P<target>[\w$]+)(?P=q1)\s*,\s*(?P<q2>["\'])Next(?P=q2)', + content, 'pagination postback target', group='target', default=None) + if not target: + break + content = self._download_webpage( + url, category_id, f'Downloading page {page_num}', + data=urllib.parse.urlencode({ + **self._hidden_inputs(content), + '__EVENTTARGET': target, + '__EVENTARGUMENT': 'Next', + }).encode()) + else: + next_url = urljoin(url, self._search_regex( + r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', + content, 'next page url', group='url', default=None)) + if not next_url: + break + content = self._download_webpage(next_url, category_id, f'Downloading page {page_num}') def _real_extract(self, url): category_id = self._match_id(url) - webpage = self._download_webpage(url, category_id) + webpage, urlh = self._download_webpage_handle(url, category_id) + if PolskieRadioAuditionIE.suitable(urlh.url): + return self.url_result(urlh.url, PolskieRadioAuditionIE, category_id) title = self._html_search_regex( - r'<title>([^<]+) - [^<]+ - [^<]+</title>', + r'<title>([^<]+)(?: - [^<]+ - [^<]+| w [Pp]olskie[Rr]adio\.pl\s*)</title>', webpage, 'title', fatal=False) return self.playlist_result( self._entries(url, webpage, category_id), @@ -358,7 +573,7 @@ class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor): 'entries': InAdvancePagedList( get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE), 'id': str(data['id']), - 'title': data['title'], + 'title': data.get('title'), 'description': data.get('description'), 'uploader': data.get('announcer'), } @@ -374,6 +589,10 @@ class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): 'ext': 'mp3', 'title': 'Theresa May rezygnuje. Co dalej z brexitem?', 'description': 'md5:e41c409a29d022b70ef0faa61dbded60', + 'episode': 'Theresa May rezygnuje. Co dalej z brexitem?', + 'duration': 2893, + 'thumbnail': 'https://static.prsa.pl/images/58649376-c8a0-4ba2-a714-78b383285f5f.jpg', + 'series': 'Raport o stanie świata', }, }] @@ -389,39 +608,3 @@ class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): 'Content-Type': 'application/json', }) return self._parse_episode(data[0]) - - -class PolskieRadioRadioKierowcowIE(PolskieRadioBaseExtractor): - _VALID_URL = r'https?://(?:www\.)?radiokierowcow\.pl/artykul/(?P<id>[0-9]+)' - IE_NAME = 'polskieradio:kierowcow' - - _TESTS = [{ - 'url': 'https://radiokierowcow.pl/artykul/2694529', - 'info_dict': { - 'id': '2694529', - 'title': 'Zielona fala reliktem przeszłości?', - 'description': 'md5:343950a8717c9818fdfd4bd2b8ca9ff2', - }, - 'playlist_count': 3, - }] - - def _real_extract(self, url): - media_id = self._match_id(url) - webpage = self._download_webpage(url, media_id) - nextjs_build = self._search_nextjs_data(webpage, media_id)['buildId'] - article = self._download_json( - f'https://radiokierowcow.pl/_next/data/{nextjs_build}/artykul/{media_id}.json?articleId={media_id}', - media_id) - data = article['pageProps']['data'] - title = data['title'] - entries = self._extract_webpage_player_entries(data['content'], media_id, { - 'title': title, - }) - - return { - '_type': 'playlist', - 'id': media_id, - 'entries': entries, - 'title': title, - 'description': data.get('lead'), - } diff --git a/hypervideo_dl/extractor/porn91.py b/hypervideo_dl/extractor/porn91.py index af4a0dc..7d16a16 100644 --- a/hypervideo_dl/extractor/porn91.py +++ b/hypervideo_dl/extractor/porn91.py @@ -1,26 +1,48 @@ +import urllib.parse from .common import InfoExtractor from ..utils import ( - parse_duration, + determine_ext, int_or_none, + parse_duration, + remove_end, + unified_strdate, ExtractorError, ) class Porn91IE(InfoExtractor): IE_NAME = '91porn' - _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P<id>[\w\d]+)' + _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/view_video.php\?([^#]+&)?viewkey=(?P<id>\w+)' - _TEST = { + _TESTS = [{ 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', - 'md5': '7fcdb5349354f40d41689bd0fa8db05a', + 'md5': 'd869db281402e0ef4ddef3c38b866f86', 'info_dict': { 'id': '7e42283b4f5ab36da134', 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', + 'description': 'md5:1ff241f579b07ae936a54e810ad2e891', 'ext': 'mp4', 'duration': 431, + 'upload_date': '20150520', + 'comment_count': int, + 'view_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://91porn.com/view_video.php?viewkey=7ef0cf3d362c699ab91c', + 'md5': 'f8fd50540468a6d795378cd778b40226', + 'info_dict': { + 'id': '7ef0cf3d362c699ab91c', + 'title': '真实空乘,冲上云霄第二部', + 'description': 'md5:618bf9652cafcc66cd277bd96789baea', + 'ext': 'mp4', + 'duration': 248, + 'upload_date': '20221119', + 'comment_count': int, + 'view_count': int, 'age_limit': 18, } - } + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -29,32 +51,45 @@ class Porn91IE(InfoExtractor): webpage = self._download_webpage( 'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id) - if '作为游客,你每天只可观看10个视频' in webpage: - raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True) + if '视频不存在,可能已经被删除或者被举报为不良内容!' in webpage: + raise ExtractorError('91 Porn says: Video does not exist', expected=True) - title = self._search_regex( - r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title') - title = title.replace('\n', '') + daily_limit = self._search_regex( + r'作为游客,你每天只可观看([\d]+)个视频', webpage, 'exceeded daily limit', default=None, fatal=False) + if daily_limit: + raise ExtractorError(f'91 Porn says: Daily limit {daily_limit} videos exceeded', expected=True) video_link_url = self._search_regex( - r'<textarea[^>]+id=["\']fm-video_link[^>]+>([^<]+)</textarea>', - webpage, 'video link') - videopage = self._download_webpage(video_link_url, video_id) - - info_dict = self._parse_html5_media_entries(url, videopage, video_id)[0] - - duration = parse_duration(self._search_regex( - r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False)) + r'document\.write\(\s*strencode2\s*\(\s*((?:"[^"]+")|(?:\'[^\']+\'))', webpage, 'video link') + video_link_url = self._search_regex( + r'src=["\']([^"\']+)["\']', urllib.parse.unquote(video_link_url), 'unquoted video link') - comment_count = int_or_none(self._search_regex( - r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False)) + formats, subtitles = self._get_formats_and_subtitle(video_link_url, video_id) - info_dict.update({ + return { 'id': video_id, - 'title': title, - 'duration': duration, - 'comment_count': comment_count, - 'age_limit': self._rta_search(webpage), - }) + 'title': remove_end(self._html_extract_title(webpage).replace('\n', ''), 'Chinese homemade video').strip(), + 'formats': formats, + 'subtitles': subtitles, + 'upload_date': unified_strdate(self._search_regex( + r'<span\s+class=["\']title-yakov["\']>(\d{4}-\d{2}-\d{2})</span>', webpage, 'upload_date', fatal=False)), + 'description': self._html_search_regex( + r'<span\s+class=["\']more title["\']>\s*([^<]+)', webpage, 'description', fatal=False), + 'duration': parse_duration(self._search_regex( + r'时长:\s*<span[^>]*>\s*(\d+(?::\d+){1,2})', webpage, 'duration', fatal=False)), + 'comment_count': int_or_none(self._search_regex( + r'留言:\s*<span[^>]*>\s*(\d+)\s*</span>', webpage, 'comment count', fatal=False)), + 'view_count': int_or_none(self._search_regex( + r'热度:\s*<span[^>]*>\s*(\d+)\s*</span>', webpage, 'view count', fatal=False)), + 'age_limit': 18, + } + + def _get_formats_and_subtitle(self, video_link_url, video_id): + ext = determine_ext(video_link_url) + if ext == 'm3u8': + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_link_url, video_id, ext='mp4') + else: + formats = [{'url': video_link_url, 'ext': ext}] + subtitles = {} - return info_dict + return formats, subtitles diff --git a/hypervideo_dl/extractor/pornez.py b/hypervideo_dl/extractor/pornez.py index df0e44a..bc45f86 100644 --- a/hypervideo_dl/extractor/pornez.py +++ b/hypervideo_dl/extractor/pornez.py @@ -1,41 +1,60 @@ from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + clean_html, + int_or_none, + get_element_by_class, + urljoin, +) class PornezIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornez\.net/video(?P<id>[0-9]+)/' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?pornez\.net/(?:video(?P<id>\w+)|watch)/' + _TESTS = [{ 'url': 'https://pornez.net/video344819/mistresst-funny_penis_names-wmv/', - 'md5': '2e19a0a1cff3a5dbea0ef1b9e80bcbbc', 'info_dict': { 'id': '344819', 'ext': 'mp4', - 'title': r'mistresst funny_penis_names wmv', + 'title': 'mistresst funny_penis_names wmv', 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 18, - } - } + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://pornez.net/watch/leana+lovings+stiff+for+stepdaughter/', + 'info_dict': { + 'id': '156161', + 'ext': 'mp4', + 'title': 'Watch leana lovings stiff for stepdaughter porn video.', + 'age_limit': 18, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://pornez.net/videovzs27fj/tutor4k-e14-blue-wave-1080p-nbq-tutor4k-e14-blue-wave/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - iframe_src = self._html_search_regex( - r'<iframe[^>]+src="(https?://pornez\.net/player/\?[^"]+)"', webpage, 'iframe', fatal=True) - title = self._html_search_meta(['name', 'twitter:title', 'og:title'], webpage, 'title', default=None) - if title is None: - title = self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title', fatal=True) - thumbnail = self._html_search_meta(['thumbnailUrl'], webpage, 'title', default=None) - webpage = self._download_webpage(iframe_src, video_id) - entries = self._parse_html5_media_entries(iframe_src, webpage, video_id)[0] - for format in entries['formats']: - height = self._search_regex(r'_(\d+)\.m3u8', format['url'], 'height') - format['format_id'] = '%sp' % height - format['height'] = int_or_none(height) + if not video_id: + video_id = self._search_regex( + r'<link[^>]+\bhref=["\']https?://pornez.net/\?p=(\w+)["\']', webpage, 'id') + + iframe_src = self._html_search_regex(r'<iframe[^>]+src="([^"]+)"', webpage, 'iframe') + iframe = self._download_webpage(urljoin('https://pornez.net', iframe_src), video_id) + + entries = self._parse_html5_media_entries(iframe_src, iframe, video_id)[0] + for fmt in entries['formats']: + height = self._search_regex(r'_(\d+)\.m3u8', fmt['url'], 'height') + fmt['format_id'] = '%sp' % height + fmt['height'] = int_or_none(height) entries.update({ 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'age_limit': 18 + 'title': (clean_html(get_element_by_class('video-title', webpage)) + or self._html_search_meta( + ['twitter:title', 'og:title', 'description'], webpage, 'title', default=None)), + 'thumbnail': self._html_search_meta(['thumbnailUrl'], webpage, 'thumb', default=None), + 'age_limit': 18, }) return entries diff --git a/hypervideo_dl/extractor/pornhub.py b/hypervideo_dl/extractor/pornhub.py index 5d8d7c1..999d038 100644 --- a/hypervideo_dl/extractor/pornhub.py +++ b/hypervideo_dl/extractor/pornhub.py @@ -3,11 +3,12 @@ import itertools import math import operator import re -import urllib.request from .common import InfoExtractor from .openload import PhantomJSwrapper -from ..compat import compat_HTTPError, compat_str +from ..compat import compat_str +from ..networking import Request +from ..networking.exceptions import HTTPError from ..utils import ( NO_DEFAULT, ExtractorError, @@ -46,8 +47,8 @@ class PornHubBaseIE(InfoExtractor): r'document\.cookie\s*=\s*["\']RNKEY=', r'document\.location\.reload\(true\)')): url_or_request = args[0] - url = (url_or_request.get_full_url() - if isinstance(url_or_request, urllib.request.Request) + url = (url_or_request.url + if isinstance(url_or_request, Request) else url_or_request) phantom = PhantomJSwrapper(self, required_version='2.0') phantom.get(url, html=webpage) @@ -58,6 +59,12 @@ class PornHubBaseIE(InfoExtractor): def _real_initialize(self): self._logged_in = False + def _set_age_cookies(self, host): + self._set_cookie(host, 'age_verified', '1') + self._set_cookie(host, 'accessAgeDisclaimerPH', '1') + self._set_cookie(host, 'accessAgeDisclaimerUK', '1') + self._set_cookie(host, 'accessPH', '1') + def _login(self, host): if self._logged_in: return @@ -267,8 +274,7 @@ class PornHubIE(PornHubBaseIE): video_id = mobj.group('id') self._login(host) - - self._set_cookie(host, 'age_verified', '1') + self._set_age_cookies(host) def dl_webpage(platform): self._set_cookie(host, 'platform', platform) @@ -569,6 +575,7 @@ class PornHubUserIE(PornHubPlaylistBaseIE): mobj = self._match_valid_url(url) user_id = mobj.group('id') videos_url = '%s/videos' % mobj.group('url') + self._set_age_cookies(mobj.group('host')) page = self._extract_page(url) if page: videos_url = update_url_query(videos_url, {'page': page}) @@ -597,7 +604,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): base_url, item_id, note, query={'page': num}) def is_404(e): - return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 + return isinstance(e.cause, HTTPError) and e.cause.status == 404 base_url = url has_page = page is not None @@ -633,6 +640,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): item_id = mobj.group('id') self._login(host) + self._set_age_cookies(host) return self.playlist_result(self._entries(url, host, item_id), item_id) @@ -812,5 +820,6 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE): item_id = mobj.group('id') self._login(host) + self._set_age_cookies(host) return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id) diff --git a/hypervideo_dl/extractor/pr0gramm.py b/hypervideo_dl/extractor/pr0gramm.py new file mode 100644 index 0000000..2eb327f --- /dev/null +++ b/hypervideo_dl/extractor/pr0gramm.py @@ -0,0 +1,97 @@ +import re + +from .common import InfoExtractor +from ..utils import merge_dicts + + +class Pr0grammStaticIE(InfoExtractor): + # Possible urls: + # https://pr0gramm.com/static/5466437 + _VALID_URL = r'https?://pr0gramm\.com/static/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://pr0gramm.com/static/5466437', + 'md5': '52fa540d70d3edc286846f8ca85938aa', + 'info_dict': { + 'id': '5466437', + 'ext': 'mp4', + 'title': 'pr0gramm-5466437 by g11st', + 'uploader': 'g11st', + 'upload_date': '20221221', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Fetch media sources + entries = self._parse_html5_media_entries(url, webpage, video_id) + media_info = entries[0] + + # Fetch author + uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader') + + # Fetch approx upload timestamp from filename + # Have None-defaults in case the extraction fails + uploadDay = None + uploadMon = None + uploadYear = None + uploadTimestr = None + # (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4) + m = re.search(r'//img\.pr0gramm\.com/(?P<year>[\d]+)/(?P<mon>[\d]+)/(?P<day>[\d]+)/\w+\.\w{,4}', webpage) + + if (m): + # Up to a day of accuracy should suffice... + uploadDay = m.groupdict().get('day') + uploadMon = m.groupdict().get('mon') + uploadYear = m.groupdict().get('year') + uploadTimestr = uploadYear + uploadMon + uploadDay + + return merge_dicts({ + 'id': video_id, + 'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''), + 'uploader': uploader, + 'upload_date': uploadTimestr + }, media_info) + + +# This extractor is for the primary url (used for sharing, and appears in the +# location bar) Since this page loads the DOM via JS, yt-dl can't find any +# video information here. So let's redirect to a compatibility version of +# the site, which does contain the <video>-element by itself, without requiring +# js to be ran. +class Pr0grammIE(InfoExtractor): + # Possible urls: + # https://pr0gramm.com/new/546637 + # https://pr0gramm.com/new/video/546637 + # https://pr0gramm.com/top/546637 + # https://pr0gramm.com/top/video/546637 + # https://pr0gramm.com/user/g11st/uploads/5466437 + # https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290 + # https://pr0gramm.com/user/froschler/reinziehen-1elf/5232030 + # https://pr0gramm.com/user/froschler/1elf/5232030 + # https://pr0gramm.com/new/5495710:comment62621020 <- this is not the id! + # https://pr0gramm.com/top/fruher war alles damals/5498175 + + _VALID_URL = r'https?:\/\/pr0gramm\.com\/(?!static/\d+).+?\/(?P<id>[\d]+)(:|$)' + _TEST = { + 'url': 'https://pr0gramm.com/new/video/5466437', + 'info_dict': { + 'id': '5466437', + 'ext': 'mp4', + 'title': 'pr0gramm-5466437 by g11st', + 'uploader': 'g11st', + 'upload_date': '20221221', + } + } + + def _generic_title(): + return "oof" + + def _real_extract(self, url): + video_id = self._match_id(url) + + return self.url_result( + 'https://pr0gramm.com/static/' + video_id, + video_id=video_id, + ie=Pr0grammStaticIE.ie_key()) diff --git a/hypervideo_dl/extractor/prankcast.py b/hypervideo_dl/extractor/prankcast.py index 0eb5f98..b2ec5bb 100644 --- a/hypervideo_dl/extractor/prankcast.py +++ b/hypervideo_dl/extractor/prankcast.py @@ -18,7 +18,7 @@ class PrankCastIE(InfoExtractor): 'cast': ['Devonanustart', 'Phonelosers'], 'description': '', 'categories': ['prank'], - 'tags': ['prank call', 'prank'], + 'tags': ['prank call', 'prank', 'live show'], 'upload_date': '20220825' } }, { @@ -35,7 +35,7 @@ class PrankCastIE(InfoExtractor): 'cast': ['phonelosers'], 'description': '', 'categories': ['prank'], - 'tags': ['prank call', 'prank'], + 'tags': ['prank call', 'prank', 'live show'], 'upload_date': '20221006' } }] @@ -62,5 +62,5 @@ class PrankCastIE(InfoExtractor): 'cast': list(filter(None, [uploader] + traverse_obj(guests_json, (..., 'name')))), 'description': json_info.get('broadcast_description'), 'categories': [json_info.get('broadcast_category')], - 'tags': self._parse_json(json_info.get('broadcast_tags') or '{}', video_id) + 'tags': try_call(lambda: json_info['broadcast_tags'].split(',')) } diff --git a/hypervideo_dl/extractor/puhutv.py b/hypervideo_dl/extractor/puhutv.py index 482e570..4b8e5e9 100644 --- a/hypervideo_dl/extractor/puhutv.py +++ b/hypervideo_dl/extractor/puhutv.py @@ -1,8 +1,6 @@ from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -72,7 +70,7 @@ class PuhuTVIE(InfoExtractor): display_id, 'Downloading video JSON', headers=self.geo_verification_headers()) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: self.raise_geo_restricted() raise diff --git a/hypervideo_dl/extractor/qdance.py b/hypervideo_dl/extractor/qdance.py new file mode 100644 index 0000000..62b08b3 --- /dev/null +++ b/hypervideo_dl/extractor/qdance.py @@ -0,0 +1,150 @@ +import json +import time + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + jwt_decode_hs256, + str_or_none, + traverse_obj, + try_call, + url_or_none, +) + + +class QDanceIE(InfoExtractor): + _NETRC_MACHINE = 'qdance' + _VALID_URL = r'https?://(?:www\.)?q-dance\.com/network/(?:library|live)/(?P<id>\d+)' + _TESTS = [{ + 'note': 'vod', + 'url': 'https://www.q-dance.com/network/library/146542138', + 'info_dict': { + 'id': '146542138', + 'ext': 'mp4', + 'title': 'Sound Rush [LIVE] | Defqon.1 Weekend Festival 2022 | Friday | RED', + 'display_id': 'sound-rush-live-v3-defqon-1-weekend-festival-2022-friday-red', + 'description': 'Relive Defqon.1 - Primal Energy 2022 with the sounds of Sound Rush LIVE at the RED on Friday! 🔥', + 'season': 'Defqon.1 Weekend Festival 2022', + 'season_id': '31840632', + 'series': 'Defqon.1', + 'series_id': '31840378', + 'thumbnail': 'https://images.q-dance.network/1674829540-20220624171509-220624171509_delio_dn201093-2.jpg', + 'availability': 'premium_only', + 'duration': 1829, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'livestream', + 'url': 'https://www.q-dance.com/network/live/149170353', + 'info_dict': { + 'id': '149170353', + 'ext': 'mp4', + 'title': r're:^Defqon\.1 2023 - Friday - RED', + 'display_id': 'defqon-1-2023-friday-red', + 'description': 'md5:3c73fbbd4044e578e696adfc64019163', + 'season': 'Defqon.1 Weekend Festival 2023', + 'season_id': '141735599', + 'series': 'Defqon.1', + 'series_id': '31840378', + 'thumbnail': 'https://images.q-dance.network/1686849069-area-thumbs_red.png', + 'availability': 'subscriber_only', + 'live_status': 'is_live', + 'channel_id': 'qdancenetwork.video_149170353', + }, + 'skip': 'Completed livestream', + }] + + _access_token = None + _refresh_token = None + + def _call_login_api(self, data, note='Logging in'): + login = self._download_json( + 'https://members.id-t.com/api/auth/login', None, note, headers={ + 'content-type': 'application/json', + 'brand': 'qdance', + 'origin': 'https://www.q-dance.com', + 'referer': 'https://www.q-dance.com/', + }, data=json.dumps(data, separators=(',', ':')).encode(), + expected_status=lambda x: True) + + tokens = traverse_obj(login, ('data', { + '_id-t-accounts-token': ('accessToken', {str}), + '_id-t-accounts-refresh': ('refreshToken', {str}), + '_id-t-accounts-id-token': ('idToken', {str}), + })) + + if not tokens.get('_id-t-accounts-token'): + error = ': '.join(traverse_obj(login, ('error', ('code', 'message'), {str}))) + if 'validation_error' not in error: + raise ExtractorError(f'Q-Dance API said "{error}"') + msg = 'Invalid username or password' if 'email' in data else 'Refresh token has expired' + raise ExtractorError(msg, expected=True) + + for name, value in tokens.items(): + self._set_cookie('.q-dance.com', name, value) + + def _perform_login(self, username, password): + self._call_login_api({'email': username, 'password': password}) + + def _real_initialize(self): + cookies = self._get_cookies('https://www.q-dance.com/') + self._refresh_token = try_call(lambda: cookies['_id-t-accounts-refresh'].value) + self._access_token = try_call(lambda: cookies['_id-t-accounts-token'].value) + if not self._access_token: + self.raise_login_required() + + def _get_auth(self): + if (try_call(lambda: jwt_decode_hs256(self._access_token)['exp']) or 0) <= int(time.time() - 120): + if not self._refresh_token: + raise ExtractorError( + 'Cannot refresh access token, login with hypervideo or refresh cookies in browser') + self._call_login_api({'refreshToken': self._refresh_token}, note='Refreshing access token') + self._real_initialize() + + return {'Authorization': self._access_token} + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._search_nuxt_data(webpage, video_id, traverse=('data', 0, 'data')) + + def extract_availability(level): + level = int_or_none(level) or 0 + return self._availability( + needs_premium=(level >= 20), needs_subscription=(level >= 15), needs_auth=True) + + info = traverse_obj(data, { + 'title': ('title', {str.strip}), + 'description': ('description', {str.strip}), + 'display_id': ('slug', {str}), + 'thumbnail': ('thumbnail', {url_or_none}), + 'duration': ('durationInSeconds', {int_or_none}, {lambda x: x or None}), + 'availability': ('subscription', 'level', {extract_availability}), + 'is_live': ('type', {lambda x: x.lower() == 'live'}), + 'artist': ('acts', ..., {str}), + 'series': ('event', 'title', {str.strip}), + 'series_id': ('event', 'id', {str_or_none}), + 'season': ('eventEdition', 'title', {str.strip}), + 'season_id': ('eventEdition', 'id', {str_or_none}), + 'channel_id': ('pubnub', 'channelName', {str}), + }) + + stream = self._download_json( + f'https://dc9h6qmsoymbq.cloudfront.net/api/content/videos/{video_id}/url', + video_id, headers=self._get_auth(), expected_status=401) + + m3u8_url = traverse_obj(stream, ('data', 'url', {url_or_none})) + if not m3u8_url and traverse_obj(stream, ('error', 'code')) == 'unauthorized': + raise ExtractorError('Your account does not have access to this content', expected=True) + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, fatal=False, live=True) if m3u8_url else [] + if not formats: + self.raise_no_formats('No active streams found', expected=bool(info.get('is_live'))) + + return { + **info, + 'id': video_id, + 'formats': formats, + } diff --git a/hypervideo_dl/extractor/radiko.py b/hypervideo_dl/extractor/radiko.py index f102922..cef68eb 100644 --- a/hypervideo_dl/extractor/radiko.py +++ b/hypervideo_dl/extractor/radiko.py @@ -1,5 +1,4 @@ import base64 -import re import urllib.parse from .common import InfoExtractor @@ -15,6 +14,23 @@ from ..utils import ( class RadikoBaseIE(InfoExtractor): _FULL_KEY = None + _HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED = ( + 'https://c-rpaa.smartstream.ne.jp', + 'https://si-c-radiko.smartstream.ne.jp', + 'https://tf-f-rpaa-radiko.smartstream.ne.jp', + 'https://tf-c-rpaa-radiko.smartstream.ne.jp', + 'https://si-f-radiko.smartstream.ne.jp', + 'https://rpaa.smartstream.ne.jp', + ) + _HOSTS_FOR_TIME_FREE_FFMPEG_SUPPORTED = ( + 'https://rd-wowza-radiko.radiko-cf.com', + 'https://radiko.jp', + 'https://f-radiko.smartstream.ne.jp', + ) + # Following URL forcibly connects not Time Free but Live + _HOSTS_FOR_LIVE = ( + 'https://c-radiko.smartstream.ne.jp', + ) def _auth_client(self): _, auth1_handle = self._download_webpage_handle( @@ -25,7 +41,7 @@ class RadikoBaseIE(InfoExtractor): 'x-radiko-device': 'pc', 'x-radiko-user': 'dummy_user', }) - auth1_header = auth1_handle.info() + auth1_header = auth1_handle.headers auth_token = auth1_header['X-Radiko-AuthToken'] kl = int(auth1_header['X-Radiko-KeyLength']) @@ -92,9 +108,9 @@ class RadikoBaseIE(InfoExtractor): formats = [] found = set() for url_tag in m3u8_urls: - pcu = url_tag.find('playlist_create_url') + pcu = url_tag.find('playlist_create_url').text url_attrib = url_tag.attrib - playlist_url = update_url_query(pcu.text, { + playlist_url = update_url_query(pcu, { 'station_id': station, **query, 'l': '15', @@ -118,9 +134,10 @@ class RadikoBaseIE(InfoExtractor): 'X-Radiko-AuthToken': auth_token, }) for sf in subformats: - if re.fullmatch(r'[cf]-radiko\.smartstream\.ne\.jp', domain): - # Prioritize live radio vs playback based on extractor - sf['preference'] = 100 if is_onair else -100 + if (is_onair ^ pcu.startswith(self._HOSTS_FOR_LIVE)) or ( + not is_onair and pcu.startswith(self._HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED)): + sf['preference'] = -100 + sf['format_note'] = 'not preferred' if not is_onair and url_attrib['timefree'] == '1' and time_to_skip: sf['downloader_options'] = {'ffmpeg_args': ['-ss', time_to_skip]} formats.extend(subformats) diff --git a/hypervideo_dl/extractor/radiocanada.py b/hypervideo_dl/extractor/radiocanada.py index 72c21d5..1a5a635 100644 --- a/hypervideo_dl/extractor/radiocanada.py +++ b/hypervideo_dl/extractor/radiocanada.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -74,8 +74,8 @@ class RadioCanadaIE(InfoExtractor): return self._download_json( 'https://services.radio-canada.ca/media/' + path, video_id, query=query) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422): - data = self._parse_json(e.cause.read().decode(), None) + if isinstance(e.cause, HTTPError) and e.cause.status in (401, 422): + data = self._parse_json(e.cause.response.read().decode(), None) error = data.get('error_description') or data['errorMessage']['text'] raise ExtractorError(error, expected=True) raise diff --git a/hypervideo_dl/extractor/rai.py b/hypervideo_dl/extractor/rai.py index cab12cc..df4102a 100644 --- a/hypervideo_dl/extractor/rai.py +++ b/hypervideo_dl/extractor/rai.py @@ -1,19 +1,12 @@ import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) from ..utils import ( clean_html, determine_ext, ExtractorError, filter_dict, - find_xpath_attr, - fix_xml_ampersands, GeoRestrictedError, - HEADRequest, int_or_none, join_nonempty, parse_duration, @@ -35,82 +28,70 @@ class RaiBaseIE(InfoExtractor): _GEO_BYPASS = False def _extract_relinker_info(self, relinker_url, video_id, audio_only=False): + def fix_cdata(s): + # remove \r\n\t before and after <![CDATA[ ]]> to avoid + # polluted text with xpath_text + s = re.sub(r'(\]\]>)[\r\n\t]+(</)', '\\1\\2', s) + return re.sub(r'(>)[\r\n\t]+(<!\[CDATA\[)', '\\1\\2', s) + if not re.match(r'https?://', relinker_url): return {'formats': [{'url': relinker_url}]} - formats = [] - geoprotection = None - is_live = None - duration = None - - for platform in ('mon', 'flash', 'native'): - relinker = self._download_xml( - relinker_url, video_id, - note=f'Downloading XML metadata for platform {platform}', - transform_source=fix_xml_ampersands, - query={'output': 45, 'pl': platform}, - headers=self.geo_verification_headers()) - - if xpath_text(relinker, './license_url', default='{}') != '{}': - self.report_drm(video_id) + # set User-Agent to generic 'Rai' to avoid quality filtering from + # the media server and get the maximum qualities available + relinker = self._download_xml( + relinker_url, video_id, note='Downloading XML metadata', + transform_source=fix_cdata, query={'output': 64}, + headers={**self.geo_verification_headers(), 'User-Agent': 'Rai'}) - if not geoprotection: - geoprotection = xpath_text( - relinker, './geoprotection', default=None) == 'Y' + if xpath_text(relinker, './license_url', default='{}') != '{}': + self.report_drm(video_id) - if not is_live: - is_live = xpath_text( - relinker, './is_live', default=None) == 'Y' - if not duration: - duration = parse_duration(xpath_text( - relinker, './duration', default=None)) + is_live = xpath_text(relinker, './is_live', default='N') == 'Y' + duration = parse_duration(xpath_text(relinker, './duration', default=None)) + media_url = xpath_text(relinker, './url[@type="content"]', default=None) - url_elem = find_xpath_attr(relinker, './url', 'type', 'content') - if url_elem is None: - continue + if not media_url: + self.raise_no_formats('The relinker returned no media url') - media_url = url_elem.text + # geo flag is a bit unreliable and not properly set all the time + geoprotection = xpath_text(relinker, './geoprotection', default='N') == 'Y' - # This does not imply geo restriction (e.g. - # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) - if '/video_no_available.mp4' in media_url: - continue - - ext = determine_ext(media_url) - if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): - continue + ext = determine_ext(media_url) + formats = [] - if ext == 'mp3': - formats.append({ - 'url': media_url, - 'vcodec': 'none', - 'acodec': 'mp3', - 'format_id': 'http-mp3', - }) - break - elif ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m' or platform == 'flash': - manifest_url = update_url_query( - media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), - {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) - formats.extend(self._extract_f4m_formats( - manifest_url, video_id, f4m_id='hds', fatal=False)) - else: - bitrate = int_or_none(xpath_text(relinker, 'bitrate')) - formats.append({ - 'url': media_url, - 'tbr': bitrate if bitrate > 0 else None, - 'format_id': f'http-{bitrate if bitrate > 0 else "http"}', - }) + if ext == 'mp3': + formats.append({ + 'url': media_url, + 'vcodec': 'none', + 'acodec': 'mp3', + 'format_id': 'https-mp3', + }) + elif ext == 'm3u8' or 'format=m3u8' in media_url: + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + # very likely no longer needed. Cannot find any url that uses it. + manifest_url = update_url_query( + media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), + {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, f4m_id='hds', fatal=False)) + elif ext == 'mp4': + bitrate = int_or_none(xpath_text(relinker, './bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': join_nonempty('https', bitrate, delim='-'), + }) + else: + raise ExtractorError('Unrecognized media file found') - if not formats and geoprotection is True: + if (not formats and geoprotection is True) or '/video_no_available.mp4' in media_url: self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) - if not audio_only: - formats.extend(self._create_http_urls(relinker_url, formats)) + if not audio_only and not is_live: + formats.extend(self._create_http_urls(media_url, relinker_url, formats)) return filter_dict({ 'is_live': is_live, @@ -118,38 +99,31 @@ class RaiBaseIE(InfoExtractor): 'formats': formats, }) - def _create_http_urls(self, relinker_url, fmts): - _RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' + def _create_http_urls(self, manifest_url, relinker_url, fmts): + _MANIFEST_REG = r'/(?P<id>\w+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4)?(?:\.csmil)?/playlist\.m3u8' _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' _QUALITY = { # tbr: w, h - '250': [352, 198], - '400': [512, 288], - '700': [512, 288], - '800': [700, 394], - '1200': [736, 414], - '1800': [1024, 576], - '2400': [1280, 720], - '3200': [1440, 810], - '3600': [1440, 810], - '5000': [1920, 1080], - '10000': [1920, 1080], + 250: [352, 198], + 400: [512, 288], + 600: [512, 288], + 700: [512, 288], + 800: [700, 394], + 1200: [736, 414], + 1500: [920, 518], + 1800: [1024, 576], + 2400: [1280, 720], + 3200: [1440, 810], + 3600: [1440, 810], + 5000: [1920, 1080], + 10000: [1920, 1080], } - def test_url(url): - resp = self._request_webpage( - HEADRequest(url), None, headers={'User-Agent': 'Rai'}, - fatal=False, errnote=False, note=False) - - if resp is False: + def percentage(number, target, pc=20, roof=125): + '''check if the target is in the range of number +/- percent''' + if not number or number < 0: return False - - if resp.code == 200: - return False if resp.url == url else resp.url - return None - - # filter out audio-only formats - fmts = [f for f in fmts if not f.get('vcodec') == 'none'] + return abs(target - number) < min(float(number) * float(pc) / 100.0, roof) def get_format_info(tbr): import math @@ -157,67 +131,78 @@ class RaiBaseIE(InfoExtractor): if len(fmts) == 1 and not br: br = fmts[0].get('tbr') if br and br > 300: - tbr = compat_str(math.floor(br / 100) * 100) + tbr = math.floor(br / 100) * 100 else: - tbr = '250' + tbr = 250 # try extracting info from available m3u8 formats - format_copy = None + format_copy = [None, None] for f in fmts: if f.get('tbr'): - br_limit = math.floor(br / 100) - if br_limit - 1 <= math.floor(f['tbr'] / 100) <= br_limit + 1: - format_copy = f.copy() + if percentage(tbr, f['tbr']): + format_copy[0] = f.copy() + if [f.get('width'), f.get('height')] == _QUALITY.get(tbr): + format_copy[1] = f.copy() + format_copy[1]['tbr'] = tbr + + # prefer format with similar bitrate because there might be + # multiple video with the same resolution but different bitrate + format_copy = format_copy[0] or format_copy[1] or {} return { + 'format_id': f'https-{tbr}', 'width': format_copy.get('width'), 'height': format_copy.get('height'), 'tbr': format_copy.get('tbr'), 'vcodec': format_copy.get('vcodec'), 'acodec': format_copy.get('acodec'), 'fps': format_copy.get('fps'), - 'format_id': f'https-{tbr}', } if format_copy else { + 'format_id': f'https-{tbr}', 'width': _QUALITY[tbr][0], 'height': _QUALITY[tbr][1], - 'format_id': f'https-{tbr}', - 'tbr': int(tbr), + 'tbr': tbr, + 'vcodec': 'avc1', + 'acodec': 'mp4a', + 'fps': 25, } - loc = test_url(_MP4_TMPL % (relinker_url, '*')) - if not isinstance(loc, compat_str): - return [] + # filter out single-stream formats + fmts = [f for f in fmts + if not f.get('vcodec') == 'none' and not f.get('acodec') == 'none'] - mobj = re.match( - _RELINKER_REG, - test_url(relinker_url) or '') + mobj = re.search(_MANIFEST_REG, manifest_url) if not mobj: return [] - available_qualities = mobj.group('quality').split(',') if mobj.group('quality') else ['*'] - available_qualities = [i for i in available_qualities if i] formats = [] - for q in available_qualities: - fmt = { + for q in filter(None, available_qualities): + self.write_debug(f'Creating https format for quality {q}') + formats.append({ 'url': _MP4_TMPL % (relinker_url, q), 'protocol': 'https', 'ext': 'mp4', **get_format_info(q) - } - formats.append(fmt) + }) return formats @staticmethod + def _get_thumbnails_list(thumbs, url): + return [{ + 'url': urljoin(url, thumb_url), + } for thumb_url in (thumbs or {}).values() if thumb_url] + + @staticmethod def _extract_subtitles(url, video_data): STL_EXT = 'stl' SRT_EXT = 'srt' subtitles = {} - subtitles_array = video_data.get('subtitlesArray') or [] + subtitles_array = video_data.get('subtitlesArray') or video_data.get('subtitleList') or [] for k in ('subtitles', 'subtitlesUrl'): subtitles_array.append({'url': video_data.get(k)}) for subtitle in subtitles_array: sub_url = subtitle.get('url') - if sub_url and isinstance(sub_url, compat_str): + if sub_url and isinstance(sub_url, str): sub_lang = subtitle.get('language') or 'it' sub_url = urljoin(url, sub_url) sub_ext = determine_ext(sub_url, SRT_EXT) @@ -236,7 +221,7 @@ class RaiBaseIE(InfoExtractor): class RaiPlayIE(RaiBaseIE): _VALID_URL = rf'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)' _TESTS = [{ - 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', + 'url': 'https://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', @@ -244,22 +229,20 @@ class RaiPlayIE(RaiBaseIE): 'title': 'Report del 07/04/2014', 'alt_title': 'St 2013/14 - Report - Espresso nel caffè - 07/04/2014', 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai Gulp', + 'thumbnail': r're:^https?://www\.raiplay\.it/.+\.jpg', + 'uploader': 'Rai 3', + 'creator': 'Rai 3', 'duration': 6160, 'series': 'Report', 'season': '2013/14', - 'subtitles': { - 'it': 'count:4', - }, + 'subtitles': {'it': 'count:4'}, 'release_year': 2022, 'episode': 'Espresso nel caffè - 07/04/2014', 'timestamp': 1396919880, 'upload_date': '20140408', + 'formats': 'count:4', }, - 'params': { - 'skip_download': True, - }, + 'params': {'skip_download': True}, }, { # 1080p direct mp4 url 'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html', @@ -270,8 +253,9 @@ class RaiPlayIE(RaiBaseIE): 'title': 'Blanca - S1E1 - Senza occhi', 'alt_title': 'St 1 Ep 1 - Blanca - Senza occhi', 'description': 'md5:75f95d5c030ec8bac263b1212322e28c', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 1', + 'thumbnail': r're:^https://www\.raiplay\.it/dl/img/.+\.jpg', + 'uploader': 'Rai Premium', + 'creator': 'Rai Fiction', 'duration': 6493, 'series': 'Blanca', 'season': 'Season 1', @@ -281,6 +265,30 @@ class RaiPlayIE(RaiBaseIE): 'episode': 'Senza occhi', 'timestamp': 1637318940, 'upload_date': '20211119', + 'formats': 'count:12', + }, + 'params': {'skip_download': True}, + 'expected_warnings': ['Video not available. Likely due to geo-restriction.'] + }, { + # 1500 quality + 'url': 'https://www.raiplay.it/video/2012/09/S1E11---Tutto-cio-che-luccica-0cab3323-732e-45d6-8e86-7704acab6598.html', + 'md5': 'a634d20e8ab2d43724c273563f6bf87a', + 'info_dict': { + 'id': '0cab3323-732e-45d6-8e86-7704acab6598', + 'ext': 'mp4', + 'title': 'Mia and Me - S1E11 - Tutto ciò che luccica', + 'alt_title': 'St 1 Ep 11 - Mia and Me - Tutto ciò che luccica', + 'description': 'md5:4969e594184b1920c4c1f2b704da9dea', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai Gulp', + 'series': 'Mia and Me', + 'season': 'Season 1', + 'episode_number': 11, + 'release_year': 2015, + 'season_number': 1, + 'episode': 'Tutto ciò che luccica', + 'timestamp': 1348495020, + 'upload_date': '20120924', }, }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', @@ -299,57 +307,40 @@ class RaiPlayIE(RaiBaseIE): base, video_id = self._match_valid_url(url).groups() media = self._download_json( - base + '.json', video_id, 'Downloading video JSON') + f'{base}.json', video_id, 'Downloading video JSON') if not self.get_param('allow_unplayable_formats'): - if try_get( - media, - (lambda x: x['rights_management']['rights']['drm'], - lambda x: x['program_info']['rights_management']['rights']['drm']), - dict): + if traverse_obj(media, (('program_info', None), 'rights_management', 'rights', 'drm')): self.report_drm(video_id) - title = media['name'] video = media['video'] - relinker_info = self._extract_relinker_info(video['content_url'], video_id) - - thumbnails = [] - for _, value in media.get('images', {}).items(): - if value: - thumbnails.append({ - 'url': urljoin(url, value), - }) - - date_published = media.get('date_published') - time_published = media.get('time_published') - if date_published and time_published: - date_published += ' ' + time_published - - subtitles = self._extract_subtitles(url, video) - - program_info = media.get('program_info') or {} + date_published = join_nonempty( + media.get('date_published'), media.get('time_published'), delim=' ') season = media.get('season') - alt_title = join_nonempty(media.get('subtitle'), media.get('toptitle'), delim=' - ') return { 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, 'display_id': video_id, - 'title': title, + 'title': media.get('name'), 'alt_title': strip_or_none(alt_title or None), 'description': media.get('description'), - 'uploader': strip_or_none(media.get('channel') or None), - 'creator': strip_or_none(media.get('editor') or None), + 'uploader': strip_or_none( + traverse_obj(media, ('program_info', 'channel')) + or media.get('channel') or None), + 'creator': strip_or_none( + traverse_obj(media, ('program_info', 'editor')) + or media.get('editor') or None), 'duration': parse_duration(video.get('duration')), 'timestamp': unified_timestamp(date_published), - 'thumbnails': thumbnails, - 'series': program_info.get('name'), + 'thumbnails': self._get_thumbnails_list(media.get('images'), url), + 'series': traverse_obj(media, ('program_info', 'name')), 'season_number': int_or_none(season), 'season': season if (season and not season.isdigit()) else None, 'episode': media.get('episode_title'), 'episode_number': int_or_none(media.get('episode')), - 'subtitles': subtitles, + 'subtitles': self._extract_subtitles(url, video), 'release_year': int_or_none(traverse_obj(media, ('track_info', 'edit_year'))), **relinker_info } @@ -371,38 +362,39 @@ class RaiPlayLiveIE(RaiPlayIE): # XXX: Do not subclass from concrete IE 'live_status': 'is_live', 'upload_date': '20090502', 'timestamp': 1241276220, + 'formats': 'count:3', }, - 'params': { - 'skip_download': True, - }, + 'params': {'skip_download': True}, }] class RaiPlayPlaylistIE(InfoExtractor): _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?' _TESTS = [{ + # entire series episodes + extras... 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, - 'playlist_mincount': 12, + 'playlist_mincount': 30, }, { + # single season 'url': 'https://www.raiplay.it/programmi/nondirloalmiocapo/episodi/stagione-2/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo - Stagione 2', 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, - 'playlist_mincount': 12, + 'playlist_count': 12, }] def _real_extract(self, url): base, playlist_id, extra_id = self._match_valid_url(url).groups() program = self._download_json( - base + '.json', playlist_id, 'Downloading program JSON') + f'{base}.json', playlist_id, 'Downloading program JSON') if extra_id: extra_id = extra_id.upper().rstrip('/') @@ -450,7 +442,7 @@ class RaiPlaySoundIE(RaiBaseIE): 'title': 'Il Ruggito del Coniglio del 10/12/2021', 'alt_title': 'md5:0e6476cd57858bb0f3fcc835d305b455', 'description': 'md5:2a17d2107e59a4a8faa0e18334139ee2', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.+\.jpg$', 'uploader': 'rai radio 2', 'duration': 5685, 'series': 'Il Ruggito del Coniglio', @@ -459,9 +451,7 @@ class RaiPlaySoundIE(RaiBaseIE): 'timestamp': 1638346620, 'upload_date': '20211201', }, - 'params': { - 'skip_download': True, - }, + 'params': {'skip_download': True}, }] def _real_extract(self, url): @@ -480,9 +470,6 @@ class RaiPlaySoundIE(RaiBaseIE): lambda x: x['live']['create_date'])) podcast_info = traverse_obj(media, 'podcast_info', ('live', 'cards', 0)) or {} - thumbnails = [{ - 'url': urljoin(url, thumb_url), - } for thumb_url in (podcast_info.get('images') or {}).values() if thumb_url] return { **info, @@ -494,7 +481,7 @@ class RaiPlaySoundIE(RaiBaseIE): 'uploader': traverse_obj(media, ('track_info', 'channel'), expected_type=strip_or_none), 'creator': traverse_obj(media, ('track_info', 'editor'), expected_type=strip_or_none), 'timestamp': unified_timestamp(date_published), - 'thumbnails': thumbnails, + 'thumbnails': self._get_thumbnails_list(podcast_info.get('images'), url), 'series': podcast_info.get('title'), 'season_number': int_or_none(media.get('season')), 'episode': media.get('episode_title'), @@ -512,30 +499,30 @@ class RaiPlaySoundLiveIE(RaiPlaySoundIE): # XXX: Do not subclass from concrete 'display_id': 'radio2', 'ext': 'mp4', 'title': r're:Rai Radio 2 \d+-\d+-\d+ \d+:\d+', - 'thumbnail': r're:https://www.raiplaysound.it/dl/img/.+?png', + 'thumbnail': r're:^https://www\.raiplaysound\.it/dl/img/.+\.png', 'uploader': 'rai radio 2', 'series': 'Rai Radio 2', 'creator': 'raiplaysound', 'is_live': True, 'live_status': 'is_live', }, - 'params': { - 'skip_download': 'live', - }, + 'params': {'skip_download': True}, }] class RaiPlaySoundPlaylistIE(InfoExtractor): _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?:programmi|playlist|audiolibri)/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?' _TESTS = [{ + # entire show 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio', 'info_dict': { 'id': 'ilruggitodelconiglio', 'title': 'Il Ruggito del Coniglio', - 'description': 'md5:1bbaf631245a7ab1ec4d9fbb3c7aa8f3', + 'description': 'md5:48cff6972435964284614d70474132e6', }, 'playlist_mincount': 65, }, { + # single season 'url': 'https://www.raiplaysound.it/programmi/ilruggitodelconiglio/puntate/prima-stagione-1995', 'info_dict': { 'id': 'ilruggitodelconiglio_puntate_prima-stagione-1995', @@ -568,22 +555,19 @@ class RaiPlaySoundPlaylistIE(InfoExtractor): class RaiIE(RaiBaseIE): _VALID_URL = rf'https?://[^/]+\.(?:rai\.(?:it|tv))/.+?-(?P<id>{RaiBaseIE._UUID_RE})(?:-.+?)?\.html' _TESTS = [{ - # var uniquename = "ContentItem-..." - # data-id="ContentItem-..." 'url': 'https://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', 'ext': 'mp4', 'title': 'TG PRIMO TEMPO', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 1758, 'upload_date': '20140612', }, - 'skip': 'This content is available only in Italy', + 'params': {'skip_download': True}, + 'expected_warnings': ['Video not available. Likely due to geo-restriction.'] }, { - # with ContentItem in og:url 'url': 'https://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '06345bd97c932f19ffb129973d07a020', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', @@ -592,123 +576,51 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2214, 'upload_date': '20161103' - } + }, + 'params': {'skip_download': True}, }, { - # Direct MMS URL + # Direct MMS: Media URL no longer works. 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', 'only_matching': True, }] - def _extract_from_content_id(self, content_id, url): + def _real_extract(self, url): + content_id = self._match_id(url) media = self._download_json( f'https://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-{content_id}.html?json', - content_id, 'Downloading video JSON') + content_id, 'Downloading video JSON', fatal=False, expected_status=404) - title = media['name'].strip() + if media is None: + return None - media_type = media['type'] - if 'Audio' in media_type: + if 'Audio' in media['type']: relinker_info = { 'formats': [{ - 'format_id': media.get('formatoAudio'), + 'format_id': join_nonempty('https', media.get('formatoAudio'), delim='-'), 'url': media['audioUrl'], 'ext': media.get('formatoAudio'), + 'vcodec': 'none', + 'acodec': media.get('formatoAudio'), }] } - elif 'Video' in media_type: + elif 'Video' in media['type']: relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) else: raise ExtractorError('not a media file') - thumbnails = [] - for image_type in ('image', 'image_medium', 'image_300'): - thumbnail_url = media.get(image_type) - if thumbnail_url: - thumbnails.append({ - 'url': compat_urlparse.urljoin(url, thumbnail_url), - }) - - subtitles = self._extract_subtitles(url, media) + thumbnails = self._get_thumbnails_list( + {image_type: media.get(image_type) for image_type in ( + 'image', 'image_medium', 'image_300')}, url) return { 'id': content_id, - 'title': title, - 'description': strip_or_none(media.get('desc') or None), + 'title': strip_or_none(media.get('name') or media.get('title')), + 'description': strip_or_none(media.get('desc')) or None, 'thumbnails': thumbnails, - 'uploader': strip_or_none(media.get('author') or None), + 'uploader': strip_or_none(media.get('author')) or None, 'upload_date': unified_strdate(media.get('date')), 'duration': parse_duration(media.get('length')), - 'subtitles': subtitles, - **relinker_info - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - content_item_id = None - - content_item_url = self._html_search_meta( - ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', - 'twitter:player', 'jsonlink'), webpage, default=None) - if content_item_url: - content_item_id = self._search_regex( - rf'ContentItem-({self._UUID_RE})', content_item_url, - 'content item id', default=None) - - if not content_item_id: - content_item_id = self._search_regex( - rf'''(?x) - (?: - (?:initEdizione|drawMediaRaiTV)\(| - <(?:[^>]+\bdata-id|var\s+uniquename)=| - <iframe[^>]+\bsrc= - ) - (["\']) - (?:(?!\1).)*\bContentItem-(?P<id>{self._UUID_RE}) - ''', - webpage, 'content item id', default=None, group='id') - - content_item_ids = set() - if content_item_id: - content_item_ids.add(content_item_id) - if video_id not in content_item_ids: - content_item_ids.add(video_id) - - for content_item_id in content_item_ids: - try: - return self._extract_from_content_id(content_item_id, url) - except GeoRestrictedError: - raise - except ExtractorError: - pass - - relinker_url = self._proto_relative_url(self._search_regex( - r'''(?x) - (?: - var\s+videoURL| - mediaInfo\.mediaUri - )\s*=\s* - ([\'"]) - (?P<url> - (?:https?:)? - //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? - (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 - ''', - webpage, 'relinker URL', group='url')) - - relinker_info = self._extract_relinker_info( - urljoin(url, relinker_url), video_id) - - title = self._search_regex( - r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', - webpage, 'title', group='title', - default=None) or self._og_search_title(webpage) - - return { - 'id': video_id, - 'title': title, + 'subtitles': self._extract_subtitles(url, media), **relinker_info } @@ -726,7 +638,8 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE 'duration': 1589, 'upload_date': '20220529', 'uploader': 'rainews', - } + }, + 'params': {'skip_download': True}, }, { # old content with fallback method to extract media urls 'url': 'https://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', @@ -739,12 +652,14 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE 'duration': 833, 'upload_date': '20161103' }, + 'params': {'skip_download': True}, 'expected_warnings': ['unable to extract player_data'], }, { # iframe + drm 'url': 'https://www.rainews.it/iframe/video/2022/07/euro2022-europei-calcio-femminile-italia-belgio-gol-0-1-video-4de06a69-de75-4e32-a657-02f0885f8118.html', 'only_matching': True, }] + _PLAYER_TAG = 'news' def _real_extract(self, url): video_id = self._match_id(url) @@ -752,8 +667,8 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE webpage = self._download_webpage(url, video_id) player_data = self._search_json( - r'<rainews-player\s*data=\'', webpage, 'player_data', video_id, - transform_source=clean_html, fatal=False) + rf'<rai{self._PLAYER_TAG}-player\s*data=\'', webpage, 'player_data', video_id, + transform_source=clean_html, default={}) track_info = player_data.get('track_info') relinker_url = traverse_obj(player_data, 'mediapolis', 'content_url') @@ -770,16 +685,36 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE return { 'id': video_id, - 'title': track_info.get('title') or self._og_search_title(webpage), + 'title': player_data.get('title') or track_info.get('title') or self._og_search_title(webpage), 'upload_date': unified_strdate(track_info.get('date')), 'uploader': strip_or_none(track_info.get('editor') or None), **relinker_info } +class RaiCulturaIE(RaiNewsIE): # XXX: Do not subclass from concrete IE + _VALID_URL = rf'https?://(www\.)?raicultura\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' + _EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)'] + _TESTS = [{ + 'url': 'https://www.raicultura.it/letteratura/articoli/2018/12/Alberto-Asor-Rosa-Letteratura-e-potere-05ba8775-82b5-45c5-a89d-dd955fbde1fb.html', + 'info_dict': { + 'id': '05ba8775-82b5-45c5-a89d-dd955fbde1fb', + 'ext': 'mp4', + 'title': 'Alberto Asor Rosa: Letteratura e potere', + 'duration': 1756, + 'upload_date': '20181206', + 'uploader': 'raicultura', + 'formats': 'count:2', + }, + 'params': {'skip_download': True}, + }] + _PLAYER_TAG = 'cultura' + + class RaiSudtirolIE(RaiBaseIE): - _VALID_URL = r'https?://raisudtirol\.rai\.it/.+?media=(?P<id>[TP]tv\d+)' + _VALID_URL = r'https?://raisudtirol\.rai\.it/.+media=(?P<id>\w+)' _TESTS = [{ + # mp4 file 'url': 'https://raisudtirol.rai.it/la/index.php?media=Ptv1619729460', 'info_dict': { 'id': 'Ptv1619729460', @@ -787,34 +722,62 @@ class RaiSudtirolIE(RaiBaseIE): 'title': 'Euro: trasmisciun d\'economia - 29-04-2021 20:51', 'series': 'Euro: trasmisciun d\'economia', 'upload_date': '20210429', - 'thumbnail': r're:https://raisudtirol\.rai\.it/img/.+?\.jpg', + 'thumbnail': r're:https://raisudtirol\.rai\.it/img/.+\.jpg', 'uploader': 'raisudtirol', - } + 'formats': 'count:1', + }, + 'params': {'skip_download': True}, + }, { + # m3u manifest + 'url': 'https://raisudtirol.rai.it/it/kidsplayer.php?lang=it&media=GUGGUG_P1.smil', + 'info_dict': { + 'id': 'GUGGUG_P1', + 'ext': 'mp4', + 'title': 'GUGGUG! La Prospettiva - Die Perspektive', + 'uploader': 'raisudtirol', + 'formats': 'count:6', + }, + 'params': {'skip_download': True}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_date = self._html_search_regex(r'<span class="med_data">(.+?)</span>', webpage, 'video_date', fatal=False) - video_title = self._html_search_regex(r'<span class="med_title">(.+?)</span>', webpage, 'video_title', fatal=False) - video_url = self._html_search_regex(r'sources:\s*\[\{file:\s*"(.+?)"\}\]', webpage, 'video_url') - video_thumb = self._html_search_regex(r'image: \'(.+?)\'', webpage, 'video_thumb', fatal=False) + video_date = self._html_search_regex( + r'<span class="med_data">(.+?)</span>', webpage, 'video_date', default=None) + video_title = self._html_search_regex([ + r'<span class="med_title">(.+?)</span>', r'title: \'(.+?)\','], + webpage, 'video_title', default=None) + video_url = self._html_search_regex([ + r'sources:\s*\[\{file:\s*"(.+?)"\}\]', + r'<source\s+src="(.+?)"\s+type="application/x-mpegURL"'], + webpage, 'video_url', default=None) + + ext = determine_ext(video_url) + if ext == 'm3u8': + formats = self._extract_m3u8_formats(video_url, video_id) + elif ext == 'mp4': + formats = [{ + 'format_id': 'https-mp4', + 'url': self._proto_relative_url(video_url), + 'width': 1024, + 'height': 576, + 'fps': 25, + 'vcodec': 'avc1', + 'acodec': 'mp4a', + }] + else: + formats = [] + self.raise_no_formats(f'Unrecognized media file: {video_url}') return { 'id': video_id, 'title': join_nonempty(video_title, video_date, delim=' - '), - 'series': video_title, + 'series': video_title if video_date else None, 'upload_date': unified_strdate(video_date), - 'thumbnail': urljoin('https://raisudtirol.rai.it/', video_thumb), + 'thumbnail': urljoin('https://raisudtirol.rai.it/', self._html_search_regex( + r'image: \'(.+?)\'', webpage, 'video_thumb', default=None)), 'uploader': 'raisudtirol', - 'formats': [{ - 'format_id': 'https-mp4', - 'url': self._proto_relative_url(video_url), - 'width': 1024, - 'height': 576, - 'fps': 25, - 'vcodec': 'h264', - 'acodec': 'aac', - }], + 'formats': formats, } diff --git a/hypervideo_dl/extractor/rbgtum.py b/hypervideo_dl/extractor/rbgtum.py new file mode 100644 index 0000000..47649cf --- /dev/null +++ b/hypervideo_dl/extractor/rbgtum.py @@ -0,0 +1,93 @@ +import re + +from .common import InfoExtractor + + +class RbgTumIE(InfoExtractor): + _VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P<id>.+)' + _TESTS = [{ + # Combined view + 'url': 'https://live.rbg.tum.de/w/cpp/22128', + 'md5': '53a5e7b3e07128e33bbf36687fe1c08f', + 'info_dict': { + 'id': 'cpp/22128', + 'ext': 'mp4', + 'title': 'Lecture: October 18. 2022', + 'series': 'Concepts of C++ programming (IN2377)', + } + }, { + # Presentation only + 'url': 'https://live.rbg.tum.de/w/I2DL/12349/PRES', + 'md5': '36c584272179f3e56b0db5d880639cba', + 'info_dict': { + 'id': 'I2DL/12349/PRES', + 'ext': 'mp4', + 'title': 'Lecture 3: Introduction to Neural Networks', + 'series': 'Introduction to Deep Learning (IN2346)', + } + }, { + # Camera only + 'url': 'https://live.rbg.tum.de/w/fvv-info/16130/CAM', + 'md5': 'e04189d92ff2f56aedf5cede65d37aad', + 'info_dict': { + 'id': 'fvv-info/16130/CAM', + 'ext': 'mp4', + 'title': 'Fachschaftsvollversammlung', + 'series': 'Fachschaftsvollversammlung Informatik', + } + }, ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8') + lecture_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title') + lecture_series_title = self._html_search_regex( + r'(?s)<title\b[^>]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?</title>', webpage, 'series') + + formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + + return { + 'id': video_id, + 'title': lecture_title, + 'series': lecture_series_title, + 'formats': formats, + } + + +class RbgTumCourseIE(InfoExtractor): + _VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P<id>.+)' + _TESTS = [{ + 'url': 'https://live.rbg.tum.de/course/2022/S/fpv', + 'info_dict': { + 'title': 'Funktionale Programmierung und Verifikation (IN0003)', + 'id': '2022/S/fpv', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 13, + }, { + 'url': 'https://live.rbg.tum.de/course/2022/W/set', + 'info_dict': { + 'title': 'SET FSMPIC', + 'id': '2022/W/set', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 6, + }, ] + + def _real_extract(self, url): + course_id = self._match_id(url) + webpage = self._download_webpage(url, course_id) + + lecture_series_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title') + + lecture_urls = [] + for lecture_url in re.findall(r'(?i)href="/w/(.+)(?<!/cam)(?<!/pres)(?<!/chat)"', webpage): + lecture_urls.append(self.url_result('https://live.rbg.tum.de/w/' + lecture_url, ie=RbgTumIE.ie_key())) + + return self.playlist_result(lecture_urls, course_id, lecture_series_title) diff --git a/hypervideo_dl/extractor/rcs.py b/hypervideo_dl/extractor/rcs.py index b905f8d..0fd3ca7 100644 --- a/hypervideo_dl/extractor/rcs.py +++ b/hypervideo_dl/extractor/rcs.py @@ -1,11 +1,20 @@ import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( ExtractorError, base_url, clean_html, + extract_attributes, + get_element_html_by_class, + get_element_html_by_id, + int_or_none, js_to_json, + mimetype2ext, + sanitize_url, + traverse_obj, + try_call, url_basename, urljoin, ) @@ -15,41 +24,8 @@ class RCSBaseIE(InfoExtractor): # based on VideoPlayerLoader.prototype.getVideoSrc # and VideoPlayerLoader.prototype.transformSrc from # https://js2.corriereobjects.it/includes2013/LIBS/js/corriere_video.sjs - _ALL_REPLACE = { - 'media2vam.corriere.it.edgesuite.net': - 'media2vam-corriere-it.akamaized.net', - 'media.youreporter.it.edgesuite.net': - 'media-youreporter-it.akamaized.net', - 'corrierepmd.corriere.it.edgesuite.net': - 'corrierepmd-corriere-it.akamaized.net', - 'media2vam-corriere-it.akamaized.net/fcs.quotidiani/vr/videos/': - 'video.corriere.it/vr360/videos/', - '.net//': '.net/', - } - _MP4_REPLACE = { - 'media2vam.corbologna.corriere.it.edgesuite.net': - 'media2vam-bologna-corriere-it.akamaized.net', - 'media2vam.corfiorentino.corriere.it.edgesuite.net': - 'media2vam-fiorentino-corriere-it.akamaized.net', - 'media2vam.cormezzogiorno.corriere.it.edgesuite.net': - 'media2vam-mezzogiorno-corriere-it.akamaized.net', - 'media2vam.corveneto.corriere.it.edgesuite.net': - 'media2vam-veneto-corriere-it.akamaized.net', - 'media2.oggi.it.edgesuite.net': - 'media2-oggi-it.akamaized.net', - 'media2.quimamme.it.edgesuite.net': - 'media2-quimamme-it.akamaized.net', - 'media2.amica.it.edgesuite.net': - 'media2-amica-it.akamaized.net', - 'media2.living.corriere.it.edgesuite.net': - 'media2-living-corriere-it.akamaized.net', - 'media2.style.corriere.it.edgesuite.net': - 'media2-style-corriere-it.akamaized.net', - 'media2.iodonna.it.edgesuite.net': - 'media2-iodonna-it.akamaized.net', - 'media2.leitv.it.edgesuite.net': - 'media2-leitv-it.akamaized.net', - } + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _RCS_ID_RE = r'[\w-]+-\d{10}' _MIGRATION_MAP = { 'videoamica-vh.akamaihd': 'amica', 'media2-amica-it.akamaized': 'amica', @@ -90,183 +66,140 @@ class RCSBaseIE(InfoExtractor): 'vivimilano-vh.akamaihd': 'vivimilano', 'media2-youreporter-it.akamaized': 'youreporter' } - _MIGRATION_MEDIA = { - 'advrcs-vh.akamaihd': '', - 'corriere-f.akamaihd': '', - 'corrierepmd-corriere-it.akamaized': '', - 'corrprotetto-vh.akamaihd': '', - 'gazzetta-f.akamaihd': '', - 'gazzettapmd-gazzetta-it.akamaized': '', - 'gazzprotetto-vh.akamaihd': '', - 'periodici-f.akamaihd': '', - 'periodicisecure-vh.akamaihd': '', - 'videocoracademy-vh.akamaihd': '' - } def _get_video_src(self, video): - mediaFiles = video.get('mediaProfile').get('mediaFile') - src = {} - # audio - if video.get('mediaType') == 'AUDIO': - for aud in mediaFiles: - # todo: check - src['mp3'] = aud.get('value') - # video - else: - for vid in mediaFiles: - if vid.get('mimeType') == 'application/vnd.apple.mpegurl': - src['m3u8'] = vid.get('value') - if vid.get('mimeType') == 'video/mp4': - src['mp4'] = vid.get('value') + for source in traverse_obj(video, ( + 'mediaProfile', 'mediaFile', lambda _, v: v.get('mimeType'))): + url = source['value'] + for s, r in ( + ('media2vam.corriere.it.edgesuite.net', 'media2vam-corriere-it.akamaized.net'), + ('media.youreporter.it.edgesuite.net', 'media-youreporter-it.akamaized.net'), + ('corrierepmd.corriere.it.edgesuite.net', 'corrierepmd-corriere-it.akamaized.net'), + ('media2vam-corriere-it.akamaized.net/fcs.quotidiani/vr/videos/', 'video.corriere.it/vr360/videos/'), + ('http://', 'https://'), + ): + url = url.replace(s, r) - # replace host - for t in src: - for s, r in self._ALL_REPLACE.items(): - src[t] = src[t].replace(s, r) - for s, r in self._MP4_REPLACE.items(): - src[t] = src[t].replace(s, r) + type_ = mimetype2ext(source['mimeType']) + if type_ == 'm3u8' and '-vh.akamaihd' in url: + # still needed for some old content: see _TESTS #3 + matches = re.search(r'(?:https?:)?//(?P<host>[\w\.\-]+)\.net/i(?P<path>.+)$', url) + if matches: + url = f'https://vod.rcsobjects.it/hls/{self._MIGRATION_MAP[matches.group("host")]}{matches.group("path")}' + if traverse_obj(video, ('mediaProfile', 'geoblocking')) or ( + type_ == 'm3u8' and 'fcs.quotidiani_!' in url): + url = url.replace('vod.rcsobjects', 'vod-it.rcsobjects') + if type_ == 'm3u8' and 'vod' in url: + url = url.replace('.csmil', '.urlset') + if type_ == 'mp3': + url = url.replace('media2vam-corriere-it.akamaized.net', 'vod.rcsobjects.it/corriere') - # switch cdn - if 'mp4' in src and 'm3u8' in src: - if ('-lh.akamaihd' not in src.get('m3u8') - and 'akamai' in src.get('mp4')): - if 'm3u8' in src: - matches = re.search(r'(?:https*:)?\/\/(?P<host>.*)\.net\/i(?P<path>.*)$', src.get('m3u8')) - src['m3u8'] = 'https://vod.rcsobjects.it/hls/%s%s' % ( - self._MIGRATION_MAP[matches.group('host')], - matches.group('path').replace( - '///', '/').replace( - '//', '/').replace( - '.csmil', '.urlset' - ) - ) - if 'mp4' in src: - matches = re.search(r'(?:https*:)?\/\/(?P<host>.*)\.net\/i(?P<path>.*)$', src.get('mp4')) - if matches: - if matches.group('host') in self._MIGRATION_MEDIA: - vh_stream = 'https://media2.corriereobjects.it' - if src.get('mp4').find('fcs.quotidiani_!'): - vh_stream = 'https://media2-it.corriereobjects.it' - src['mp4'] = '%s%s' % ( - vh_stream, - matches.group('path').replace( - '///', '/').replace( - '//', '/').replace( - '/fcs.quotidiani/mediacenter', '').replace( - '/fcs.quotidiani_!/mediacenter', '').replace( - 'corriere/content/mediacenter/', '').replace( - 'gazzetta/content/mediacenter/', '') - ) - else: - src['mp4'] = 'https://vod.rcsobjects.it/%s%s' % ( - self._MIGRATION_MAP[matches.group('host')], - matches.group('path').replace('///', '/').replace('//', '/') - ) - - if 'mp3' in src: - src['mp3'] = src.get('mp3').replace( - 'media2vam-corriere-it.akamaized.net', - 'vod.rcsobjects.it/corriere') - if 'mp4' in src: - if src.get('mp4').find('fcs.quotidiani_!'): - src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects') - if 'm3u8' in src: - if src.get('m3u8').find('fcs.quotidiani_!'): - src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects') + yield { + 'type': type_, + 'url': url, + 'bitrate': source.get('bitrate') + } - if 'geoblocking' in video.get('mediaProfile'): - if 'm3u8' in src: - src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects') - if 'mp4' in src: - src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects') - if 'm3u8' in src: - if src.get('m3u8').find('csmil') and src.get('m3u8').find('vod'): - src['m3u8'] = src.get('m3u8').replace('.csmil', '.urlset') + def _create_http_formats(self, m3u8_formats, video_id): + for f in m3u8_formats: + if f['vcodec'] == 'none': + continue + http_url = re.sub(r'(https?://[^/]+)/hls/([^?#]+?\.mp4).+', r'\g<1>/\g<2>', f['url']) + if http_url == f['url']: + continue - return src + http_f = f.copy() + del http_f['manifest_url'] + format_id = try_call(lambda: http_f['format_id'].replace('hls-', 'https-')) + urlh = self._request_webpage(HEADRequest(http_url), video_id, fatal=False, + note=f'Check filesize for {format_id}') + if not urlh: + continue - def _create_formats(self, urls, video_id): - formats = [] - formats = self._extract_m3u8_formats( - urls.get('m3u8'), video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False) - - if urls.get('mp4'): - formats.append({ - 'format_id': 'http-mp4', - 'url': urls['mp4'] + http_f.update({ + 'format_id': format_id, + 'url': http_url, + 'protocol': 'https', + 'filesize_approx': int_or_none(urlh.headers.get('Content-Length', None)), }) - return formats + yield http_f + + def _create_formats(self, sources, video_id): + for source in sources: + if source['type'] == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + source['url'], video_id, 'mp4', m3u8_id='hls', fatal=False) + yield from m3u8_formats + yield from self._create_http_formats(m3u8_formats, video_id) + elif source['type'] == 'mp3': + yield { + 'format_id': 'https-mp3', + 'ext': 'mp3', + 'acodec': 'mp3', + 'vcodec': 'none', + 'abr': source.get('bitrate'), + 'url': source['url'], + } def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') + cdn, video_id = self._match_valid_url(url).group('cdn', 'id') + display_id, video_data = None, None - if 'cdn' not in mobj.groupdict(): - raise ExtractorError('CDN not found in url: %s' % url) + if re.match(self._UUID_RE, video_id) or re.match(self._RCS_ID_RE, video_id): + url = f'https://video.{cdn}/video-json/{video_id}' + else: + webpage = self._download_webpage(url, video_id) + data_config = get_element_html_by_id('divVideoPlayer', webpage) or get_element_html_by_class('divVideoPlayer', webpage) - # for leitv/youreporter/viaggi don't use the embed page - if ((mobj.group('cdn') not in ['leitv.it', 'youreporter.it']) - and (mobj.group('vid') == 'video')): - url = 'https://video.%s/video-embed/%s' % (mobj.group('cdn'), video_id) + if data_config: + data_config = self._parse_json( + extract_attributes(data_config).get('data-config'), + video_id, fatal=False) or {} + if data_config.get('newspaper'): + cdn = f'{data_config["newspaper"]}.it' + display_id, video_id = video_id, data_config.get('uuid') or video_id + url = f'https://video.{cdn}/video-json/{video_id}' + else: + json_url = self._search_regex( + r'''(?x)url\s*=\s*(["']) + (?P<url> + (?:https?:)?//video\.rcs\.it + /fragment-includes/video-includes/[^"']+?\.json + )\1;''', + webpage, video_id, group='url', default=None) + if json_url: + video_data = self._download_json(sanitize_url(json_url, scheme='https'), video_id) + display_id, video_id = video_id, video_data.get('id') or video_id - page = self._download_webpage(url, video_id) + if not video_data: + webpage = self._download_webpage(url, video_id) - video_data = None - # look for json video data url - json = self._search_regex( - r'''(?x)url\s*=\s*(["']) - (?P<url> - (?:https?:)?//video\.rcs\.it - /fragment-includes/video-includes/.+?\.json - )\1;''', - page, video_id, group='url', default=None) - if json: - if json.startswith('//'): - json = 'https:%s' % json - video_data = self._download_json(json, video_id) + video_data = self._search_json( + '##start-video##', webpage, 'video data', video_id, default=None, + end_pattern='##end-video##', transform_source=js_to_json) - # if json url not found, look for json video data directly in the page - else: - # RCS normal pages and most of the embeds - json = self._search_regex( - r'[\s;]video\s*=\s*({[\s\S]+?})(?:;|,playlist=)', - page, video_id, default=None) - if not json and 'video-embed' in url: - page = self._download_webpage(url.replace('video-embed', 'video-json'), video_id) - json = self._search_regex( - r'##start-video##({[\s\S]+?})##end-video##', - page, video_id, default=None) - if not json: - # if no video data found try search for iframes - emb = RCSEmbedsIE._extract_url(page) + if not video_data: + # try search for iframes + emb = RCSEmbedsIE._extract_url(webpage) if emb: return { '_type': 'url_transparent', 'url': emb, 'ie_key': RCSEmbedsIE.ie_key() } - if json: - video_data = self._parse_json( - json, video_id, transform_source=js_to_json) if not video_data: raise ExtractorError('Video data not found in the page') - formats = self._create_formats( - self._get_video_src(video_data), video_id) - - description = (video_data.get('description') - or clean_html(video_data.get('htmlDescription')) - or self._html_search_meta('description', page)) - uploader = video_data.get('provider') or mobj.group('cdn') - return { 'id': video_id, + 'display_id': display_id, 'title': video_data.get('title'), - 'description': description, - 'uploader': uploader, - 'formats': formats + 'description': (clean_html(video_data.get('description')) + or clean_html(video_data.get('htmlDescription')) + or self._html_search_meta('description', webpage)), + 'uploader': video_data.get('provider') or cdn, + 'formats': list(self._create_formats(self._get_video_src(video_data), video_id)), } @@ -296,7 +229,7 @@ class RCSEmbedsIE(RCSBaseIE): \1'''] _TESTS = [{ 'url': 'https://video.rcs.it/video-embed/iodonna-0001585037', - 'md5': '623ecc8ffe7299b2d0c1046d8331a9df', + 'md5': '0faca97df525032bb9847f690bc3720c', 'info_dict': { 'id': 'iodonna-0001585037', 'ext': 'mp4', @@ -305,38 +238,31 @@ class RCSEmbedsIE(RCSBaseIE): 'uploader': 'rcs.it', } }, { - # redownload the page changing 'video-embed' in 'video-json' 'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789', - 'md5': 'a043e3fecbe4d9ed7fc5d888652a5440', - 'info_dict': { - 'id': 'gazzanet-mo05-0000260789', - 'ext': 'mp4', - 'title': 'Valentino Rossi e papà Graziano si divertono col drifting', - 'description': 'md5:a8bf90d6adafd9815f70fc74c0fc370a', - 'uploader': 'rcd', - } - }, { - 'url': 'https://video.corriere.it/video-embed/b727632a-f9d0-11ea-91b0-38d50a849abb?player', 'match_only': True }, { 'url': 'https://video.gazzetta.it/video-embed/49612410-00ca-11eb-bcd8-30d4253e0140', 'match_only': True }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.iodonna.it/video-iodonna/personaggi-video/monica-bellucci-piu-del-lavoro-oggi-per-me-sono-importanti-lamicizia-e-la-famiglia/', + 'info_dict': { + 'id': 'iodonna-0002033648', + 'ext': 'mp4', + 'title': 'Monica Bellucci: «Più del lavoro, oggi per me sono importanti l\'amicizia e la famiglia»', + 'description': 'md5:daea6d9837351e56b1ab615c06bebac1', + 'uploader': 'rcs.it', + } + }] @staticmethod - def _sanitize_urls(urls): - # add protocol if missing - for i, e in enumerate(urls): - if e.startswith('//'): - urls[i] = 'https:%s' % e - # clean iframes urls - for i, e in enumerate(urls): - urls[i] = urljoin(base_url(e), url_basename(e)) - return urls + def _sanitize_url(url): + url = sanitize_url(url, scheme='https') + return urljoin(base_url(url), url_basename(url)) @classmethod def _extract_embed_urls(cls, url, webpage): - return cls._sanitize_urls(list(super()._extract_embed_urls(url, webpage))) + return map(cls._sanitize_url, super()._extract_embed_urls(url, webpage)) class RCSIE(RCSBaseIE): @@ -349,37 +275,53 @@ class RCSIE(RCSBaseIE): |corrierefiorentino\. )?corriere\.it |(?:gazzanet\.)?gazzetta\.it) - /(?!video-embed/).+?/(?P<id>[^/\?]+)(?=\?|/$|$)''' + /(?!video-embed/)[^?#]+?/(?P<id>[^/\?]+)(?=\?|/$|$)''' _TESTS = [{ + # json iframe directly from id 'url': 'https://video.corriere.it/sport/formula-1/vettel-guida-ferrari-sf90-mugello-suo-fianco-c-elecrerc-bendato-video-esilarante/b727632a-f9d0-11ea-91b0-38d50a849abb', - 'md5': '0f4ededc202b0f00b6e509d831e2dcda', + 'md5': '14946840dec46ecfddf66ba4eea7d2b2', 'info_dict': { 'id': 'b727632a-f9d0-11ea-91b0-38d50a849abb', 'ext': 'mp4', 'title': 'Vettel guida la Ferrari SF90 al Mugello e al suo fianco c\'è Leclerc (bendato): il video è esilarante', - 'description': 'md5:93b51c9161ac8a64fb2f997b054d0152', + 'description': 'md5:3915ce5ebb3d2571deb69a5eb85ac9b5', 'uploader': 'Corriere Tv', } }, { - # video data inside iframe + # search for video id inside the page 'url': 'https://viaggi.corriere.it/video/norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen/', - 'md5': 'da378e4918d2afbf7d61c35abb948d4c', + 'md5': 'f22a92d9e666e80f2fffbf2825359c81', 'info_dict': { 'id': '5b7cd134-e2c1-11ea-89b3-b56dd0df2aa2', + 'display_id': 'norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen', 'ext': 'mp4', 'title': 'La nuova spettacolare attrazione in Norvegia: il ponte sopra Vøringsfossen', 'description': 'md5:18b35a291f6746c0c8dacd16e5f5f4f8', 'uploader': 'DOVE Viaggi', } }, { - 'url': 'https://video.gazzetta.it/video-motogp-catalogna-cadute-dovizioso-vale-rossi/49612410-00ca-11eb-bcd8-30d4253e0140?vclk=Videobar', - 'md5': 'eedc1b5defd18e67383afef51ff7bdf9', + # only audio format https://github.com/hypervideo/hypervideo/issues/5683 + 'url': 'https://video.corriere.it/cronaca/audio-telefonata-il-papa-becciu-santita-lettera-che-mi-ha-inviato-condanna/b94c0d20-70c2-11ed-9572-e4b947a0ebd2', + 'md5': 'aaffb08d02f2ce4292a4654694c78150', + 'info_dict': { + 'id': 'b94c0d20-70c2-11ed-9572-e4b947a0ebd2', + 'ext': 'mp3', + 'title': 'L\'audio della telefonata tra il Papa e Becciu: «Santità, la lettera che mi ha inviato è una condanna»', + 'description': 'md5:c0ddb61bd94a8d4e0d4bb9cda50a689b', + 'uploader': 'Corriere Tv', + 'formats': [{'format_id': 'https-mp3', 'ext': 'mp3'}], + } + }, { + # old content still needs cdn migration + 'url': 'https://viaggi.corriere.it/video/milano-varallo-sesia-sul-treno-a-vapore/', + 'md5': '2dfdce7af249654ad27eeba03fe1e08d', 'info_dict': { - 'id': '49612410-00ca-11eb-bcd8-30d4253e0140', + 'id': 'd8f6c8d0-f7d7-11e8-bfca-f74cf4634191', + 'display_id': 'milano-varallo-sesia-sul-treno-a-vapore', 'ext': 'mp4', - 'title': 'Dovizioso, il contatto con Zarco e la caduta. E anche Vale finisce a terra', - 'description': 'md5:8c6e905dc3b9413218beca11ebd69778', - 'uploader': 'AMorici', + 'title': 'Milano-Varallo Sesia sul treno a vapore', + 'description': 'md5:6348f47aac230397fe341a74f7678d53', + 'uploader': 'DOVE Viaggi', } }, { 'url': 'https://video.corriere.it/video-360/metro-copenaghen-tutta-italiana/a248a7f0-e2db-11e9-9830-af2de6b1f945', @@ -391,13 +333,15 @@ class RCSVariousIE(RCSBaseIE): _VALID_URL = r'''(?x)https?://www\. (?P<cdn> leitv\.it| - youreporter\.it + youreporter\.it| + amica\.it )/(?:[^/]+/)?(?P<id>[^/]+?)(?:$|\?|/)''' _TESTS = [{ - 'url': 'https://www.leitv.it/benessere/mal-di-testa-come-combatterlo-ed-evitarne-la-comparsa/', - 'md5': '92b4e63667b8f95acb0a04da25ae28a1', + 'url': 'https://www.leitv.it/benessere/mal-di-testa/', + 'md5': '3b7a683d105a7313ec7513b014443631', 'info_dict': { - 'id': 'mal-di-testa-come-combatterlo-ed-evitarne-la-comparsa', + 'id': 'leitv-0000125151', + 'display_id': 'mal-di-testa', 'ext': 'mp4', 'title': 'Cervicalgia e mal di testa, il video con i suggerimenti dell\'esperto', 'description': 'md5:ae21418f34cee0b8d02a487f55bcabb5', @@ -405,12 +349,24 @@ class RCSVariousIE(RCSBaseIE): } }, { 'url': 'https://www.youreporter.it/fiume-sesia-3-ottobre-2020/', - 'md5': '8dccd436b47a830bab5b4a88232f391a', + 'md5': '3989b6d603482611a2abd2f32b79f739', 'info_dict': { - 'id': 'fiume-sesia-3-ottobre-2020', + 'id': 'youreporter-0000332574', + 'display_id': 'fiume-sesia-3-ottobre-2020', 'ext': 'mp4', 'title': 'Fiume Sesia 3 ottobre 2020', 'description': 'md5:0070eef1cc884d13c970a4125063de55', 'uploader': 'youreporter.it', } + }, { + 'url': 'https://www.amica.it/video-post/saint-omer-al-cinema-il-film-leone-dargento-che-ribalta-gli-stereotipi/', + 'md5': '187cce524dfd0343c95646c047375fc4', + 'info_dict': { + 'id': 'amica-0001225365', + 'display_id': 'saint-omer-al-cinema-il-film-leone-dargento-che-ribalta-gli-stereotipi', + 'ext': 'mp4', + 'title': '"Saint Omer": al cinema il film Leone d\'argento che ribalta gli stereotipi', + 'description': 'md5:b1c8869c2dcfd6073a2a311ba0008aa8', + 'uploader': 'rcs.it', + } }] diff --git a/hypervideo_dl/extractor/rcti.py b/hypervideo_dl/extractor/rcti.py index 27b4ad7..79d9c8e 100644 --- a/hypervideo_dl/extractor/rcti.py +++ b/hypervideo_dl/extractor/rcti.py @@ -3,7 +3,7 @@ import random import time from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( dict_get, ExtractorError, @@ -186,7 +186,7 @@ class RCTIPlusIE(RCTIPlusBaseIE): try: formats = self._extract_m3u8_formats(video_url, display_id, 'mp4', headers={'Referer': 'https://www.rctiplus.com/'}) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: self.raise_geo_restricted(countries=['ID'], metadata_available=True) else: raise e diff --git a/hypervideo_dl/extractor/recurbate.py b/hypervideo_dl/extractor/recurbate.py new file mode 100644 index 0000000..d7294cb --- /dev/null +++ b/hypervideo_dl/extractor/recurbate.py @@ -0,0 +1,42 @@ +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ExtractorError, merge_dicts + + +class RecurbateIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?recurbate\.com/play\.php\?video=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://recurbate.com/play.php?video=39161415', + 'md5': 'dd2b4ec57aa3e3572cb5cf0997fca99f', + 'info_dict': { + 'id': '39161415', + 'ext': 'mp4', + 'description': 'md5:db48d09e4d93fc715f47fd3d6b7edd51', + 'title': 'Performer zsnicole33 show on 2022-10-25 20:23, Chaturbate Archive – Recurbate', + 'age_limit': 18, + }, + 'skip': 'Website require membership.', + }] + + def _real_extract(self, url): + SUBSCRIPTION_MISSING_MESSAGE = 'This video is only available for registered users; Set your authenticated browser user agent via the --user-agent parameter.' + video_id = self._match_id(url) + try: + webpage = self._download_webpage(url, video_id) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + self.raise_login_required(msg=SUBSCRIPTION_MISSING_MESSAGE, method='cookies') + raise + token = self._html_search_regex(r'data-token="([^"]+)"', webpage, 'token') + video_url = f'https://recurbate.com/api/get.php?video={video_id}&token={token}' + + video_webpage = self._download_webpage(video_url, video_id) + if video_webpage == 'shall_subscribe': + self.raise_login_required(msg=SUBSCRIPTION_MISSING_MESSAGE, method='cookies') + entries = self._parse_html5_media_entries(video_url, video_webpage, video_id) + return merge_dicts({ + 'id': video_id, + 'title': self._html_extract_title(webpage, 'title'), + 'description': self._og_search_description(webpage), + 'age_limit': self._rta_search(webpage), + }, entries[0]) diff --git a/hypervideo_dl/extractor/redbulltv.py b/hypervideo_dl/extractor/redbulltv.py index a01bc84..d1de249 100644 --- a/hypervideo_dl/extractor/redbulltv.py +++ b/hypervideo_dl/extractor/redbulltv.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( float_or_none, ExtractorError, @@ -68,9 +68,9 @@ class RedBullTVIE(InfoExtractor): headers={'Authorization': token} ) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: error_message = self._parse_json( - e.cause.read().decode(), video_id)['error'] + e.cause.response.read().decode(), video_id)['error'] raise ExtractorError('%s said: %s' % ( self.IE_NAME, error_message), expected=True) raise diff --git a/hypervideo_dl/extractor/reddit.py b/hypervideo_dl/extractor/reddit.py index f1a5c85..813e628 100644 --- a/hypervideo_dl/extractor/reddit.py +++ b/hypervideo_dl/extractor/reddit.py @@ -1,4 +1,3 @@ -import random import urllib.parse from .common import InfoExtractor @@ -9,12 +8,14 @@ from ..utils import ( traverse_obj, try_get, unescapeHTML, + urlencode_postdata, url_or_none, ) class RedditIE(InfoExtractor): - _VALID_URL = r'https?://(?P<subdomain>[^/]+\.)?reddit(?:media)?\.com/r/(?P<slug>[^/]+/comments/(?P<id>[^/?#&]+))' + _NETRC_MACHINE = 'reddit' + _VALID_URL = r'https?://(?P<host>(?:\w+\.)?reddit(?:media)?\.com)/(?P<slug>(?:(?:r|user)/[^/]+/)?comments/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', 'info_dict': { @@ -32,6 +33,7 @@ class RedditIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, 'age_limit': 0, + 'channel_id': 'videos', }, 'params': { 'skip_download': True, @@ -55,6 +57,30 @@ class RedditIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, 'age_limit': 0, + 'channel_id': 'aww', + }, + }, { + # User post + 'url': 'https://www.reddit.com/user/creepyt0es/comments/nip71r/i_plan_to_make_more_stickers_and_prints_check/', + 'info_dict': { + 'id': 'zasobba6wp071', + 'ext': 'mp4', + 'display_id': 'nip71r', + 'title': 'I plan to make more stickers and prints! Check them out on my Etsy! Or get them through my Patreon. Links below.', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:5', + 'timestamp': 1621709093, + 'upload_date': '20210522', + 'uploader': 'creepyt0es', + 'duration': 6, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + 'channel_id': 'u_creepyt0es', + }, + 'params': { + 'skip_download': True, }, }, { # videos embedded in reddit text post @@ -65,6 +91,66 @@ class RedditIE(InfoExtractor): 'title': 'md5:72d3d19402aa11eff5bd32fc96369b37', }, }, { + # crossposted reddit-hosted media + 'url': 'https://www.reddit.com/r/dumbfuckers_club/comments/zjjw82/cringe/', + 'md5': '746180895c7b75a9d6b05341f507699a', + 'info_dict': { + 'id': 'a1oneun6pa5a1', + 'ext': 'mp4', + 'display_id': 'zjjw82', + 'title': 'Cringe', + 'uploader': 'Otaku-senpai69420', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'upload_date': '20221212', + 'timestamp': 1670812309, + 'duration': 16, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + 'channel_id': 'dumbfuckers_club', + }, + }, { + # post link without subreddit + 'url': 'https://www.reddit.com/comments/124pp33', + 'md5': '15eec9d828adcef4468b741a7e45a395', + 'info_dict': { + 'id': 'antsenjc2jqa1', + 'ext': 'mp4', + 'display_id': '124pp33', + 'title': 'Harmless prank of some old friends', + 'uploader': 'Dudezila', + 'channel_id': 'ContagiousLaughter', + 'duration': 17, + 'upload_date': '20230328', + 'timestamp': 1680012043, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'age_limit': 0, + 'comment_count': int, + 'dislike_count': int, + 'like_count': int, + }, + }, { + # quarantined subreddit post + 'url': 'https://old.reddit.com/r/GenZedong/comments/12fujy3/based_hasan/', + 'md5': '3156ea69e3c1f1b6259683c5abd36e71', + 'info_dict': { + 'id': '8bwtclfggpsa1', + 'ext': 'mp4', + 'display_id': '12fujy3', + 'title': 'Based Hasan?', + 'uploader': 'KingNigelXLII', + 'channel_id': 'GenZedong', + 'duration': 16, + 'upload_date': '20230408', + 'timestamp': 1680979138, + 'age_limit': 0, + 'comment_count': int, + 'dislike_count': int, + 'like_count': int, + }, + 'skip': 'Requires account that has opted-in to the GenZedong subreddit', + }, { 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', 'only_matching': True, }, { @@ -92,21 +178,45 @@ class RedditIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _gen_session_id(): - id_length = 16 - rand_max = 1 << (id_length * 4) - return '%0.*x' % (id_length, random.randrange(rand_max)) + def _perform_login(self, username, password): + captcha = self._download_json( + 'https://www.reddit.com/api/requires_captcha/login.json', None, + 'Checking login requirement')['required'] + if captcha: + raise ExtractorError('Reddit is requiring captcha before login', expected=True) + login = self._download_json( + f'https://www.reddit.com/api/login/{username}', None, data=urlencode_postdata({ + 'op': 'login-main', + 'user': username, + 'passwd': password, + 'api_type': 'json', + }), note='Logging in', errnote='Login request failed') + errors = '; '.join(traverse_obj(login, ('json', 'errors', ..., 1))) + if errors: + raise ExtractorError(f'Unable to login, Reddit API says {errors}', expected=True) + elif not traverse_obj(login, ('json', 'data', 'cookie', {str})): + raise ExtractorError('Unable to login, no cookie was returned') def _real_extract(self, url): - subdomain, slug, video_id = self._match_valid_url(url).group('subdomain', 'slug', 'id') + host, slug, video_id = self._match_valid_url(url).group('host', 'slug', 'id') - self._set_cookie('.reddit.com', 'reddit_session', self._gen_session_id()) - self._set_cookie('.reddit.com', '_options', '%7B%22pref_quarantine_optin%22%3A%20true%7D') - data = self._download_json(f'https://{subdomain}reddit.com/r/{slug}/.json', video_id, fatal=False) + data = self._download_json( + f'https://{host}/{slug}/.json', video_id, fatal=False, expected_status=403) if not data: - # Fall back to old.reddit.com in case the requested subdomain fails - data = self._download_json(f'https://old.reddit.com/r/{slug}/.json', video_id) + fallback_host = 'old.reddit.com' if host != 'old.reddit.com' else 'www.reddit.com' + self.to_screen(f'{host} request failed, retrying with {fallback_host}') + data = self._download_json( + f'https://{fallback_host}/{slug}/.json', video_id, expected_status=403) + + if traverse_obj(data, 'error') == 403: + reason = data.get('reason') + if reason == 'quarantined': + self.raise_login_required('Quarantined subreddit; an account that has opted in is required') + elif reason == 'private': + self.raise_login_required('Private subreddit; an account that has been approved is required') + else: + raise ExtractorError(f'HTTP Error 403 Forbidden; reason given: {reason}') + data = data[0]['data']['children'][0]['data'] video_url = data['url'] @@ -130,6 +240,7 @@ class RedditIE(InfoExtractor): 'url': unescapeHTML(thumbnail_url), 'width': int_or_none(src.get('width')), 'height': int_or_none(src.get('height')), + 'http_headers': {'Accept': '*/*'}, }) for image in try_get(data, lambda x: x['preview']['images']) or []: @@ -146,6 +257,7 @@ class RedditIE(InfoExtractor): 'thumbnails': thumbnails, 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), + 'channel_id': data.get('subreddit'), 'like_count': int_or_none(data.get('ups')), 'dislike_count': int_or_none(data.get('downs')), 'comment_count': int_or_none(data.get('num_comments')), @@ -179,7 +291,8 @@ class RedditIE(InfoExtractor): raise ExtractorError('No media found', expected=True) # Check if media is hosted on reddit: - reddit_video = traverse_obj(data, (('media', 'secure_media'), 'reddit_video'), get_all=False) + reddit_video = traverse_obj(data, ( + (None, ('crosspost_parent_list', ...)), ('secure_media', 'media'), 'reddit_video'), get_all=False) if reddit_video: playlist_urls = [ try_get(reddit_video, lambda x: unescapeHTML(x[y])) diff --git a/hypervideo_dl/extractor/redgifs.py b/hypervideo_dl/extractor/redgifs.py index 098fb81..f945320 100644 --- a/hypervideo_dl/extractor/redgifs.py +++ b/hypervideo_dl/extractor/redgifs.py @@ -1,8 +1,8 @@ import functools -import urllib from .common import InfoExtractor from ..compat import compat_parse_qs +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -82,7 +82,7 @@ class RedGifsBaseInfoExtractor(InfoExtractor): f'https://api.redgifs.com/v2/{ep}', video_id, headers=headers, *args, **kwargs) break except ExtractorError as e: - if first_attempt and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + if first_attempt and isinstance(e.cause, HTTPError) and e.cause.status == 401: del self._API_HEADERS['authorization'] # refresh the token continue raise diff --git a/hypervideo_dl/extractor/regiotv.py b/hypervideo_dl/extractor/regiotv.py index 6114841..edb6ae5 100644 --- a/hypervideo_dl/extractor/regiotv.py +++ b/hypervideo_dl/extractor/regiotv.py @@ -1,10 +1,6 @@ from .common import InfoExtractor - -from ..utils import ( - sanitized_Request, - xpath_text, - xpath_with_ns, -) +from ..networking import Request +from ..utils import xpath_text, xpath_with_ns class RegioTVIE(InfoExtractor): @@ -33,7 +29,7 @@ class RegioTVIE(InfoExtractor): SOAP_TEMPLATE = '<?xml version="1.0" encoding="utf-8"?><soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"><soap:Body><{0} xmlns="http://v.telvi.de/"><key xsi:type="xsd:string">{1}</key></{0}></soap:Body></soap:Envelope>' - request = sanitized_Request( + request = Request( 'http://v.telvi.de/', SOAP_TEMPLATE.format('GetHTML5VideoData', key).encode('utf-8')) video_data = self._download_xml(request, video_id, 'Downloading video XML') diff --git a/hypervideo_dl/extractor/rheinmaintv.py b/hypervideo_dl/extractor/rheinmaintv.py new file mode 100644 index 0000000..c3b352d --- /dev/null +++ b/hypervideo_dl/extractor/rheinmaintv.py @@ -0,0 +1,94 @@ +from .common import InfoExtractor +from ..utils import extract_attributes, merge_dicts, remove_end + + +class RheinMainTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rheinmaintv\.de/sendungen/(?:[\w-]+/)*(?P<video_id>(?P<display_id>[\w-]+)/vom-\d{2}\.\d{2}\.\d{4}(?:/\d+)?)' + _TESTS = [{ + 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/auf-dem-weg-zur-deutschen-meisterschaft/vom-07.11.2022/', + 'info_dict': { + 'id': 'auf-dem-weg-zur-deutschen-meisterschaft-vom-07.11.2022', + 'ext': 'ismv', # ismv+isma will be merged into mp4 + 'alt_title': 'Auf dem Weg zur Deutschen Meisterschaft', + 'title': 'Auf dem Weg zur Deutschen Meisterschaft', + 'upload_date': '20221108', + 'view_count': int, + 'display_id': 'auf-dem-weg-zur-deutschen-meisterschaft', + 'thumbnail': r're:^https://.+\.jpg', + 'description': 'md5:48c59b74192bc819a9b34af1d5ed1eb9', + 'timestamp': 1667933057, + 'duration': 243.0, + }, + 'params': {'skip_download': 'ism'}, + }, { + 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften/vom-14.11.2022/', + 'info_dict': { + 'id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften-vom-14.11.2022', + 'ext': 'ismv', + 'title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften', + 'timestamp': 1668526214, + 'display_id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften', + 'alt_title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften', + 'view_count': int, + 'thumbnail': r're:^https://.+\.jpg', + 'duration': 345.0, + 'description': 'md5:9370ba29526984006c2cba1372e5c5a0', + 'upload_date': '20221115', + }, + 'params': {'skip_download': 'ism'}, + }, { + 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/casino-mainz-bei-den-deutschen-meisterschaften/vom-14.11.2022/', + 'info_dict': { + 'id': 'casino-mainz-bei-den-deutschen-meisterschaften-vom-14.11.2022', + 'ext': 'ismv', + 'title': 'Casino Mainz bei den Deutschen Meisterschaften', + 'view_count': int, + 'timestamp': 1668527402, + 'alt_title': 'Casino Mainz bei den Deutschen Meisterschaften', + 'upload_date': '20221115', + 'display_id': 'casino-mainz-bei-den-deutschen-meisterschaften', + 'duration': 348.0, + 'thumbnail': r're:^https://.+\.jpg', + 'description': 'md5:70fc1660eeba96da17199e5bdff4c0aa', + }, + 'params': {'skip_download': 'ism'}, + }, { + 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/bricks4kids/vom-22.06.2022/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = self._match_valid_url(url) + display_id = mobj.group('display_id') + video_id = mobj.group('video_id').replace('/', '-') + webpage = self._download_webpage(url, video_id) + + source, img = self._search_regex(r'(?s)(?P<source><source[^>]*>)(?P<img><img[^>]*>)', + webpage, 'video', group=('source', 'img')) + source = extract_attributes(source) + img = extract_attributes(img) + + raw_json_ld = list(self._yield_json_ld(webpage, video_id)) + json_ld = self._json_ld(raw_json_ld, video_id) + json_ld.pop('url', None) + + ism_manifest_url = ( + source.get('src') + or next(json_ld.get('embedUrl') for json_ld in raw_json_ld if json_ld.get('@type') == 'VideoObject') + ) + formats, subtitles = self._extract_ism_formats_and_subtitles(ism_manifest_url, video_id) + + return merge_dicts({ + 'id': video_id, + 'display_id': display_id, + 'title': + self._html_search_regex(r'<h1><span class="title">([^<]*)</span>', + webpage, 'headline', default=None) + or img.get('title') or json_ld.get('title') or self._og_search_title(webpage) + or remove_end(self._html_extract_title(webpage), ' -'), + 'alt_title': img.get('alt'), + 'description': json_ld.get('description') or self._og_search_description(webpage), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': [{'url': img['src']}] if 'src' in img else json_ld.get('thumbnails'), + }, json_ld) diff --git a/hypervideo_dl/extractor/rokfin.py b/hypervideo_dl/extractor/rokfin.py index ade3cd0..4a4d40b 100644 --- a/hypervideo_dl/extractor/rokfin.py +++ b/hypervideo_dl/extractor/rokfin.py @@ -45,6 +45,7 @@ class RokfinIE(InfoExtractor): 'live_status': 'not_live', 'dislike_count': int, 'like_count': int, + 'duration': 213, } }, { 'url': 'https://rokfin.com/post/223/Julian-Assange-Arrested-Streaming-In-Real-Time', @@ -72,7 +73,7 @@ class RokfinIE(InfoExtractor): 'title': '"It\'s A Crazy Mess" Regional Director Blows Whistle On Pfizer\'s Vaccine Trial Data', 'thumbnail': r're:https://img\.production\.rokfin\.com/.+', 'description': 'md5:324ce2d3e3b62e659506409e458b9d8e', - 'channel': 'Ryan Cristián', + 'channel': 'TLAVagabond', 'channel_id': 53856, 'channel_url': 'https://rokfin.com/TLAVagabond', 'availability': 'public', @@ -86,6 +87,47 @@ class RokfinIE(InfoExtractor): 'dislike_count': int, 'like_count': int, 'tags': ['FreeThinkingMedia^'], + 'duration': None, + } + }, { + 'url': 'https://rokfin.com/post/126703/Brave-New-World--Aldous-Huxley-DEEPDIVE--Chpts-13--Quite-Frankly--Jay-Dyer', + 'info_dict': { + 'id': 'post/126703', + 'ext': 'mp4', + 'title': 'Brave New World - Aldous Huxley DEEPDIVE! (Chpts 1-3) - Quite Frankly & Jay Dyer', + 'thumbnail': r're:https://img\.production\.rokfin\.com/.+', + 'channel': 'Jay Dyer', + 'channel_id': 186881, + 'channel_url': 'https://rokfin.com/jaydyer', + 'availability': 'premium_only', + 'live_status': 'not_live', + 'dislike_count': int, + 'like_count': int, + 'timestamp': 1678213357, + 'upload_date': '20230307', + 'tags': ['FreeThinkingMedia^', 'OpenMind^'], + 'description': 'md5:cb04e32e68326c9b2b251b297bacff35', + 'duration': 3100, + } + }, { + 'url': 'https://rokfin.com/stream/31332/The-Grayzone-live-on-Nordstream-blame-game', + 'info_dict': { + 'id': 'stream/31332', + 'ext': 'mp4', + 'title': 'The Grayzone live on Nordstream blame game', + 'thumbnail': r're:https://image\.v\.rokfin\.com/.+', + 'channel': 'Max Blumenthal', + 'channel_id': 248902, + 'channel_url': 'https://rokfin.com/MaxBlumenthal', + 'availability': 'premium_only', + 'live_status': 'was_live', + 'dislike_count': int, + 'like_count': int, + 'timestamp': 1678475166, + 'release_timestamp': 1678475166.0, + 'release_date': '20230310', + 'upload_date': '20230310', + 'tags': ['FreeThinkingMedia^'], } }] @@ -100,6 +142,12 @@ class RokfinIE(InfoExtractor): else 'not_live') video_url = traverse_obj(metadata, 'url', ('content', 'contentUrl'), expected_type=url_or_none) + if video_url in (None, 'fake.m3u8'): + video_url = format_field(self._search_regex( + r'https?://[^/]+/([^/]+)/storyboard.vtt', + traverse_obj(metadata, 'timelineUrl', ('content', 'timelineUrl'), expected_type=url_or_none), + video_id, default=None), None, 'https://stream.v.rokfin.com/%s.m3u8') + formats, subtitles = [{'url': video_url}] if video_url else [], {} if determine_ext(video_url) == 'm3u8': formats, subtitles = self._extract_m3u8_formats_and_subtitles( @@ -197,7 +245,7 @@ class RokfinIE(InfoExtractor): f'{self._AUTH_BASE}/token', None, note='getting access credentials', errnote='error getting access credentials', data=urlencode_postdata({ - 'code': urllib.parse.parse_qs(urllib.parse.urldefrag(urlh.geturl()).fragment).get('code')[0], + 'code': urllib.parse.parse_qs(urllib.parse.urldefrag(urlh.url).fragment).get('code')[0], 'client_id': 'web', 'grant_type': 'authorization_code', 'redirect_uri': 'https://rokfin.com/silent-check-sso.html' @@ -221,7 +269,7 @@ class RokfinIE(InfoExtractor): json_string, urlh = self._download_webpage_handle( url_or_request, video_id, headers=headers, query=query, expected_status=401) - if not auth_token or urlh.code != 401 or refresh_token is None: + if not auth_token or urlh.status != 401 or refresh_token is None: return self._parse_json(json_string, video_id) self._access_mgmt_tokens = self._download_json( diff --git a/hypervideo_dl/extractor/roosterteeth.py b/hypervideo_dl/extractor/roosterteeth.py index 776fbfb..94e673b 100644 --- a/hypervideo_dl/extractor/roosterteeth.py +++ b/hypervideo_dl/extractor/roosterteeth.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -35,8 +35,8 @@ class RoosterTeethBaseIE(InfoExtractor): })) except ExtractorError as e: msg = 'Unable to login' - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - resp = self._parse_json(e.cause.read().decode(), None, fatal=False) + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + resp = self._parse_json(e.cause.response.read().decode(), None, fatal=False) if resp: error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') if error: @@ -138,8 +138,8 @@ class RoosterTeethIE(RoosterTeethBaseIE): m3u8_url = video_data['attributes']['url'] # XXX: additional URL at video_data['links']['download'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - if self._parse_json(e.cause.read().decode(), display_id).get('access') is False: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + if self._parse_json(e.cause.response.read().decode(), display_id).get('access') is False: self.raise_login_required( '%s is only available for FIRST members' % display_id) raise diff --git a/hypervideo_dl/extractor/rottentomatoes.py b/hypervideo_dl/extractor/rottentomatoes.py index f133c85..e357175 100644 --- a/hypervideo_dl/extractor/rottentomatoes.py +++ b/hypervideo_dl/extractor/rottentomatoes.py @@ -1,30 +1,80 @@ from .common import InfoExtractor -from .internetvideoarchive import InternetVideoArchiveIE +from ..utils import ( + ExtractorError, + clean_html, + float_or_none, + get_element_by_class, + join_nonempty, + traverse_obj, + url_or_none, +) class RottenTomatoesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/(?P<playlist>[^/]+)(?:/(?P<tr>trailers)(?:/(?P<id>\w+))?)?' - _TEST = { + _TESTS = [{ 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', 'info_dict': { 'id': '11028566', 'ext': 'mp4', 'title': 'Toy Story 3', - 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.' + }, + 'skip': 'No longer available', + }, { + 'url': 'https://www.rottentomatoes.com/m/toy_story_3/trailers/VycaVoBKhGuk', + 'info_dict': { + 'id': 'VycaVoBKhGuk', + 'ext': 'mp4', + 'title': 'Toy Story 3: Trailer 2', + 'description': '', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 149.941 + }, + }, { + 'url': 'http://www.rottentomatoes.com/m/toy_story_3', + 'info_dict': { + 'id': 'toy_story_3', + 'title': 'Toy Story 3', + }, + 'playlist_mincount': 4, + }, { + 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers', + 'info_dict': { + 'id': 'toy_story_3-trailers', }, - } + 'playlist_mincount': 5, + }] + + def _extract_videos(self, data, display_id): + for video in traverse_obj(data, (lambda _, v: v['publicId'] and v['file'] and v['type'] == 'hls')): + yield { + 'formats': self._extract_m3u8_formats( + video['file'], display_id, 'mp4', m3u8_id='hls', fatal=False), + **traverse_obj(video, { + 'id': 'publicId', + 'title': 'title', + 'description': 'description', + 'duration': ('durationInSeconds', {float_or_none}), + 'thumbnail': ('image', {url_or_none}), + }), + } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - iva_id = self._search_regex(r'publishedid=(\d+)', webpage, 'internet video archive id') + playlist_id, trailers, video_id = self._match_valid_url(url).group('playlist', 'tr', 'id') + playlist_id = join_nonempty(playlist_id, trailers) + webpage = self._download_webpage(url, playlist_id) + data = self._search_json( + r'<script[^>]+\bid=["\'](?:heroV|v)ideos["\'][^>]*>', webpage, + 'data', playlist_id, contains_pattern=r'\[{(?s:.+)}\]') + + if video_id: + video_data = traverse_obj(data, lambda _, v: v['publicId'] == video_id) + if not video_data: + raise ExtractorError('Unable to extract video from webpage') + return next(self._extract_videos(video_data, video_id)) - return { - '_type': 'url_transparent', - 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?domain=www.videodetective.com&customerid=69249&playerid=641&publishedid=' + iva_id, - 'ie_key': InternetVideoArchiveIE.ie_key(), - 'id': video_id, - 'title': self._og_search_title(webpage), - } + return self.playlist_result( + self._extract_videos(data, playlist_id), playlist_id, + clean_html(get_element_by_class('scoreboard__title', webpage))) diff --git a/hypervideo_dl/extractor/rozhlas.py b/hypervideo_dl/extractor/rozhlas.py index a818967..6313432 100644 --- a/hypervideo_dl/extractor/rozhlas.py +++ b/hypervideo_dl/extractor/rozhlas.py @@ -1,7 +1,16 @@ +import itertools + from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, + extract_attributes, int_or_none, remove_start, + str_or_none, + traverse_obj, + unified_timestamp, + url_or_none, ) @@ -45,3 +54,290 @@ class RozhlasIE(InfoExtractor): 'duration': duration, 'vcodec': 'none', } + + +class RozhlasBaseIE(InfoExtractor): + def _extract_formats(self, entry, audio_id): + formats = [] + for audio in traverse_obj(entry, ('audioLinks', lambda _, v: url_or_none(v['url']))): + ext = audio.get('variant') + for retry in self.RetryManager(): + if retry.attempt > 1: + self._sleep(1, audio_id) + try: + if ext == 'dash': + formats.extend(self._extract_mpd_formats( + audio['url'], audio_id, mpd_id=ext)) + elif ext == 'hls': + formats.extend(self._extract_m3u8_formats( + audio['url'], audio_id, 'm4a', m3u8_id=ext)) + else: + formats.append({ + 'url': audio['url'], + 'ext': ext, + 'format_id': ext, + 'abr': int_or_none(audio.get('bitrate')), + 'acodec': ext, + 'vcodec': 'none', + }) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 429: + retry.error = e.cause + else: + self.report_warning(e.msg) + + return formats + + +class RozhlasVltavaIE(RozhlasBaseIE): + _VALID_URL = r'https?://(?:\w+\.rozhlas|english\.radio)\.cz/[\w-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://wave.rozhlas.cz/papej-masicko-porcujeme-a-bilancujeme-filmy-a-serialy-ktere-letos-zabily-8891337', + 'md5': 'ba2fdbc1242fc16771c7695d271ec355', + 'info_dict': { + 'id': '8891337', + 'title': 'md5:21f99739d04ab49d8c189ec711eef4ec', + }, + 'playlist_count': 1, + 'playlist': [{ + 'md5': 'ba2fdbc1242fc16771c7695d271ec355', + 'info_dict': { + 'id': '10520988', + 'ext': 'mp3', + 'title': 'Papej masíčko! Porcujeme a bilancujeme filmy a seriály, které to letos zabily', + 'description': 'md5:1c6d29fb9564e1f17fc1bb83ae7da0bc', + 'duration': 1574, + 'artist': 'Aleš Stuchlý', + 'channel_id': 'radio-wave', + }, + }] + }, { + 'url': 'https://wave.rozhlas.cz/poslechnete-si-neklid-podcastovy-thriller-o-vine-strachu-a-vztahu-ktery-zasel-8554744', + 'info_dict': { + 'id': '8554744', + 'title': 'Poslechněte si Neklid. Podcastový thriller o vině, strachu a vztahu, který zašel příliš daleko', + }, + 'playlist_count': 5, + 'playlist': [{ + 'md5': '93d4109cf8f40523699ae9c1d4600bdd', + 'info_dict': { + 'id': '9890713', + 'ext': 'mp3', + 'title': 'Neklid #1', + 'description': '1. díl: Neklid: 1. díl', + 'duration': 1025, + 'artist': 'Josef Kokta', + 'channel_id': 'radio-wave', + 'chapter': 'Neklid #1', + 'chapter_number': 1, + }, + }, { + 'md5': 'e9763235be4a6dcf94bc8a5bac1ca126', + 'info_dict': { + 'id': '9890716', + 'ext': 'mp3', + 'title': 'Neklid #2', + 'description': '2. díl: Neklid: 2. díl', + 'duration': 768, + 'artist': 'Josef Kokta', + 'channel_id': 'radio-wave', + 'chapter': 'Neklid #2', + 'chapter_number': 2, + }, + }, { + 'md5': '00b642ea94b78cc949ac84da09f87895', + 'info_dict': { + 'id': '9890722', + 'ext': 'mp3', + 'title': 'Neklid #3', + 'description': '3. díl: Neklid: 3. díl', + 'duration': 607, + 'artist': 'Josef Kokta', + 'channel_id': 'radio-wave', + 'chapter': 'Neklid #3', + 'chapter_number': 3, + }, + }, { + 'md5': 'faef97b1b49da7df874740f118c19dea', + 'info_dict': { + 'id': '9890728', + 'ext': 'mp3', + 'title': 'Neklid #4', + 'description': '4. díl: Neklid: 4. díl', + 'duration': 621, + 'artist': 'Josef Kokta', + 'channel_id': 'radio-wave', + 'chapter': 'Neklid #4', + 'chapter_number': 4, + }, + }, { + 'md5': '6e729fa39b647325b868d419c76f3efa', + 'info_dict': { + 'id': '9890734', + 'ext': 'mp3', + 'title': 'Neklid #5', + 'description': '5. díl: Neklid: 5. díl', + 'duration': 908, + 'artist': 'Josef Kokta', + 'channel_id': 'radio-wave', + 'chapter': 'Neklid #5', + 'chapter_number': 5, + }, + }] + }, { + 'url': 'https://dvojka.rozhlas.cz/karel-siktanc-cerny-jezdec-bily-kun-napinava-pohadka-o-tajemnem-prizraku-8946969', + 'info_dict': { + 'id': '8946969', + 'title': 'Karel Šiktanc: Černý jezdec, bílý kůň. Napínavá pohádka o tajemném přízraku', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '10631121', + 'ext': 'm4a', + 'title': 'Karel Šiktanc: Černý jezdec, bílý kůň. Napínavá pohádka o tajemném přízraku', + 'description': 'Karel Šiktanc: Černý jezdec, bílý kůň', + 'duration': 2656, + 'artist': 'Tvůrčí skupina Drama a literatura', + 'channel_id': 'dvojka', + }, + }], + 'params': {'skip_download': 'dash'}, + }] + + def _extract_video(self, entry): + audio_id = entry['meta']['ga']['contentId'] + chapter_number = traverse_obj(entry, ('meta', 'ga', 'contentSerialPart', {int_or_none})) + + return { + 'id': audio_id, + 'chapter': traverse_obj(entry, ('meta', 'ga', 'contentNameShort')) if chapter_number else None, + 'chapter_number': chapter_number, + 'formats': self._extract_formats(entry, audio_id), + **traverse_obj(entry, { + 'title': ('meta', 'ga', 'contentName'), + 'description': 'title', + 'duration': ('duration', {int_or_none}), + 'artist': ('meta', 'ga', 'contentAuthor'), + 'channel_id': ('meta', 'ga', 'contentCreator'), + }) + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # FIXME: Use get_element_text_and_html_by_tag when it accepts less strict html + data = self._parse_json(extract_attributes(self._search_regex( + r'(<div class="mujRozhlasPlayer" data-player=\'[^\']+\'>)', + webpage, 'player'))['data-player'], video_id)['data'] + + return { + '_type': 'playlist', + 'id': str_or_none(data.get('embedId')) or video_id, + 'title': traverse_obj(data, ('series', 'title')), + 'entries': map(self._extract_video, data['playlist']), + } + + +class MujRozhlasIE(RozhlasBaseIE): + _VALID_URL = r'https?://(?:www\.)?mujrozhlas\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + # single episode extraction + 'url': 'https://www.mujrozhlas.cz/vykopavky/ach-jo-zase-teleci-rizek-je-mnohem-min-cesky-nez-jsme-si-mysleli', + 'md5': '6f8fd68663e64936623e67c152a669e0', + 'info_dict': { + 'id': '10739193', + 'ext': 'mp3', + 'title': 'Ach jo, zase to telecí! Řízek je mnohem míň český, než jsme si mysleli', + 'description': 'md5:db7141e9caaedc9041ec7cefb9a62908', + 'timestamp': 1684915200, + 'modified_timestamp': 1684922446, + 'series': 'Vykopávky', + 'thumbnail': 'https://portal.rozhlas.cz/sites/default/files/images/84377046610af6ddc54d910b1dd7a22b.jpg', + 'channel_id': 'radio-wave', + 'upload_date': '20230524', + 'modified_date': '20230524', + }, + }, { + # serial extraction + 'url': 'https://www.mujrozhlas.cz/radiokniha/jaroslava-janackova-pribeh-tajemneho-psani-o-pramenech-genezi-babicky', + 'playlist_mincount': 7, + 'info_dict': { + 'id': 'bb2b5f4e-ffb4-35a6-a34a-046aa62d6f6b', + 'title': 'Jaroslava Janáčková: Příběh tajemného psaní. O pramenech a genezi Babičky', + 'description': 'md5:7434d8fac39ac9fee6df098e11dfb1be', + }, + }, { + # show extraction + 'url': 'https://www.mujrozhlas.cz/nespavci', + 'playlist_mincount': 14, + 'info_dict': { + 'id': '09db9b37-d0f4-368c-986a-d3439f741f08', + 'title': 'Nespavci', + 'description': 'md5:c430adcbf9e2b9eac88b745881e814dc', + }, + }] + + def _call_api(self, path, item_id, msg='API JSON'): + return self._download_json( + f'https://api.mujrozhlas.cz/{path}/{item_id}', item_id, + note=f'Downloading {msg}', errnote=f'Failed to download {msg}')['data'] + + def _extract_audio_entry(self, entry): + audio_id = entry['meta']['ga']['contentId'] + + return { + 'id': audio_id, + 'formats': self._extract_formats(entry['attributes'], audio_id), + **traverse_obj(entry, { + 'title': ('attributes', 'title'), + 'description': ('attributes', 'description'), + 'episode_number': ('attributes', 'part'), + 'series': ('attributes', 'mirroredShow', 'title'), + 'chapter': ('attributes', 'mirroredSerial', 'title'), + 'artist': ('meta', 'ga', 'contentAuthor'), + 'channel_id': ('meta', 'ga', 'contentCreator'), + 'timestamp': ('attributes', 'since', {unified_timestamp}), + 'modified_timestamp': ('attributes', 'updated', {unified_timestamp}), + 'thumbnail': ('attributes', 'asset', 'url', {url_or_none}), + }) + } + + def _entries(self, api_url, playlist_id): + for page in itertools.count(1): + episodes = self._download_json( + api_url, playlist_id, note=f'Downloading episodes page {page}', + errnote=f'Failed to download episodes page {page}', fatal=False) + for episode in traverse_obj(episodes, ('data', lambda _, v: v['meta']['ga']['contentId'])): + yield self._extract_audio_entry(episode) + api_url = traverse_obj(episodes, ('links', 'next', {url_or_none})) + if not api_url: + break + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + info = self._search_json(r'\bvar\s+dl\s*=', webpage, 'info json', display_id) + + entity = info['siteEntityBundle'] + + if entity == 'episode': + return self._extract_audio_entry(self._call_api( + 'episodes', info['contentId'], 'episode info API JSON')) + + elif entity in ('show', 'serial'): + playlist_id = info['contentShow'].split(':')[0] if entity == 'show' else info['contentId'] + data = self._call_api(f'{entity}s', playlist_id, f'{entity} playlist JSON') + api_url = data['relationships']['episodes']['links']['related'] + return self.playlist_result( + self._entries(api_url, playlist_id), playlist_id, + **traverse_obj(data, ('attributes', { + 'title': 'title', + 'description': 'description', + }))) + + else: + # `entity == 'person'` not implemented yet by API, ref: + # https://api.mujrozhlas.cz/persons/8367e456-2a57-379a-91bb-e699619bea49/participation + raise ExtractorError(f'Unsupported entity type "{entity}"') diff --git a/hypervideo_dl/extractor/rte.py b/hypervideo_dl/extractor/rte.py index aedaa5b..7ba80d4 100644 --- a/hypervideo_dl/extractor/rte.py +++ b/hypervideo_dl/extractor/rte.py @@ -1,7 +1,7 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( float_or_none, parse_iso8601, @@ -31,8 +31,8 @@ class RteBaseIE(InfoExtractor): except ExtractorError as ee: if num < len(ENDPOINTS) or formats: continue - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: - error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) + if isinstance(ee.cause, HTTPError) and ee.cause.status == 404: + error_info = self._parse_json(ee.cause.response.read().decode(), item_id, fatal=False) if error_info: raise ExtractorError( '%s said: %s' % (self.IE_NAME, error_info['message']), diff --git a/hypervideo_dl/extractor/rts.py b/hypervideo_dl/extractor/rts.py index 81c4d7c..9f73d18 100644 --- a/hypervideo_dl/extractor/rts.py +++ b/hypervideo_dl/extractor/rts.py @@ -136,8 +136,8 @@ class RTSIE(SRGSSRIE): # XXX: Do not subclass from concrete IE if not entries: page, urlh = self._download_webpage_handle(url, display_id) - if re.match(self._VALID_URL, urlh.geturl()).group('id') != media_id: - return self.url_result(urlh.geturl(), 'RTS') + if re.match(self._VALID_URL, urlh.url).group('id') != media_id: + return self.url_result(urlh.url, 'RTS') # article with videos on rhs videos = re.findall( diff --git a/hypervideo_dl/extractor/rtvcplay.py b/hypervideo_dl/extractor/rtvcplay.py new file mode 100644 index 0000000..741c472 --- /dev/null +++ b/hypervideo_dl/extractor/rtvcplay.py @@ -0,0 +1,285 @@ +import re + +from .common import InfoExtractor, ExtractorError +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + float_or_none, + js_to_json, + mimetype2ext, + traverse_obj, + urljoin, + url_or_none, +) + + +class RTVCPlayBaseIE(InfoExtractor): + _BASE_VALID_URL = r'https?://(?:www\.)?rtvcplay\.co' + + def _extract_player_config(self, webpage, video_id): + return self._search_json( + r'<script\b[^>]*>[^<]*(?:var|let|const)\s+config\s*=', re.sub(r'"\s*\+\s*"', '', webpage), + 'player_config', video_id, transform_source=js_to_json) + + def _extract_formats_and_subtitles_player_config(self, player_config, video_id): + formats, subtitles = [], {} + for source in traverse_obj(player_config, ('sources', ..., lambda _, v: url_or_none(v['url']))): + ext = mimetype2ext(source.get('mimetype'), default=determine_ext(source['url'])) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + source['url'], video_id, 'mp4', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': source['url'], + 'ext': ext, + }) + + return formats, subtitles + + +class RTVCPlayIE(RTVCPlayBaseIE): + _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/(?P<category>(?!embed)[^/]+)/(?:[^?#]+/)?(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://www.rtvcplay.co/en-vivo/canal-institucional', + 'info_dict': { + 'id': 'canal-institucional', + 'title': r're:^Canal Institucional', + 'description': 'md5:eff9e548394175928059320c006031ea', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.rtvcplay.co/en-vivo/senal-colombia', + 'info_dict': { + 'id': 'senal-colombia', + 'title': r're:^Señal Colombia', + 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.rtvcplay.co/en-vivo/radio-nacional', + 'info_dict': { + 'id': 'radio-nacional', + 'title': r're:^Radio Nacional', + 'description': 'md5:5de009bc6a9fa79d2a6cf0b73f977d53', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.rtvcplay.co/peliculas-ficcion/senoritas', + 'md5': '1288ee6f6d1330d880f98bff2ed710a3', + 'info_dict': { + 'id': 'senoritas', + 'title': 'Señoritas', + 'description': 'md5:f095a2bb52cb6cf279daf6302f86fb32', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa/james-regresa-clases-28022022', + 'md5': 'f040a7380a269ad633cf837384d5e9fc', + 'info_dict': { + 'id': 'james-regresa-clases-28022022', + 'title': 'James regresa a clases - 28/02/2022', + 'description': 'md5:c5dcdf757c7ab29305e8763c6007e675', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.rtvcplay.co/peliculas-documentales/llinas-el-cerebro-y-el-universo', + 'info_dict': { + 'id': 'llinas-el-cerebro-y-el-universo', + 'title': 'Llinás, el cerebro y el universo', + 'description': 'md5:add875bf2309bb52b3e8b9b06116d9b0', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa', + 'info_dict': { + 'id': 'profe-en-tu-casa', + 'title': 'Profe en tu casa', + 'description': 'md5:47dbe20e263194413b1db2a2805a4f2e', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 537, + }, { + 'url': 'https://www.rtvcplay.co/series-al-oido/relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura', + 'info_dict': { + 'id': 'relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura', + 'title': 'Relato de un náufrago: una travesía del periodismo a la literatura', + 'description': 'md5:6da28fdca4a5a568ea47ef65ef775603', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://www.rtvcplay.co/series-al-oido/diez-versiones', + 'info_dict': { + 'id': 'diez-versiones', + 'title': 'Diez versiones', + 'description': 'md5:997471ed971cb3fd8e41969457675306', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 20, + }] + + def _real_extract(self, url): + video_id, category = self._match_valid_url(url).group('id', 'category') + webpage = self._download_webpage(url, video_id) + + hydration = self._search_json( + r'window\.__RTVCPLAY_STATE__\s*=', webpage, 'hydration', + video_id, transform_source=js_to_json)['content']['currentContent'] + + asset_id = traverse_obj(hydration, ('video', 'assetid')) + if asset_id: + hls_url = hydration['base_url_hls'].replace('[node:field_asset_id]', asset_id) + else: + hls_url = traverse_obj(hydration, ('channel', 'hls')) + + metadata = traverse_obj(hydration, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ((('channel', 'image', 'logo'), ('resource', 'image', 'cover_desktop')), 'path'), + }, get_all=False) + + # Probably it's a program's page + if not hls_url: + seasons = traverse_obj( + hydration, ('widgets', lambda _, y: y['type'] == 'seasonList', 'contents'), + get_all=False) + if not seasons: + podcast_episodes = hydration.get('audios') + if not podcast_episodes: + raise ExtractorError('Could not find asset_id nor program playlist nor podcast episodes') + + return self.playlist_result([ + self.url_result(episode['file'], url_transparent=True, **traverse_obj(episode, { + 'title': 'title', + 'description': ('description', {clean_html}), + 'episode_number': ('chapter_number', {float_or_none}, {int_or_none}), + 'season_number': ('season', {int_or_none}), + })) for episode in podcast_episodes], video_id, **metadata) + + entries = [self.url_result( + urljoin(url, episode['slug']), url_transparent=True, + **traverse_obj(season, { + 'season': 'title', + 'season_number': ('season', {int_or_none}), + }), **traverse_obj(episode, { + 'title': 'title', + 'thumbnail': ('image', 'cover', 'path'), + 'episode_number': ('chapter_number', {int_or_none}), + })) for season in seasons for episode in traverse_obj(season, ('contents', ...))] + + return self.playlist_result(entries, video_id, **metadata) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(hls_url, video_id, 'mp4') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': category == 'en-vivo', + **metadata, + } + + +class RTVCPlayEmbedIE(RTVCPlayBaseIE): + _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/embed/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://www.rtvcplay.co/embed/72b0e699-248b-4929-a4a8-3782702fa7f9', + 'md5': 'ed529aeaee7aa2a72afe91ac7d1177a8', + 'info_dict': { + 'id': '72b0e699-248b-4929-a4a8-3782702fa7f9', + 'title': 'Tráiler: Señoritas', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'ext': 'mp4', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + player_config = self._extract_player_config(webpage, video_id) + formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id) + + asset_id = traverse_obj(player_config, ('rtvcplay', 'assetid')) + metadata = {} if not asset_id else self._download_json( + f'https://cms.rtvcplay.co/api/v1/video/asset-id/{asset_id}', video_id, fatal=False) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(metadata, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ('image', ..., 'thumbnail', 'path'), + }, get_all=False) + } + + +class RTVCKalturaIE(RTVCPlayBaseIE): + _VALID_URL = r'https?://media\.rtvc\.gov\.co/kalturartvc/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://media.rtvc.gov.co/kalturartvc/indexSC.html', + 'info_dict': { + 'id': 'indexSC', + 'title': r're:^Señal Colombia', + 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + player_config = self._extract_player_config(webpage, video_id) + formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id) + + channel_id = traverse_obj(player_config, ('rtvcplay', 'channelId')) + metadata = {} if not channel_id else self._download_json( + f'https://cms.rtvcplay.co/api/v1/taxonomy_term/streaming/{channel_id}', video_id, fatal=False) + + fmts, subs = self._extract_m3u8_formats_and_subtitles( + traverse_obj(metadata, ('channel', 'hls')), video_id, 'mp4', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + **traverse_obj(metadata, { + 'title': 'title', + 'description': 'description', + 'thumbnail': ('channel', 'image', 'logo', 'path'), + }) + } diff --git a/hypervideo_dl/extractor/rumble.py b/hypervideo_dl/extractor/rumble.py index 102615c..f8bf4a1 100644 --- a/hypervideo_dl/extractor/rumble.py +++ b/hypervideo_dl/extractor/rumble.py @@ -2,13 +2,20 @@ import itertools import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, + UnsupportedError, + clean_html, + determine_ext, + format_field, + get_element_by_class, int_or_none, + join_nonempty, + parse_count, parse_iso8601, traverse_obj, unescapeHTML, - ExtractorError, ) @@ -112,24 +119,6 @@ class RumbleEmbedIE(InfoExtractor): _WEBPAGE_TESTS = [ { - 'note': 'Rumble embed', - 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html', - 'md5': '53af34098a7f92c4e51cf0bd1c33f009', - 'info_dict': { - 'id': 'vb0ofn', - 'ext': 'mp4', - 'timestamp': 1612662578, - 'uploader': 'LovingMontana', - 'channel': 'LovingMontana', - 'upload_date': '20210207', - 'title': 'Winter-loving dog helps girls dig a snow fort ', - 'channel_url': 'https://rumble.com/c/c-546523', - 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/f/x/x/5fxxb.OvCc.1-small-Moose-The-Dog-Helps-Girls-D.jpg', - 'duration': 103, - 'live_status': 'not_live', - } - }, - { 'note': 'Rumble JS embed', 'url': 'https://therightscoop.com/what-does-9-plus-1-plus-1-equal-listen-to-this-audio-of-attempted-kavanaugh-assassins-call-and-youll-get-it', 'md5': '4701209ac99095592e73dbba21889690', @@ -155,7 +144,7 @@ class RumbleEmbedIE(InfoExtractor): if embeds: return embeds return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer( - r'<script>\s*Rumble\(\s*"play"\s*,\s*{\s*[\'"]video[\'"]\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] + r'<script>[^<]*\bRumble\(\s*"play"\s*,\s*{\s*[\'"]?video[\'"]?\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] def _real_extract(self, url): video_id = self._match_id(url) @@ -178,7 +167,13 @@ class RumbleEmbedIE(InfoExtractor): formats = [] for ext, ext_info in (video.get('ua') or {}).items(): - for height, video_info in (ext_info or {}).items(): + if isinstance(ext_info, dict): + for height, video_info in ext_info.items(): + if not traverse_obj(video_info, ('meta', 'h', {int_or_none})): + video_info.setdefault('meta', {})['h'] = height + ext_info = ext_info.values() + + for video_info in ext_info: meta = video_info.get('meta') or {} if not video_info.get('url'): continue @@ -189,18 +184,22 @@ class RumbleEmbedIE(InfoExtractor): video_info['url'], video_id, ext='mp4', m3u8_id='hls', fatal=False, live=live_status == 'is_live')) continue + timeline = ext == 'timeline' + if timeline: + ext = determine_ext(video_info['url']) formats.append({ 'ext': ext, + 'acodec': 'none' if timeline else None, 'url': video_info['url'], - 'format_id': '%s-%sp' % (ext, height), - 'height': int_or_none(height), - 'fps': video.get('fps'), + 'format_id': join_nonempty(ext, format_field(meta, 'h', '%sp')), + 'format_note': 'Timeline' if timeline else None, + 'fps': None if timeline else video.get('fps'), **traverse_obj(meta, { 'tbr': 'bitrate', 'filesize': 'size', 'width': 'w', 'height': 'h', - }, default={}) + }, expected_type=lambda x: int(x) or None) }) subtitles = { @@ -235,6 +234,121 @@ class RumbleEmbedIE(InfoExtractor): } +class RumbleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/(?P<id>v(?!ideos)[\w.-]+)[^/]*$' + _EMBED_REGEX = [r'<a class=video-item--a href=(?P<url>/v[\w.-]+\.html)>'] + _TESTS = [{ + 'add_ie': ['RumbleEmbed'], + 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html', + 'md5': '53af34098a7f92c4e51cf0bd1c33f009', + 'info_dict': { + 'id': 'vb0ofn', + 'ext': 'mp4', + 'timestamp': 1612662578, + 'uploader': 'LovingMontana', + 'channel': 'LovingMontana', + 'upload_date': '20210207', + 'title': 'Winter-loving dog helps girls dig a snow fort ', + 'description': 'Moose the dog is more than happy to help with digging out this epic snow fort. Great job, Moose!', + 'channel_url': 'https://rumble.com/c/c-546523', + 'thumbnail': r're:https://.+\.jpg', + 'duration': 103, + 'like_count': int, + 'view_count': int, + 'live_status': 'not_live', + } + }, { + 'url': 'http://www.rumble.com/vDMUM1?key=value', + 'only_matching': True, + }, { + 'note': 'timeline format', + 'url': 'https://rumble.com/v2ea9qb-the-u.s.-cannot-hide-this-in-ukraine-anymore-redacted-with-natali-and-clayt.html', + 'md5': '40d61fec6c0945bca3d0e1dc1aa53d79', + 'params': {'format': 'wv'}, + 'info_dict': { + 'id': 'v2bou5f', + 'ext': 'mp4', + 'uploader': 'Redacted News', + 'upload_date': '20230322', + 'timestamp': 1679445010, + 'title': 'The U.S. CANNOT hide this in Ukraine anymore | Redacted with Natali and Clayton Morris', + 'duration': 892, + 'channel': 'Redacted News', + 'description': 'md5:aaad0c5c3426d7a361c29bdaaced7c42', + 'channel_url': 'https://rumble.com/c/Redacted', + 'live_status': 'not_live', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/d/x/2/O/dx2Oi.qR4e-small-The-U.S.-CANNOT-hide-this-i.jpg', + }, + }, { + 'url': 'https://rumble.com/v2e7fju-the-covid-twitter-files-drop-protecting-fauci-while-censoring-the-truth-wma.html', + 'info_dict': { + 'id': 'v2blzyy', + 'ext': 'mp4', + 'live_status': 'was_live', + 'release_timestamp': 1679446804, + 'description': 'md5:2ac4908ccfecfb921f8ffa4b30c1e636', + 'release_date': '20230322', + 'timestamp': 1679445692, + 'duration': 4435, + 'upload_date': '20230322', + 'title': 'The Covid Twitter Files Drop: Protecting Fauci While Censoring The Truth w/Matt Taibbi', + 'uploader': 'Kim Iversen', + 'channel_url': 'https://rumble.com/c/KimIversen', + 'channel': 'Kim Iversen', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/6/b/w/O/6bwOi.qR4e-small-The-Covid-Twitter-Files-Dro.jpg', + }, + }] + + _WEBPAGE_TESTS = [{ + 'url': 'https://rumble.com/videos?page=2', + 'playlist_count': 25, + 'info_dict': { + 'id': 'videos?page=2', + 'title': 'All videos', + 'description': 'Browse videos uploaded to Rumble.com', + 'age_limit': 0, + }, + }, { + 'url': 'https://rumble.com/live-videos', + 'playlist_mincount': 19, + 'info_dict': { + 'id': 'live-videos', + 'title': 'Live Videos', + 'description': 'Live videos on Rumble.com', + 'age_limit': 0, + }, + }, { + 'url': 'https://rumble.com/search/video?q=rumble&sort=views', + 'playlist_count': 24, + 'info_dict': { + 'id': 'video?q=rumble&sort=views', + 'title': 'Search results for: rumble', + 'age_limit': 0, + }, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + url_info = next(RumbleEmbedIE.extract_from_webpage(self._downloader, url, webpage), None) + if not url_info: + raise UnsupportedError(url) + + release_ts_str = self._search_regex( + r'(?:Livestream begins|Streamed on):\s+<time datetime="([^"]+)', + webpage, 'release date', fatal=False, default=None) + view_count_str = self._search_regex(r'<span class="media-heading-info">([\d,]+) Views', + webpage, 'view count', fatal=False, default=None) + + return self.url_result( + url_info['url'], ie_key=url_info['ie_key'], url_transparent=True, + view_count=parse_count(view_count_str), + release_timestamp=parse_iso8601(release_ts_str), + like_count=parse_count(get_element_by_class('rumbles-count', webpage)), + description=clean_html(get_element_by_class('media-description', webpage)), + ) + + class RumbleChannelIE(InfoExtractor): _VALID_URL = r'(?P<url>https?://(?:www\.)?rumble\.com/(?:c|user)/(?P<id>[^&?#$/]+))' @@ -257,7 +371,7 @@ class RumbleChannelIE(InfoExtractor): try: webpage = self._download_webpage(f'{url}?page={page}', playlist_id, note='Downloading page %d' % page) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if isinstance(e.cause, HTTPError) and e.cause.status == 404: break raise for video_url in re.findall(r'class=video-item--a\s?href=([^>]+\.html)', webpage): diff --git a/hypervideo_dl/extractor/rutube.py b/hypervideo_dl/extractor/rutube.py index 5a4fd97..08d9b92 100644 --- a/hypervideo_dl/extractor/rutube.py +++ b/hypervideo_dl/extractor/rutube.py @@ -25,8 +25,7 @@ class RutubeBaseIE(InfoExtractor): video_id, 'Downloading video JSON', 'Unable to download video JSON', query=query) - @staticmethod - def _extract_info(video, video_id=None, require_title=True): + def _extract_info(self, video, video_id=None, require_title=True): title = video['title'] if require_title else video.get('title') age_limit = video.get('is_adult') @@ -35,13 +34,15 @@ class RutubeBaseIE(InfoExtractor): uploader_id = try_get(video, lambda x: x['author']['id']) category = try_get(video, lambda x: x['category']['name']) + description = video.get('description') + duration = int_or_none(video.get('duration')) return { 'id': video.get('id') or video_id if video_id else video['id'], 'title': title, - 'description': video.get('description'), + 'description': description, 'thumbnail': video.get('thumbnail_url'), - 'duration': int_or_none(video.get('duration')), + 'duration': duration, 'uploader': try_get(video, lambda x: x['author']['name']), 'uploader_id': compat_str(uploader_id) if uploader_id else None, 'timestamp': unified_timestamp(video.get('created_ts')), @@ -50,6 +51,7 @@ class RutubeBaseIE(InfoExtractor): 'view_count': int_or_none(video.get('hits')), 'comment_count': int_or_none(video.get('comments_count')), 'is_live': bool_or_none(video.get('is_livestream')), + 'chapters': self._extract_chapters_from_description(description, duration), } def _download_and_extract_info(self, video_id, query=None): @@ -91,12 +93,12 @@ class RutubeBaseIE(InfoExtractor): class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' - _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})' + _VALID_URL = r'https?://rutube\.ru/(?:video(?:/private)?|(?:play/)?embed)/(?P<id>[\da-z]{32})' _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/(?:play/)?embed/[\da-z]{32}.*?)\1'] _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', - 'md5': '1d24f180fac7a02f3900712e5a5764d6', + 'md5': 'e33ac625efca66aba86cbec9851f2692', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', 'ext': 'mp4', @@ -108,7 +110,12 @@ class RutubeIE(RutubeBaseIE): 'timestamp': 1381943602, 'upload_date': '20131016', 'age_limit': 0, + 'view_count': int, + 'thumbnail': 'http://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg', + 'category': ['Новости и СМИ'], + 'chapters': [], }, + 'expected_warnings': ['Unable to download f4m'], }, { 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, @@ -121,6 +128,45 @@ class RutubeIE(RutubeBaseIE): }, { 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', 'only_matching': True, + }, { + 'url': 'https://rutube.ru/video/private/884fb55f07a97ab673c7d654553e0f48/?p=x2QojCumHTS3rsKHWXN8Lg', + 'md5': 'd106225f15d625538fe22971158e896f', + 'info_dict': { + 'id': '884fb55f07a97ab673c7d654553e0f48', + 'ext': 'mp4', + 'title': 'Яцуноками, Nioh2', + 'description': 'Nioh2: финал сражения с боссом Яцуноками', + 'duration': 15, + 'uploader': 'mexus', + 'uploader_id': '24222106', + 'timestamp': 1670646232, + 'upload_date': '20221210', + 'age_limit': 0, + 'view_count': int, + 'thumbnail': 'http://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg', + 'category': ['Видеоигры'], + 'chapters': [], + }, + 'expected_warnings': ['Unable to download f4m'], + }, { + 'url': 'https://rutube.ru/video/c65b465ad0c98c89f3b25cb03dcc87c6/', + 'info_dict': { + 'id': 'c65b465ad0c98c89f3b25cb03dcc87c6', + 'ext': 'mp4', + 'chapters': 'count:4', + 'category': ['Бизнес и предпринимательство'], + 'description': 'md5:252feac1305257d8c1bab215cedde75d', + 'thumbnail': 'http://pic.rutubelist.ru/video/71/8f/718f27425ea9706073eb80883dd3787b.png', + 'duration': 782, + 'age_limit': 0, + 'uploader_id': '23491359', + 'timestamp': 1677153329, + 'view_count': int, + 'upload_date': '20230223', + 'title': 'Бизнес с нуля: найм сотрудников. Интервью с директором строительной компании', + 'uploader': 'Стас Быков', + }, + 'expected_warnings': ['Unable to download f4m'], }] @classmethod @@ -129,8 +175,9 @@ class RutubeIE(RutubeBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - info = self._download_and_extract_info(video_id) - info['formats'] = self._download_and_extract_formats(video_id) + query = parse_qs(url) + info = self._download_and_extract_info(video_id, query) + info['formats'] = self._download_and_extract_formats(video_id, query) return info diff --git a/hypervideo_dl/extractor/s4c.py b/hypervideo_dl/extractor/s4c.py new file mode 100644 index 0000000..38a9058 --- /dev/null +++ b/hypervideo_dl/extractor/s4c.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class S4CIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/programme/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.s4c.cymru/clic/programme/861362209', + 'info_dict': { + 'id': '861362209', + 'ext': 'mp4', + 'title': 'Y Swn', + 'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0', + 'duration': 5340 + }, + }, { + 'url': 'https://www.s4c.cymru/clic/programme/856636948', + 'info_dict': { + 'id': '856636948', + 'ext': 'mp4', + 'title': 'Am Dro', + 'duration': 2880, + 'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + details = self._download_json( + f'https://www.s4c.cymru/df/full_prog_details?lang=e&programme_id={video_id}', + video_id, fatal=False) + + filename = self._download_json( + 'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={ + 'programme_id': video_id, + 'signed': '0', + 'lang': 'en', + 'mode': 'od', + 'appId': 'clic', + 'streamName': '', + }, note='Downloading player config JSON')['filename'] + m3u8_url = self._download_json( + 'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={ + 'mode': 'od', + 'application': 'clic', + 'region': 'WW', + 'extra': 'false', + 'thirdParty': 'false', + 'filename': filename, + }, note='Downloading streaming urls JSON')['hls'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(details, ('full_prog_details', 0, { + 'title': (('programme_title', 'series_title'), {str}), + 'description': ('full_billing', {str.strip}), + 'duration': ('duration', {lambda x: int(x) * 60}), + }), get_all=False), + } diff --git a/hypervideo_dl/extractor/safari.py b/hypervideo_dl/extractor/safari.py index 450a661..8d322d7 100644 --- a/hypervideo_dl/extractor/safari.py +++ b/hypervideo_dl/extractor/safari.py @@ -28,13 +28,13 @@ class SafariBaseIE(InfoExtractor): 'Downloading login page') def is_logged(urlh): - return 'learning.oreilly.com/home/' in urlh.geturl() + return 'learning.oreilly.com/home/' in urlh.url if is_logged(urlh): self.LOGGED_IN = True return - redirect_url = urlh.geturl() + redirect_url = urlh.url parsed_url = compat_urlparse.urlparse(redirect_url) qs = compat_parse_qs(parsed_url.query) next_uri = compat_urlparse.urljoin( @@ -129,7 +129,7 @@ class SafariIE(SafariBaseIE): webpage, urlh = self._download_webpage_handle(url, video_id) - mobj = re.match(self._VALID_URL, urlh.geturl()) + mobj = re.match(self._VALID_URL, urlh.url) reference_id = mobj.group('reference_id') if not reference_id: reference_id = self._search_regex( diff --git a/hypervideo_dl/extractor/sbs.py b/hypervideo_dl/extractor/sbs.py index 4532033..7a91150 100644 --- a/hypervideo_dl/extractor/sbs.py +++ b/hypervideo_dl/extractor/sbs.py @@ -1,7 +1,13 @@ from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( - smuggle_url, - ExtractorError, + float_or_none, + int_or_none, + parse_duration, + parse_iso8601, + traverse_obj, + update_url_query, + url_or_none, ) @@ -11,7 +17,7 @@ class SBSIE(InfoExtractor): https?://(?:www\.)?sbs\.com\.au/(?: ondemand(?: /video/(?:single/)?| - /movie/[^/]+/| + /(?:movie|tv-program)/[^/]+/| /(?:tv|news)-series/(?:[^/]+/){3}| .*?\bplay=|/watch/ )|news/(?:embeds/)?video/ @@ -27,18 +33,21 @@ class SBSIE(InfoExtractor): # Original URL is handled by the generic IE which finds the iframe: # http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation 'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed', - 'md5': '3150cf278965eeabb5b4cea1c963fe0a', + 'md5': '31f84a7a19b53635db63c73f8ab0c4a7', 'info_dict': { - 'id': '_rFBPRPO4pMR', + 'id': '320403011771', # '_rFBPRPO4pMR', 'ext': 'mp4', 'title': 'Dingo Conservation (The Feed)', 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'duration': 308, 'timestamp': 1408613220, 'upload_date': '20140821', 'uploader': 'SBSC', + 'tags': None, + 'categories': None, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', 'only_matching': True, @@ -70,34 +79,80 @@ class SBSIE(InfoExtractor): }, { 'url': 'https://www.sbs.com.au/ondemand/tv-series/the-handmaids-tale/season-5/the-handmaids-tale-s5-ep1/2065631811776', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/tv-program/autun-romes-forgotten-sister/2116212803602', + 'only_matching': True, }] + _GEO_COUNTRIES = ['AU'] + _AUS_TV_PARENTAL_GUIDELINES = { + 'P': 0, + 'C': 7, + 'G': 0, + 'PG': 0, + 'M': 14, + 'MA15+': 15, + 'MAV15+': 15, + 'R18+': 18, + } + _PLAYER_API = 'https://www.sbs.com.au/api/v3' + def _real_extract(self, url): video_id = self._match_id(url) - player_params = self._download_json( - 'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id) + formats, subtitles = self._extract_smil_formats_and_subtitles( + update_url_query(f'{self._PLAYER_API}/video_smil', {'id': video_id}), video_id) - error = player_params.get('error') - if error: - error_message = 'Sorry, The video you are looking for does not exist.' - video_data = error.get('results') or {} - error_code = error.get('errorCode') - if error_code == 'ComingSoon': - error_message = '%s is not yet available.' % video_data.get('title', '') - elif error_code in ('Forbidden', 'intranetAccessOnly'): - error_message = 'Sorry, This video cannot be accessed via this website' - elif error_code == 'Expired': - error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '') - raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) + if not formats: + urlh = self._request_webpage( + HEADRequest('https://sbs-vod-prod-01.akamaized.net/'), video_id, + note='Checking geo-restriction', fatal=False, expected_status=403) + if urlh: + error_reasons = urlh.headers.get_all('x-error-reason') or [] + if 'geo-blocked' in error_reasons: + self.raise_geo_restricted(countries=['AU']) + self.raise_no_formats('No formats are available', video_id=video_id) - urls = player_params['releaseUrls'] - theplatform_url = (urls.get('progressive') or urls.get('html') - or urls.get('standard') or player_params['relatedItemsURL']) + media = traverse_obj(self._download_json( + f'{self._PLAYER_API}/video_stream', video_id, fatal=False, + query={'id': video_id, 'context': 'tv'}), ('video_object', {dict})) or {} + + media.update(self._download_json( + f'https://catalogue.pr.sbsod.com/mpx-media/{video_id}', + video_id, fatal=not media) or {}) + + # For named episodes, use the catalogue's title to set episode, rather than generic 'Episode N'. + if traverse_obj(media, ('partOfSeries', {dict})): + media['epName'] = traverse_obj(media, ('title', {str})) return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', 'id': video_id, - 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}), - 'is_live': player_params.get('streamType') == 'live', + **traverse_obj(media, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'channel': ('taxonomy', 'channel', 'name', {str}), + 'series': ((('partOfSeries', 'name'), 'seriesTitle'), {str}), + 'series_id': ((('partOfSeries', 'uuid'), 'seriesID'), {str}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode': ('epName', {str}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'timestamp': (('datePublished', ('publication', 'startDate')), {parse_iso8601}), + 'release_year': ('releaseYear', {int_or_none}), + 'duration': ('duration', ({float_or_none}, {parse_duration})), + 'is_live': ('liveStream', {bool}), + 'age_limit': (('classificationID', 'contentRating'), {str.upper}, { + lambda x: self._AUS_TV_PARENTAL_GUIDELINES.get(x)}), # dict.get is unhashable in py3.7 + }, get_all=False), + **traverse_obj(media, { + 'categories': (('genres', ...), ('taxonomy', ('genre', 'subgenre'), 'name'), {str}), + 'tags': (('consumerAdviceTexts', ('sbsSubCertification', 'consumerAdvice')), ..., {str}), + 'thumbnails': ('thumbnails', lambda _, v: url_or_none(v['contentUrl']), { + 'id': ('name', {str}), + 'url': 'contentUrl', + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + }), + }), + 'formats': formats, + 'subtitles': subtitles, + 'uploader': 'SBSC', } diff --git a/hypervideo_dl/extractor/scrippsnetworks.py b/hypervideo_dl/extractor/scrippsnetworks.py index c3cee6e..adfd7e5 100644 --- a/hypervideo_dl/extractor/scrippsnetworks.py +++ b/hypervideo_dl/extractor/scrippsnetworks.py @@ -115,6 +115,7 @@ class ScrippsNetworksIE(InfoExtractor): 'uploader': 'SCNI-SCND', }, 'add_ie': ['ThePlatform'], + 'expected_warnings': ['No HLS formats found'], }, { 'url': 'https://www.diynetwork.com/videos/diy-barnwood-tablet-stand-0265790', 'only_matching': True, diff --git a/hypervideo_dl/extractor/senalcolombia.py b/hypervideo_dl/extractor/senalcolombia.py new file mode 100644 index 0000000..f3c066d --- /dev/null +++ b/hypervideo_dl/extractor/senalcolombia.py @@ -0,0 +1,31 @@ +from .common import InfoExtractor +from .rtvcplay import RTVCKalturaIE + + +class SenalColombiaLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?senalcolombia\.tv/(?P<id>senal-en-vivo)' + + _TESTS = [{ + 'url': 'https://www.senalcolombia.tv/senal-en-vivo', + 'info_dict': { + 'id': 'indexSC', + 'title': 're:^Señal Colombia', + 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + hydration = self._search_json( + r'<script\b[^>]*data-drupal-selector\s*=\s*"[^"]*drupal-settings-json[^"]*"[^>]*>', + webpage, 'hydration', display_id) + + return self.url_result(hydration['envivosrc'], RTVCKalturaIE, display_id) diff --git a/hypervideo_dl/extractor/servus.py b/hypervideo_dl/extractor/servus.py index 490d562..dda1958 100644 --- a/hypervideo_dl/extractor/servus.py +++ b/hypervideo_dl/extractor/servus.py @@ -1,11 +1,13 @@ from .common import InfoExtractor from ..utils import ( - determine_ext, + ExtractorError, float_or_none, + format_field, int_or_none, + join_nonempty, + traverse_obj, + unescapeHTML, unified_timestamp, - urlencode_postdata, - url_or_none, ) @@ -15,32 +17,41 @@ class ServusIE(InfoExtractor): (?:www\.)? (?: servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| - (?:servustv|pm-wissen)\.com/videos + (?:servustv|pm-wissen)\.com/(?:[^/]+/)?v(?:ideos)? ) - /(?P<id>[aA]{2}-\w+|\d+-\d+) + /(?P<id>[aA]{2}-?\w+|\d+-\d+) ''' _TESTS = [{ - # new URL schema - 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', - 'md5': '60474d4c21f3eb148838f215c37f02b9', + # URL schema v3 + 'url': 'https://www.servustv.com/natur/v/aa-28bycqnh92111/', 'info_dict': { - 'id': 'AA-1T6VBU5PW1W12', + 'id': 'AA-28BYCQNH92111', 'ext': 'mp4', - 'title': 'Die Grünen aus Sicht des Volkes', - 'alt_title': 'Talk im Hangar-7 Voxpops Gruene', - 'description': 'md5:1247204d85783afe3682644398ff2ec4', + 'title': 'Klettersteige in den Alpen', + 'description': 'md5:25e47ddd83a009a0f9789ba18f2850ce', 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 62.442, - 'timestamp': 1605193976, - 'upload_date': '20201112', - 'series': 'Talk im Hangar-7', - 'season': 'Season 9', - 'season_number': 9, - 'episode': 'Episode 31 - September 14', - 'episode_number': 31, - } + 'duration': 2823, + 'timestamp': 1655752333, + 'upload_date': '20220620', + 'series': 'Bergwelten', + 'season': 'Season 11', + 'season_number': 11, + 'episode': 'Episode 8 - Vie Ferrate – Klettersteige in den Alpen', + 'episode_number': 8, + }, + 'params': {'skip_download': 'm3u8'} + }, { + 'url': 'https://www.servustv.com/natur/v/aa-1xg5xwmgw2112/', + 'only_matching': True, + }, { + 'url': 'https://www.servustv.com/natur/v/aansszcx3yi9jmlmhdc1/', + 'only_matching': True, }, { - # old URL schema + # URL schema v2 + 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', + 'only_matching': True, + }, { + # URL schema v1 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', 'only_matching': True, }, { @@ -60,85 +71,65 @@ class ServusIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url).upper() - token = self._download_json( - 'https://auth.redbullmediahouse.com/token', video_id, - 'Downloading token', data=urlencode_postdata({ - 'grant_type': 'client_credentials', - }), headers={ - 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==', - }) - access_token = token['access_token'] - token_type = token.get('token_type', 'Bearer') - video = self._download_json( - 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id, - video_id, 'Downloading video JSON', headers={ - 'Authorization': '%s %s' % (token_type, access_token), - }) + 'https://api-player.redbull.com/stv/servus-tv?timeZone=Europe/Berlin', + video_id, 'Downloading video JSON', query={'videoId': video_id}) + if not video.get('videoUrl'): + self._report_errors(video) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + video['videoUrl'], video_id, 'mp4', m3u8_id='hls') - formats = [] - thumbnail = None - for resource in video['resources']: - if not isinstance(resource, dict): - continue - format_url = url_or_none(resource.get('url')) - if not format_url: - continue - extension = resource.get('extension') - type_ = resource.get('type') - if extension == 'jpg' or type_ == 'reference_keyframe': - thumbnail = format_url - continue - ext = determine_ext(format_url) - if type_ == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - elif type_ == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif extension == 'mp4' or ext == 'mp4': - formats.append({ - 'url': format_url, - 'format_id': type_, - 'width': int_or_none(resource.get('width')), - 'height': int_or_none(resource.get('height')), - }) - - attrs = {} - for attribute in video['attributes']: - if not isinstance(attribute, dict): - continue - key = attribute.get('fieldKey') - value = attribute.get('fieldValue') - if not key or not value: - continue - attrs[key] = value - - title = attrs.get('title_stv') or video_id - alt_title = attrs.get('title') - description = attrs.get('long_description') or attrs.get('short_description') - series = attrs.get('label') - season = attrs.get('season') - episode = attrs.get('chapter') - duration = float_or_none(attrs.get('duration'), scale=1000) + season = video.get('season') season_number = int_or_none(self._search_regex( r'Season (\d+)', season or '', 'season number', default=None)) + episode = video.get('chapter') episode_number = int_or_none(self._search_regex( r'Episode (\d+)', episode or '', 'episode number', default=None)) return { 'id': video_id, - 'title': title, - 'alt_title': alt_title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': unified_timestamp(video.get('lastPublished')), - 'series': series, + 'title': video.get('title'), + 'description': self._get_description(video_id) or video.get('description'), + 'thumbnail': video.get('poster'), + 'duration': float_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('currentSunrise')), + 'series': video.get('label'), 'season': season, 'season_number': season_number, 'episode': episode, 'episode_number': episode_number, 'formats': formats, + 'subtitles': subtitles, } + + def _get_description(self, video_id): + info = self._download_json( + f'https://backend.servustv.com/wp-json/rbmh/v2/media_asset/aa_id/{video_id}?fieldset=page', + video_id, fatal=False) + + return join_nonempty(*traverse_obj(info, ( + ('stv_short_description', 'stv_long_description'), + {lambda x: unescapeHTML(x.replace('\n\n', '\n'))})), delim='\n\n') + + def _report_errors(self, video): + playability_errors = traverse_obj(video, ('playabilityErrors', ...)) + if not playability_errors: + raise ExtractorError('No videoUrl and no information about errors') + + elif 'FSK_BLOCKED' in playability_errors: + details = traverse_obj(video, ('playabilityErrorDetails', 'FSK_BLOCKED'), expected_type=dict) + message = format_field(''.join(( + format_field(details, 'minEveningHour', ' from %02d:00'), + format_field(details, 'maxMorningHour', ' to %02d:00'), + format_field(details, 'minAge', ' (Minimum age %d)'), + )), None, 'Only available%s') or 'Blocked by FSK with unknown availability' + + elif 'NOT_YET_AVAILABLE' in playability_errors: + message = format_field( + video, (('playabilityErrorDetails', 'NOT_YET_AVAILABLE', 'availableFrom'), 'currentSunrise'), + 'Only available from %s') or 'Video not yet available with unknown availability' + + else: + message = f'Video unavailable: {", ".join(playability_errors)}' + + raise ExtractorError(message, expected=True) diff --git a/hypervideo_dl/extractor/sevenplus.py b/hypervideo_dl/extractor/sevenplus.py index 222bf6c..6c688d1 100644 --- a/hypervideo_dl/extractor/sevenplus.py +++ b/hypervideo_dl/extractor/sevenplus.py @@ -2,10 +2,8 @@ import json import re from .brightcove import BrightcoveNewBaseIE -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, try_get, @@ -97,9 +95,9 @@ class SevenPlusIE(BrightcoveNewBaseIE): 'videoType': 'vod', }, headers=headers)['media'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: raise ExtractorError(self._parse_json( - e.cause.read().decode(), episode_id)[0]['error_code'], expected=True) + e.cause.response.read().decode(), episode_id)[0]['error_code'], expected=True) raise for source in media.get('sources', {}): diff --git a/hypervideo_dl/extractor/shahid.py b/hypervideo_dl/extractor/shahid.py index 26a0bff..d509e88 100644 --- a/hypervideo_dl/extractor/shahid.py +++ b/hypervideo_dl/extractor/shahid.py @@ -3,7 +3,7 @@ import math import re from .aws import AWSIE -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, ExtractorError, @@ -22,7 +22,7 @@ class ShahidBaseIE(AWSIE): def _handle_error(self, e): fail_data = self._parse_json( - e.cause.read().decode('utf-8'), None, fatal=False) + e.cause.response.read().decode('utf-8'), None, fatal=False) if fail_data: faults = fail_data.get('faults', []) faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')]) @@ -40,7 +40,7 @@ class ShahidBaseIE(AWSIE): 'secret_key': '4WUUJWuFvtTkXbhaWTDv7MhO+0LqoYDWfEnUXoWn', }, video_id, query) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): + if isinstance(e.cause, HTTPError): self._handle_error(e) raise @@ -88,7 +88,7 @@ class ShahidIE(ShahidBaseIE): 'Content-Type': 'application/json; charset=UTF-8', })['user'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): + if isinstance(e.cause, HTTPError): self._handle_error(e) raise diff --git a/hypervideo_dl/extractor/shemaroome.py b/hypervideo_dl/extractor/shemaroome.py index 7a78c6e..ec9938b 100644 --- a/hypervideo_dl/extractor/shemaroome.py +++ b/hypervideo_dl/extractor/shemaroome.py @@ -73,7 +73,10 @@ class ShemarooMeIE(InfoExtractor): key = bytes_to_intlist(compat_b64decode(data_json['key'])) iv = [0] * 16 m3u8_url = unpad_pkcs7(intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))).decode('ascii') - formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers={'stream_key': data_json['stream_key']}) + headers = {'stream_key': data_json['stream_key']} + formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers=headers) + for fmt in formats: + fmt['http_headers'] = headers release_date = self._html_search_regex( (r'itemprop="uploadDate">\s*([\d-]+)', r'id="release_date" value="([\d-]+)'), diff --git a/hypervideo_dl/extractor/sibnet.py b/hypervideo_dl/extractor/sibnet.py new file mode 100644 index 0000000..73bb75d --- /dev/null +++ b/hypervideo_dl/extractor/sibnet.py @@ -0,0 +1,17 @@ +from .common import InfoExtractor + + +class SibnetEmbedIE(InfoExtractor): + # Ref: https://help.sibnet.ru/?sibnet_video_embed + _VALID_URL = False + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1'] + _WEBPAGE_TESTS = [{ + 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html', + 'info_dict': { + 'id': 'shell', # FIXME? + 'ext': 'mp4', + 'age_limit': 0, + 'thumbnail': 'https://video.sibnet.ru/upload/cover/video_1887072_0.jpg', + 'title': 'КВН Москва не сразу строилась - Девушка впервые играет в Mortal Kombat', + } + }] diff --git a/hypervideo_dl/extractor/sina.py b/hypervideo_dl/extractor/sina.py index aeba4e3..9842811 100644 --- a/hypervideo_dl/extractor/sina.py +++ b/hypervideo_dl/extractor/sina.py @@ -1,12 +1,12 @@ from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( - HEADRequest, ExtractorError, + clean_html, + get_element_by_attribute, int_or_none, - update_url_query, qualities, - get_element_by_attribute, - clean_html, + update_url_query, ) @@ -60,7 +60,7 @@ class SinaIE(InfoExtractor): self.to_screen('Getting video id') request = HEADRequest(url) _, urlh = self._download_webpage_handle(request, 'NA', False) - return self._real_extract(urlh.geturl()) + return self._real_extract(urlh.url) else: pseudo_id = mobj.group('pseudo_id') webpage = self._download_webpage(url, pseudo_id) diff --git a/hypervideo_dl/extractor/sixplay.py b/hypervideo_dl/extractor/sixplay.py index a6fb6c1..ef93b92 100644 --- a/hypervideo_dl/extractor/sixplay.py +++ b/hypervideo_dl/extractor/sixplay.py @@ -79,7 +79,7 @@ class SixPlayIE(InfoExtractor): headers=self.geo_verification_headers()) if not urlh: continue - asset_url = urlh.geturl() + asset_url = urlh.url asset_url = asset_url.replace('_drmnp.ism/', '_unpnp.ism/') for i in range(3, 0, -1): asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i) diff --git a/hypervideo_dl/extractor/slideslive.py b/hypervideo_dl/extractor/slideslive.py index 9a60a79..25f867a 100644 --- a/hypervideo_dl/extractor/slideslive.py +++ b/hypervideo_dl/extractor/slideslive.py @@ -1,103 +1,567 @@ +import re +import urllib.parse + from .common import InfoExtractor from ..utils import ( - bool_or_none, + ExtractorError, + int_or_none, + parse_qs, smuggle_url, - try_get, + traverse_obj, + unified_timestamp, + update_url_query, url_or_none, + xpath_text, ) class SlidesLiveIE(InfoExtractor): - _VALID_URL = r'https?://slideslive\.com/(?P<id>[0-9]+)' - _WORKING = False + _VALID_URL = r'https?://slideslive\.com/(?:embed/(?:presentation/)?)?(?P<id>[0-9]+)' _TESTS = [{ - # video_service_name = YOUTUBE + # service_name = yoda, only XML slides info 'url': 'https://slideslive.com/38902413/gcc-ia16-backend', - 'md5': 'b29fcd6c6952d0c79c5079b0e7a07e6f', 'info_dict': { - 'id': 'LMtgR8ba0b0', + 'id': '38902413', 'ext': 'mp4', 'title': 'GCC IA16 backend', - 'description': 'Watch full version of this video at https://slideslive.com/38902413.', - 'uploader': 'SlidesLive Videos - A', - 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', - 'timestamp': 1597615266, - 'upload_date': '20170925', - } + 'timestamp': 1648189972, + 'upload_date': '20220325', + 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnails': 'count:42', + 'chapters': 'count:41', + 'duration': 1638, + }, + 'params': { + 'skip_download': 'm3u8', + }, }, { - # video_service_name = yoda + # service_name = yoda, /v7/ slides 'url': 'https://slideslive.com/38935785', - 'md5': '575cd7a6c0acc6e28422fe76dd4bcb1a', 'info_dict': { - 'id': 'RMraDYN5ozA_', + 'id': '38935785', 'ext': 'mp4', 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', + 'upload_date': '20211115', + 'timestamp': 1636996003, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:640', + 'chapters': 'count:639', + 'duration': 9832, + }, + 'params': { + 'skip_download': 'm3u8', }, }, { - # video_service_name = youtube + # service_name = yoda, /v1/ slides + 'url': 'https://slideslive.com/38973182/how-should-a-machine-learning-researcher-think-about-ai-ethics', + 'info_dict': { + 'id': '38973182', + 'ext': 'mp4', + 'title': 'How Should a Machine Learning Researcher Think About AI Ethics?', + 'upload_date': '20220201', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1643728135, + 'thumbnails': 'count:3', + 'chapters': 'count:2', + 'duration': 5889, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # service_name = youtube, only XML slides info + 'url': 'https://slideslive.com/38897546/special-metaprednaska-petra-ludwiga-hodnoty-pro-lepsi-spolecnost', + 'md5': '8a79b5e3d700837f40bd2afca3c8fa01', + 'info_dict': { + 'id': 'jmg02wCJD5M', + 'display_id': '38897546', + 'ext': 'mp4', + 'title': 'SPECIÁL: Meta-přednáška Petra Ludwiga - Hodnoty pro lepší společnost', + 'description': 'Watch full version of this video at https://slideslive.com/38897546.', + 'channel_url': 'https://www.youtube.com/channel/UCZWdAkNYFncuX0khyvhqnxw', + 'channel': 'SlidesLive Videos - G1', + 'channel_id': 'UCZWdAkNYFncuX0khyvhqnxw', + 'uploader_id': 'UCZWdAkNYFncuX0khyvhqnxw', + 'uploader': 'SlidesLive Videos - G1', + 'uploader_url': 'http://www.youtube.com/channel/UCZWdAkNYFncuX0khyvhqnxw', + 'live_status': 'not_live', + 'upload_date': '20160710', + 'timestamp': 1618786715, + 'duration': 6827, + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'channel_follower_count': int, + 'age_limit': 0, + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)', + 'thumbnails': 'count:169', + 'playable_in_embed': True, + 'availability': 'unlisted', + 'tags': [], + 'categories': ['People & Blogs'], + 'chapters': 'count:168', + }, + }, { + # embed-only presentation, only XML slides info + 'url': 'https://slideslive.com/embed/presentation/38925850', + 'info_dict': { + 'id': '38925850', + 'ext': 'mp4', + 'title': 'Towards a Deep Network Architecture for Structured Smoothness', + 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnails': 'count:8', + 'timestamp': 1629671508, + 'upload_date': '20210822', + 'chapters': 'count:7', + 'duration': 326, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # embed-only presentation, only JSON slides info, /v5/ slides (.png) + 'url': 'https://slideslive.com/38979920/', + 'info_dict': { + 'id': '38979920', + 'ext': 'mp4', + 'title': 'MoReL: Multi-omics Relational Learning', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:7', + 'timestamp': 1654714970, + 'upload_date': '20220608', + 'chapters': 'count:6', + 'duration': 171, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v2/ slides (.jpg) + 'url': 'https://slideslive.com/38954074', + 'info_dict': { + 'id': '38954074', + 'ext': 'mp4', + 'title': 'Decentralized Attribution of Generative Models', + 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnails': 'count:16', + 'timestamp': 1622806321, + 'upload_date': '20210604', + 'chapters': 'count:15', + 'duration': 306, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v4/ slides (.png) + 'url': 'https://slideslive.com/38979570/', + 'info_dict': { + 'id': '38979570', + 'ext': 'mp4', + 'title': 'Efficient Active Search for Combinatorial Optimization Problems', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:9', + 'timestamp': 1654714896, + 'upload_date': '20220608', + 'chapters': 'count:8', + 'duration': 295, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v10/ slides + 'url': 'https://slideslive.com/embed/presentation/38979880?embed_parent_url=https%3A%2F%2Fedit.videoken.com%2F', + 'info_dict': { + 'id': '38979880', + 'ext': 'mp4', + 'title': 'The Representation Power of Neural Networks', + 'timestamp': 1654714962, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:22', + 'upload_date': '20220608', + 'chapters': 'count:21', + 'duration': 294, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v7/ slides, 2 video slides + 'url': 'https://slideslive.com/embed/presentation/38979682?embed_container_origin=https%3A%2F%2Fedit.videoken.com', + 'playlist_count': 3, + 'info_dict': { + 'id': '38979682-playlist', + 'title': 'LoRA: Low-Rank Adaptation of Large Language Models', + }, + 'playlist': [{ + 'info_dict': { + 'id': '38979682', + 'ext': 'mp4', + 'title': 'LoRA: Low-Rank Adaptation of Large Language Models', + 'timestamp': 1654714920, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:30', + 'upload_date': '20220608', + 'chapters': 'count:31', + 'duration': 272, + }, + }, { + 'info_dict': { + 'id': '38979682-021', + 'ext': 'mp4', + 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 021', + 'duration': 3, + 'timestamp': 1654714920, + 'upload_date': '20220608', + }, + }, { + 'info_dict': { + 'id': '38979682-024', + 'ext': 'mp4', + 'title': 'LoRA: Low-Rank Adaptation of Large Language Models - Slide 024', + 'duration': 4, + 'timestamp': 1654714920, + 'upload_date': '20220608', + }, + }], + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v6/ slides, 1 video slide, edit.videoken.com embed + 'url': 'https://slideslive.com/38979481/', + 'playlist_count': 2, + 'info_dict': { + 'id': '38979481-playlist', + 'title': 'How to Train Your MAML to Excel in Few-Shot Classification', + }, + 'playlist': [{ + 'info_dict': { + 'id': '38979481', + 'ext': 'mp4', + 'title': 'How to Train Your MAML to Excel in Few-Shot Classification', + 'timestamp': 1654714877, + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:43', + 'upload_date': '20220608', + 'chapters': 'count:43', + 'duration': 315, + }, + }, { + 'info_dict': { + 'id': '38979481-013', + 'ext': 'mp4', + 'title': 'How to Train Your MAML to Excel in Few-Shot Classification - Slide 013', + 'duration': 3, + 'timestamp': 1654714877, + 'upload_date': '20220608', + }, + }], + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v3/ slides, .jpg and .png, service_name = youtube + 'url': 'https://slideslive.com/embed/38932460/', + 'info_dict': { + 'id': 'RTPdrgkyTiE', + 'display_id': '38932460', + 'ext': 'mp4', + 'title': 'Active Learning for Hierarchical Multi-Label Classification', + 'description': 'Watch full version of this video at https://slideslive.com/38932460.', + 'channel': 'SlidesLive Videos - A', + 'channel_id': 'UC62SdArr41t_-_fX40QCLRw', + 'channel_url': 'https://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw', + 'uploader': 'SlidesLive Videos - A', + 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', + 'uploader_url': 'http://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw', + 'upload_date': '20200903', + 'timestamp': 1602599092, + 'duration': 942, + 'age_limit': 0, + 'live_status': 'not_live', + 'playable_in_embed': True, + 'availability': 'unlisted', + 'categories': ['People & Blogs'], + 'tags': [], + 'channel_follower_count': int, + 'like_count': int, + 'view_count': int, + 'thumbnail': r're:^https?://.*\.(?:jpg|png|webp)', + 'thumbnails': 'count:21', + 'chapters': 'count:20', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # /v3/ slides, .png only, service_name = yoda + 'url': 'https://slideslive.com/38983994', + 'info_dict': { + 'id': '38983994', + 'ext': 'mp4', + 'title': 'Zero-Shot AutoML with Pretrained Models', + 'timestamp': 1662384834, + 'upload_date': '20220905', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:23', + 'chapters': 'count:22', + 'duration': 295, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # service_name = yoda 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', 'only_matching': True, }, { - # video_service_name = url + # dead link, service_name = url 'url': 'https://slideslive.com/38922070/learning-transferable-skills-1', 'only_matching': True, }, { - # video_service_name = vimeo + # dead link, service_name = vimeo 'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3', 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + # only XML slides info + 'url': 'https://iclr.cc/virtual_2020/poster_Hklr204Fvr.html', + 'info_dict': { + 'id': '38925850', + 'ext': 'mp4', + 'title': 'Towards a Deep Network Architecture for Structured Smoothness', + 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnails': 'count:8', + 'timestamp': 1629671508, + 'upload_date': '20210822', + 'chapters': 'count:7', + 'duration': 326, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + # Reference: https://slideslive.com/embed_presentation.js + for embed_id in re.findall(r'(?s)new\s+SlidesLiveEmbed\s*\([^)]+\bpresentationId:\s*["\'](\d+)["\']', webpage): + url_parsed = urllib.parse.urlparse(url) + origin = f'{url_parsed.scheme}://{url_parsed.netloc}' + yield update_url_query( + f'https://slideslive.com/embed/presentation/{embed_id}', { + 'embed_parent_url': url, + 'embed_container_origin': origin, + }) + + def _download_embed_webpage_handle(self, video_id, headers): + return self._download_webpage_handle( + f'https://slideslive.com/embed/presentation/{video_id}', video_id, + headers=headers, query=traverse_obj(headers, { + 'embed_parent_url': 'Referer', + 'embed_container_origin': 'Origin', + })) + + def _extract_custom_m3u8_info(self, m3u8_data): + m3u8_dict = {} + + lookup = { + 'PRESENTATION-TITLE': 'title', + 'PRESENTATION-UPDATED-AT': 'timestamp', + 'PRESENTATION-THUMBNAIL': 'thumbnail', + 'PLAYLIST-TYPE': 'playlist_type', + 'VOD-VIDEO-SERVICE-NAME': 'service_name', + 'VOD-VIDEO-ID': 'service_id', + 'VOD-VIDEO-SERVERS': 'video_servers', + 'VOD-SUBTITLES': 'subtitles', + 'VOD-SLIDES-JSON-URL': 'slides_json_url', + 'VOD-SLIDES-XML-URL': 'slides_xml_url', + } + + for line in m3u8_data.splitlines(): + if not line.startswith('#EXT-SL-'): + continue + tag, _, value = line.partition(':') + key = lookup.get(tag.lstrip('#EXT-SL-')) + if not key: + continue + m3u8_dict[key] = value + + # Some values are stringified JSON arrays + for key in ('video_servers', 'subtitles'): + if key in m3u8_dict: + m3u8_dict[key] = self._parse_json(m3u8_dict[key], None, fatal=False) or [] + + return m3u8_dict + + def _extract_formats_and_duration(self, cdn_hostname, path, video_id, skip_duration=False): + formats, duration = [], None + + hls_formats = self._extract_m3u8_formats( + f'https://{cdn_hostname}/{path}/master.m3u8', + video_id, 'mp4', m3u8_id='hls', fatal=False, live=True) + if hls_formats: + if not skip_duration: + duration = self._extract_m3u8_vod_duration( + hls_formats[0]['url'], video_id, note='Extracting duration from HLS manifest') + formats.extend(hls_formats) + + dash_formats = self._extract_mpd_formats( + f'https://{cdn_hostname}/{path}/master.mpd', video_id, mpd_id='dash', fatal=False) + if dash_formats: + if not duration and not skip_duration: + duration = self._extract_mpd_vod_duration( + f'https://{cdn_hostname}/{path}/master.mpd', video_id, + note='Extracting duration from DASH manifest') + formats.extend(dash_formats) + + return formats, duration + def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'https://ben.slideslive.com/player/' + video_id, video_id) - service_name = video_data['video_service_name'].lower() + webpage, urlh = self._download_embed_webpage_handle( + video_id, headers=traverse_obj(parse_qs(url), { + 'Referer': ('embed_parent_url', -1), + 'Origin': ('embed_container_origin', -1)})) + redirect_url = urlh.url + if 'domain_not_allowed' in redirect_url: + domain = traverse_obj(parse_qs(redirect_url), ('allowed_domains[]', ...), get_all=False) + if not domain: + raise ExtractorError( + 'This is an embed-only presentation. Try passing --referer', expected=True) + webpage, _ = self._download_embed_webpage_handle(video_id, headers={ + 'Referer': f'https://{domain}/', + 'Origin': f'https://{domain}', + }) + + player_token = self._search_regex(r'data-player-token="([^"]+)"', webpage, 'player token') + player_data = self._download_webpage( + f'https://ben.slideslive.com/player/{video_id}', video_id, + note='Downloading player info', query={'player_token': player_token}) + player_info = self._extract_custom_m3u8_info(player_data) + + service_name = player_info['service_name'].lower() assert service_name in ('url', 'yoda', 'vimeo', 'youtube') - service_id = video_data['video_service_id'] + service_id = player_info['service_id'] + + slide_url_template = 'https://slides.slideslive.com/%s/slides/original/%s%s' + slides, slides_info = {}, [] + + if player_info.get('slides_json_url'): + slides = self._download_json( + player_info['slides_json_url'], video_id, fatal=False, + note='Downloading slides JSON', errnote=False) or {} + slide_ext_default = '.png' + slide_quality = traverse_obj(slides, ('slide_qualities', 0)) + if slide_quality: + slide_ext_default = '.jpg' + slide_url_template = f'https://cdn.slideslive.com/data/presentations/%s/slides/{slide_quality}/%s%s' + for slide_id, slide in enumerate(traverse_obj(slides, ('slides', ...), expected_type=dict), 1): + slides_info.append(( + slide_id, traverse_obj(slide, ('image', 'name')), + traverse_obj(slide, ('image', 'extname'), default=slide_ext_default), + int_or_none(slide.get('time'), scale=1000))) + + if not slides and player_info.get('slides_xml_url'): + slides = self._download_xml( + player_info['slides_xml_url'], video_id, fatal=False, + note='Downloading slides XML', errnote='Failed to download slides info') + slide_url_template = 'https://cdn.slideslive.com/data/presentations/%s/slides/big/%s%s' + for slide_id, slide in enumerate(slides.findall('./slide') if slides else [], 1): + slides_info.append(( + slide_id, xpath_text(slide, './slideName', 'name'), '.jpg', + int_or_none(xpath_text(slide, './timeSec', 'time')))) + + chapters, thumbnails = [], [] + if url_or_none(player_info.get('thumbnail')): + thumbnails.append({'id': 'cover', 'url': player_info['thumbnail']}) + for slide_id, slide_path, slide_ext, start_time in slides_info: + if slide_path: + thumbnails.append({ + 'id': f'{slide_id:03d}', + 'url': slide_url_template % (video_id, slide_path, slide_ext), + }) + chapters.append({ + 'title': f'Slide {slide_id:03d}', + 'start_time': start_time, + }) + subtitles = {} - for sub in try_get(video_data, lambda x: x['subtitles'], list) or []: - if not isinstance(sub, dict): - continue + for sub in traverse_obj(player_info, ('subtitles', ...), expected_type=dict): webvtt_url = url_or_none(sub.get('webvtt_url')) if not webvtt_url: continue - lang = sub.get('language') or 'en' - subtitles.setdefault(lang, []).append({ + subtitles.setdefault(sub.get('language') or 'en', []).append({ 'url': webvtt_url, + 'ext': 'vtt', }) + info = { 'id': video_id, - 'thumbnail': video_data.get('thumbnail'), - 'is_live': bool_or_none(video_data.get('is_live')), + 'title': player_info.get('title') or self._html_search_meta('title', webpage, default=''), + 'timestamp': unified_timestamp(player_info.get('timestamp')), + 'is_live': player_info.get('playlist_type') != 'vod', + 'thumbnails': thumbnails, + 'chapters': chapters, 'subtitles': subtitles, } - if service_name in ('url', 'yoda'): - info['title'] = video_data['title'] - if service_name == 'url': - info['url'] = service_id - else: - formats = [] - _MANIFEST_PATTERN = 'https://01.cdn.yoda.slideslive.com/%s/master.%s' - # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol - formats.extend(self._extract_m3u8_formats( - _MANIFEST_PATTERN % (service_id, 'm3u8'), - service_id, 'mp4', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_mpd_formats( - _MANIFEST_PATTERN % (service_id, 'mpd'), service_id, - mpd_id='dash', fatal=False)) - info.update({ - 'id': service_id, - 'formats': formats, - }) + + if service_name == 'url': + info['url'] = service_id + elif service_name == 'yoda': + formats, duration = self._extract_formats_and_duration( + player_info['video_servers'][0], service_id, video_id) + info.update({ + 'duration': duration, + 'formats': formats, + }) else: info.update({ '_type': 'url_transparent', 'url': service_id, 'ie_key': service_name.capitalize(), - 'title': video_data.get('title'), + 'display_id': video_id, }) if service_name == 'vimeo': info['url'] = smuggle_url( - 'https://player.vimeo.com/video/' + service_id, + f'https://player.vimeo.com/video/{service_id}', {'http_headers': {'Referer': url}}) - return info + + video_slides = traverse_obj(slides, ('slides', ..., 'video', 'id')) + if not video_slides: + return info + + def entries(): + yield info + + service_data = self._download_json( + f'https://ben.slideslive.com/player/{video_id}/slides_video_service_data', + video_id, fatal=False, query={ + 'player_token': player_token, + 'videos': ','.join(video_slides), + }, note='Downloading video slides info', errnote='Failed to download video slides info') or {} + + for slide_id, slide in enumerate(traverse_obj(slides, ('slides', ...)), 1): + if not traverse_obj(slide, ('video', 'service')) == 'yoda': + continue + video_path = traverse_obj(slide, ('video', 'id')) + cdn_hostname = traverse_obj(service_data, ( + video_path, 'video_servers', ...), get_all=False) + if not cdn_hostname or not video_path: + continue + formats, _ = self._extract_formats_and_duration( + cdn_hostname, video_path, video_id, skip_duration=True) + if not formats: + continue + yield { + 'id': f'{video_id}-{slide_id:03d}', + 'title': f'{info["title"]} - Slide {slide_id:03d}', + 'timestamp': info['timestamp'], + 'duration': int_or_none(traverse_obj(slide, ('video', 'duration_ms')), scale=1000), + 'formats': formats, + } + + return self.playlist_result(entries(), f'{video_id}-playlist', info['title']) diff --git a/hypervideo_dl/extractor/sonyliv.py b/hypervideo_dl/extractor/sonyliv.py index aaad420..4379572 100644 --- a/hypervideo_dl/extractor/sonyliv.py +++ b/hypervideo_dl/extractor/sonyliv.py @@ -6,10 +6,12 @@ import time import uuid from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, + jwt_decode_hs256, + try_call, try_get, ) @@ -77,8 +79,10 @@ class SonyLIVIE(InfoExtractor): self._HEADERS['device_id'] = self._get_device_id() self._HEADERS['content-type'] = 'application/json' - if username.lower() == 'token' and len(password) > 1198: + if username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)): self._HEADERS['authorization'] = password + self.report_login() + return elif len(username) != 10 or not username.isdigit(): raise ExtractorError(f'Invalid username/password; {self._LOGIN_HINT}') @@ -119,12 +123,12 @@ class SonyLIVIE(InfoExtractor): 'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path), video_id, headers=self._HEADERS)['resultObj'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406 and self._parse_json( - e.cause.read().decode(), video_id)['message'] == 'Please subscribe to watch this content': + if isinstance(e.cause, HTTPError) and e.cause.status == 406 and self._parse_json( + e.cause.response.read().decode(), video_id)['message'] == 'Please subscribe to watch this content': self.raise_login_required(self._LOGIN_HINT, method=None) - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: message = self._parse_json( - e.cause.read().decode(), video_id)['message'] + e.cause.response.read().decode(), video_id)['message'] if message == 'Geoblocked Country': self.raise_geo_restricted(countries=self._GEO_COUNTRIES) raise ExtractorError(message) diff --git a/hypervideo_dl/extractor/soundcloud.py b/hypervideo_dl/extractor/soundcloud.py index c2344dd..2e6d21a 100644 --- a/hypervideo_dl/extractor/soundcloud.py +++ b/hypervideo_dl/extractor/soundcloud.py @@ -7,15 +7,13 @@ from .common import ( InfoExtractor, SearchInfoExtractor ) -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_str +from ..networking import HEADRequest, Request +from ..networking.exceptions import HTTPError from ..utils import ( error_to_compat_str, ExtractorError, float_or_none, - HEADRequest, int_or_none, KNOWN_EXTENSIONS, mimetype2ext, @@ -26,7 +24,6 @@ from ..utils import ( update_url_query, url_or_none, urlhandle_detect_ext, - sanitized_Request, ) @@ -103,7 +100,7 @@ class SoundcloudBaseIE(InfoExtractor): try: return super()._download_json(*args, **kwargs) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): + if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403): self._store_client_id(None) self._update_client_id() continue @@ -123,7 +120,7 @@ class SoundcloudBaseIE(InfoExtractor): self._access_token = password query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID payload = {'session': {'access_token': self._access_token}} - token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8')) + token_verification = Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8')) response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False) if response is not False: self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} @@ -212,7 +209,7 @@ class SoundcloudBaseIE(InfoExtractor): urlh = self._request_webpage( HEADRequest(redirect_url), track_id, fatal=False) if urlh: - format_url = urlh.geturl() + format_url = urlh.url format_urls.add(format_url) formats.append({ 'format_id': 'download', @@ -669,7 +666,7 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE): except ExtractorError as e: # Downloading page may result in intermittent 502 HTTP error # See https://github.com/hypervideo/hypervideo/issues/872 - if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 502: + if not isinstance(e.cause, HTTPError) or e.cause.status != 502: raise retry.error = e continue @@ -782,6 +779,27 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): '%s (%s)' % (user['username'], resource.capitalize())) +class SoundcloudUserPermalinkIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'https?://api\.soundcloud\.com/users/(?P<id>\d+)' + IE_NAME = 'soundcloud:user:permalink' + _TESTS = [{ + 'url': 'https://api.soundcloud.com/users/30909869', + 'info_dict': { + 'id': '30909869', + 'title': 'neilcic', + }, + 'playlist_mincount': 23, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + user = self._download_json( + self._resolv_url(url), user_id, 'Downloading user info', headers=self._HEADERS) + + return self._extract_playlist( + f'{self._API_V2_BASE}stream/users/{user["id"]}', str(user['id']), user.get('username')) + + class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)' IE_NAME = 'soundcloud:trackstation' diff --git a/hypervideo_dl/extractor/spankbang.py b/hypervideo_dl/extractor/spankbang.py index f242d33..43da34a 100644 --- a/hypervideo_dl/extractor/spankbang.py +++ b/hypervideo_dl/extractor/spankbang.py @@ -177,7 +177,6 @@ class SpankBangPlaylistIE(InfoExtractor): def _real_extract(self, url): mobj = self._match_valid_url(url) playlist_id = mobj.group('id') - display_id = mobj.group('display_id') webpage = self._download_webpage( url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) @@ -186,11 +185,11 @@ class SpankBangPlaylistIE(InfoExtractor): urljoin(url, mobj.group('path')), ie=SpankBangIE.ie_key(), video_id=mobj.group('id')) for mobj in re.finditer( - r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/%s(?:(?!\1).)*)\1' - % re.escape(display_id), webpage)] + r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/[^"\'](?:(?!\1).)*)\1', + webpage)] title = self._html_search_regex( - r'<h1>([^<]+)\s+playlist\s*<', webpage, 'playlist title', + r'<em>([^<]+)</em>\s+playlist\s*<', webpage, 'playlist title', fatal=False) return self.playlist_result(entries, playlist_id, title) diff --git a/hypervideo_dl/extractor/sportdeutschland.py b/hypervideo_dl/extractor/sportdeutschland.py index 75074b3..30dbcf3 100644 --- a/hypervideo_dl/extractor/sportdeutschland.py +++ b/hypervideo_dl/extractor/sportdeutschland.py @@ -1,95 +1,142 @@ from .common import InfoExtractor from ..utils import ( - clean_html, - float_or_none, - int_or_none, - parse_iso8601, - parse_qs, + join_nonempty, strip_or_none, - try_get, + traverse_obj, + unified_timestamp, ) class SportDeutschlandIE(InfoExtractor): _VALID_URL = r'https?://sportdeutschland\.tv/(?P<id>(?:[^/]+/)?[^?#/&]+)' _TESTS = [{ - 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'url': 'https://sportdeutschland.tv/blauweissbuchholztanzsport/buchholzer-formationswochenende-2023-samstag-1-bundesliga-landesliga', 'info_dict': { - 'id': '5318cac0275701382770543d7edaf0a0', + 'id': '9839a5c7-0dbb-48a8-ab63-3b408adc7b54', 'ext': 'mp4', - 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1', - 'duration': 16106.36, - }, - 'params': { - 'noplaylist': True, - # m3u8 download - 'skip_download': True, - }, + 'title': 'Buchholzer Formationswochenende 2023 - Samstag - 1. Bundesliga / Landesliga', + 'display_id': 'blauweissbuchholztanzsport/buchholzer-formationswochenende-2023-samstag-1-bundesliga-landesliga', + 'description': 'md5:a288c794a5ee69e200d8f12982f81a87', + 'live_status': 'was_live', + 'channel': 'Blau-Weiss Buchholz Tanzsport', + 'channel_url': 'https://sportdeutschland.tv/blauweissbuchholztanzsport', + 'channel_id': '93ec33c9-48be-43b6-b404-e016b64fdfa3', + 'duration': 32447, + 'upload_date': '20230114', + 'timestamp': 1673733618, + } + }, { + 'url': 'https://sportdeutschland.tv/deutscherbadmintonverband/bwf-tour-1-runde-feld-1-yonex-gainward-german-open-2022-0', + 'info_dict': { + 'id': '95c80c52-6b9a-4ae9-9197-984145adfced', + 'ext': 'mp4', + 'title': 'BWF Tour: 1. Runde Feld 1 - YONEX GAINWARD German Open 2022', + 'display_id': 'deutscherbadmintonverband/bwf-tour-1-runde-feld-1-yonex-gainward-german-open-2022-0', + 'description': 'md5:2afb5996ceb9ac0b2ac81f563d3a883e', + 'live_status': 'was_live', + 'channel': 'Deutscher Badminton Verband', + 'channel_url': 'https://sportdeutschland.tv/deutscherbadmintonverband', + 'channel_id': '93ca5866-2551-49fc-8424-6db35af58920', + 'duration': 41097, + 'upload_date': '20220309', + 'timestamp': 1646860727.0, + } }, { - 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'url': 'https://sportdeutschland.tv/ggcbremen/formationswochenende-latein-2023', 'info_dict': { - 'id': 'c6e2fdd01f63013854c47054d2ab776f', - 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals', - 'description': 'md5:5263ff4c31c04bb780c9f91130b48530', - 'duration': 31397, + 'id': '9889785e-55b0-4d97-a72a-ce9a9f157cce', + 'title': 'Formationswochenende Latein 2023 - Samstag', + 'display_id': 'ggcbremen/formationswochenende-latein-2023', + 'description': 'md5:6e4060d40ff6a8f8eeb471b51a8f08b2', + 'live_status': 'was_live', + 'channel': 'Grün-Gold-Club Bremen e.V.', + 'channel_id': '9888f04e-bb46-4c7f-be47-df960a4167bb', + 'channel_url': 'https://sportdeutschland.tv/ggcbremen', }, - 'playlist_count': 2, + 'playlist_count': 3, + 'playlist': [{ + 'info_dict': { + 'id': '988e1fea-9d44-4fab-8c72-3085fb667547', + 'ext': 'mp4', + 'channel_url': 'https://sportdeutschland.tv/ggcbremen', + 'channel_id': '9888f04e-bb46-4c7f-be47-df960a4167bb', + 'channel': 'Grün-Gold-Club Bremen e.V.', + 'duration': 86, + 'title': 'Formationswochenende Latein 2023 - Samstag Part 1', + 'upload_date': '20230225', + 'timestamp': 1677349909, + 'live_status': 'was_live', + } + }] }, { - 'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich', - 'only_matching': True, + 'url': 'https://sportdeutschland.tv/dtb/gymnastik-international-tag-1', + 'info_dict': { + 'id': '95d71b8a-370a-4b87-ad16-94680da18528', + 'ext': 'mp4', + 'title': r're:Gymnastik International - Tag 1 .+', + 'display_id': 'dtb/gymnastik-international-tag-1', + 'channel_id': '936ecef1-2f4a-4e08-be2f-68073cb7ecab', + 'channel': 'Deutscher Turner-Bund', + 'channel_url': 'https://sportdeutschland.tv/dtb', + 'description': 'md5:07a885dde5838a6f0796ee21dc3b0c52', + 'live_status': 'is_live', + }, + 'skip': 'live', }] + def _process_video(self, asset_id, video): + is_live = video['type'] == 'mux_live' + token = self._download_json( + f'https://api.sportdeutschland.tv/api/frontend/asset-token/{asset_id}', + video['id'], query={'type': video['type'], 'playback_id': video['src']})['token'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://stream.mux.com/{video["src"]}.m3u8?token={token}', video['id'], live=is_live) + + return { + 'is_live': is_live, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(video, { + 'id': 'id', + 'duration': ('duration', {lambda x: float(x) > 0 and float(x)}), + 'timestamp': ('created_at', {unified_timestamp}) + }), + } + def _real_extract(self, url): display_id = self._match_id(url) - data = self._download_json( - 'https://backend.sportdeutschland.tv/api/permalinks/' + display_id, + meta = self._download_json( + f'https://api.sportdeutschland.tv/api/stateless/frontend/assets/{display_id}', display_id, query={'access_token': 'true'}) - asset = data['asset'] - title = (asset.get('title') or asset['label']).strip() - asset_id = asset.get('id') or asset.get('uuid') + info = { - 'id': asset_id, - 'title': title, - 'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'), - 'duration': int_or_none(asset.get('seconds')), + 'display_id': display_id, + **traverse_obj(meta, { + 'id': (('id', 'uuid'), ), + 'title': (('title', 'name'), {strip_or_none}), + 'description': 'description', + 'channel': ('profile', 'name'), + 'channel_id': ('profile', 'id'), + 'is_live': 'currently_live', + 'was_live': 'was_live', + 'channel_url': ('profile', 'slug', {lambda x: f'https://sportdeutschland.tv/{x}'}), + }, get_all=False) } - videos = asset.get('videos') or [] - if len(videos) > 1: - playlist_id = parse_qs(url).get('playlistId', [None])[0] - if not self._yes_playlist(playlist_id, asset_id): - videos = [videos[int(playlist_id)]] - def entries(): - for i, video in enumerate(videos, 1): - video_id = video.get('uuid') - video_url = video.get('url') - if not (video_id and video_url): - continue - formats = self._extract_m3u8_formats( - video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False) - if not formats and not self.get_param('ignore_no_formats'): - continue - yield { - 'id': video_id, - 'formats': formats, - 'title': title + ' - ' + (video.get('label') or 'Teil %d' % i), - 'duration': float_or_none(video.get('duration')), - } - info.update({ - '_type': 'multi_video', - 'entries': entries(), - }) - else: - formats = self._extract_m3u8_formats( - videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4') - section_title = strip_or_none(try_get(data, lambda x: x['section']['title'])) - info.update({ - 'formats': formats, - 'display_id': asset.get('permalink'), - 'thumbnail': try_get(asset, lambda x: x['images'][0]), - 'categories': [section_title] if section_title else None, - 'view_count': int_or_none(asset.get('views')), - 'is_live': asset.get('is_live') is True, - 'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')), - }) - return info + parts = traverse_obj(meta, (('livestream', ('videos', ...)), )) + entries = [{ + 'title': join_nonempty(info.get('title'), f'Part {i}', delim=' '), + **traverse_obj(info, {'channel': 'channel', 'channel_id': 'channel_id', + 'channel_url': 'channel_url', 'was_live': 'was_live'}), + **self._process_video(info['id'], video), + } for i, video in enumerate(parts, 1)] + + return { + '_type': 'multi_video', + **info, + 'entries': entries, + } if len(entries) > 1 else { + **info, + **entries[0], + 'title': info.get('title'), + } diff --git a/hypervideo_dl/extractor/stacommu.py b/hypervideo_dl/extractor/stacommu.py new file mode 100644 index 0000000..6f58f06 --- /dev/null +++ b/hypervideo_dl/extractor/stacommu.py @@ -0,0 +1,148 @@ +import time + +from .wrestleuniverse import WrestleUniverseBaseIE +from ..utils import ( + int_or_none, + traverse_obj, + url_or_none, +) + + +class StacommuBaseIE(WrestleUniverseBaseIE): + _NETRC_MACHINE = 'stacommu' + _API_HOST = 'api.stacommu.jp' + _LOGIN_QUERY = {'key': 'AIzaSyCR9czxhH2eWuijEhTNWBZ5MCcOYEUTAhg'} + _LOGIN_HEADERS = { + 'Accept': '*/*', + 'Content-Type': 'application/json', + 'X-Client-Version': 'Chrome/JsCore/9.9.4/FirebaseCore-web', + 'Referer': 'https://www.stacommu.jp/', + 'Origin': 'https://www.stacommu.jp', + } + + @WrestleUniverseBaseIE._TOKEN.getter + def _TOKEN(self): + if self._REAL_TOKEN and self._TOKEN_EXPIRY <= int(time.time()): + self._refresh_token() + + return self._REAL_TOKEN + + def _get_formats(self, data, path, video_id=None): + if not traverse_obj(data, path) and not data.get('canWatch') and not self._TOKEN: + self.raise_login_required(method='password') + return super()._get_formats(data, path, video_id) + + def _extract_hls_key(self, data, path, decrypt): + encryption_data = traverse_obj(data, path) + if traverse_obj(encryption_data, ('encryptType', {int})) == 0: + return None + return traverse_obj(encryption_data, {'key': ('key', {decrypt}), 'iv': ('iv', {decrypt})}) + + +class StacommuVODIE(StacommuBaseIE): + _VALID_URL = r'https?://www\.stacommu\.jp/videos/episodes/(?P<id>[\da-zA-Z]+)' + _TESTS = [{ + # not encrypted + 'url': 'https://www.stacommu.jp/videos/episodes/aXcVKjHyAENEjard61soZZ', + 'info_dict': { + 'id': 'aXcVKjHyAENEjard61soZZ', + 'ext': 'mp4', + 'title': 'スタコミュAWARDの裏側、ほぼ全部見せます!〜晴れ舞台の直前ドキドキ編〜', + 'description': 'md5:6400275c57ae75c06da36b06f96beb1c', + 'timestamp': 1679652000, + 'upload_date': '20230324', + 'thumbnail': 'https://image.stacommu.jp/6eLobQan8PFtBoU4RL4uGg/6eLobQan8PFtBoU4RL4uGg', + 'cast': 'count:11', + 'duration': 250, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + # encrypted; requires a premium account + 'url': 'https://www.stacommu.jp/videos/episodes/3hybMByUvzMEqndSeu5LpD', + 'info_dict': { + 'id': '3hybMByUvzMEqndSeu5LpD', + 'ext': 'mp4', + 'title': 'スタプラフェス2023〜裏側ほぼ全部見せます〜#10', + 'description': 'md5:85494488ccf1dfa1934accdeadd7b340', + 'timestamp': 1682506800, + 'upload_date': '20230426', + 'thumbnail': 'https://image.stacommu.jp/eMdXtEefR4kEyJJMpAFi7x/eMdXtEefR4kEyJJMpAFi7x', + 'cast': 'count:55', + 'duration': 312, + 'hls_aes': { + 'key': '6bbaf241b8e1fd9f59ecf546a70e4ae7', + 'iv': '1fc9002a23166c3bb1d240b953d09de9', + }, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + _API_PATH = 'videoEpisodes' + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = self._download_metadata( + url, video_id, 'ja', ('dehydratedState', 'queries', 0, 'state', 'data')) + hls_info, decrypt = self._call_encrypted_api( + video_id, ':watch', 'stream information', data={'method': 1}) + + return { + 'id': video_id, + 'formats': self._get_formats(hls_info, ('protocolHls', 'url', {url_or_none}), video_id), + 'hls_aes': self._extract_hls_key(hls_info, 'protocolHls', decrypt), + **traverse_obj(video_info, { + 'title': ('displayName', {str}), + 'description': ('description', {str}), + 'timestamp': ('watchStartTime', {int_or_none}), + 'thumbnail': ('keyVisualUrl', {url_or_none}), + 'cast': ('casts', ..., 'displayName', {str}), + 'duration': ('duration', {int}), + }), + } + + +class StacommuLiveIE(StacommuBaseIE): + _VALID_URL = r'https?://www\.stacommu\.jp/live/(?P<id>[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'https://www.stacommu.jp/live/d2FJ3zLnndegZJCAEzGM3m', + 'info_dict': { + 'id': 'd2FJ3zLnndegZJCAEzGM3m', + 'ext': 'mp4', + 'title': '仲村悠菜 2023/05/04', + 'timestamp': 1683195647, + 'upload_date': '20230504', + 'thumbnail': 'https://image.stacommu.jp/pHGF57SPEHE2ke83FS92FN/pHGF57SPEHE2ke83FS92FN', + 'duration': 5322, + 'hls_aes': { + 'key': 'efbb3ec0b8246f61adf1764c5a51213a', + 'iv': '80621d19a1f19167b64cedb415b05d1c', + }, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + _API_PATH = 'events' + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = self._call_api(video_id, msg='video information', query={'al': 'ja'}, auth=False) + hls_info, decrypt = self._call_encrypted_api( + video_id, ':watchArchive', 'stream information', data={'method': 1}) + + return { + 'id': video_id, + 'formats': self._get_formats(hls_info, ('hls', 'urls', ..., {url_or_none}), video_id), + 'hls_aes': self._extract_hls_key(hls_info, 'hls', decrypt), + **traverse_obj(video_info, { + 'title': ('displayName', {str}), + 'timestamp': ('startTime', {int_or_none}), + 'thumbnail': ('keyVisualUrl', {url_or_none}), + 'duration': ('duration', {int_or_none}), + }), + } diff --git a/hypervideo_dl/extractor/stageplus.py b/hypervideo_dl/extractor/stageplus.py new file mode 100644 index 0000000..4bed4d6 --- /dev/null +++ b/hypervideo_dl/extractor/stageplus.py @@ -0,0 +1,515 @@ +import json +import uuid + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + traverse_obj, + try_call, + unified_timestamp, + url_or_none, +) + + +class StagePlusVODConcertIE(InfoExtractor): + _NETRC_MACHINE = 'stageplus' + _VALID_URL = r'https?://(?:www\.)?stage-plus\.com/video/(?P<id>vod_concert_\w+)' + _TESTS = [{ + 'url': 'https://www.stage-plus.com/video/vod_concert_APNM8GRFDPHMASJKBSPJACG', + 'playlist_count': 6, + 'info_dict': { + 'id': 'vod_concert_APNM8GRFDPHMASJKBSPJACG', + 'title': 'Yuja Wang plays Rachmaninoff\'s Piano Concerto No. 2 – from Odeonsplatz', + 'description': 'md5:50f78ec180518c9bdb876bac550996fc', + 'artist': ['Yuja Wang', 'Lorenzo Viotti'], + 'upload_date': '20230331', + 'timestamp': 1680249600, + 'release_date': '20210709', + 'release_timestamp': 1625788800, + 'thumbnails': 'count:3', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'performance_work_A1IN4PJFE9MM2RJ3CLBMUSJBBSOJAD9O', + 'ext': 'mp4', + 'title': 'Piano Concerto No. 2 in C Minor, Op. 18', + 'description': 'md5:50f78ec180518c9bdb876bac550996fc', + 'upload_date': '20230331', + 'timestamp': 1680249600, + 'release_date': '20210709', + 'release_timestamp': 1625788800, + 'duration': 2207, + 'chapters': 'count:5', + 'artist': ['Yuja Wang'], + 'composer': ['Sergei Rachmaninoff'], + 'album': 'Yuja Wang plays Rachmaninoff\'s Piano Concerto No. 2 – from Odeonsplatz', + 'album_artist': ['Yuja Wang', 'Lorenzo Viotti'], + 'track': 'Piano Concerto No. 2 in C Minor, Op. 18', + 'track_number': 1, + 'genre': 'Instrumental Concerto', + }, + }], + 'params': {'skip_download': 'm3u8'}, + }] + + # TODO: Prune this after livestream and/or album extractors are added + _GRAPHQL_QUERY = '''query videoDetailPage($videoId: ID!, $sliderItemsFirst: Int = 24) { + node(id: $videoId) { + __typename + ...LiveConcertFields + ... on LiveConcert { + artists { + edges { + role { + ...RoleFields + } + node { + id + name + sortName + } + } + } + isAtmos + maxResolution + groups { + id + name + typeDisplayName + } + shortDescription + performanceWorks { + ...livePerformanceWorkFields + } + totalDuration + sliders { + ...contentContainerFields + } + vodConcert { + __typename + id + } + } + ...VideoFields + ... on Video { + artists { + edges { + role { + ...RoleFields + } + node { + id + name + sortName + } + } + } + isAtmos + maxResolution + isLossless + description + productionDate + takedownDate + sliders { + ...contentContainerFields + } + } + ...VodConcertFields + ... on VodConcert { + artists { + edges { + role { + ...RoleFields + } + node { + id + name + sortName + } + } + } + isAtmos + maxResolution + groups { + id + name + typeDisplayName + } + performanceWorks { + ...PerformanceWorkFields + } + shortDescription + productionDate + takedownDate + sliders { + ...contentContainerFields + } + } + } +} + +fragment LiveConcertFields on LiveConcert { + endTime + id + pictures { + ...PictureFields + } + reruns { + ...liveConcertRerunFields + } + publicationLevel + startTime + streamStartTime + subtitle + title + typeDisplayName + stream { + ...liveStreamFields + } + trailerStream { + ...streamFields + } + geoAccessCountries + geoAccessMode +} + +fragment PictureFields on Picture { + id + url + type +} + +fragment liveConcertRerunFields on LiveConcertRerun { + streamStartTime + endTime + startTime + stream { + ...rerunStreamFields + } +} + +fragment rerunStreamFields on RerunStream { + publicationLevel + streamType + url +} + +fragment liveStreamFields on LiveStream { + publicationLevel + streamType + url +} + +fragment streamFields on Stream { + publicationLevel + streamType + url +} + +fragment RoleFields on Role { + __typename + id + type + displayName +} + +fragment livePerformanceWorkFields on LivePerformanceWork { + __typename + id + artists { + ...artistWithRoleFields + } + groups { + edges { + node { + id + name + typeDisplayName + } + } + } + work { + ...workFields + } +} + +fragment artistWithRoleFields on ArtistWithRoleConnection { + edges { + role { + ...RoleFields + } + node { + id + name + sortName + } + } +} + +fragment workFields on Work { + id + title + movements { + id + title + } + composers { + id + name + } + genre { + id + title + } +} + +fragment contentContainerFields on CuratedContentContainer { + __typename + ...SliderFields + ...BannerFields +} + +fragment SliderFields on Slider { + id + headline + items(first: $sliderItemsFirst) { + edges { + node { + id + __typename + ...AlbumFields + ...ArtistFields + ...EpochFields + ...GenreFields + ...GroupFields + ...LiveConcertFields + ...PartnerFields + ...PerformanceWorkFields + ...VideoFields + ...VodConcertFields + } + } + } +} + +fragment AlbumFields on Album { + artistAndGroupDisplayInfo + id + pictures { + ...PictureFields + } + title +} + +fragment ArtistFields on Artist { + id + name + roles { + ...RoleFields + } + pictures { + ...PictureFields + } +} + +fragment EpochFields on Epoch { + id + endYear + pictures { + ...PictureFields + } + startYear + title +} + +fragment GenreFields on Genre { + id + pictures { + ...PictureFields + } + title +} + +fragment GroupFields on Group { + id + name + typeDisplayName + pictures { + ...PictureFields + } +} + +fragment PartnerFields on Partner { + id + name + typeDisplayName + subtypeDisplayName + pictures { + ...PictureFields + } +} + +fragment PerformanceWorkFields on PerformanceWork { + __typename + id + artists { + ...artistWithRoleFields + } + groups { + edges { + node { + id + name + typeDisplayName + } + } + } + work { + ...workFields + } + stream { + ...streamFields + } + vodConcert { + __typename + id + } + duration + cuePoints { + mark + title + } +} + +fragment VideoFields on Video { + id + archiveReleaseDate + title + subtitle + pictures { + ...PictureFields + } + stream { + ...streamFields + } + trailerStream { + ...streamFields + } + duration + typeDisplayName + duration + geoAccessCountries + geoAccessMode + publicationLevel + takedownDate +} + +fragment VodConcertFields on VodConcert { + id + archiveReleaseDate + pictures { + ...PictureFields + } + subtitle + title + typeDisplayName + totalDuration + geoAccessCountries + geoAccessMode + trailerStream { + ...streamFields + } + publicationLevel + takedownDate +} + +fragment BannerFields on Banner { + description + link + pictures { + ...PictureFields + } + title +}''' + + _TOKEN = None + + def _perform_login(self, username, password): + auth = self._download_json('https://audience.api.stageplus.io/oauth/token', None, headers={ + 'Content-Type': 'application/json', + 'Origin': 'https://www.stage-plus.com', + }, data=json.dumps({ + 'grant_type': 'password', + 'username': username, + 'password': password, + 'device_info': 'Chrome (Windows)', + 'client_device_id': str(uuid.uuid4()), + }, separators=(',', ':')).encode(), note='Logging in') + + if auth.get('access_token'): + self._TOKEN = auth['access_token'] + + def _real_initialize(self): + if self._TOKEN: + return + + self._TOKEN = try_call( + lambda: self._get_cookies('https://www.stage-plus.com/')['dgplus_access_token'].value) + if not self._TOKEN: + self.raise_login_required() + + def _real_extract(self, url): + concert_id = self._match_id(url) + + data = self._download_json('https://audience.api.stageplus.io/graphql', concert_id, headers={ + 'authorization': f'Bearer {self._TOKEN}', + 'content-type': 'application/json', + 'Origin': 'https://www.stage-plus.com', + }, data=json.dumps({ + 'query': self._GRAPHQL_QUERY, + 'variables': {'videoId': concert_id}, + 'operationName': 'videoDetailPage' + }, separators=(',', ':')).encode())['data']['node'] + + metadata = traverse_obj(data, { + 'title': 'title', + 'description': ('shortDescription', {str}), + 'artist': ('artists', 'edges', ..., 'node', 'name'), + 'timestamp': ('archiveReleaseDate', {unified_timestamp}), + 'release_timestamp': ('productionDate', {unified_timestamp}), + }) + + thumbnails = traverse_obj(data, ('pictures', lambda _, v: url_or_none(v['url']), { + 'id': 'name', + 'url': 'url', + })) or None + + entries = [] + for idx, video in enumerate(traverse_obj(data, ( + 'performanceWorks', lambda _, v: v['id'] and url_or_none(v['stream']['url']))), 1): + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + video['stream']['url'], video['id'], 'mp4', m3u8_id='hls', query={'token': self._TOKEN}) + entries.append({ + 'id': video['id'], + 'formats': formats, + 'subtitles': subtitles, + 'album': metadata.get('title'), + 'album_artist': metadata.get('artist'), + 'track_number': idx, + **metadata, + **traverse_obj(video, { + 'title': ('work', 'title'), + 'track': ('work', 'title'), + 'duration': ('duration', {float_or_none}), + 'chapters': ( + 'cuePoints', lambda _, v: float_or_none(v['mark']) is not None, { + 'title': 'title', + 'start_time': ('mark', {float_or_none}), + }), + 'artist': ('artists', 'edges', ..., 'node', 'name'), + 'composer': ('work', 'composers', ..., 'name'), + 'genre': ('work', 'genre', 'title'), + }), + }) + + return self.playlist_result(entries, concert_id, thumbnails=thumbnails, **metadata) diff --git a/hypervideo_dl/extractor/stripchat.py b/hypervideo_dl/extractor/stripchat.py index 4229a0b..b9523c8 100644 --- a/hypervideo_dl/extractor/stripchat.py +++ b/hypervideo_dl/extractor/stripchat.py @@ -1,5 +1,10 @@ from .common import InfoExtractor -from ..utils import ExtractorError, lowercase_escape, traverse_obj +from ..utils import ( + ExtractorError, + UserNotLive, + lowercase_escape, + traverse_obj +) class StripchatIE(InfoExtractor): @@ -35,16 +40,15 @@ class StripchatIE(InfoExtractor): if traverse_obj(data, ('viewCam', 'show'), expected_type=dict): raise ExtractorError('Model is in private show', expected=True) elif not traverse_obj(data, ('viewCam', 'model', 'isLive'), expected_type=bool): - raise ExtractorError('Model is offline', expected=True) + raise UserNotLive(video_id=video_id) - server = traverse_obj(data, ('viewCam', 'viewServers', 'flashphoner-hls'), expected_type=str) model_id = traverse_obj(data, ('viewCam', 'model', 'id'), expected_type=int) formats = [] - for host in traverse_obj(data, ( - 'config', 'data', (('featuresV2', 'hlsFallback', 'fallbackDomains', ...), 'hlsStreamHost'))): + for host in traverse_obj(data, ('config', 'data', ( + (('features', 'featuresV2'), 'hlsFallback', 'fallbackDomains', ...), 'hlsStreamHost'))): formats = self._extract_m3u8_formats( - f'https://b-{server}.{host}/hls/{model_id}/{model_id}.m3u8', + f'https://edge-hls.{host}/hls/{model_id}/master/{model_id}_auto.m3u8', video_id, ext='mp4', m3u8_id='hls', fatal=False, live=True) if formats: break diff --git a/hypervideo_dl/extractor/stv.py b/hypervideo_dl/extractor/stv.py index c879fb5..8b3e635 100644 --- a/hypervideo_dl/extractor/stv.py +++ b/hypervideo_dl/extractor/stv.py @@ -73,6 +73,8 @@ class STVPlayerIE(InfoExtractor): }) programme = result.get('programme') or {} + if programme.get('drmEnabled'): + self.report_drm(video_id) return { '_type': 'url_transparent', diff --git a/hypervideo_dl/extractor/substack.py b/hypervideo_dl/extractor/substack.py index fa38263..3782cee 100644 --- a/hypervideo_dl/extractor/substack.py +++ b/hypervideo_dl/extractor/substack.py @@ -2,7 +2,7 @@ import re import urllib.parse from .common import InfoExtractor -from ..utils import str_or_none, traverse_obj +from ..utils import js_to_json, str_or_none, traverse_obj class SubstackIE(InfoExtractor): @@ -14,7 +14,7 @@ class SubstackIE(InfoExtractor): 'id': '47660949', 'ext': 'mp4', 'title': 'I MADE A VLOG', - 'description': 'md5:10c01ff93439a62e70ce963b2aa0b7f6', + 'description': 'md5:9248af9a759321e1027226f988f54d96', 'thumbnail': 'md5:bec758a34d8ee9142d43bcebdf33af18', 'uploader': 'Maybe Baby', 'uploader_id': '33628', @@ -77,7 +77,9 @@ class SubstackIE(InfoExtractor): display_id, username = self._match_valid_url(url).group('id', 'username') webpage = self._download_webpage(url, display_id) - webpage_info = self._search_json(r'<script[^>]*>\s*window\._preloads\s*=', webpage, 'preloads', display_id) + webpage_info = self._parse_json(self._search_json( + r'window\._preloads\s*=\s*JSON\.parse\(', webpage, 'json string', + display_id, transform_source=js_to_json, contains_pattern=r'"{(?s:.+)}"'), display_id) post_type = webpage_info['post']['type'] formats, subtitles = [], {} diff --git a/hypervideo_dl/extractor/sverigesradio.py b/hypervideo_dl/extractor/sverigesradio.py index 65da615..01a07b3 100644 --- a/hypervideo_dl/extractor/sverigesradio.py +++ b/hypervideo_dl/extractor/sverigesradio.py @@ -1,8 +1,13 @@ from .common import InfoExtractor from ..utils import ( determine_ext, + extract_attributes, + get_element_by_id, + get_element_html_by_class, int_or_none, str_or_none, + traverse_obj, + url_or_none, ) @@ -21,7 +26,15 @@ class SverigesRadioBaseIE(InfoExtractor): } def _real_extract(self, url): - audio_id = self._match_id(url) + audio_id, display_id = self._match_valid_url(url).group('id', 'slug') + if not audio_id: + webpage = self._download_webpage(url, display_id) + audio_id = ( + traverse_obj( + get_element_html_by_class('audio-button', webpage), + ({extract_attributes}, ('data-audio-id', 'data-publication-id')), get_all=False) + or self._parse_json(get_element_by_id('gtm-metadata', webpage), display_id)['pageId']) + query = { 'id': audio_id, 'type': self._AUDIO_TYPE, @@ -30,7 +43,6 @@ class SverigesRadioBaseIE(InfoExtractor): item = self._download_json( self._BASE_URL + 'audiometadata', audio_id, 'Downloading audio JSON metadata', query=query)['items'][0] - title = item['subtitle'] query['format'] = 'iis' urls = [] @@ -61,18 +73,20 @@ class SverigesRadioBaseIE(InfoExtractor): return { 'id': audio_id, - 'title': title, 'formats': formats, - 'series': item.get('title'), - 'duration': int_or_none(item.get('duration')), - 'thumbnail': item.get('displayimageurl'), - 'description': item.get('description'), + **traverse_obj(item, { + 'title': 'subtitle', + 'series': 'title', + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('displayimageurl', {url_or_none}), + 'description': 'description', + }), } class SverigesRadioPublicationIE(SverigesRadioBaseIE): IE_NAME = 'sverigesradio:publication' - _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/sida/(?:artikel|gruppsida)\.aspx\?.*?\bartikel=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?(?:artikel|gruppsida)(?:\.aspx\?.*?\bartikel=(?P<id>[0-9]+)|/(?P<slug>[\w-]+))' _TESTS = [{ 'url': 'https://sverigesradio.se/sida/artikel.aspx?programid=83&artikel=7038546', 'md5': '6a4917e1923fccb080e5a206a5afa542', @@ -86,6 +100,18 @@ class SverigesRadioPublicationIE(SverigesRadioBaseIE): 'thumbnail': r're:^https?://.*\.jpg', }, }, { + 'url': 'https://sverigesradio.se/artikel/tysk-fotbollsfeber-bayern-munchens-10-ariga-segersvit-kan-brytas', + 'md5': 'f8a914ad50f491bb74eed403ab4bfef6', + 'info_dict': { + 'id': '8360345', + 'ext': 'm4a', + 'title': 'Tysk fotbollsfeber när Bayern Münchens 10-åriga segersvit kan brytas', + 'series': 'Radiosporten', + 'description': 'md5:5254610e20ce527ecb3a6102a06dcc5f', + 'duration': 72, + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, { 'url': 'https://sverigesradio.se/sida/gruppsida.aspx?programid=3304&grupp=6247&artikel=7146887', 'only_matching': True, }] @@ -94,8 +120,8 @@ class SverigesRadioPublicationIE(SverigesRadioBaseIE): class SverigesRadioEpisodeIE(SverigesRadioBaseIE): IE_NAME = 'sverigesradio:episode' - _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?:(?P<id>\d+)|(?P<slug>[\w-]+))(?:$|[#?])' + _TESTS = [{ 'url': 'https://sverigesradio.se/avsnitt/1140922?programid=1300', 'md5': '20dc4d8db24228f846be390b0c59a07c', 'info_dict': { @@ -106,6 +132,18 @@ class SverigesRadioEpisodeIE(SverigesRadioBaseIE): 'title': 'Metoo och valen', 'description': 'md5:fcb5c1f667f00badcc702b196f10a27e', 'thumbnail': r're:^https?://.*\.jpg', - } - } + }, + }, { + 'url': 'https://sverigesradio.se/avsnitt/p4-live-med-first-aid-kit-scandinavium-mars-2023', + 'md5': 'ce17fb82520a8033dbb846993d5589fe', + 'info_dict': { + 'id': '2160416', + 'ext': 'm4a', + 'title': 'P4 Live med First Aid Kit', + 'description': 'md5:6d5b78eed3d2b65f6de04daa45e9285d', + 'thumbnail': r're:^https?://.*\.jpg', + 'series': 'P4 Live', + 'duration': 5640, + }, + }] _AUDIO_TYPE = 'episode' diff --git a/hypervideo_dl/extractor/svt.py b/hypervideo_dl/extractor/svt.py index 31bf7f9..18da875 100644 --- a/hypervideo_dl/extractor/svt.py +++ b/hypervideo_dl/extractor/svt.py @@ -1,3 +1,4 @@ +import json import re from .common import InfoExtractor @@ -6,10 +7,11 @@ from ..utils import ( determine_ext, dict_get, int_or_none, - unified_timestamp, str_or_none, strip_or_none, + traverse_obj, try_get, + unified_timestamp, ) @@ -163,10 +165,46 @@ class SVTPlayIE(SVTPlayBaseIE): }, }, 'params': { - # skip for now due to download test asserts that segment is > 10000 bytes and svt uses - # init segments that are smaller - # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B - 'skip_download': True, + 'skip_download': 'm3u8', + }, + 'skip': 'Episode is no longer available', + }, { + 'url': 'https://www.svtplay.se/video/emBxBQj', + 'md5': '2382036fd6f8c994856c323fe51c426e', + 'info_dict': { + 'id': 'eyBd9aj', + 'ext': 'mp4', + 'title': '1. Farlig kryssning', + 'timestamp': 1491019200, + 'upload_date': '20170401', + 'duration': 2566, + 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', + 'age_limit': 0, + 'episode': '1. Farlig kryssning', + 'series': 'Rederiet', + 'subtitles': { + 'sv': 'count:3' + }, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://www.svtplay.se/video/jz2rYz7/anders-hansen-moter/james-fallon?info=visa', + 'info_dict': { + 'id': 'jvXAGVb', + 'ext': 'mp4', + 'title': 'James Fallon', + 'timestamp': 1673917200, + 'upload_date': '20230117', + 'duration': 1081, + 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', + 'age_limit': 0, + 'episode': 'James Fallon', + 'series': 'Anders Hansen möter...', + }, + 'params': { + 'skip_download': 'dash', }, }, { 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', @@ -248,14 +286,15 @@ class SVTPlayIE(SVTPlayBaseIE): compat_str) if not svt_id: + nextjs_data = self._search_nextjs_data(webpage, video_id, fatal=False) + svt_id = traverse_obj(nextjs_data, ( + 'props', 'urqlState', ..., 'data', {json.loads}, 'detailsPageByPath', + 'video', 'svtId', {str}), get_all=False) + + if not svt_id: svt_id = self._search_regex( (r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', - r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\b(?:modalId|id)=([\da-zA-Z-]+)' % re.escape(video_id), - r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', - r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', - r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', - r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)', - r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'), + r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/[\w-]+/[^"\']*\b(?:modalId|id)=([\w-]+)'), webpage, 'video id') info_dict = self._extract_by_video_id(svt_id, webpage) diff --git a/hypervideo_dl/extractor/tagesschau.py b/hypervideo_dl/extractor/tagesschau.py index ea0532c..e23b490 100644 --- a/hypervideo_dl/extractor/tagesschau.py +++ b/hypervideo_dl/extractor/tagesschau.py @@ -2,10 +2,12 @@ import re from .common import InfoExtractor from ..utils import ( - js_to_json, + UnsupportedError, extract_attributes, - try_get, int_or_none, + js_to_json, + parse_iso8601, + try_get, ) @@ -14,36 +16,38 @@ class TagesschauIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', - 'md5': '7a7287612fa881a1ae1d087df45c2fd6', + 'md5': 'ccb9359bf8c4795836e43759f3408a93', 'info_dict': { 'id': 'video-102143-1', 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', + 'duration': 138, }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', - 'md5': '3c54c1f6243d279b706bde660ceec633', + 'md5': '5c15e8f3da049e48829ec9786d835536', 'info_dict': { 'id': 'ts-5727-1', 'ext': 'mp4', 'title': 'Ganze Sendung', + 'duration': 932, }, }, { # exclusive audio 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', - 'md5': '4cf22023c285f35e99c24d290ba58cc9', + 'md5': '4bff8f23504df56a0d86ed312d654182', 'info_dict': { 'id': 'audio-29417-1', 'ext': 'mp3', - 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', + 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet', }, }, { 'url': 'http://www.tagesschau.de/inland/bnd-303.html', - 'md5': '12cfb212d9325b5ba0d52b625f1aa61c', + 'md5': 'f049fa1698d7564e9ca4c3325108f034', 'info_dict': { 'id': 'bnd-303-1', - 'ext': 'mp4', - 'title': 'SPD-Gruppenbild mit Bärbel Bas nach der Fraktionssitzung | dpa', + 'ext': 'mp3', + 'title': 'Das Siegel des Bundesnachrichtendienstes | dpa', }, }, { 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', @@ -51,13 +55,24 @@ class TagesschauIE(InfoExtractor): 'id': 'afd-parteitag-135', 'title': 'AfD', }, - 'playlist_count': 20, + 'playlist_mincount': 15, }, { 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', 'info_dict': { 'id': 'audio-29417-1', 'ext': 'mp3', - 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt', + 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet', + }, + }, { + 'url': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-327.html', + 'info_dict': { + 'id': 'podcast-11km-327', + 'ext': 'mp3', + 'title': 'Gewalt in der Kita – Wenn Erzieher:innen schweigen', + 'upload_date': '20230322', + 'timestamp': 1679482808, + 'thumbnail': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-329~_v-original.jpg', + 'description': 'md5:dad059931fe4b3693e3656e93a249848', }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', @@ -117,7 +132,7 @@ class TagesschauIE(InfoExtractor): formats = [] if media_url.endswith('master.m3u8'): formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls') - elif media_url.endswith('.hi.mp3') and media_url.startswith('https://download'): + elif media_url.endswith('.mp3'): formats = [{ 'url': media_url, 'vcodec': 'none', @@ -130,20 +145,19 @@ class TagesschauIE(InfoExtractor): 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])), 'formats': formats }) + + if not entries: + raise UnsupportedError(url) + if len(entries) > 1: return self.playlist_result(entries, display_id, title) - formats = entries[0]['formats'] - video_info = self._search_json_ld(webpage, video_id) - description = video_info.get('description') - thumbnail = self._og_search_thumbnail(webpage) or video_info.get('thumbnail') - timestamp = video_info.get('timestamp') - title = title or video_info.get('description') return { 'id': display_id, 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - 'timestamp': timestamp, - 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': entries[0]['formats'], + 'timestamp': parse_iso8601(self._html_search_meta('date', webpage)), + 'description': self._og_search_description(webpage), + 'duration': entries[0]['duration'], } diff --git a/hypervideo_dl/extractor/tbsjp.py b/hypervideo_dl/extractor/tbsjp.py new file mode 100644 index 0000000..77ddeca --- /dev/null +++ b/hypervideo_dl/extractor/tbsjp.py @@ -0,0 +1,152 @@ +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + clean_html, + get_element_text_and_html_by_tag, + int_or_none, + str_or_none, + traverse_obj, + try_call, + unified_timestamp, + urljoin, +) + + +class TBSJPEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://cu\.tbs\.co\.jp/episode/(?P<id>[\d_]+)' + _GEO_BYPASS = False + _TESTS = [{ + 'url': 'https://cu.tbs.co.jp/episode/23613_2044134_1000049010', + 'skip': 'streams geo-restricted, Japan only. Also, will likely expire eventually', + 'info_dict': { + 'title': 'VIVANT 第三話 誤送金完結へ!絶体絶命の反撃開始', + 'id': '23613_2044134_1000049010', + 'ext': 'mp4', + 'upload_date': '20230728', + 'duration': 3517, + 'release_timestamp': 1691118230, + 'episode': '第三話 誤送金完結へ!絶体絶命の反撃開始', + 'release_date': '20230804', + 'categories': 'count:11', + 'episode_number': 3, + 'timestamp': 1690522538, + 'description': 'md5:2b796341af1ef772034133174ba4a895', + 'series': 'VIVANT', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + meta = self._search_json(r'window\.app\s*=', webpage, 'episode info', video_id, fatal=False) + episode = traverse_obj(meta, ('falcorCache', 'catalog', 'episode', video_id, 'value')) + + tf_path = self._search_regex( + r'<script[^>]+src=["\'](/assets/tf\.[^"\']+\.js)["\']', webpage, 'stream API config') + tf_js = self._download_webpage(urljoin(url, tf_path), video_id, note='Downloading stream API config') + video_url = self._search_regex(r'videoPlaybackUrl:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API url') + api_key = self._search_regex(r'api_key:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API key') + + try: + source_meta = self._download_json(f'{video_url}ref:{video_id}', video_id, + headers={'X-Streaks-Api-Key': api_key}, + note='Downloading stream metadata') + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + self.raise_geo_restricted(countries=['JP']) + raise + + formats, subtitles = [], {} + for src in traverse_obj(source_meta, ('sources', ..., 'src')): + fmts, subs = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'title': try_call(lambda: clean_html(get_element_text_and_html_by_tag('h3', webpage)[0])), + 'id': video_id, + **traverse_obj(episode, { + 'categories': ('keywords', {list}), + 'id': ('content_id', {str}), + 'description': ('description', 0, 'value'), + 'timestamp': ('created_at', {unified_timestamp}), + 'release_timestamp': ('pub_date', {unified_timestamp}), + 'duration': ('tv_episode_info', 'duration', {int_or_none}), + 'episode_number': ('tv_episode_info', 'episode_number', {int_or_none}), + 'episode': ('title', lambda _, v: not v.get('is_phonetic'), 'value'), + 'series': ('custom_data', 'program_name'), + }, get_all=False), + 'formats': formats, + 'subtitles': subtitles, + } + + +class TBSJPProgramIE(InfoExtractor): + _VALID_URL = r'https?://cu\.tbs\.co\.jp/program/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://cu.tbs.co.jp/program/23601', + 'playlist_mincount': 4, + 'info_dict': { + 'id': '23601', + 'categories': ['エンタメ', 'ミライカプセル', '会社', '働く', 'バラエティ', '動画'], + 'description': '幼少期の夢は大人になって、どう成長したのだろうか?\nそしてその夢は今後、どのように広がっていくのか?\nいま話題の会社で働く人の「夢の成長」を描く', + 'series': 'ミライカプセル -I have a dream-', + 'title': 'ミライカプセル -I have a dream-' + } + }] + + def _real_extract(self, url): + programme_id = self._match_id(url) + webpage = self._download_webpage(url, programme_id) + meta = self._search_json(r'window\.app\s*=', webpage, 'programme info', programme_id) + + programme = traverse_obj(meta, ('falcorCache', 'catalog', 'program', programme_id, 'false', 'value')) + + return { + '_type': 'playlist', + 'entries': [self.url_result(f'https://cu.tbs.co.jp/episode/{video_id}', TBSJPEpisodeIE, video_id) + for video_id in traverse_obj(programme, ('custom_data', 'seriesList', 'episodeCode', ...))], + 'id': programme_id, + **traverse_obj(programme, { + 'categories': ('keywords', ...), + 'id': ('tv_episode_info', 'show_content_id', {str_or_none}), + 'description': ('custom_data', 'program_description'), + 'series': ('custom_data', 'program_name'), + 'title': ('custom_data', 'program_name'), + }), + } + + +class TBSJPPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://cu\.tbs\.co\.jp/playlist/(?P<id>[\da-f]+)' + _TESTS = [{ + 'url': 'https://cu.tbs.co.jp/playlist/184f9970e7ba48e4915f1b252c55015e', + 'playlist_mincount': 4, + 'info_dict': { + 'title': 'まもなく配信終了', + 'id': '184f9970e7ba48e4915f1b252c55015e', + } + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + page = self._download_webpage(url, playlist_id) + meta = self._search_json(r'window\.app\s*=', page, 'playlist info', playlist_id) + playlist = traverse_obj(meta, ('falcorCache', 'playList', playlist_id)) + + def entries(): + for entry in traverse_obj(playlist, ('catalogs', 'value', lambda _, v: v['content_id'])): + # TODO: it's likely possible to get all metadata from the playlist page json instead + content_id = entry['content_id'] + content_type = entry.get('content_type') + if content_type == 'tv_show': + yield self.url_result( + f'https://cu.tbs.co.jp/program/{content_id}', TBSJPProgramIE, content_id) + elif content_type == 'tv_episode': + yield self.url_result( + f'https://cu.tbs.co.jp/episode/{content_id}', TBSJPEpisodeIE, content_id) + else: + self.report_warning(f'Skipping "{content_id}" with unsupported content_type "{content_type}"') + + return self.playlist_result(entries(), playlist_id, traverse_obj(playlist, ('display_name', 'value'))) diff --git a/hypervideo_dl/extractor/teachable.py b/hypervideo_dl/extractor/teachable.py index c212a49..01906bd 100644 --- a/hypervideo_dl/extractor/teachable.py +++ b/hypervideo_dl/extractor/teachable.py @@ -56,7 +56,7 @@ class TeachableBaseIE(InfoExtractor): self._logged_in = True return - login_url = urlh.geturl() + login_url = urlh.url login_form = self._hidden_inputs(login_page) diff --git a/hypervideo_dl/extractor/teamcoco.py b/hypervideo_dl/extractor/teamcoco.py index a822b67..d32f812 100644 --- a/hypervideo_dl/extractor/teamcoco.py +++ b/hypervideo_dl/extractor/teamcoco.py @@ -1,57 +1,109 @@ import json +import re from .turner import TurnerBaseIE from ..utils import ( - determine_ext, ExtractorError, - int_or_none, + clean_html, + determine_ext, + make_archive_id, + merge_dicts, mimetype2ext, parse_duration, - parse_iso8601, - qualities, + parse_qs, + traverse_obj, + unified_timestamp, + urljoin, + url_or_none, ) -class TeamcocoIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:\w+\.)?teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)' +class TeamcocoBaseIE(TurnerBaseIE): + _QUALITIES = { + 'low': (480, 272), + 'sd': (640, 360), + 'hd': (1280, 720), + 'uhd': (1920, 1080), + } + + def _get_formats_and_subtitles(self, info, video_id): + formats, subtitles = [], {} + + for src in traverse_obj(info, ('src', ..., {dict})): + format_id = src.get('label') + src_url = src.get('src') + if re.match(r'https?:/[^/]', src_url): + src_url = src_url.replace(':/', '://', 1) + ext = determine_ext(src_url, mimetype2ext(src.get('type'))) + + if not format_id or not src_url: + continue + elif format_id == 'hls' or ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + elif format_id in self._QUALITIES: + if src_url.startswith('/mp4:protected/'): + # TODO: Correct extraction for these files + continue + formats.append({ + 'url': src_url, + 'ext': ext, + 'format_id': format_id, + 'width': self._QUALITIES[format_id][0], + 'height': self._QUALITIES[format_id][1], + }) + + return formats, subtitles + + +class TeamcocoIE(TeamcocoBaseIE): + _VALID_URL = r'https?://(?:www\.)?teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)' _TESTS = [ { 'url': 'http://teamcoco.com/video/mary-kay-remote', - 'md5': '55d532f81992f5c92046ad02fec34d7d', 'info_dict': { 'id': '80187', + 'display_id': 'video_mary-kay-remote', 'ext': 'mp4', 'title': 'Conan Becomes A Mary Kay Beauty Consultant', - 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.', - 'duration': 495.0, + 'description': 'md5:9fb64e45b5aef6b2af1b67612b36c162', + 'thumbnail': 'https://teamcoco.com/image/thumb?id=80187', 'upload_date': '20140402', - 'timestamp': 1396407600, - } + 'timestamp': 1396440000, + }, + 'params': { + 'skip_download': 'm3u8', + }, }, { 'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush', - 'md5': 'cde9ba0fa3506f5f017ce11ead928f9a', 'info_dict': { 'id': '19705', + 'display_id': 'video_louis-ck-interview-george-w-bush', 'ext': 'mp4', - 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.', 'title': 'Louis C.K. Interview Pt. 1 11/3/11', - 'duration': 288, + 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.', + 'thumbnail': 'https://teamcoco.com/image/thumb?id=19705', 'upload_date': '20111104', - 'timestamp': 1320405840, - } + 'timestamp': 1320408000, + }, + 'params': { + 'skip_download': 'm3u8', + }, }, { 'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey', 'info_dict': { 'id': '88748', + 'display_id': 'video_timothy-olyphant-drinking-whiskey', 'ext': 'mp4', 'title': 'Timothy Olyphant Raises A Toast To “Justified”', 'description': 'md5:15501f23f020e793aeca761205e42c24', 'upload_date': '20150415', - 'timestamp': 1429088400, + 'timestamp': 1429099200, + 'thumbnail': 'https://teamcoco.com/image/thumb?id=88748', }, - 'params': { - 'skip_download': True, # m3u8 downloads - } }, { 'url': 'http://teamcoco.com/video/full-episode-mon-6-1-joel-mchale-jake-tapper-and-musical-guest-courtney-barnett?playlist=x;eyJ0eXBlIjoidGFnIiwiaWQiOjl9', 'info_dict': { @@ -60,9 +112,6 @@ class TeamcocoIE(TurnerBaseIE): 'title': 'Full Episode - Mon. 6/1 - Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett', 'description': 'Guests: Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett', }, - 'params': { - 'skip_download': True, # m3u8 downloads - }, 'skip': 'This video is no longer available.', }, { 'url': 'http://teamcoco.com/video/the-conan-audiencey-awards-for-04/25/18', @@ -76,126 +125,156 @@ class TeamcocoIE(TurnerBaseIE): }, { 'url': 'http://teamcoco.com/israel/conan-hits-the-streets-beaches-of-tel-aviv', 'only_matching': True, - }, { - 'url': 'https://conan25.teamcoco.com/video/ice-cube-kevin-hart-conan-share-lyft', - 'only_matching': True, - } + }, ] - _RECORD_TEMPL = '''id - title - teaser - publishOn - thumb { - preview - } - tags { - name - } - duration - turnerMediaId - turnerMediaAuthToken''' - - def _graphql_call(self, query_template, object_type, object_id): - find_object = 'find' + object_type - return self._download_json( - 'https://teamcoco.com/graphql', object_id, data=json.dumps({ - 'query': query_template % (find_object, object_id) - }).encode(), headers={ - 'Content-Type': 'application/json', - })['data'][find_object] def _real_extract(self, url): - display_id = self._match_id(url) + display_id = self._match_id(url).replace('/', '_') + webpage = self._download_webpage(url, display_id) + data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['pageData'] + info = merge_dicts(*traverse_obj(data, ( + 'blocks', lambda _, v: v['name'] in ('meta-tags', 'video-player', 'video-info'), 'props', {dict}))) - response = self._graphql_call('''{ - %%s(slug: "%%s") { - ... on RecordSlug { - record { - %s - } - } - ... on PageSlug { - child { - id - } - } - ... on NotFoundSlug { - status - } - } -}''' % self._RECORD_TEMPL, 'Slug', display_id) - if response.get('status'): - raise ExtractorError('This video is no longer available.', expected=True) - - child = response.get('child') - if child: - record = self._graphql_call('''{ - %%s(id: "%%s") { - ... on Video { - %s - } - } -}''' % self._RECORD_TEMPL, 'Record', child['id']) - else: - record = response['record'] - video_id = record['id'] + thumbnail = traverse_obj( + info, (('image', 'poster'), {lambda x: urljoin('https://teamcoco.com/', x)}), get_all=False) + video_id = traverse_obj(parse_qs(thumbnail), ('id', 0)) or display_id - info = { + formats, subtitles = self._get_formats_and_subtitles(info, video_id) + + return { 'id': video_id, 'display_id': display_id, - 'title': record['title'], - 'thumbnail': record.get('thumb', {}).get('preview'), - 'description': record.get('teaser'), - 'duration': parse_duration(record.get('duration')), - 'timestamp': parse_iso8601(record.get('publishOn')), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': thumbnail, + **traverse_obj(info, { + 'title': 'title', + 'description': (('descriptionHtml', 'description'), {clean_html}), + 'timestamp': ('publishedOn', {lambda x: f'{x} 12:00AM'}, {unified_timestamp}), + }, get_all=False), } - media_id = record.get('turnerMediaId') + +class ConanClassicIE(TeamcocoBaseIE): + _VALID_URL = r'https?://(?:(?:www\.)?conanclassic|conan25\.teamcoco)\.com/(?P<id>([^/]+/)*[^/?#]+)' + _TESTS = [{ + 'url': 'https://conanclassic.com/video/ice-cube-kevin-hart-conan-share-lyft', + 'info_dict': { + 'id': '74709', + 'ext': 'mp4', + 'title': 'Ice Cube, Kevin Hart, & Conan Share A Lyft Car', + 'display_id': 'video/ice-cube-kevin-hart-conan-share-lyft', + 'description': 'The stars of "Ride Along" teach Conan how to roll around Hollywood.', + 'thumbnail': 'http://cdn.teamcococdn.com/image/640x360/lyft-5bd75f82b616c.png', + 'duration': 570.0, + 'upload_date': '20131211', + 'timestamp': 1386721620, + '_old_archive_ids': ['teamcoco 74709'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://conan25.teamcoco.com/video/ice-cube-kevin-hart-conan-share-lyft', + 'only_matching': True, + }] + + _GRAPHQL_QUERY = '''query find($id: ID!) { + findRecord(id: $id) { + +... on MetaInterface { + id + title + teaser + publishOn + slug + thumb { + +... on FileInterface { + id + path + preview + mime +} + + } +} + +... on Video { + videoType + duration + isLive + youtubeId + turnerMediaId + turnerMediaAuthToken + airDate +} + +... on Episode { + airDate + seasonNumber + episodeNumber + guestNames +} + + } + findRecordVideoMetadata(id: $id) { + turnerMediaId + turnerMediaAuthToken + duration + src + } +}''' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['pageData'] + video_id = traverse_obj( + data, ('blocks', ..., 'props', 'fieldDefs', lambda _, v: v['name'] == 'incomingVideoId', 'value'), + ('blocks', ..., 'props', 'fields', 'incomingVideoRecord', 'id'), get_all=False) + if not video_id: + self.raise_no_formats('Unable to extract video ID from webpage', expected=True) + + response = self._download_json( + 'https://conanclassic.com/api/legacy/graphql', video_id, data=json.dumps({ + 'query': self._GRAPHQL_QUERY, + 'variables': {'id': video_id}, + }, separators=(',', ':')).encode(), headers={ + 'Content-Type': 'application/json', + }) + + info = traverse_obj(response, ('data', 'findRecord', { + 'title': 'title', + 'description': 'teaser', + 'thumbnail': ('thumb', 'preview', {url_or_none}), + 'duration': ('duration', {parse_duration}), + 'timestamp': ('publishOn', {unified_timestamp}), + })) + + media_id = traverse_obj( + response, ('data', ('findRecord', 'findRecordVideoMetadata'), 'turnerMediaId'), get_all=False) if media_id: + token = traverse_obj( + response, ('data', ('findRecord', 'findRecordVideoMetadata'), 'turnerMediaAuthToken'), get_all=False) + if not token: + raise ExtractorError('No Turner Media auth token found in API response') self._initialize_geo_bypass({ 'countries': ['US'], }) info.update(self._extract_ngtv_info(media_id, { - 'accessToken': record['turnerMediaAuthToken'], + 'accessToken': token, 'accessTokenType': 'jws', })) else: - video_sources = self._download_json( - 'https://teamcoco.com/_truman/d/' + video_id, - video_id)['meta']['src'] - if isinstance(video_sources, dict): - video_sources = video_sources.values() - - formats = [] - get_quality = qualities(['low', 'sd', 'hd', 'uhd']) - for src in video_sources: - if not isinstance(src, dict): - continue - src_url = src.get('src') - if not src_url: - continue - format_id = src.get('label') - ext = determine_ext(src_url, mimetype2ext(src.get('type'))) - if format_id == 'hls' or ext == 'm3u8': - # compat_urllib_parse.urljoin does not work here - if src_url.startswith('/'): - src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url - formats.extend(self._extract_m3u8_formats( - src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) - else: - if src_url.startswith('/mp4:protected/'): - # TODO Correct extraction for these files - continue - tbr = int_or_none(self._search_regex( - r'(\d+)k\.mp4', src_url, 'tbr', default=None)) - - formats.append({ - 'url': src_url, - 'ext': ext, - 'tbr': tbr, - 'format_id': format_id, - 'quality': get_quality(format_id), - }) - info['formats'] = formats - - return info + formats, subtitles = self._get_formats_and_subtitles( + traverse_obj(response, ('data', 'findRecordVideoMetadata')), video_id) + info.update({ + 'formats': formats, + 'subtitles': subtitles, + }) + + return { + 'id': video_id, + 'display_id': display_id, + '_old_archive_ids': [make_archive_id('Teamcoco', video_id)], + **info, + } diff --git a/hypervideo_dl/extractor/telecaribe.py b/hypervideo_dl/extractor/telecaribe.py new file mode 100644 index 0000000..91118a1 --- /dev/null +++ b/hypervideo_dl/extractor/telecaribe.py @@ -0,0 +1,91 @@ +import re + +from .common import InfoExtractor +from ..utils import traverse_obj + + +class TelecaribePlayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?play\.telecaribe\.co/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.play.telecaribe.co/breicok', + 'info_dict': { + 'id': 'breicok', + 'title': 'Breicok', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.play.telecaribe.co/si-fue-gol-de-yepes', + 'info_dict': { + 'id': 'si-fue-gol-de-yepes', + 'title': 'Sí Fue Gol de Yepes', + }, + 'playlist_count': 6, + }, { + 'url': 'https://www.play.telecaribe.co/ciudad-futura', + 'info_dict': { + 'id': 'ciudad-futura', + 'title': 'Ciudad Futura', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.play.telecaribe.co/live', + 'info_dict': { + 'id': 'live', + 'title': r're:^Señal en vivo', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + } + }, { + 'url': 'https://www.play.telecaribe.co/liveplus', + 'info_dict': { + 'id': 'liveplus', + 'title': r're:^Señal en vivo Plus', + 'live_status': 'is_live', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, + 'skip': 'Geo-restricted to Colombia', + }] + + def _download_player_webpage(self, webpage, display_id): + page_id = self._search_regex( + (r'window\.firstPageId\s*=\s*["\']([^"\']+)', r'<div[^>]+id\s*=\s*"pageBackground_([^"]+)'), + webpage, 'page_id') + + props = self._download_json(self._search_regex( + rf'<link[^>]+href\s*=\s*"([^"]+)"[^>]+id\s*=\s*"features_{page_id}"', + webpage, 'json_props_url'), display_id)['props']['render']['compProps'] + + return self._download_webpage(traverse_obj(props, (..., 'url'))[-1], display_id) + + def _get_clean_title(self, title): + return re.sub(r'\s*\|\s*Telecaribe\s*VOD', '', title or '').strip() or None + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player = self._download_player_webpage(webpage, display_id) + + livestream_url = self._search_regex( + r'(?:let|const|var)\s+source\s*=\s*["\']([^"\']+)', player, 'm3u8 url', default=None) + + if not livestream_url: + return self.playlist_from_matches( + re.findall(r'<a[^>]+href\s*=\s*"([^"]+\.mp4)', player), display_id, + self._get_clean_title(self._og_search_title(webpage))) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + livestream_url, display_id, 'mp4', live=True) + + return { + 'id': display_id, + 'title': self._get_clean_title(self._og_search_title(webpage)), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + } diff --git a/hypervideo_dl/extractor/telemundo.py b/hypervideo_dl/extractor/telemundo.py index 88f29cb..54e74a6 100644 --- a/hypervideo_dl/extractor/telemundo.py +++ b/hypervideo_dl/extractor/telemundo.py @@ -1,9 +1,6 @@ from .common import InfoExtractor -from ..utils import ( - try_get, - unified_timestamp, - HEADRequest, -) +from ..networking import HEADRequest +from ..utils import try_get, unified_timestamp class TelemundoIE(InfoExtractor): @@ -38,7 +35,7 @@ class TelemundoIE(InfoExtractor): m3u8_url = self._request_webpage(HEADRequest( redirect_url + '?format=redirect&manifest=m3u&format=redirect&Tracking=true&Embedded=true&formats=MPEG4'), - video_id, 'Processing m3u8').geturl() + video_id, 'Processing m3u8').url formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') date = unified_timestamp(try_get( metadata, lambda x: x['props']['initialState']['video']['associatedPlaylists'][0]['videos'][0]['datePublished'].split(' ', 1)[1])) diff --git a/hypervideo_dl/extractor/tempo.py b/hypervideo_dl/extractor/tempo.py index 1cfb956..9318d6f 100644 --- a/hypervideo_dl/extractor/tempo.py +++ b/hypervideo_dl/extractor/tempo.py @@ -1,5 +1,81 @@ +import re + from .common import InfoExtractor -from ..utils import int_or_none, parse_iso8601, str_or_none, traverse_obj +from ..utils import ( + int_or_none, + parse_iso8601, + traverse_obj, + try_call +) + + +class IVXPlayerIE(InfoExtractor): + _VALID_URL = r'ivxplayer:(?P<video_id>\d+):(?P<player_key>\w+)' + _TESTS = [{ + 'url': 'ivxplayer:2366065:4a89dfe6bc8f002596b1dfbd600730b1', + 'info_dict': { + 'id': '2366065', + 'ext': 'mp4', + 'duration': 112, + 'upload_date': '20221204', + 'title': 'Film Indonesia di Disney Content Showcase Asia Pacific 2022', + 'timestamp': 1670151746, + 'thumbnail': 'https://ivx-image.ivideosmart.com/serve/image/video/2366065?width=300' + } + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.cantika.com/video/31737/film-indonesia-di-disney-content-showcase-asia-pacific-2022', + 'info_dict': { + 'id': '2374200', + 'ext': 'mp4', + 'duration': 110, + 'title': 'Serial Indonesia di Disney Content Showcase Asia Pacific 2022', + 'timestamp': 1670639416, + 'upload_date': '20221210', + 'thumbnail': 'https://ivx-image.ivideosmart.com/serve/image/video/2374200?width=300' + } + }, { + 'url': 'https://www.gooto.com/video/11437/wuling-suv-ramai-dikunjungi-di-giias-2018', + 'info_dict': { + 'id': '892109', + 'ext': 'mp4', + 'title': 'Wuling SUV Ramai Dikunjungi di GIIAS 2018', + 'upload_date': '20180811', + 'description': 'md5:6d901483d0aacc664aecb4489719aafa', + 'duration': 75, + 'timestamp': 1534011263, + 'thumbnail': 'https://ivx-image.ivideosmart.com/serve/image/video/892109?width=300' + } + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + # more info at https://player.ivideosmart.com/ivsplayer/v4/dist/js/loader.js + mobj = re.search( + r'<ivs-player\s*[^>]+data-ivs-key\s*=\s*"(?P<player_key>[\w]+)\s*[^>]+\bdata-ivs-vid="(?P<video_id>[\w-]+)', + webpage) + if mobj: + yield f'ivxplayer:{mobj.group("video_id")}:{mobj.group("player_key")}' + raise cls.StopExtraction() + + def _real_extract(self, url): + video_id, player_key = self._match_valid_url(url).group('video_id', 'player_key') + json_data = self._download_json( + f'https://ivxplayer.ivideosmart.com/prod/video/{video_id}?key={player_key}', video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + json_data['player']['video_url'], video_id) + + return { + 'id': str(json_data['ivx']['id']), + 'title': traverse_obj(json_data, ('ivx', 'name')), + 'description': traverse_obj(json_data, ('ivx', 'description')), + 'duration': int_or_none(traverse_obj(json_data, ('ivx', 'duration'))), + 'timestamp': parse_iso8601(traverse_obj(json_data, ('ivx', 'published_at'))), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': traverse_obj(json_data, ('ivx', 'thumbnail_url')) + } class TempoIE(InfoExtractor): @@ -7,14 +83,14 @@ class TempoIE(InfoExtractor): _TESTS = [{ 'url': 'https://video.tempo.co/read/30058/anies-baswedan-ajukan-banding-putusan-ptun-batalkan-ump-dki', 'info_dict': { - 'id': '2144438', + 'id': '2144275', + 'display_id': 'anies-baswedan-ajukan-banding-putusan-ptun-batalkan-ump-dki', 'ext': 'mp4', 'title': 'Anies Baswedan Ajukan Banding Putusan PTUN Batalkan UMP DKI', - 'display_id': 'anies-baswedan-ajukan-banding-putusan-ptun-batalkan-ump-dki', - 'duration': 84, + 'duration': 85, 'description': 'md5:a6822b7c4c874fa7e5bd63e96a387b66', 'thumbnail': 'https://statik.tempo.co/data/2022/07/27/id_1128287/1128287_720.jpg', - 'timestamp': 1658911277, + 'timestamp': 1658907970, 'upload_date': '20220727', 'tags': ['Anies Baswedan', ' PTUN', ' PTUN | Pengadilan Tata Usaha Negara', ' PTUN Batalkan UMP DKI', ' UMP DKI'], } @@ -24,30 +100,15 @@ class TempoIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - player_key, widget_id = self._search_regex( - r'<ivs-player\s*[^>]+data-ivs-key\s*=\s*"(?P<player_key>[\w]+)[^>]+\bdata-ivs-wid="(?P<widget_id>[\w-]+)', - webpage, 'player_key, widget_id', group=('player_key', 'widget_id')) + _, video_id, player_key = next(IVXPlayerIE._extract_embed_urls(url, webpage)).split(':') json_ld_data = self._search_json_ld(webpage, display_id) - json_data = self._download_json( - f'https://ivxplayer.ivideosmart.com/prod/widget/{widget_id}', - display_id, query={'key': player_key}) - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - json_data['player']['video_url'], display_id, ext='mp4') - - return { - 'id': str(json_data['ivx']['id']), - 'display_id': display_id, - 'formats': formats, - 'subtitles': subtitles, - 'title': (self._html_search_meta('twitter:title', webpage) or self._og_search_title(webpage) - or traverse_obj(json_data, ('ivx', 'name'))), - 'duration': int_or_none(traverse_obj(json_data, ('ivx', 'duration'))), - 'thumbnail': (self._html_search_meta('twitter:image:src', webpage) or self._og_search_thumbnail(webpage) - or traverse_obj(json_data, ('ivx', 'thumbnail_url'))), - 'description': (json_ld_data.get('description') or self._html_search_meta(['description', 'twitter:description'], webpage) - or self._og_search_description(webpage)), - 'timestamp': parse_iso8601(traverse_obj(json_data, ('ivx', 'created_at'))), - 'tags': str_or_none(self._html_search_meta('keywords', webpage), '').split(','), - } + return self.url_result( + f'ivxplayer:{video_id}:{player_key}', display_id=display_id, + thumbnail=self._html_search_meta('twitter:image:src', webpage) or self._og_search_thumbnail(webpage), + tags=try_call(lambda: self._html_search_meta('keywords', webpage).split(',')), + description=(json_ld_data.get('description') + or self._html_search_meta(('description', 'twitter:description'), webpage) + or self._og_search_description(webpage)), + url_transparent=True) diff --git a/hypervideo_dl/extractor/tencent.py b/hypervideo_dl/extractor/tencent.py index ff8bf99..6618ea4 100644 --- a/hypervideo_dl/extractor/tencent.py +++ b/hypervideo_dl/extractor/tencent.py @@ -8,6 +8,7 @@ from .common import InfoExtractor from ..aes import aes_cbc_encrypt_bytes from ..utils import ( ExtractorError, + float_or_none, determine_ext, int_or_none, js_to_json, @@ -19,6 +20,16 @@ from ..utils import ( class TencentBaseIE(InfoExtractor): """Subclasses must set _API_URL, _APP_VERSION, _PLATFORM, _HOST, _REFERER""" + def _check_api_response(self, api_response): + msg = api_response.get('msg') + if api_response.get('code') != '0.0' and msg is not None: + if msg in ( + '您所在区域暂无此内容版权(如设置VPN请关闭后重试)', + 'This content is not available in your area due to copyright restrictions. Please choose other videos.' + ): + self.raise_geo_restricted() + raise ExtractorError(f'Tencent said: {msg}') + def _get_ckey(self, video_id, url, guid): ua = self.get_param('http_headers')['User-Agent'] @@ -32,7 +43,7 @@ class TencentBaseIE(InfoExtractor): padding_mode='whitespace').hex().upper() def _get_video_api_response(self, video_url, video_id, series_id, subtitle_format, video_format, video_quality): - guid = ''.join([random.choice(string.digits + string.ascii_lowercase) for _ in range(16)]) + guid = ''.join(random.choices(string.digits + string.ascii_lowercase, k=16)) ckey = self._get_ckey(video_id, video_url, guid) query = { 'vid': video_id, @@ -47,6 +58,11 @@ class TencentBaseIE(InfoExtractor): 'sphttps': '1', # Enable HTTPS 'otype': 'json', 'spwm': '1', + 'hevclv': '28', # Enable HEVC + 'drm': '40', # Enable DRM + # For HDR + 'spvideo': '4', + 'spsfrhdr': '100', # For SHD 'host': self._HOST, 'referer': self._REFERER, @@ -55,7 +71,7 @@ class TencentBaseIE(InfoExtractor): 'platform': self._PLATFORM, # For VQQ 'guid': guid, - 'flowid': ''.join(random.choice(string.digits + string.ascii_lowercase) for _ in range(32)), + 'flowid': ''.join(random.choices(string.digits + string.ascii_lowercase, k=32)), } return self._search_json(r'QZOutputJson=', self._download_webpage( @@ -63,7 +79,6 @@ class TencentBaseIE(InfoExtractor): def _extract_video_formats_and_subtitles(self, api_response, video_id): video_response = api_response['vl']['vi'][0] - video_width, video_height = video_response.get('vw'), video_response.get('vh') formats, subtitles = [], {} for video_format in video_response['ul']['ui']: @@ -71,47 +86,61 @@ class TencentBaseIE(InfoExtractor): fmts, subs = self._extract_m3u8_formats_and_subtitles( video_format['url'] + traverse_obj(video_format, ('hls', 'pt'), default=''), video_id, 'mp4', fatal=False) - for f in fmts: - f.update({'width': video_width, 'height': video_height}) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) else: formats.append({ 'url': f'{video_format["url"]}{video_response["fn"]}?vkey={video_response["fvkey"]}', - 'width': video_width, - 'height': video_height, 'ext': 'mp4', }) + identifier = video_response.get('br') + format_response = traverse_obj( + api_response, ('fl', 'fi', lambda _, v: v['br'] == identifier), + expected_type=dict, get_all=False) or {} + common_info = { + 'width': video_response.get('vw'), + 'height': video_response.get('vh'), + 'abr': float_or_none(format_response.get('audiobandwidth'), scale=1000), + 'vbr': float_or_none(format_response.get('bandwidth'), scale=1000), + 'fps': format_response.get('vfps'), + 'format': format_response.get('sname'), + 'format_id': format_response.get('name'), + 'format_note': format_response.get('resolution'), + 'dynamic_range': {'hdr10': 'hdr10'}.get(format_response.get('name'), 'sdr'), + 'has_drm': format_response.get('drm', 0) != 0, + } + for f in formats: + f.update(common_info) + return formats, subtitles - def _extract_video_native_subtitles(self, api_response, subtitles_format): + def _extract_video_native_subtitles(self, api_response): subtitles = {} for subtitle in traverse_obj(api_response, ('sfl', 'fi')) or (): subtitles.setdefault(subtitle['lang'].lower(), []).append({ 'url': subtitle['url'], - 'ext': subtitles_format, + 'ext': 'srt' if subtitle.get('captionType') == 1 else 'vtt', 'protocol': 'm3u8_native' if determine_ext(subtitle['url']) == 'm3u8' else 'http', }) return subtitles def _extract_all_video_formats_and_subtitles(self, url, video_id, series_id): - formats, subtitles = [], {} - for video_format, subtitle_format, video_quality in ( - # '': 480p, 'shd': 720p, 'fhd': 1080p - ('mp4', 'srt', ''), ('hls', 'vtt', 'shd'), ('hls', 'vtt', 'fhd')): - api_response = self._get_video_api_response( - url, video_id, series_id, subtitle_format, video_format, video_quality) - - if api_response.get('em') != 0 and api_response.get('exem') != 0: - if '您所在区域暂无此内容版权' in api_response.get('msg'): - self.raise_geo_restricted() - raise ExtractorError(f'Tencent said: {api_response.get("msg")}') + api_responses = [self._get_video_api_response(url, video_id, series_id, 'srt', 'hls', 'hd')] + self._check_api_response(api_responses[0]) + qualities = traverse_obj(api_responses, (0, 'fl', 'fi', ..., 'name')) or ('shd', 'fhd') + for q in qualities: + if q not in ('ld', 'sd', 'hd'): + api_responses.append(self._get_video_api_response( + url, video_id, series_id, 'vtt', 'hls', q)) + self._check_api_response(api_responses[-1]) + formats, subtitles = [], {} + for api_response in api_responses: fmts, subs = self._extract_video_formats_and_subtitles(api_response, video_id) - native_subtitles = self._extract_video_native_subtitles(api_response, subtitle_format) + native_subtitles = self._extract_video_native_subtitles(api_response) formats.extend(fmts) self._merge_subtitles(subs, native_subtitles, target=subtitles) @@ -120,7 +149,7 @@ class TencentBaseIE(InfoExtractor): def _get_clean_title(self, title): return re.sub( - r'\s*[_\-]\s*(?:Watch online|腾讯视频|(?:高清)?1080P在线观看平台).*?$', + r'\s*[_\-]\s*(?:Watch online|Watch HD Video Online|WeTV|腾讯视频|(?:高清)?1080P在线观看平台).*?$', '', title or '').strip() or None @@ -134,11 +163,9 @@ class VQQBaseIE(TencentBaseIE): _REFERER = 'v.qq.com' def _get_webpage_metadata(self, webpage, video_id): - return self._parse_json( - self._search_regex( - r'(?s)<script[^>]*>[^<]*window\.__pinia\s*=\s*([^<]+)</script>', - webpage, 'pinia data', fatal=False), - video_id, transform_source=js_to_json, fatal=False) + return self._search_json( + r'<script[^>]*>[^<]*window\.__(?:pinia|PINIA__)\s*=', + webpage, 'pinia data', video_id, transform_source=js_to_json, fatal=False) class VQQVideoIE(VQQBaseIE): @@ -147,27 +174,29 @@ class VQQVideoIE(VQQBaseIE): _TESTS = [{ 'url': 'https://v.qq.com/x/page/q326831cny0.html', - 'md5': '826ef93682df09e3deac4a6e6e8cdb6e', + 'md5': 'b11c9cb781df710d686b950376676e2a', 'info_dict': { 'id': 'q326831cny0', 'ext': 'mp4', 'title': '我是选手:雷霆裂阵,终极时刻', 'description': 'md5:e7ed70be89244017dac2a835a10aeb1e', 'thumbnail': r're:^https?://[^?#]+q326831cny0', + 'format_id': r're:^shd', }, }, { 'url': 'https://v.qq.com/x/page/o3013za7cse.html', - 'md5': 'b91cbbeada22ef8cc4b06df53e36fa21', + 'md5': 'a1bcf42c6d28c189bd2fe2d468abb287', 'info_dict': { 'id': 'o3013za7cse', 'ext': 'mp4', 'title': '欧阳娜娜VLOG', 'description': 'md5:29fe847497a98e04a8c3826e499edd2e', 'thumbnail': r're:^https?://[^?#]+o3013za7cse', + 'format_id': r're:^shd', }, }, { 'url': 'https://v.qq.com/x/cover/7ce5noezvafma27/a00269ix3l8.html', - 'md5': '71459c5375c617c265a22f083facce67', + 'md5': '87968df6238a65d2478f19c25adf850b', 'info_dict': { 'id': 'a00269ix3l8', 'ext': 'mp4', @@ -175,10 +204,12 @@ class VQQVideoIE(VQQBaseIE): 'description': 'md5:8cae3534327315b3872fbef5e51b5c5b', 'thumbnail': r're:^https?://[^?#]+7ce5noezvafma27', 'series': '鸡毛飞上天', + 'format_id': r're:^shd', }, + 'skip': '404', }, { 'url': 'https://v.qq.com/x/cover/mzc00200p29k31e/s0043cwsgj0.html', - 'md5': '96b9fd4a189fdd4078c111f21d7ac1bc', + 'md5': 'fadd10bf88aec3420f06f19ee1d24c5b', 'info_dict': { 'id': 's0043cwsgj0', 'ext': 'mp4', @@ -186,7 +217,9 @@ class VQQVideoIE(VQQBaseIE): 'description': 'md5:1d8c3a0b8729ae3827fa5b2d3ebd5213', 'thumbnail': r're:^https?://[^?#]+s0043cwsgj0', 'series': '青年理工工作者生活研究所', + 'format_id': r're:^shd', }, + 'params': {'skip_download': 'm3u8'}, }, { # Geo-restricted to China 'url': 'https://v.qq.com/x/cover/mcv8hkc8zk8lnov/x0036x5qqsr.html', @@ -319,6 +352,7 @@ class WeTvEpisodeIE(WeTvBaseIE): 'episode': 'Episode 1', 'episode_number': 1, 'duration': 2835, + 'format_id': r're:^shd', }, }, { 'url': 'https://wetv.vip/en/play/u37kgfnfzs73kiu/p0039b9nvik', @@ -333,6 +367,7 @@ class WeTvEpisodeIE(WeTvBaseIE): 'episode': 'Episode 1', 'episode_number': 1, 'duration': 2454, + 'format_id': r're:^shd', }, }, { 'url': 'https://wetv.vip/en/play/lcxgwod5hapghvw-WeTV-PICK-A-BOO/i0042y00lxp-Zhao-Lusi-Describes-The-First-Experiences-She-Had-In-Who-Rules-The-World-%7C-WeTV-PICK-A-BOO', @@ -342,11 +377,12 @@ class WeTvEpisodeIE(WeTvBaseIE): 'ext': 'mp4', 'title': 'md5:f7a0857dbe5fbbe2e7ad630b92b54e6a', 'description': 'md5:76260cb9cdc0ef76826d7ca9d92fadfa', - 'thumbnail': r're:^https?://[^?#]+lcxgwod5hapghvw', + 'thumbnail': r're:^https?://[^?#]+i0042y00lxp', 'series': 'WeTV PICK-A-BOO', 'episode': 'Episode 0', 'episode_number': 0, 'duration': 442, + 'format_id': r're:^shd', }, }] @@ -406,6 +442,7 @@ class IflixEpisodeIE(IflixBaseIE): 'episode': 'Episode 1', 'episode_number': 1, 'duration': 2639, + 'format_id': r're:^shd', }, }, { 'url': 'https://www.iflix.com/en/play/fvvrcc3ra9lbtt1-Take-My-Brother-Away/i0029sd3gm1-EP1%EF%BC%9ATake-My-Brother-Away', @@ -420,6 +457,7 @@ class IflixEpisodeIE(IflixBaseIE): 'episode': 'Episode 1', 'episode_number': 1, 'duration': 228, + 'format_id': r're:^shd', }, }] diff --git a/hypervideo_dl/extractor/tennistv.py b/hypervideo_dl/extractor/tennistv.py index bc64226..c1b4a33 100644 --- a/hypervideo_dl/extractor/tennistv.py +++ b/hypervideo_dl/extractor/tennistv.py @@ -86,7 +86,7 @@ class TennisTVIE(InfoExtractor): }) self.get_token(None, { - 'code': urllib.parse.parse_qs(handle.geturl())['code'][-1], + 'code': urllib.parse.parse_qs(handle.url)['code'][-1], 'grant_type': 'authorization_code', 'client_id': 'tennis-tv-web', 'redirect_uri': 'https://www.tennistv.com/resources/v1.1.10/html/silent-check-sso.html' diff --git a/hypervideo_dl/extractor/tenplay.py b/hypervideo_dl/extractor/tenplay.py index 633032e..c7097cf 100644 --- a/hypervideo_dl/extractor/tenplay.py +++ b/hypervideo_dl/extractor/tenplay.py @@ -2,11 +2,8 @@ from datetime import datetime import base64 from .common import InfoExtractor -from ..utils import ( - HEADRequest, - int_or_none, - urlencode_postdata, -) +from ..networking import HEADRequest +from ..utils import int_or_none, urlencode_postdata class TenPlayIE(InfoExtractor): @@ -94,7 +91,7 @@ class TenPlayIE(InfoExtractor): data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON', headers=headers).get('source') m3u8_url = self._request_webpage(HEADRequest( - _video_url), content_id).geturl() + _video_url), content_id).url if '10play-not-in-oz' in m3u8_url: self.raise_geo_restricted(countries=['AU']) formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4') diff --git a/hypervideo_dl/extractor/testurl.py b/hypervideo_dl/extractor/testurl.py index dccca10..3cf0017 100644 --- a/hypervideo_dl/extractor/testurl.py +++ b/hypervideo_dl/extractor/testurl.py @@ -8,7 +8,7 @@ class TestURLIE(InfoExtractor): """ Allows addressing of the test cases as test:yout.*be_1 """ IE_DESC = False # Do not list - _VALID_URL = r'test(?:url)?:(?P<extractor>.*?)(?:_(?P<num>[0-9]+))?$' + _VALID_URL = r'test(?:url)?:(?P<extractor>.*?)(?:_(?P<num>\d+|all))?$' def _real_extract(self, url): from . import gen_extractor_classes @@ -23,11 +23,12 @@ class TestURLIE(InfoExtractor): if len(matching_extractors) == 0: raise ExtractorError(f'No extractors matching {extractor_id!r} found', expected=True) elif len(matching_extractors) > 1: - try: # Check for exact match - extractor = next( - ie for ie in matching_extractors - if ie.IE_NAME.lower() == extractor_id.lower()) - except StopIteration: + extractor = next(( # Check for exact match + ie for ie in matching_extractors if ie.IE_NAME.lower() == extractor_id.lower() + ), None) or next(( # Check for exact match without plugin suffix + ie for ie in matching_extractors if ie.IE_NAME.split('+')[0].lower() == extractor_id.lower() + ), None) + if not extractor: raise ExtractorError( 'Found multiple matching extractors: %s' % ' '.join(ie.IE_NAME for ie in matching_extractors), expected=True) @@ -35,6 +36,10 @@ class TestURLIE(InfoExtractor): extractor = matching_extractors[0] testcases = tuple(extractor.get_testcases(True)) + if num == 'all': + return self.playlist_result( + [self.url_result(tc['url'], extractor) for tc in testcases], + url, f'{extractor.IE_NAME} tests') try: tc = testcases[int(num or 0)] except IndexError: @@ -42,4 +47,4 @@ class TestURLIE(InfoExtractor): f'Test case {num or 0} not found, got only {len(testcases)} tests', expected=True) self.to_screen(f'Test URL: {tc["url"]}') - return self.url_result(tc['url']) + return self.url_result(tc['url'], extractor) diff --git a/hypervideo_dl/extractor/tf1.py b/hypervideo_dl/extractor/tf1.py index 4cf0322..aba4927 100644 --- a/hypervideo_dl/extractor/tf1.py +++ b/hypervideo_dl/extractor/tf1.py @@ -28,6 +28,25 @@ class TF1IE(InfoExtractor): 'skip_download': True, }, }, { + 'url': 'https://www.tf1.fr/tmc/burger-quiz/videos/burger-quiz-du-19-aout-2023-s03-episode-21-85585666.html', + 'info_dict': { + 'id': '14010600', + 'ext': 'mp4', + 'title': 'Burger Quiz - S03 EP21 avec Eye Haidara, Anne Depétrini, Jonathan Zaccaï et Pio Marmaï', + 'thumbnail': 'https://photos.tf1.fr/1280/720/burger-quiz-11-9adb79-0@1x.jpg', + 'description': 'Manu Payet recevra Eye Haidara, Anne Depétrini, Jonathan Zaccaï et Pio Marmaï.', + 'upload_date': '20230819', + 'timestamp': 1692469471, + 'season_number': 3, + 'series': 'Burger Quiz', + 'episode_number': 21, + 'season': 'Season 3', + 'tags': 'count:13', + 'episode': 'Episode 21', + 'duration': 2312 + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', 'only_matching': True, }, { diff --git a/hypervideo_dl/extractor/tfo.py b/hypervideo_dl/extractor/tfo.py index a24789c..d417f50 100644 --- a/hypervideo_dl/extractor/tfo.py +++ b/hypervideo_dl/extractor/tfo.py @@ -1,12 +1,8 @@ import json from .common import InfoExtractor -from ..utils import ( - HEADRequest, - ExtractorError, - int_or_none, - clean_html, -) +from ..networking import HEADRequest +from ..utils import ExtractorError, clean_html, int_or_none class TFOIE(InfoExtractor): diff --git a/hypervideo_dl/extractor/theplatform.py b/hypervideo_dl/extractor/theplatform.py index e659b8e..99caeb5 100644 --- a/hypervideo_dl/extractor/theplatform.py +++ b/hypervideo_dl/extractor/theplatform.py @@ -7,19 +7,23 @@ import hashlib from .once import OnceIE from .adobepass import AdobePassIE +from ..networking import Request from ..utils import ( determine_ext, ExtractorError, float_or_none, int_or_none, parse_qs, - sanitized_Request, unsmuggle_url, update_url_query, xpath_with_ns, mimetype2ext, find_xpath_attr, + traverse_obj, + update_url, + urlhandle_detect_ext, ) +from ..networking import HEADRequest default_ns = 'http://www.w3.org/2005/SMIL21/Language' _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) @@ -45,7 +49,7 @@ class ThePlatformBaseIE(OnceIE): raise ExtractorError( error_element.attrib['abstract'], expected=True) - smil_formats = self._parse_smil_formats( + smil_formats, subtitles = self._parse_smil_formats_and_subtitles( meta, smil_url, video_id, namespace=default_ns, # the parameters are from syfy.com, other sites may use others, # they also work for nbc.com @@ -65,8 +69,6 @@ class ThePlatformBaseIE(OnceIE): formats.append(_format) - subtitles = self._parse_smil_subtitles(meta, default_ns) - return formats, subtitles def _download_theplatform_metadata(self, path, video_id): @@ -164,7 +166,8 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'params': { # rtmp download 'skip_download': True, - } + }, + 'skip': '404 Not Found', }, { 'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD', 'info_dict': { @@ -173,7 +176,8 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'description': 'md5:644ad9188d655b742f942bf2e06b002d', 'title': 'HIGHLIGHTS: USA bag first ever series Cup win', 'uploader': 'EGSM', - } + }, + 'skip': '404 Not Found', }, { 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', 'only_matching': True, @@ -191,6 +195,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'upload_date': '20150701', 'uploader': 'NBCU-NEWS', }, + 'skip': '404 Not Found', }, { # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 # geo-restricted (US), HLS encrypted with AES-128 @@ -270,7 +275,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): source_url = smuggled_data.get('source_url') if source_url: headers['Referer'] = source_url - request = sanitized_Request(url, headers=headers) + request = Request(url, headers=headers) webpage = self._download_webpage(request, video_id) smil_url = self._search_regex( r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml', @@ -297,6 +302,17 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): formats, subtitles = self._extract_theplatform_smil(smil_url, video_id) + # With some sites, manifest URL must be forced to extract HLS formats + if not traverse_obj(formats, lambda _, v: v['format_id'].startswith('hls')): + m3u8_url = update_url(url, query='mbr=true&manifest=m3u', fragment=None) + urlh = self._request_webpage( + HEADRequest(m3u8_url), video_id, 'Checking for HLS formats', 'No HLS formats found', fatal=False) + if urlh and urlhandle_detect_ext(urlh) == 'm3u8': + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, m3u8_id='hls', fatal=False) + formats.extend(m3u8_fmts) + self._merge_subtitles(m3u8_subs, target=subtitles) + ret = self._extract_theplatform_metadata(path, video_id) combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles) ret.update({ diff --git a/hypervideo_dl/extractor/thesun.py b/hypervideo_dl/extractor/thesun.py index ba58482..5edcf1c 100644 --- a/hypervideo_dl/extractor/thesun.py +++ b/hypervideo_dl/extractor/thesun.py @@ -5,15 +5,22 @@ from ..utils import extract_attributes class TheSunIE(InfoExtractor): - _VALID_URL = r'https://(?:www\.)?thesun\.co\.uk/[^/]+/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?the-?sun(\.co\.uk|\.com)/[^/]+/(?P<id>\d+)' + _TESTS = [{ 'url': 'https://www.thesun.co.uk/tvandshowbiz/2261604/orlando-bloom-and-katy-perry-post-adorable-instagram-video-together-celebrating-thanksgiving-after-split-rumours/', 'info_dict': { 'id': '2261604', 'title': 'md5:cba22f48bad9218b64d5bbe0e16afddf', }, 'playlist_count': 2, - } + }, { + 'url': 'https://www.the-sun.com/entertainment/7611415/1000lb-sisters-fans-rip-amy-dangerous-health-decision/', + 'info_dict': { + 'id': '7611415', + 'title': 'md5:e0b9b976f79dc770e5c80f22f40bb844', + }, + 'playlist_count': 1, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/thisoldhouse.py b/hypervideo_dl/extractor/thisoldhouse.py index 55b6413..cc7beee 100644 --- a/hypervideo_dl/extractor/thisoldhouse.py +++ b/hypervideo_dl/extractor/thisoldhouse.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..utils import HEADRequest +from ..networking import HEADRequest class ThisOldHouseIE(InfoExtractor): @@ -50,6 +50,6 @@ class ThisOldHouseIE(InfoExtractor): r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]', webpage, 'video url') if 'subscription_required=true' in video_url or 'c-entry-group-labels__image' in webpage: - return self.url_result(self._request_webpage(HEADRequest(video_url), display_id).geturl(), 'Zype', display_id) + return self.url_result(self._request_webpage(HEADRequest(video_url), display_id).url, 'Zype', display_id) video_id = self._search_regex(r'(?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})', video_url, 'video id') return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id) diff --git a/hypervideo_dl/extractor/thisvid.py b/hypervideo_dl/extractor/thisvid.py new file mode 100644 index 0000000..9d3368e --- /dev/null +++ b/hypervideo_dl/extractor/thisvid.py @@ -0,0 +1,226 @@ +import itertools +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_class, + int_or_none, + url_or_none, + urljoin, +) + + +class ThisVidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+)' + _TESTS = [{ + 'url': 'https://thisvid.com/videos/sitting-on-ball-tight-jeans/', + 'md5': '839becb572995687e11a69dc4358a386', + 'info_dict': { + 'id': '3533241', + 'ext': 'mp4', + 'title': 'Sitting on ball tight jeans', + 'description': 'md5:372353bb995883d1b65fddf507489acd', + 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg', + 'uploader_id': '150629', + 'uploader': 'jeanslevisjeans', + 'display_id': 'sitting-on-ball-tight-jeans', + 'age_limit': 18, + } + }, { + 'url': 'https://thisvid.com/embed/3533241/', + 'md5': '839becb572995687e11a69dc4358a386', + 'info_dict': { + 'id': '3533241', + 'ext': 'mp4', + 'title': 'Sitting on ball tight jeans', + 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg', + 'uploader_id': '150629', + 'uploader': 'jeanslevisjeans', + 'display_id': 'sitting-on-ball-tight-jeans', + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + main_id, type_ = re.match(self._VALID_URL, url).group('id', 'type') + webpage = self._download_webpage(url, main_id) + + title = self._html_search_regex( + r'<title\b[^>]*?>(?:Video:\s+)?(.+?)(?:\s+-\s+ThisVid(?:\.com| tube))?</title>', + webpage, 'title') + + if type_ == 'embed': + # look for more metadata + video_alt_url = url_or_none(self._search_regex( + rf'''video_alt_url\s*:\s+'({self._VALID_URL}/)',''', + webpage, 'video_alt_url', default=None)) + if video_alt_url and video_alt_url != url: + webpage = self._download_webpage( + video_alt_url, main_id, + note='Redirecting embed to main page', fatal=False) or webpage + + video_holder = get_element_by_class('video-holder', webpage) or '' + if '>This video is a private video' in video_holder: + self.raise_login_required( + (clean_html(video_holder) or 'Private video').partition('\n')[0]) + + uploader = self._html_search_regex( + r'''(?s)<span\b[^>]*>Added by:\s*</span><a\b[^>]+\bclass\s*=\s*["']author\b[^>]+\bhref\s*=\s*["']https://thisvid\.com/members/([0-9]+/.{3,}?)\s*</a>''', + webpage, 'uploader', default='') + uploader = re.split(r'''/["'][^>]*>\s*''', uploader) + if len(uploader) == 2: + # id must be non-empty, uploader could be '' + uploader_id, uploader = uploader + uploader = uploader or None + else: + uploader_id = uploader = None + + return self.url_result( + url, ie='Generic', url_transparent=True, + title=title, + age_limit=18, + uploader=uploader, + uploader_id=uploader_id) + + +class ThisVidPlaylistBaseIE(InfoExtractor): + _PLAYLIST_URL_RE = None + + @classmethod + def _find_urls(cls, html): + for m in re.finditer(rf'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>{cls._PLAYLIST_URL_RE}\b)[^>]+>''', html): + yield m.group('url') + + def _generate_playlist_entries(self, url, playlist_id, html=None): + page_url = url + for page in itertools.count(1): + if not html: + html = self._download_webpage( + page_url, playlist_id, note=f'Downloading page {page}', + fatal=False) or '' + + yield from self._find_urls(html) + + next_page = get_element_by_class('pagination-next', html) or '' + if next_page: + # member list page + next_page = urljoin(url, self._search_regex( + r'''<a\b[^>]+\bhref\s*=\s*("|')(?P<url>(?!#)(?:(?!\1).)+)''', + next_page, 'next page link', group='url', default=None)) + + # in case a member page should have pagination-next with empty link, not just `else:` + if next_page is None: + # playlist page + parsed_url = urllib.parse.urlparse(page_url) + base_path, _, num = parsed_url.path.rpartition('/') + num = int_or_none(num) + if num is None: + base_path, num = parsed_url.path.rstrip('/'), 1 + parsed_url = parsed_url._replace(path=f'{base_path}/{num + 1}') + next_page = urllib.parse.urlunparse(parsed_url) + if page_url == next_page: + next_page = None + + if not next_page: + return + page_url, html = next_page, None + + def _make_playlist_result(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + title = re.split( + r'(?i)\s*\|\s*ThisVid\.com\s*$', + self._og_search_title(webpage, default=None) + or self._html_search_regex(r'(?s)<title\b[^>]*>(.+?)</title', webpage, 'title', fatal=False) or '', 1)[0] or None + + return self.playlist_from_matches( + self._generate_playlist_entries(url, playlist_id, webpage), + playlist_id=playlist_id, playlist_title=title, ie=ThisVidIE) + + +class ThisVidMemberIE(ThisVidPlaylistBaseIE): + _VALID_URL = r'https?://thisvid\.com/members/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://thisvid.com/members/2140501/', + 'info_dict': { + 'id': '2140501', + 'title': 'Rafflesia\'s Profile', + }, + 'playlist_mincount': 16, + }, { + 'url': 'https://thisvid.com/members/2140501/favourite_videos/', + 'info_dict': { + 'id': '2140501', + 'title': 'Rafflesia\'s Favourite Videos', + }, + 'playlist_mincount': 15, + }, { + 'url': 'https://thisvid.com/members/636468/public_videos/', + 'info_dict': { + 'id': '636468', + 'title': 'Happymouth\'s Public Videos', + }, + 'playlist_mincount': 196, + }] + _PLAYLIST_URL_RE = ThisVidIE._VALID_URL + + def _real_extract(self, url): + return self._make_playlist_result(url) + + +class ThisVidPlaylistIE(ThisVidPlaylistBaseIE): + _VALID_URL = r'https?://thisvid\.com/playlist/(?P<id>\d+)/video/(?P<video_id>[A-Za-z0-9-]+)' + _TESTS = [{ + 'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/', + 'info_dict': { + 'id': '6615', + 'title': 'Underwear Stuff', + }, + 'playlist_mincount': 200, + }, { + 'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/', + 'info_dict': { + 'id': '1072387', + 'ext': 'mp4', + 'title': 'Big Italian Booty 28', + 'description': 'md5:1bccf7b13765e18fb27bf764dba7ede2', + 'uploader_id': '367912', + 'uploader': 'Jcmusclefun', + 'age_limit': 18, + 'display_id': 'big-italian-booty-28', + 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+1072387/preview\.jpg', + }, + 'params': { + 'noplaylist': True, + }, + }] + _PLAYLIST_URL_RE = _VALID_URL + + def _generate_playlist_entries(self, url, playlist_id, html=None): + for wrapped_url in super()._generate_playlist_entries(url, playlist_id, html): + video_id = re.match(self._VALID_URL, wrapped_url).group('video_id') + yield urljoin(url, f'/videos/{video_id}/') + + def _real_extract(self, url): + playlist_id, video_id = self._match_valid_url(url).group('id', 'video_id') + + if not self._yes_playlist(playlist_id, video_id): + redirect_url = urljoin(url, f'/videos/{video_id}/') + return self.url_result(redirect_url, ThisVidIE) + + result = self._make_playlist_result(url) + + # Fix duplicated title (`the title - the title` => `the title`) + title = result['title'] + t_len = len(title) + if t_len > 5 and t_len % 2 != 0: + t_len = t_len // 2 + if title[t_len] == '-': + first, second = map(str.strip, (title[:t_len], title[t_len + 1:])) + if first and first == second: + result['title'] = first + + return result diff --git a/hypervideo_dl/extractor/threeqsdn.py b/hypervideo_dl/extractor/threeqsdn.py index b104190..7841f8d 100644 --- a/hypervideo_dl/extractor/threeqsdn.py +++ b/hypervideo_dl/extractor/threeqsdn.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -90,7 +90,7 @@ class ThreeQSDNIE(InfoExtractor): config = self._download_json( url.replace('://playout.3qsdn.com/', '://playout.3qsdn.com/config/'), video_id) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: self.raise_geo_restricted() raise diff --git a/hypervideo_dl/extractor/tiktok.py b/hypervideo_dl/extractor/tiktok.py index 1bbf884..f14c4f9 100644 --- a/hypervideo_dl/extractor/tiktok.py +++ b/hypervideo_dl/extractor/tiktok.py @@ -1,25 +1,31 @@ import itertools import json import random +import re import string import time from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse +from ..networking import HEADRequest from ..utils import ( ExtractorError, - HEADRequest, LazyList, UnsupportedError, + UserNotLive, + determine_ext, + format_field, get_element_by_id, get_first, int_or_none, join_nonempty, + merge_dicts, qualities, remove_start, srt_subtitles_timecode, str_or_none, traverse_obj, + try_call, try_get, url_or_none, ) @@ -30,11 +36,15 @@ class TikTokBaseIE(InfoExtractor): _WORKING_APP_VERSION = None _APP_NAME = 'trill' _AID = 1180 - _API_HOSTNAME = 'api-h2.tiktokv.com' _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s' _WEBPAGE_HOST = 'https://www.tiktok.com/' QUALITIES = ('360p', '540p', '720p', '1080p') + @property + def _API_HOSTNAME(self): + return self._configuration_arg( + 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0] + @staticmethod def _create_url(user_id, video_id): return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}' @@ -45,14 +55,14 @@ class TikTokBaseIE(InfoExtractor): def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): - self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160))) + self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160))) webpage_cookies = self._get_cookies(self._WEBPAGE_HOST) if webpage_cookies.get('sid_tt'): self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value) return self._download_json( 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, fatal=fatal, note=note, errnote=errnote, headers={ - 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)', + 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)', 'Accept': 'application/json', }, query=query) @@ -64,16 +74,16 @@ class TikTokBaseIE(InfoExtractor): 'build_number': app_version, 'manifest_version_code': manifest_app_version, 'update_version_code': manifest_app_version, - 'openudid': ''.join(random.choice('0123456789abcdef') for _ in range(16)), - 'uuid': ''.join([random.choice(string.digits) for _ in range(16)]), + 'openudid': ''.join(random.choices('0123456789abcdef', k=16)), + 'uuid': ''.join(random.choices(string.digits, k=16)), '_rticket': int(time.time() * 1000), 'ts': int(time.time()), 'device_brand': 'Google', - 'device_type': 'Pixel 4', + 'device_type': 'Pixel 7', 'device_platform': 'android', - 'resolution': '1080*1920', + 'resolution': '1080*2400', 'dpi': 420, - 'os_version': '10', + 'os_version': '13', 'os_api': '29', 'carrier_region': 'US', 'sys_region': 'US', @@ -195,11 +205,22 @@ class TikTokBaseIE(InfoExtractor): known_resolutions = {} + def audio_meta(url): + ext = determine_ext(url, default_ext='m4a') + return { + 'format_note': 'Music track', + 'ext': ext, + 'acodec': 'aac' if ext == 'm4a' else ext, + 'vcodec': 'none', + 'width': None, + 'height': None, + } if ext == 'mp3' or '-music-' in url else {} + def extract_addr(addr, add_meta={}): parsed_meta, res = parse_url_key(addr.get('url_key', '')) if res: - known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height')) - known_resolutions[res].setdefault('width', add_meta.get('width')) + known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height') or addr.get('height')) + known_resolutions[res].setdefault('width', add_meta.get('width') or addr.get('width')) parsed_meta.update(known_resolutions.get(res, {})) add_meta.setdefault('height', int_or_none(res[:-1])) return [{ @@ -210,7 +231,8 @@ class TikTokBaseIE(InfoExtractor): 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked **add_meta, **parsed_meta, 'format_note': join_nonempty( - add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' ') + add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' '), + **audio_meta(url), } for url in addr.get('url_list') or []] # Hack: Add direct video links first to prioritize them when removing duplicate formats @@ -266,21 +288,19 @@ class TikTokBaseIE(InfoExtractor): thumbnails = [] for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak', 'origin_cover', 'dynamic_cover'): - cover = video_info.get(cover_id) - if cover: - for cover_url in cover['url_list']: - thumbnails.append({ - 'id': cover_id, - 'url': cover_url, - }) - - stats_info = aweme_detail.get('statistics', {}) - author_info = aweme_detail.get('author', {}) - music_info = aweme_detail.get('music', {}) + for cover_url in traverse_obj(video_info, (cover_id, 'url_list', ...)): + thumbnails.append({ + 'id': cover_id, + 'url': cover_url, + }) + + stats_info = aweme_detail.get('statistics') or {} + author_info = aweme_detail.get('author') or {} + music_info = aweme_detail.get('music') or {} user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info, 'sec_uid', 'id', 'uid', 'unique_id', expected_type=str_or_none, get_all=False)) - labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str, default=[]) + labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str) contained_music_track = traverse_obj( music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str) @@ -298,20 +318,27 @@ class TikTokBaseIE(InfoExtractor): 'extractor_key': TikTokIE.ie_key(), 'extractor': TikTokIE.IE_NAME, 'webpage_url': self._create_url(author_info.get('uid'), aweme_id), - 'title': aweme_detail.get('desc'), - 'description': aweme_detail.get('desc'), - 'view_count': int_or_none(stats_info.get('play_count')), - 'like_count': int_or_none(stats_info.get('digg_count')), - 'repost_count': int_or_none(stats_info.get('share_count')), - 'comment_count': int_or_none(stats_info.get('comment_count')), - 'uploader': str_or_none(author_info.get('unique_id')), - 'creator': str_or_none(author_info.get('nickname')), - 'uploader_id': str_or_none(author_info.get('uid')), + **traverse_obj(aweme_detail, { + 'title': ('desc', {str}), + 'description': ('desc', {str}), + 'timestamp': ('create_time', {int_or_none}), + }), + **traverse_obj(stats_info, { + 'view_count': 'play_count', + 'like_count': 'digg_count', + 'repost_count': 'share_count', + 'comment_count': 'comment_count', + }, expected_type=int_or_none), + **traverse_obj(author_info, { + 'uploader': 'unique_id', + 'uploader_id': 'uid', + 'creator': 'nickname', + 'channel_id': 'sec_uid', + }, expected_type=str_or_none), 'uploader_url': user_url, 'track': music_track, 'album': str_or_none(music_info.get('album')) or None, 'artist': music_author or None, - 'timestamp': int_or_none(aweme_detail.get('create_time')), 'formats': formats, 'subtitles': self.extract_subtitles(aweme_detail, aweme_id), 'thumbnails': thumbnails, @@ -323,37 +350,27 @@ class TikTokBaseIE(InfoExtractor): '_format_sort_fields': ('quality', 'codec', 'size', 'br'), } - def _parse_aweme_video_web(self, aweme_detail, webpage_url): + def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id): video_info = aweme_detail['video'] author_info = traverse_obj(aweme_detail, 'authorInfo', 'author', expected_type=dict, default={}) music_info = aweme_detail.get('music') or {} stats_info = aweme_detail.get('stats') or {} - user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info, - 'secUid', 'id', 'uid', 'uniqueId', - expected_type=str_or_none, get_all=False) - or aweme_detail.get('authorSecId')) + channel_id = traverse_obj(author_info or aweme_detail, (('authorSecId', 'secUid'), {str}), get_all=False) + user_url = self._UPLOADER_URL_FORMAT % channel_id if channel_id else None formats = [] - play_url = video_info.get('playAddr') - width = video_info.get('width') - height = video_info.get('height') - if isinstance(play_url, str): - formats = [{ + width = int_or_none(video_info.get('width')) + height = int_or_none(video_info.get('height')) + + for play_url in traverse_obj(video_info, ('playAddr', ((..., 'src'), None), {url_or_none})): + formats.append({ 'url': self._proto_relative_url(play_url), 'ext': 'mp4', 'width': width, 'height': height, - }] - elif isinstance(play_url, list): - formats = [{ - 'url': self._proto_relative_url(url), - 'ext': 'mp4', - 'width': width, - 'height': height, - } for url in traverse_obj(play_url, (..., 'src'), expected_type=url_or_none, default=[]) if url] + }) - download_url = url_or_none(video_info.get('downloadAddr')) or traverse_obj(video_info, ('download', 'url'), expected_type=url_or_none) - if download_url: + for download_url in traverse_obj(video_info, (('downloadAddr', ('download', 'url')), {url_or_none})): formats.append({ 'format_id': 'download', 'url': self._proto_relative_url(download_url), @@ -361,44 +378,54 @@ class TikTokBaseIE(InfoExtractor): 'width': width, 'height': height, }) + self._remove_duplicate_formats(formats) thumbnails = [] - for thumbnail_name in ('thumbnail', 'cover', 'dynamicCover', 'originCover'): - if aweme_detail.get(thumbnail_name): - thumbnails = [{ - 'url': self._proto_relative_url(aweme_detail[thumbnail_name]), - 'width': width, - 'height': height - }] + for thumb_url in traverse_obj(aweme_detail, ( + (None, 'video'), ('thumbnail', 'cover', 'dynamicCover', 'originCover'), {url_or_none})): + thumbnails.append({ + 'url': self._proto_relative_url(thumb_url), + 'width': width, + 'height': height, + }) return { - 'id': traverse_obj(aweme_detail, 'id', 'awemeId', expected_type=str_or_none), - 'title': aweme_detail.get('desc'), - 'duration': try_get(aweme_detail, lambda x: x['video']['duration'], int), - 'view_count': int_or_none(stats_info.get('playCount')), - 'like_count': int_or_none(stats_info.get('diggCount')), - 'repost_count': int_or_none(stats_info.get('shareCount')), - 'comment_count': int_or_none(stats_info.get('commentCount')), - 'timestamp': int_or_none(aweme_detail.get('createTime')), - 'creator': str_or_none(author_info.get('nickname')), - 'uploader': str_or_none(author_info.get('uniqueId') or aweme_detail.get('author')), - 'uploader_id': str_or_none(traverse_obj(author_info, 'id', 'uid', 'authorId')), + 'id': video_id, + **traverse_obj(aweme_detail, { + 'title': ('desc', {str}), + 'description': ('desc', {str}), + 'duration': ('video', 'duration', {int_or_none}), + 'timestamp': ('createTime', {int_or_none}), + }), + **traverse_obj(author_info or aweme_detail, { + 'creator': ('nickname', {str}), + 'uploader': (('uniqueId', 'author'), {str}), + 'uploader_id': (('authorId', 'uid', 'id'), {str_or_none}), + }, get_all=False), + **traverse_obj(stats_info, { + 'view_count': 'playCount', + 'like_count': 'diggCount', + 'repost_count': 'shareCount', + 'comment_count': 'commentCount', + }, expected_type=int_or_none), + **traverse_obj(music_info, { + 'track': 'title', + 'album': ('album', {lambda x: x or None}), + 'artist': 'authorName', + }, expected_type=str), + 'channel_id': channel_id, 'uploader_url': user_url, - 'track': str_or_none(music_info.get('title')), - 'album': str_or_none(music_info.get('album')) or None, - 'artist': str_or_none(music_info.get('authorName')), 'formats': formats, 'thumbnails': thumbnails, - 'description': str_or_none(aweme_detail.get('desc')), 'http_headers': { - 'Referer': webpage_url + 'Referer': webpage_url, } } class TikTokIE(TikTokBaseIE): - _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)/video)/(?P<id>\d+)' + _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)?/video)/(?P<id>\d+)' _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})'] _TESTS = [{ @@ -426,7 +453,8 @@ class TikTokIE(TikTokBaseIE): 'artist': 'Ysrbeats', 'album': 'Lehanga', 'track': 'Lehanga', - } + }, + 'skip': '404 Not Found', }, { 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en', 'md5': '6f3cf8cdd9b28cb8363fe0a9a160695b', @@ -441,6 +469,7 @@ class TikTokIE(TikTokBaseIE): 'uploader': 'patrox', 'uploader_id': '18702747', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws', + 'channel_id': 'MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws', 'creator': 'patroX', 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', 'upload_date': '20190930', @@ -451,7 +480,7 @@ class TikTokIE(TikTokBaseIE): 'comment_count': int, 'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson', 'track': 'Big Fun', - } + }, }, { # Banned audio, only available on the app 'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402', @@ -464,6 +493,7 @@ class TikTokIE(TikTokBaseIE): 'creator': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6', 'uploader_id': '6974687867511718913', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d', + 'channel_id': 'MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d', 'track': 'Boka Dance', 'artist': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6', 'timestamp': 1626121503, @@ -474,7 +504,7 @@ class TikTokIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, - } + }, }, { # Sponsored video, only available with feed workaround 'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561', @@ -487,6 +517,7 @@ class TikTokIE(TikTokBaseIE): 'creator': 'Slap And Run', 'uploader_id': '7036055384943690754', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_', + 'channel_id': 'MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_', 'track': 'Promoted Music', 'timestamp': 1639754738, 'duration': 30, @@ -497,7 +528,7 @@ class TikTokIE(TikTokBaseIE): 'repost_count': int, 'comment_count': int, }, - 'expected_warnings': ['trying with webpage', 'Unable to find video in feed'] + 'params': {'skip_download': True}, # XXX: unable to download video data: HTTP Error 403: Forbidden }, { # Video without title and description 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694', @@ -510,6 +541,7 @@ class TikTokIE(TikTokBaseIE): 'creator': 'Pokemon', 'uploader_id': '6820838815978423302', 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W', + 'channel_id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W', 'track': 'original sound', 'timestamp': 1643714123, 'duration': 6, @@ -545,6 +577,107 @@ class TikTokIE(TikTokBaseIE): }, 'skip': 'This video is unavailable', }, { + # slideshow audio-only mp3 format + 'url': 'https://www.tiktok.com/@_le_cannibale_/video/7139980461132074283', + 'info_dict': { + 'id': '7139980461132074283', + 'ext': 'mp3', + 'title': 'TikTok video #7139980461132074283', + 'description': '', + 'creator': 'Antaura', + 'uploader': '_le_cannibale_', + 'uploader_id': '6604511138619654149', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP', + 'channel_id': 'MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP', + 'artist': 'nathan !', + 'track': 'grahamscott canon', + 'upload_date': '20220905', + 'timestamp': 1662406249, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'thumbnail': r're:^https://.+\.webp', + }, + }, { + # only available via web + 'url': 'https://www.tiktok.com/@moxypatch/video/7206382937372134662', + 'md5': '6aba7fad816e8709ff2c149679ace165', + 'info_dict': { + 'id': '7206382937372134662', + 'ext': 'mp4', + 'title': 'md5:1d95c0b96560ca0e8a231af4172b2c0a', + 'description': 'md5:1d95c0b96560ca0e8a231af4172b2c0a', + 'creator': 'MoxyPatch', + 'uploader': 'moxypatch', + 'uploader_id': '7039142049363379205', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V', + 'channel_id': 'MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V', + 'artist': 'your worst nightmare', + 'track': 'original sound', + 'upload_date': '20230303', + 'timestamp': 1677866781, + 'duration': 10, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'thumbnail': r're:^https://.+', + 'thumbnails': 'count:3', + }, + 'expected_warnings': ['Unable to find video in feed'], + }, { + # 1080p format + 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830', + 'md5': '982512017a8a917124d5a08c8ae79621', + 'info_dict': { + 'id': '7107337212743830830', + 'ext': 'mp4', + 'title': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok', + 'description': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok', + 'uploader': 'tatemcrae', + 'uploader_id': '86328792343818240', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd', + 'channel_id': 'MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd', + 'creator': 'tate mcrae', + 'artist': 'tate mcrae', + 'track': 'original sound', + 'upload_date': '20220609', + 'timestamp': 1654805899, + 'duration': 150, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'thumbnail': r're:^https://.+\.webp', + }, + 'params': {'format': 'bytevc1_1080p_808907-0'}, + }, { + # Slideshow, audio-only m4a format + 'url': 'https://www.tiktok.com/@hara_yoimiya/video/7253412088251534594', + 'md5': '2ff8fe0174db2dbf49c597a7bef4e47d', + 'info_dict': { + 'id': '7253412088251534594', + 'ext': 'm4a', + 'title': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ', + 'description': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ', + 'uploader': 'hara_yoimiya', + 'uploader_id': '6582536342634676230', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB', + 'channel_id': 'MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB', + 'creator': 'лампочка', + 'artist': 'Øneheart', + 'album': 'watching the stars', + 'track': 'watching the stars', + 'upload_date': '20230708', + 'timestamp': 1688816612, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + 'thumbnail': r're:^https://.+\.webp', + }, + }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', 'only_matching': True @@ -558,7 +691,7 @@ class TikTokIE(TikTokBaseIE): self.report_warning(f'{e}; trying with webpage') url = self._create_url(user_id, video_id) - webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'User-Agent:Mozilla/5.0'}) + webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}) next_data = self._search_nextjs_data(webpage, video_id, default='{}') if next_data: status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0 @@ -569,7 +702,7 @@ class TikTokIE(TikTokBaseIE): video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict) if status == 0: - return self._parse_aweme_video_web(video_data, url) + return self._parse_aweme_video_web(video_data, url, video_id) elif status == 10216: raise ExtractorError('This video is private', expected=True) raise ExtractorError('Video not available', video_id=video_id) @@ -634,7 +767,7 @@ class TikTokUserIE(TikTokBaseIE): 'max_cursor': 0, 'min_cursor': 0, 'retry_type': 'no_retry', - 'device_id': ''.join(random.choice(string.digits) for _ in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. + 'device_id': ''.join(random.choices(string.digits, k=19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. } for page in itertools.count(1): @@ -682,7 +815,7 @@ class TikTokBaseListIE(TikTokBaseIE): # XXX: Conventionally, base classes shoul 'cursor': 0, 'count': 20, 'type': 5, - 'device_id': ''.join(random.choice(string.digits) for i in range(19)) + 'device_id': ''.join(random.choices(string.digits, k=19)) } for page in itertools.count(1): @@ -796,6 +929,7 @@ class DouyinIE(TikTokBaseIE): 'description': '#杨超越 小小水手带你去远航❤️', 'uploader_id': '110403406559', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', + 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'creator': '杨超越', 'duration': 19782, 'timestamp': 1620905839, @@ -805,6 +939,7 @@ class DouyinIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'thumbnail': r're:https?://.+\.jpe?g', }, }, { 'url': 'https://www.douyin.com/video/6982497745948921092', @@ -816,8 +951,9 @@ class DouyinIE(TikTokBaseIE): 'description': '这个夏日和小羊@杨超越 一起遇见白色幻想', 'uploader_id': '408654318141572', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA', + 'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA', 'creator': '杨超越工作室', - 'duration': 42608, + 'duration': 42479, 'timestamp': 1625739481, 'upload_date': '20210708', 'track': '@杨超越工作室创作的原声', @@ -825,6 +961,7 @@ class DouyinIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'thumbnail': r're:https?://.+\.jpe?g', }, }, { 'url': 'https://www.douyin.com/video/6953975910773099811', @@ -836,8 +973,9 @@ class DouyinIE(TikTokBaseIE): 'description': '#一起看海 出现在你的夏日里', 'uploader_id': '110403406559', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', + 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'creator': '杨超越', - 'duration': 17228, + 'duration': 17343, 'timestamp': 1619098692, 'upload_date': '20210422', 'track': '@杨超越创作的原声', @@ -845,6 +983,7 @@ class DouyinIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'thumbnail': r're:https?://.+\.jpe?g', }, }, { 'url': 'https://www.douyin.com/video/6950251282489675042', @@ -873,6 +1012,7 @@ class DouyinIE(TikTokBaseIE): 'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈', 'uploader_id': '110403406559', 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', + 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', 'creator': '杨超越', 'duration': 15115, 'timestamp': 1621261163, @@ -882,6 +1022,7 @@ class DouyinIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'thumbnail': r're:https?://.+\.jpe?g', }, }] _APP_VERSIONS = [('23.3.0', '230300')] @@ -901,19 +1042,17 @@ class DouyinIE(TikTokBaseIE): self.to_screen(f'{e}; trying with webpage') webpage = self._download_webpage(url, video_id) - render_data_json = self._search_regex( - r'<script [^>]*\bid=[\'"]RENDER_DATA[\'"][^>]*>(%7B.+%7D)</script>', - webpage, 'render data', default=None) - if not render_data_json: + render_data = self._search_json( + r'<script [^>]*\bid=[\'"]RENDER_DATA[\'"][^>]*>', webpage, 'render data', video_id, + contains_pattern=r'%7B(?s:.+)%7D', fatal=False, transform_source=compat_urllib_parse_unquote) + if not render_data: # TODO: Run verification challenge code to generate signature cookies cookies = self._get_cookies(self._WEBPAGE_HOST) expected = not cookies.get('s_v_web_id') or not cookies.get('ttwid') raise ExtractorError( 'Fresh cookies (not necessarily logged in) are needed', expected=expected) - render_data = self._parse_json( - render_data_json, video_id, transform_source=compat_urllib_parse_unquote) - return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url) + return self._parse_aweme_video_web(get_first(render_data, ('aweme', 'detail')), url, video_id) class TikTokVMIE(InfoExtractor): @@ -944,8 +1083,27 @@ class TikTokVMIE(InfoExtractor): 'creator': 'SigmaChad', }, }, { - 'url': 'https://vm.tiktok.com/ZSe4FqkKd', - 'only_matching': True, + 'url': 'https://vm.tiktok.com/ZTR45GpSF/', + 'info_dict': { + 'id': '7106798200794926362', + 'ext': 'mp4', + 'title': 'md5:edc3e7ea587847f8537468f2fe51d074', + 'uploader_id': '6997695878846268418', + 'upload_date': '20220608', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'thumbnail': r're:https://.+\.webp.*', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAdZ_NcPPgMneaGrW0hN8O_J_bwLshwNNERRF5DxOw2HKIzk0kdlLrR8RkVl1ksrMO', + 'duration': 29, + 'timestamp': 1654680400, + 'repost_count': int, + 'artist': 'Akihitoko', + 'track': 'original sound', + 'description': 'md5:edc3e7ea587847f8537468f2fe51d074', + 'uploader': 'akihitoko1', + 'creator': 'Akihitoko', + }, }, { 'url': 'https://vt.tiktok.com/ZSe4FqkKd', 'only_matching': True, @@ -953,7 +1111,179 @@ class TikTokVMIE(InfoExtractor): def _real_extract(self, url): new_url = self._request_webpage( - HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).geturl() + HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).url if self.suitable(new_url): # Prevent infinite loop in case redirect fails raise UnsupportedError(new_url) return self.url_result(new_url) + + +class TikTokLiveIE(TikTokBaseIE): + _VALID_URL = r'''(?x)https?://(?: + (?:www\.)?tiktok\.com/@(?P<uploader>[\w.-]+)/live| + m\.tiktok\.com/share/live/(?P<id>\d+) + )''' + IE_NAME = 'tiktok:live' + + _TESTS = [{ + 'url': 'https://www.tiktok.com/@weathernewslive/live', + 'info_dict': { + 'id': '7210809319192726273', + 'ext': 'mp4', + 'title': r're:ウェザーニュースLiVE[\d\s:-]*', + 'creator': 'ウェザーニュースLiVE', + 'uploader': 'weathernewslive', + 'uploader_id': '6621496731283095554', + 'uploader_url': 'https://www.tiktok.com/@weathernewslive', + 'live_status': 'is_live', + 'concurrent_view_count': int, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.tiktok.com/@pilarmagenta/live', + 'info_dict': { + 'id': '7209423610325322522', + 'ext': 'mp4', + 'title': str, + 'creator': 'Pilarmagenta', + 'uploader': 'pilarmagenta', + 'uploader_id': '6624846890674683909', + 'uploader_url': 'https://www.tiktok.com/@pilarmagenta', + 'live_status': 'is_live', + 'concurrent_view_count': int, + }, + 'skip': 'Livestream', + }, { + 'url': 'https://m.tiktok.com/share/live/7209423610325322522/?language=en', + 'only_matching': True, + }, { + 'url': 'https://www.tiktok.com/@iris04201/live', + 'only_matching': True, + }] + + def _call_api(self, url, param, room_id, uploader, key=None): + response = traverse_obj(self._download_json( + url, room_id, fatal=False, query={ + 'aid': '1988', + param: room_id, + }), (key, {dict}), default={}) + + # status == 2 if live else 4 + if int_or_none(response.get('status')) == 2: + return response + # If room_id is obtained via mobile share URL and cannot be refreshed, do not wait for live + elif not uploader: + raise ExtractorError('This livestream has ended', expected=True) + raise UserNotLive(video_id=uploader) + + def _real_extract(self, url): + uploader, room_id = self._match_valid_url(url).group('uploader', 'id') + webpage = self._download_webpage( + url, uploader or room_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=not room_id) + + if webpage: + data = try_call(lambda: self._get_sigi_state(webpage, uploader or room_id)) + room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False) + or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None) + or room_id) + uploader = uploader or traverse_obj( + data, ('LiveRoom', 'liveRoomUserInfo', 'user', 'uniqueId'), + ('UserModule', 'users', ..., 'uniqueId'), get_all=False, expected_type=str) + + if not room_id: + raise UserNotLive(video_id=uploader) + + formats = [] + live_info = self._call_api( + 'https://webcast.tiktok.com/webcast/room/info', 'room_id', room_id, uploader, key='data') + + get_quality = qualities(('SD1', 'ld', 'SD2', 'sd', 'HD1', 'hd', 'FULL_HD1', 'uhd', 'ORIGION', 'origin')) + parse_inner = lambda x: self._parse_json(x, None) + + for quality, stream in traverse_obj(live_info, ( + 'stream_url', 'live_core_sdk_data', 'pull_data', 'stream_data', + {parse_inner}, 'data', {dict}), default={}).items(): + + sdk_params = traverse_obj(stream, ('main', 'sdk_params', {parse_inner}, { + 'vcodec': ('VCodec', {str}), + 'tbr': ('vbitrate', {lambda x: int_or_none(x, 1000)}), + 'resolution': ('resolution', {lambda x: re.match(r'(?i)\d+x\d+|\d+p', x).group().lower()}), + })) + + flv_url = traverse_obj(stream, ('main', 'flv', {url_or_none})) + if flv_url: + formats.append({ + 'url': flv_url, + 'ext': 'flv', + 'format_id': f'flv-{quality}', + 'quality': get_quality(quality), + **sdk_params, + }) + + hls_url = traverse_obj(stream, ('main', 'hls', {url_or_none})) + if hls_url: + formats.append({ + 'url': hls_url, + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'format_id': f'hls-{quality}', + 'quality': get_quality(quality), + **sdk_params, + }) + + def get_vcodec(*keys): + return traverse_obj(live_info, ( + 'stream_url', *keys, {parse_inner}, 'VCodec', {str})) + + for stream in ('hls', 'rtmp'): + stream_url = traverse_obj(live_info, ('stream_url', f'{stream}_pull_url', {url_or_none})) + if stream_url: + formats.append({ + 'url': stream_url, + 'ext': 'mp4' if stream == 'hls' else 'flv', + 'protocol': 'm3u8_native' if stream == 'hls' else 'https', + 'format_id': f'{stream}-pull', + 'vcodec': get_vcodec(f'{stream}_pull_url_params'), + 'quality': get_quality('ORIGION'), + }) + + for f_id, f_url in traverse_obj(live_info, ('stream_url', 'flv_pull_url', {dict}), default={}).items(): + if not url_or_none(f_url): + continue + formats.append({ + 'url': f_url, + 'ext': 'flv', + 'format_id': f'flv-{f_id}'.lower(), + 'vcodec': get_vcodec('flv_pull_url_params', f_id), + 'quality': get_quality(f_id), + }) + + # If uploader is a guest on another's livestream, primary endpoint will not have m3u8 URLs + if not traverse_obj(formats, lambda _, v: v['ext'] == 'mp4'): + live_info = merge_dicts(live_info, self._call_api( + 'https://www.tiktok.com/api/live/detail/', 'roomID', room_id, uploader, key='LiveRoomInfo')) + if url_or_none(live_info.get('liveUrl')): + formats.append({ + 'url': live_info['liveUrl'], + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'format_id': 'hls-fallback', + 'vcodec': 'h264', + 'quality': get_quality('origin'), + }) + + uploader = uploader or traverse_obj(live_info, ('ownerInfo', 'uniqueId'), ('owner', 'display_id')) + + return { + 'id': room_id, + 'uploader': uploader, + 'uploader_url': format_field(uploader, None, self._UPLOADER_URL_FORMAT) or None, + 'is_live': True, + 'formats': formats, + '_format_sort_fields': ('quality', 'ext'), + **traverse_obj(live_info, { + 'title': 'title', + 'uploader_id': (('ownerInfo', 'owner'), 'id', {str_or_none}), + 'creator': (('ownerInfo', 'owner'), 'nickname'), + 'concurrent_view_count': (('user_count', ('liveRoomStats', 'userCount')), {int_or_none}), + }, get_all=False), + } diff --git a/hypervideo_dl/extractor/tnaflix.py b/hypervideo_dl/extractor/tnaflix.py index 4482c84..b2baf2e 100644 --- a/hypervideo_dl/extractor/tnaflix.py +++ b/hypervideo_dl/extractor/tnaflix.py @@ -81,26 +81,27 @@ class TNAFlixNetworkBaseIE(InfoExtractor): display_id = video_id webpage = self._download_webpage(url, display_id) + inputs = self._hidden_inputs(webpage) + query = {} # check for MovieFap-style config cfg_url = self._proto_relative_url(self._html_search_regex( self._CONFIG_REGEX, webpage, 'flashvars.config', default=None, group='url'), 'http:') - query = {} - # check for TNAFlix-style config if not cfg_url: - inputs = self._hidden_inputs(webpage) - if inputs.get('vkey') and inputs.get('nkey'): - cfg_url = f'https://www.{host}.com/cdn/cdn.php' - query.update({ - 'file': inputs['vkey'], - 'key': inputs['nkey'], - 'VID': video_id, - 'premium': '1', - 'vip': '1', - 'alpha': '', - }) + cfg_url = inputs.get('config') + + # check for TNAFlix-style config + if not cfg_url and inputs.get('vkey') and inputs.get('nkey'): + cfg_url = f'http://cdn-fck.{host}.com/{host}/{inputs["vkey"]}.fid' + query.update({ + 'key': inputs['nkey'], + 'VID': video_id, + 'premium': '1', + 'vip': '1', + 'alpha': '', + }) formats, json_ld = [], {} diff --git a/hypervideo_dl/extractor/toutv.py b/hypervideo_dl/extractor/toutv.py index f60c199..ced1224 100644 --- a/hypervideo_dl/extractor/toutv.py +++ b/hypervideo_dl/extractor/toutv.py @@ -1,7 +1,7 @@ import json from .radiocanada import RadioCanadaIE -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -52,8 +52,8 @@ class TouTvIE(RadioCanadaIE): # XXX: Do not subclass from concrete IE 'Content-Type': 'application/json;charset=utf-8', })['access_token'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), None)['Message'] + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read().decode(), None)['Message'] raise ExtractorError(error, expected=True) raise self._claims = self._call_api('validation/v2/getClaims')['claims'] diff --git a/hypervideo_dl/extractor/triller.py b/hypervideo_dl/extractor/triller.py index acd9e68..56e51fe 100644 --- a/hypervideo_dl/extractor/triller.py +++ b/hypervideo_dl/extractor/triller.py @@ -1,15 +1,21 @@ import itertools import json +import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( ExtractorError, + UnsupportedError, + determine_ext, int_or_none, + parse_resolution, str_or_none, traverse_obj, - unified_strdate, unified_timestamp, url_basename, + urljoin, + url_or_none, ) @@ -22,25 +28,22 @@ class TrillerBaseIE(InfoExtractor): if self._API_HEADERS.get('Authorization'): return - user_check = self._download_json( + headers = {**self._API_HEADERS, 'Content-Type': 'application/json'} + user_check = traverse_obj(self._download_json( f'{self._API_BASE_URL}/api/user/is-valid-username', None, note='Checking username', - fatal=False, expected_status=400, headers={ - 'Content-Type': 'application/json', - 'Origin': 'https://triller.co', - }, data=json.dumps({'username': username}, separators=(',', ':')).encode('utf-8')) - if user_check.get('status'): # endpoint returns "status":false if username exists + fatal=False, expected_status=400, headers=headers, + data=json.dumps({'username': username}, separators=(',', ':')).encode()), 'status') + + if user_check: # endpoint returns `"status":false` if username exists raise ExtractorError('Unable to login: Invalid username', expected=True) - credentials = { - 'username': username, - 'password': password, - } login = self._download_json( - f'{self._API_BASE_URL}/user/auth', None, note='Logging in', - fatal=False, expected_status=400, headers={ - 'Content-Type': 'application/json', - 'Origin': 'https://triller.co', - }, data=json.dumps(credentials, separators=(',', ':')).encode('utf-8')) + f'{self._API_BASE_URL}/user/auth', None, note='Logging in', fatal=False, + expected_status=400, headers=headers, data=json.dumps({ + 'username': username, + 'password': password, + }, separators=(',', ':')).encode()) or {} + if not login.get('auth_token'): if login.get('error') == 1008: raise ExtractorError('Unable to login: Incorrect password', expected=True) @@ -55,100 +58,93 @@ class TrillerBaseIE(InfoExtractor): headers=self._API_HEADERS, query={'limit': limit}) or {} if not comment_info.get('comments'): return - for comment_dict in comment_info['comments']: - yield { - 'author': traverse_obj(comment_dict, ('author', 'username')), - 'author_id': traverse_obj(comment_dict, ('author', 'user_id')), - 'id': comment_dict.get('id'), - 'text': comment_dict.get('body'), - 'timestamp': unified_timestamp(comment_dict.get('timestamp')), - } + yield from traverse_obj(comment_info, ('comments', ..., { + 'id': ('id', {str_or_none}), + 'text': 'body', + 'author': ('author', 'username'), + 'author_id': ('author', 'user_id'), + 'timestamp': ('timestamp', {unified_timestamp}), + })) - def _check_user_info(self, user_info): - if not user_info: - self.report_warning('Unable to extract user info') - elif user_info.get('private') and not user_info.get('followed_by_me'): - raise ExtractorError('This video is private', expected=True) - elif traverse_obj(user_info, 'blocked_by_user', 'blocking_user'): - raise ExtractorError('The author of the video is blocked', expected=True) - return user_info + def _parse_video_info(self, video_info, username, user_id, display_id=None): + video_id = str(video_info['id']) + display_id = display_id or video_info.get('video_uuid') + + if traverse_obj(video_info, ( + None, ('transcoded_url', 'video_url', 'stream_url', 'audio_url'), + {lambda x: re.search(r'/copyright/', x)}), get_all=False): + self.raise_no_formats('This video has been removed due to licensing restrictions', expected=True) - def _parse_video_info(self, video_info, username, user_info=None): - video_uuid = video_info.get('video_uuid') - video_id = video_info.get('id') + def format_info(url): + return { + 'url': url, + 'ext': determine_ext(url), + 'format_id': url_basename(url).split('.')[0], + } formats = [] - video_url = traverse_obj(video_info, 'video_url', 'stream_url') - if video_url: - formats.append({ - 'url': video_url, - 'ext': 'mp4', - 'vcodec': 'h264', - 'width': video_info.get('width'), - 'height': video_info.get('height'), - 'format_id': url_basename(video_url).split('.')[0], - 'filesize': video_info.get('filesize'), - }) - video_set = video_info.get('video_set') or [] - for video in video_set: - resolution = video.get('resolution') or '' + + if determine_ext(video_info.get('transcoded_url')) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_info['transcoded_url'], video_id, 'mp4', m3u8_id='hls', fatal=False)) + + for video in traverse_obj(video_info, ('video_set', lambda _, v: url_or_none(v['url']))): formats.append({ - 'url': video['url'], - 'ext': 'mp4', + **format_info(video['url']), + **parse_resolution(video.get('resolution')), 'vcodec': video.get('codec'), 'vbr': int_or_none(video.get('bitrate'), 1000), - 'width': int_or_none(resolution.split('x')[0]), - 'height': int_or_none(resolution.split('x')[1]), - 'format_id': url_basename(video['url']).split('.')[0], }) - audio_url = video_info.get('audio_url') - if audio_url: + + video_url = traverse_obj(video_info, 'video_url', 'stream_url', expected_type=url_or_none) + if video_url: formats.append({ - 'url': audio_url, - 'ext': 'm4a', - 'format_id': url_basename(audio_url).split('.')[0], + **format_info(video_url), + 'vcodec': 'h264', + **traverse_obj(video_info, { + 'width': 'width', + 'height': 'height', + 'filesize': 'filesize', + }, expected_type=int_or_none), }) - manifest_url = video_info.get('transcoded_url') - if manifest_url: - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - - comment_count = int_or_none(video_info.get('comment_count')) + audio_url = url_or_none(video_info.get('audio_url')) + if audio_url: + formats.append(format_info(audio_url)) - user_info = user_info or traverse_obj(video_info, 'user', default={}) + comment_count = traverse_obj(video_info, ('comment_count', {int_or_none})) return { - 'id': str_or_none(video_id) or video_uuid, - 'title': video_info.get('description') or f'Video by {username}', - 'thumbnail': video_info.get('thumbnail_url'), - 'description': video_info.get('description'), - 'uploader': str_or_none(username), - 'uploader_id': str_or_none(user_info.get('user_id')), - 'creator': str_or_none(user_info.get('name')), - 'timestamp': unified_timestamp(video_info.get('timestamp')), - 'upload_date': unified_strdate(video_info.get('timestamp')), - 'duration': int_or_none(video_info.get('duration')), - 'view_count': int_or_none(video_info.get('play_count')), - 'like_count': int_or_none(video_info.get('likes_count')), - 'artist': str_or_none(video_info.get('song_artist')), - 'track': str_or_none(video_info.get('song_title')), - 'webpage_url': f'https://triller.co/@{username}/video/{video_uuid}', + 'id': video_id, + 'display_id': display_id, + 'uploader': username, + 'uploader_id': user_id or traverse_obj(video_info, ('user', 'user_id', {str_or_none})), + 'webpage_url': urljoin(f'https://triller.co/@{username}/video/', display_id), 'uploader_url': f'https://triller.co/@{username}', 'extractor_key': TrillerIE.ie_key(), 'extractor': TrillerIE.IE_NAME, 'formats': formats, 'comment_count': comment_count, '__post_extractor': self.extract_comments(video_id, comment_count), + **traverse_obj(video_info, { + 'title': ('description', {lambda x: x.replace('\r\n', ' ')}), + 'description': 'description', + 'creator': ((('user'), ('users', lambda _, v: str(v['user_id']) == user_id)), 'name'), + 'thumbnail': ('thumbnail_url', {url_or_none}), + 'timestamp': ('timestamp', {unified_timestamp}), + 'duration': ('duration', {int_or_none}), + 'view_count': ('play_count', {int_or_none}), + 'like_count': ('likes_count', {int_or_none}), + 'artist': 'song_artist', + 'track': 'song_title', + }, get_all=False), } class TrillerIE(TrillerBaseIE): _VALID_URL = r'''(?x) https?://(?:www\.)?triller\.co/ - @(?P<username>[\w\._]+)/video/ - (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + @(?P<username>[\w.]+)/video/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}) ''' _TESTS = [{ 'url': 'https://triller.co/@theestallion/video/2358fcd7-3df2-4c77-84c8-1d091610a6cf', @@ -165,16 +161,14 @@ class TrillerIE(TrillerBaseIE): 'timestamp': 1660598222, 'upload_date': '20220815', 'duration': 47, - 'height': 3840, - 'width': 2160, 'view_count': int, 'like_count': int, 'artist': 'Megan Thee Stallion', 'track': 'Her', - 'webpage_url': 'https://triller.co/@theestallion/video/2358fcd7-3df2-4c77-84c8-1d091610a6cf', 'uploader_url': 'https://triller.co/@theestallion', 'comment_count': int, - } + }, + 'skip': 'This video has been removed due to licensing restrictions', }, { 'url': 'https://triller.co/@charlidamelio/video/46c6fcfa-aa9e-4503-a50c-68444f44cddc', 'md5': '874055f462af5b0699b9dbb527a505a0', @@ -182,6 +176,7 @@ class TrillerIE(TrillerBaseIE): 'id': '71621339', 'ext': 'mp4', 'title': 'md5:4c91ea82760fe0fffb71b8c3aa7295fc', + 'display_id': '46c6fcfa-aa9e-4503-a50c-68444f44cddc', 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', 'description': 'md5:4c91ea82760fe0fffb71b8c3aa7295fc', 'uploader': 'charlidamelio', @@ -190,59 +185,73 @@ class TrillerIE(TrillerBaseIE): 'timestamp': 1660773354, 'upload_date': '20220817', 'duration': 16, - 'height': 1920, - 'width': 1080, 'view_count': int, 'like_count': int, 'artist': 'Dixie', 'track': 'Someone to Blame', - 'webpage_url': 'https://triller.co/@charlidamelio/video/46c6fcfa-aa9e-4503-a50c-68444f44cddc', 'uploader_url': 'https://triller.co/@charlidamelio', 'comment_count': int, - } + }, + }, { + 'url': 'https://triller.co/@theestallion/video/07f35f38-1f51-48e2-8c5f-f7a8e829988f', + 'md5': 'af7b3553e4b8bfca507636471ee2eb41', + 'info_dict': { + 'id': '71837829', + 'ext': 'mp4', + 'title': 'UNGRATEFUL VIDEO OUT NOW 👏🏾👏🏾👏🏾 💙💙 link my bio #womeninhiphop', + 'display_id': '07f35f38-1f51-48e2-8c5f-f7a8e829988f', + 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', + 'description': 'UNGRATEFUL VIDEO OUT NOW 👏🏾👏🏾👏🏾 💙💙 link my bio\r\n #womeninhiphop', + 'uploader': 'theestallion', + 'uploader_id': '18992236', + 'creator': 'Megan Thee Stallion', + 'timestamp': 1662486178, + 'upload_date': '20220906', + 'duration': 30, + 'view_count': int, + 'like_count': int, + 'artist': 'Unknown', + 'track': 'Unknown', + 'uploader_url': 'https://triller.co/@theestallion', + 'comment_count': int, + }, }] def _real_extract(self, url): - username, video_uuid = self._match_valid_url(url).group('username', 'id') + username, display_id = self._match_valid_url(url).group('username', 'id') - video_info = traverse_obj(self._download_json( - f'{self._API_BASE_URL}/api/videos/{video_uuid}', - video_uuid, note='Downloading video info API JSON', - errnote='Unable to download video info API JSON', - headers=self._API_HEADERS), ('videos', 0)) - if not video_info: - raise ExtractorError('No video info found in API response') + video_info = self._download_json( + f'{self._API_BASE_URL}/api/videos/{display_id}', display_id, + headers=self._API_HEADERS)['videos'][0] - user_info = self._check_user_info(video_info.get('user') or {}) - return self._parse_video_info(video_info, username, user_info) + return self._parse_video_info(video_info, username, None, display_id) class TrillerUserIE(TrillerBaseIE): - _VALID_URL = r'https?://(?:www\.)?triller\.co/@(?P<id>[\w\._]+)/?(?:$|[#?])' + _VALID_URL = r'https?://(?:www\.)?triller\.co/@(?P<id>[\w.]+)/?(?:$|[#?])' _TESTS = [{ - # first videos request only returns 2 videos 'url': 'https://triller.co/@theestallion', - 'playlist_mincount': 9, + 'playlist_mincount': 12, 'info_dict': { 'id': '18992236', 'title': 'theestallion', 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', - } + }, }, { 'url': 'https://triller.co/@charlidamelio', - 'playlist_mincount': 25, + 'playlist_mincount': 150, 'info_dict': { 'id': '1875551', 'title': 'charlidamelio', 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', - } + }, }] def _real_initialize(self): if not self._API_HEADERS.get('Authorization'): guest = self._download_json( - f'{self._API_BASE_URL}/user/create_guest', - None, note='Creating guest session', data=b'', headers=self._API_HEADERS, query={ + f'{self._API_BASE_URL}/user/create_guest', None, + note='Creating guest session', data=b'', headers=self._API_HEADERS, query={ 'platform': 'Web', 'app_version': '', }) @@ -251,44 +260,70 @@ class TrillerUserIE(TrillerBaseIE): self._API_HEADERS['Authorization'] = f'Bearer {guest["auth_token"]}' - def _extract_video_list(self, username, user_id, limit=6): - query = { - 'limit': limit, - } + def _entries(self, username, user_id, limit=6): + query = {'limit': limit} for page in itertools.count(1): - for retry in self.RetryManager(): - try: - video_list = self._download_json( - f'{self._API_BASE_URL}/api/users/{user_id}/videos', - username, note=f'Downloading user video list page {page}', - errnote='Unable to download user video list', headers=self._API_HEADERS, - query=query) - except ExtractorError as e: - if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: - retry.error = e - continue - raise - if not video_list.get('videos'): - break - yield from video_list['videos'] - query['before_time'] = traverse_obj(video_list, ('videos', -1, 'timestamp')) + videos = self._download_json( + f'{self._API_BASE_URL}/api/users/{user_id}/videos', + username, note=f'Downloading user video list page {page}', + headers=self._API_HEADERS, query=query) + + for video in traverse_obj(videos, ('videos', ...)): + yield self._parse_video_info(video, username, user_id) + + query['before_time'] = traverse_obj(videos, ('videos', -1, 'timestamp')) if not query['before_time']: break - def _entries(self, videos, username, user_info): - for video in videos: - yield self._parse_video_info(video, username, user_info) - def _real_extract(self, url): username = self._match_id(url) - user_info = self._check_user_info(self._download_json( + + user_info = traverse_obj(self._download_json( f'{self._API_BASE_URL}/api/users/by_username/{username}', - username, note='Downloading user info', - errnote='Failed to download user info', headers=self._API_HEADERS).get('user', {})) + username, note='Downloading user info', headers=self._API_HEADERS), ('user', {dict})) or {} + + if user_info.get('private') and user_info.get('followed_by_me') not in (True, 'true'): + raise ExtractorError('This user profile is private', expected=True) + elif traverse_obj(user_info, (('blocked_by_user', 'blocking_user'), {bool}), get_all=False): + raise ExtractorError('The author of the video is blocked', expected=True) user_id = str_or_none(user_info.get('user_id')) - videos = self._extract_video_list(username, user_id) - thumbnail = user_info.get('avatar_url') + if not user_id: + raise ExtractorError('Unable to extract user ID') return self.playlist_result( - self._entries(videos, username, user_info), user_id, username, thumbnail=thumbnail) + self._entries(username, user_id), user_id, username, thumbnail=user_info.get('avatar_url')) + + +class TrillerShortIE(InfoExtractor): + _VALID_URL = r'https?://v\.triller\.co/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://v.triller.co/WWZNWk', + 'md5': '5eb8dc2c971bd8cd794ec9e8d5e9d101', + 'info_dict': { + 'id': '66210052', + 'ext': 'mp4', + 'title': 'md5:2dfc89d154cd91a4a18cd9582ba03e16', + 'display_id': 'f4480e1f-fb4e-45b9-a44c-9e6c679ce7eb', + 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', + 'description': 'md5:2dfc89d154cd91a4a18cd9582ba03e16', + 'uploader': 'statefairent', + 'uploader_id': '487545193', + 'creator': 'Official Summer Fair of LA', + 'timestamp': 1629655457, + 'upload_date': '20210822', + 'duration': 19, + 'view_count': int, + 'like_count': int, + 'artist': 'Unknown', + 'track': 'Unknown', + 'uploader_url': 'https://triller.co/@statefairent', + 'comment_count': int, + }, + }] + + def _real_extract(self, url): + real_url = self._request_webpage(HEADRequest(url), self._match_id(url)).url + if self.suitable(real_url): # Prevent infinite loop in case redirect fails + raise UnsupportedError(real_url) + return self.url_result(real_url) diff --git a/hypervideo_dl/extractor/trtcocuk.py b/hypervideo_dl/extractor/trtcocuk.py new file mode 100644 index 0000000..f27f5a1 --- /dev/null +++ b/hypervideo_dl/extractor/trtcocuk.py @@ -0,0 +1,48 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, int_or_none, parse_iso8601, traverse_obj + + +class TrtCocukVideoIE(InfoExtractor): + _VALID_URL = r'https?://www\.trtcocuk\.net\.tr/video/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.trtcocuk.net.tr/video/kaptan-pengu-ve-arkadaslari-1', + 'info_dict': { + 'id': '3789738', + 'ext': 'mp4', + 'season_number': 1, + 'series': '"Kaptan Pengu ve Arkadaşları"', + 'season': 'Season 1', + 'title': 'Kaptan Pengu ve Arkadaşları 1 Bölüm İzle TRT Çocuk', + 'release_date': '20201209', + 'release_timestamp': 1607513774, + } + }, { + 'url': 'https://www.trtcocuk.net.tr/video/sef-rokanin-lezzet-dunyasi-17', + 'info_dict': { + 'id': '10260842', + 'ext': 'mp4', + 'series': '"Şef Roka\'nın Lezzet Dünyası"', + 'title': 'Şef Roka\'nın Lezzet Dünyası 17 Bölüm İzle TRT Çocuk', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + nuxtjs_data = self._search_nuxt_data(webpage, display_id)['data'] + + try: + video_url = self._parse_json(nuxtjs_data['video'], display_id) + except ExtractorError: + video_url = nuxtjs_data['video'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, display_id) + + return { + 'id': str(nuxtjs_data['id']), + 'formats': formats, + 'subtitles': subtitles, + 'season_number': int_or_none(nuxtjs_data.get('season')), + 'release_timestamp': parse_iso8601(nuxtjs_data.get('publishedDate')), + 'series': traverse_obj(nuxtjs_data, ('show', 0, 'title')), + 'title': self._html_extract_title(webpage) # TODO: get better title + } diff --git a/hypervideo_dl/extractor/trueid.py b/hypervideo_dl/extractor/trueid.py index 6963436..86f0990 100644 --- a/hypervideo_dl/extractor/trueid.py +++ b/hypervideo_dl/extractor/trueid.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -88,9 +88,9 @@ class TrueIDIE(InfoExtractor): stream_data = self._download_json( f'https://{domain}/cmsPostProxy/contents/video/{video_id}/streamer?os=android', video_id, data=b'')['data'] except ExtractorError as e: - if not isinstance(e.cause, compat_HTTPError): + if not isinstance(e.cause, HTTPError): raise e - errmsg = self._parse_json(e.cause.read().decode(), video_id)['meta']['message'] + errmsg = self._parse_json(e.cause.response.read().decode(), video_id)['meta']['message'] if 'country' in errmsg: self.raise_geo_restricted( errmsg, [initial_data['display_country']] if initial_data.get('display_country') else None, True) diff --git a/hypervideo_dl/extractor/tubetugraz.py b/hypervideo_dl/extractor/tubetugraz.py index ebabedc..a351e4e 100644 --- a/hypervideo_dl/extractor/tubetugraz.py +++ b/hypervideo_dl/extractor/tubetugraz.py @@ -21,17 +21,36 @@ class TubeTuGrazBaseIE(InfoExtractor): if not urlh: return - urlh = self._request_webpage( - urlh.geturl(), None, fatal=False, headers={'referer': urlh.geturl()}, - note='logging in', errnote='unable to log in', data=urlencode_postdata({ + content, urlh = self._download_webpage_handle( + urlh.url, None, fatal=False, headers={'referer': urlh.url}, + note='logging in', errnote='unable to log in', + data=urlencode_postdata({ 'lang': 'de', '_eventId_proceed': '', 'j_username': username, 'j_password': password })) + if not urlh or urlh.url == 'https://tube.tugraz.at/paella/ui/index.html': + return - if urlh and urlh.geturl() != 'https://tube.tugraz.at/paella/ui/index.html': + if not self._html_search_regex( + r'<p\b[^>]*>(Bitte geben Sie einen OTP-Wert ein:)</p>', + content, 'TFA prompt', default=None): self.report_warning('unable to login: incorrect password') + return + + content, urlh = self._download_webpage_handle( + urlh.url, None, fatal=False, headers={'referer': urlh.url}, + note='logging in with TFA', errnote='unable to log in with TFA', + data=urlencode_postdata({ + 'lang': 'de', + '_eventId_proceed': '', + 'j_tokenNumber': self._get_tfa_info(), + })) + if not urlh or urlh.url == 'https://tube.tugraz.at/paella/ui/index.html': + return + + self.report_warning('unable to login: incorrect TFA code') def _extract_episode(self, episode_info): id = episode_info.get('id') diff --git a/hypervideo_dl/extractor/tubitv.py b/hypervideo_dl/extractor/tubitv.py index de8b5da..bd46bc3 100644 --- a/hypervideo_dl/extractor/tubitv.py +++ b/hypervideo_dl/extractor/tubitv.py @@ -1,13 +1,13 @@ import re from .common import InfoExtractor +from ..networking import Request from ..utils import ( ExtractorError, int_or_none, js_to_json, - sanitized_Request, - urlencode_postdata, traverse_obj, + urlencode_postdata, ) @@ -72,8 +72,8 @@ class TubiTvIE(InfoExtractor): 'password': password, } payload = urlencode_postdata(form_data) - request = sanitized_Request(self._LOGIN_URL, payload) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') + request = Request(self._LOGIN_URL, payload) + request.headers['Content-Type'] = 'application/x-www-form-urlencoded' login_page = self._download_webpage( request, None, False, 'Wrong login info') if not re.search(r'id="tubi-logout"', login_page): diff --git a/hypervideo_dl/extractor/tumblr.py b/hypervideo_dl/extractor/tumblr.py index 88d4ae3..a26bdca 100644 --- a/hypervideo_dl/extractor/tumblr.py +++ b/hypervideo_dl/extractor/tumblr.py @@ -274,7 +274,7 @@ class TumblrIE(InfoExtractor): url = f'http://{blog}.tumblr.com/post/{video_id}/' webpage, urlh = self._download_webpage_handle(url, video_id) - redirect_url = urlh.geturl() + redirect_url = urlh.url api_only = bool(self._search_regex( r'(tumblr.com|^)/(safe-mode|login_required|blog/view)', diff --git a/hypervideo_dl/extractor/tunein.py b/hypervideo_dl/extractor/tunein.py index 43b4f67..fd2fe13 100644 --- a/hypervideo_dl/extractor/tunein.py +++ b/hypervideo_dl/extractor/tunein.py @@ -1,149 +1,201 @@ -import re +import urllib.parse from .common import InfoExtractor -from ..utils import ExtractorError -from ..compat import compat_urlparse +from ..utils import ( + OnDemandPagedList, + determine_ext, + parse_iso8601, + traverse_obj, +) class TuneInBaseIE(InfoExtractor): - _API_BASE_URL = 'http://tunein.com/tuner/tune/' + _VALID_URL_BASE = r'https?://(?:www\.)?tunein\.com' - def _real_extract(self, url): - content_id = self._match_id(url) - - content_info = self._download_json( - self._API_BASE_URL + self._API_URL_QUERY % content_id, - content_id, note='Downloading JSON metadata') - - title = content_info['Title'] - thumbnail = content_info.get('Logo') - location = content_info.get('Location') - streams_url = content_info.get('StreamUrl') - if not streams_url: - raise ExtractorError('No downloadable streams found', expected=True) - if not streams_url.startswith('http://'): - streams_url = compat_urlparse.urljoin(url, streams_url) + def _extract_metadata(self, webpage, content_id): + return self._search_json(r'window.INITIAL_STATE=', webpage, 'hydration', content_id, fatal=False) + def _extract_formats_and_subtitles(self, content_id): streams = self._download_json( - streams_url, content_id, note='Downloading stream data', - transform_source=lambda s: re.sub(r'^\s*\((.*)\);\s*$', r'\1', s))['Streams'] + f'https://opml.radiotime.com/Tune.ashx?render=json&formats=mp3,aac,ogg,flash,hls&id={content_id}', + content_id)['body'] - is_live = None - formats = [] + formats, subtitles = [], {} for stream in streams: - if stream.get('Type') == 'Live': - is_live = True - reliability = stream.get('Reliability') - format_note = ( - 'Reliability: %d%%' % reliability - if reliability is not None else None) - formats.append({ - 'preference': ( - 0 if reliability is None or reliability > 90 - else 1), - 'abr': stream.get('Bandwidth'), - 'ext': stream.get('MediaType').lower(), - 'acodec': stream.get('MediaType'), - 'vcodec': 'none', - 'url': stream.get('Url'), - 'source_preference': reliability, - 'format_note': format_note, - }) - - return { - 'id': content_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'location': location, - 'is_live': is_live, - } - - -class TuneInClipIE(TuneInBaseIE): - IE_NAME = 'tunein:clip' - _VALID_URL = r'https?://(?:www\.)?tunein\.com/station/.*?audioClipId\=(?P<id>\d+)' - _API_URL_QUERY = '?tuneType=AudioClip&audioclipId=%s' - - _TESTS = [{ - 'url': 'http://tunein.com/station/?stationId=246119&audioClipId=816', - 'md5': '99f00d772db70efc804385c6b47f4e77', - 'info_dict': { - 'id': '816', - 'title': '32m', - 'ext': 'mp3', - }, - }] + if stream.get('media_type') == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles(stream['url'], content_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif determine_ext(stream['url']) == 'pls': + playlist_content = self._download_webpage(stream['url'], content_id) + formats.append({ + 'url': self._search_regex(r'File1=(.*)', playlist_content, 'url', fatal=False), + 'abr': stream.get('bitrate'), + 'ext': stream.get('media_type'), + }) + else: + formats.append({ + 'url': stream['url'], + 'abr': stream.get('bitrate'), + 'ext': stream.get('media_type'), + }) + + return formats, subtitles class TuneInStationIE(TuneInBaseIE): - IE_NAME = 'tunein:station' - _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId=|embed/player/s)(?P<id>\d+)' - _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/[pst]\d+)'] - _API_URL_QUERY = '?tuneType=Station&stationId=%s' - - @classmethod - def suitable(cls, url): - return False if TuneInClipIE.suitable(url) else super(TuneInStationIE, cls).suitable(url) + _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'(?:/radio/[^?#]+-|/embed/player/)(?P<id>s\d+)' + _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/s\d+)'] _TESTS = [{ - 'url': 'http://tunein.com/radio/Jazz24-885-s34682/', + 'url': 'https://tunein.com/radio/Jazz24-885-s34682/', 'info_dict': { - 'id': '34682', - 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', + 'id': 's34682', + 'title': 're:^Jazz24', + 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b', + 'thumbnail': 're:^https?://[^?&]+/s34682', + 'location': 'Seattle-Tacoma, US', 'ext': 'mp3', - 'location': 'Tacoma, WA', + 'live_status': 'is_live', }, 'params': { - 'skip_download': True, # live stream + 'skip_download': True, }, }, { - 'url': 'http://tunein.com/embed/player/s6404/', + 'url': 'https://tunein.com/embed/player/s6404/', 'only_matching': True, + }, { + 'url': 'https://tunein.com/radio/BBC-Radio-1-988-s24939/', + 'info_dict': { + 'id': 's24939', + 'title': 're:^BBC Radio 1', + 'description': 'md5:f3f75f7423398d87119043c26e7bfb84', + 'thumbnail': 're:^https?://[^?&]+/s24939', + 'location': 'London, UK', + 'ext': 'mp3', + 'live_status': 'is_live', + }, + 'params': { + 'skip_download': True, + }, }] + def _real_extract(self, url): + station_id = self._match_id(url) + + webpage = self._download_webpage(url, station_id) + metadata = self._extract_metadata(webpage, station_id) + + formats, subtitles = self._extract_formats_and_subtitles(station_id) + return { + 'id': station_id, + 'title': traverse_obj(metadata, ('profiles', station_id, 'title')), + 'description': traverse_obj(metadata, ('profiles', station_id, 'description')), + 'thumbnail': traverse_obj(metadata, ('profiles', station_id, 'image')), + 'timestamp': parse_iso8601( + traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'publishTime'))), + 'location': traverse_obj( + metadata, ('profiles', station_id, 'metadata', 'properties', 'location', 'displayName'), + ('profiles', station_id, 'properties', 'location', 'displayName')), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'isLive')), + } + -class TuneInProgramIE(TuneInBaseIE): - IE_NAME = 'tunein:program' - _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-p|program/.*?ProgramId=|embed/player/p)(?P<id>\d+)' - _API_URL_QUERY = '?tuneType=Program&programId=%s' +class TuneInPodcastIE(TuneInBaseIE): + _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/(?:podcasts/[^?#]+-|embed/player/)(?P<id>p\d+)/?(?:#|$)' + _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/p\d+)'] _TESTS = [{ - 'url': 'http://tunein.com/radio/Jazz-24-p2506/', + 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019', 'info_dict': { - 'id': '2506', - 'title': 'Jazz 24 on 91.3 WUKY-HD3', - 'ext': 'mp3', - 'location': 'Lexington, KY', - }, - 'params': { - 'skip_download': True, # live stream + 'id': 'p1153019', + 'title': 'Lex Fridman Podcast', + 'description': 'md5:bedc4e5f1c94f7dec6e4317b5654b00d', }, + 'playlist_mincount': 200, }, { - 'url': 'http://tunein.com/embed/player/p191660/', - 'only_matching': True, + 'url': 'https://tunein.com/embed/player/p191660/', + 'only_matching': True + }, { + 'url': 'https://tunein.com/podcasts/World-News/BBC-News-p14/', + 'info_dict': { + 'id': 'p14', + 'title': 'BBC News', + 'description': 'md5:1218e575eeaff75f48ed978261fa2068', + }, + 'playlist_mincount': 200, }] + _PAGE_SIZE = 30 + + def _real_extract(self, url): + podcast_id = self._match_id(url) + + webpage = self._download_webpage(url, podcast_id, fatal=False) + metadata = self._extract_metadata(webpage, podcast_id) + + def page_func(page_num): + api_response = self._download_json( + f'https://api.tunein.com/profiles/{podcast_id}/contents', podcast_id, + note=f'Downloading page {page_num + 1}', query={ + 'filter': 't:free', + 'offset': page_num * self._PAGE_SIZE, + 'limit': self._PAGE_SIZE, + }) -class TuneInTopicIE(TuneInBaseIE): - IE_NAME = 'tunein:topic' - _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:topic/.*?TopicId=|embed/player/t)(?P<id>\d+)' - _API_URL_QUERY = '?tuneType=Topic&topicId=%s' + return [ + self.url_result( + f'https://tunein.com/podcasts/{podcast_id}?topicId={episode["GuideId"][1:]}', + TuneInPodcastEpisodeIE, title=episode.get('Title')) + for episode in api_response['Items']] + + entries = OnDemandPagedList(page_func, self._PAGE_SIZE) + return self.playlist_result( + entries, playlist_id=podcast_id, title=traverse_obj(metadata, ('profiles', podcast_id, 'title')), + description=traverse_obj(metadata, ('profiles', podcast_id, 'description'))) + + +class TuneInPodcastEpisodeIE(TuneInBaseIE): + _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/podcasts/(?:[^?&]+-)?(?P<podcast_id>p\d+)/?\?topicId=(?P<id>\w\d+)' _TESTS = [{ - 'url': 'http://tunein.com/topic/?TopicId=101830576', - 'md5': 'c31a39e6f988d188252eae7af0ef09c9', + 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/?topicId=236404354', 'info_dict': { - 'id': '101830576', - 'title': 'Votez pour moi du 29 octobre 2015 (29/10/15)', + 'id': 't236404354', + 'title': '#351 \u2013 MrBeast: Future of YouTube, Twitter, TikTok, and Instagram', + 'description': 'md5:e1734db6f525e472c0c290d124a2ad77', + 'thumbnail': 're:^https?://[^?&]+/p1153019', + 'timestamp': 1673458571, + 'upload_date': '20230111', + 'series_id': 'p1153019', + 'series': 'Lex Fridman Podcast', 'ext': 'mp3', - 'location': 'Belgium', }, - }, { - 'url': 'http://tunein.com/embed/player/t101830576/', - 'only_matching': True, }] + def _real_extract(self, url): + podcast_id, episode_id = self._match_valid_url(url).group('podcast_id', 'id') + episode_id = f't{episode_id}' + + webpage = self._download_webpage(url, episode_id) + metadata = self._extract_metadata(webpage, episode_id) + + formats, subtitles = self._extract_formats_and_subtitles(episode_id) + return { + 'id': episode_id, + 'title': traverse_obj(metadata, ('profiles', episode_id, 'title')), + 'description': traverse_obj(metadata, ('profiles', episode_id, 'description')), + 'thumbnail': traverse_obj(metadata, ('profiles', episode_id, 'image')), + 'timestamp': parse_iso8601( + traverse_obj(metadata, ('profiles', episode_id, 'actions', 'play', 'publishTime'))), + 'series_id': podcast_id, + 'series': traverse_obj(metadata, ('profiles', podcast_id, 'title')), + 'formats': formats, + 'subtitles': subtitles, + } + class TuneInShortenerIE(InfoExtractor): IE_NAME = 'tunein:shortener' @@ -154,10 +206,13 @@ class TuneInShortenerIE(InfoExtractor): # test redirection 'url': 'http://tun.in/ser7s', 'info_dict': { - 'id': '34682', - 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', + 'id': 's34682', + 'title': 're:^Jazz24', + 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b', + 'thumbnail': 're:^https?://[^?&]+/s34682', + 'location': 'Seattle-Tacoma, US', 'ext': 'mp3', - 'location': 'Tacoma, WA', + 'live_status': 'is_live', }, 'params': { 'skip_download': True, # live stream @@ -169,6 +224,11 @@ class TuneInShortenerIE(InfoExtractor): # The server doesn't support HEAD requests urlh = self._request_webpage( url, redirect_id, note='Downloading redirect page') - url = urlh.geturl() + + url = urlh.url + url_parsed = urllib.parse.urlparse(url) + if url_parsed.port == 443: + url = url_parsed._replace(netloc=url_parsed.hostname).url + self.to_screen('Following redirect: %s' % url) return self.url_result(url) diff --git a/hypervideo_dl/extractor/tv2.py b/hypervideo_dl/extractor/tv2.py index c51e633..f6b452d 100644 --- a/hypervideo_dl/extractor/tv2.py +++ b/hypervideo_dl/extractor/tv2.py @@ -1,7 +1,7 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -57,8 +57,8 @@ class TV2IE(InfoExtractor): headers={'content-type': 'application/json'}, data='{"device":{"id":"1-1-1","name":"Nettleser (HTML)"}}'.encode())['playback'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), video_id)['error'] + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read().decode(), video_id)['error'] error_code = error.get('code') if error_code == 'ASSET_PLAYBACK_INVALID_GEO_LOCATION': self.raise_geo_restricted(countries=self._GEO_COUNTRIES) @@ -211,8 +211,8 @@ class KatsomoIE(InfoExtractor): api_base + '/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % protocol, video_id, 'Downloading play JSON')['playback'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - error = self._parse_json(e.cause.read().decode(), video_id)['error'] + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + error = self._parse_json(e.cause.response.read().decode(), video_id)['error'] error_code = error.get('code') if error_code == 'ASSET_PLAYBACK_INVALID_GEO_LOCATION': self.raise_geo_restricted(countries=self._GEO_COUNTRIES) diff --git a/hypervideo_dl/extractor/tv4.py b/hypervideo_dl/extractor/tv4.py index 1378a6f..10a2fe6 100644 --- a/hypervideo_dl/extractor/tv4.py +++ b/hypervideo_dl/extractor/tv4.py @@ -2,8 +2,11 @@ import re from .common import InfoExtractor from ..utils import ( + bool_or_none, int_or_none, parse_iso8601, + traverse_obj, + url_or_none, ) @@ -20,19 +23,25 @@ class TV4IE(InfoExtractor): sport/| ) )(?P<id>[0-9]+)''' - _GEO_COUNTRIES = ['SE'] + _GEO_BYPASS = False _TESTS = [ { + # not geo-restricted 'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650', 'md5': 'cb837212f342d77cec06e6dad190e96d', 'info_dict': { 'id': '2491650', 'ext': 'mp4', 'title': 'Kalla Fakta 5 (english subtitles)', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': int, + 'description': '2491650', + 'series': 'Kalla fakta', + 'duration': 1335, + 'thumbnail': r're:^https?://[^/?#]+/api/v2/img/', + 'timestamp': 1385373240, 'upload_date': '20131125', }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.tv4play.se/iframe/video/3054113', @@ -46,6 +55,7 @@ class TV4IE(InfoExtractor): 'timestamp': int, 'upload_date': '20150130', }, + 'skip': '404 Not Found', }, { 'url': 'http://www.tv4play.se/sport/3060959', @@ -69,29 +79,28 @@ class TV4IE(InfoExtractor): } ] + def _call_api(self, endpoint, video_id, headers=None, query={}): + return self._download_json( + f'https://playback2.a2d.tv/{endpoint}/{video_id}', video_id, + f'Downloading {endpoint} API JSON', headers=headers, query={ + 'service': 'tv4', + 'device': 'browser', + 'protocol': 'hls', + **query, + }) + def _real_extract(self, url): video_id = self._match_id(url) - info = self._download_json( - 'https://playback-api.b17g.net/asset/%s' % video_id, - video_id, 'Downloading video info JSON', query={ - 'service': 'tv4', - 'device': 'browser', - 'protocol': 'hls,dash', - 'drm': 'widevine', - })['metadata'] + info = traverse_obj(self._call_api('asset', video_id, query={ + 'protocol': 'hls,dash', + 'drm': 'widevine', + }), ('metadata', {dict})) or {} - title = info['title'] + manifest_url = self._call_api( + 'play', video_id, headers=self.geo_verification_headers())['playbackItem']['manifestUrl'] - manifest_url = self._download_json( - 'https://playback-api.b17g.net/media/' + video_id, - video_id, query={ - 'service': 'tv4', - 'device': 'browser', - 'protocol': 'hls', - })['playbackItem']['manifestUrl'] - formats = [] - subtitles = {} + formats, subtitles = [], {} fmts, subs = self._extract_m3u8_formats_and_subtitles( manifest_url, video_id, 'mp4', @@ -117,20 +126,24 @@ class TV4IE(InfoExtractor): subtitles = self._merge_subtitles(subtitles, subs) if not formats and info.get('is_geo_restricted'): - self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + self.raise_geo_restricted( + 'This video is not available from your location due to geo-restriction, or not being authenticated', + countries=['SE']) return { 'id': video_id, - 'title': title, 'formats': formats, 'subtitles': subtitles, - 'description': info.get('description'), - 'timestamp': parse_iso8601(info.get('broadcast_date_time')), - 'duration': int_or_none(info.get('duration')), - 'thumbnail': info.get('image'), - 'is_live': info.get('isLive') is True, - 'series': info.get('seriesTitle'), - 'season_number': int_or_none(info.get('seasonNumber')), - 'episode': info.get('episodeTitle'), - 'episode_number': int_or_none(info.get('episodeNumber')), + **traverse_obj(info, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'timestamp': (('broadcast_date_time', 'broadcastDateTime'), {parse_iso8601}), + 'duration': ('duration', {int_or_none}), + 'thumbnail': ('image', {url_or_none}), + 'is_live': ('isLive', {bool_or_none}), + 'series': ('seriesTitle', {str}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode': ('episodeTitle', {str}), + 'episode_number': ('episodeNumber', {int_or_none}), + }, get_all=False), } diff --git a/hypervideo_dl/extractor/tvp.py b/hypervideo_dl/extractor/tvp.py index 8483564..2aa0dd8 100644 --- a/hypervideo_dl/extractor/tvp.py +++ b/hypervideo_dl/extractor/tvp.py @@ -268,8 +268,11 @@ class TVPIE(InfoExtractor): class TVPStreamIE(InfoExtractor): IE_NAME = 'tvp:stream' - _VALID_URL = r'(?:tvpstream:|https?://tvpstream\.vod\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)' + _VALID_URL = r'(?:tvpstream:|https?://(?:tvpstream\.vod|stream)\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)' _TESTS = [{ + 'url': 'https://stream.tvp.pl/?channel_id=56969941', + 'only_matching': True, + }, { # untestable as "video" id changes many times across a day 'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455', 'only_matching': True, @@ -285,28 +288,21 @@ class TVPStreamIE(InfoExtractor): 'only_matching': True, }] - _PLAYER_BOX_RE = r'<div\s[^>]*id\s*=\s*["\']?tvp_player_box["\']?[^>]+data-%s-id\s*=\s*["\']?(\d+)' - _BUTTON_RE = r'<div\s[^>]*data-channel-id=["\']?%s["\']?[^>]*\sdata-title=(?:"([^"]*)"|\'([^\']*)\')[^>]*\sdata-stationname=(?:"([^"]*)"|\'([^\']*)\')' - def _real_extract(self, url): channel_id = self._match_id(url) - channel_url = self._proto_relative_url('//tvpstream.vod.tvp.pl/?channel_id=%s' % channel_id or 'default') - webpage = self._download_webpage(channel_url, channel_id, 'Downloading channel webpage') - if not channel_id: - channel_id = self._search_regex(self._PLAYER_BOX_RE % 'channel', - webpage, 'default channel id') - video_id = self._search_regex(self._PLAYER_BOX_RE % 'video', - webpage, 'video id') - audition_title, station_name = self._search_regex( - self._BUTTON_RE % (re.escape(channel_id)), webpage, - 'audition title and station name', - group=(1, 2)) + channel_url = self._proto_relative_url('//stream.tvp.pl/?channel_id=%s' % channel_id or 'default') + webpage = self._download_webpage(channel_url, channel_id or 'default', 'Downloading channel webpage') + channels = self._search_json( + r'window\.__channels\s*=', webpage, 'channel list', channel_id, + contains_pattern=r'\[\s*{(?s:.+)}\s*]') + channel = traverse_obj(channels, (lambda _, v: channel_id == str(v['id'])), get_all=False) if channel_id else channels[0] + audition = traverse_obj(channel, ('items', lambda _, v: v['is_live'] is True), get_all=False) return { '_type': 'url_transparent', - 'id': channel_id, - 'url': 'tvp:%s' % video_id, - 'title': audition_title, - 'alt_title': station_name, + 'id': channel_id or channel['id'], + 'url': 'tvp:%s' % audition['video_id'], + 'title': audition.get('title'), + 'alt_title': channel.get('title'), 'is_live': True, 'ie_key': 'TVPEmbed', } @@ -486,21 +482,34 @@ class TVPEmbedIE(InfoExtractor): class TVPVODBaseIE(InfoExtractor): _API_BASE_URL = 'https://vod.tvp.pl/api/products' - def _call_api(self, resource, video_id, **kwargs): - return self._download_json( + def _call_api(self, resource, video_id, query={}, **kwargs): + is_valid = lambda x: 200 <= x < 300 + document, urlh = self._download_json_handle( f'{self._API_BASE_URL}/{resource}', video_id, - query={'lang': 'pl', 'platform': 'BROWSER'}, **kwargs) - - def _parse_video(self, video): - return { - '_type': 'url', - 'url': 'tvp:' + video['externalUid'], - 'ie_key': TVPEmbedIE.ie_key(), - 'title': video.get('title'), - 'description': traverse_obj(video, ('lead', 'description')), - 'age_limit': int_or_none(video.get('rating')), - 'duration': int_or_none(video.get('duration')), - } + query={'lang': 'pl', 'platform': 'BROWSER', **query}, + expected_status=lambda x: is_valid(x) or 400 <= x < 500, **kwargs) + if is_valid(urlh.status): + return document + raise ExtractorError(f'Woronicza said: {document.get("code")} (HTTP {urlh.status})') + + def _parse_video(self, video, with_url=True): + info_dict = traverse_obj(video, { + 'id': ('id', {str_or_none}), + 'title': 'title', + 'age_limit': ('rating', {int_or_none}), + 'duration': ('duration', {int_or_none}), + 'episode_number': ('number', {int_or_none}), + 'series': ('season', 'serial', 'title', {str_or_none}), + 'thumbnails': ('images', ..., ..., {'url': ('url', {url_or_none})}), + }) + info_dict['description'] = clean_html(dict_get(video, ('lead', 'description'))) + if with_url: + info_dict.update({ + '_type': 'url', + 'url': video['webUrl'], + 'ie_key': TVPVODVideoIE.ie_key(), + }) + return info_dict class TVPVODVideoIE(TVPVODBaseIE): @@ -510,37 +519,70 @@ class TVPVODVideoIE(TVPVODBaseIE): _TESTS = [{ 'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357', 'info_dict': { - 'id': '60468609', + 'id': '311357', 'ext': 'mp4', - 'title': 'Laboratorium alchemika, Tusze termiczne. Jak zobaczyć niewidoczne. Odcinek 24', + 'title': 'Tusze termiczne. Jak zobaczyć niewidoczne. Odcinek 24', 'description': 'md5:1d4098d3e537092ccbac1abf49b7cd4c', 'duration': 300, 'episode_number': 24, 'episode': 'Episode 24', 'age_limit': 0, 'series': 'Laboratorium alchemika', - 'thumbnail': 're:https://.+', + 'thumbnail': 're:https?://.+', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://vod.tvp.pl/filmy-dokumentalne,163/ukrainski-sluga-narodu,339667', 'info_dict': { - 'id': '51640077', + 'id': '339667', 'ext': 'mp4', - 'title': 'Ukraiński sługa narodu, Ukraiński sługa narodu', - 'series': 'Ukraiński sługa narodu', + 'title': 'Ukraiński sługa narodu', 'description': 'md5:b7940c0a8e439b0c81653a986f544ef3', 'age_limit': 12, - 'episode': 'Episode 0', - 'episode_number': 0, 'duration': 3051, - 'thumbnail': 're:https://.+', + 'thumbnail': 're:https?://.+', + 'subtitles': 'count:2', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'embed fails with "payment required"', + 'url': 'https://vod.tvp.pl/seriale,18/polowanie-na-cmy-odcinki,390116/odcinek-7,S01E07,398869', + 'info_dict': { + 'id': '398869', + 'ext': 'mp4', + 'title': 'odc. 7', + 'description': 'md5:dd2bb33f023dc5c2fbaddfbe4cb5dba0', + 'duration': 2750, + 'age_limit': 16, + 'series': 'Polowanie na ćmy', + 'episode_number': 7, + 'episode': 'Episode 7', + 'thumbnail': 're:https?://.+', }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): video_id = self._match_id(url) - return self._parse_video(self._call_api(f'vods/{video_id}', video_id)) + info_dict = self._parse_video(self._call_api(f'vods/{video_id}', video_id), with_url=False) + + playlist = self._call_api(f'{video_id}/videos/playlist', video_id, query={'videoType': 'MOVIE'}) + + info_dict['formats'] = [] + for manifest_url in traverse_obj(playlist, ('sources', 'HLS', ..., 'src')): + info_dict['formats'].extend(self._extract_m3u8_formats(manifest_url, video_id, fatal=False)) + for manifest_url in traverse_obj(playlist, ('sources', 'DASH', ..., 'src')): + info_dict['formats'].extend(self._extract_mpd_formats(manifest_url, video_id, fatal=False)) + + info_dict['subtitles'] = {} + for sub in playlist.get('subtitles') or []: + info_dict['subtitles'].setdefault(sub.get('language') or 'und', []).append({ + 'url': sub['url'], + 'ext': 'ttml', + }) + + return info_dict class TVPVODSeriesIE(TVPVODBaseIE): @@ -555,7 +597,7 @@ class TVPVODSeriesIE(TVPVODBaseIE): 'age_limit': 12, 'categories': ['seriale'], }, - 'playlist_count': 129, + 'playlist_count': 130, }, { 'url': 'https://vod.tvp.pl/programy,88/rolnik-szuka-zony-odcinki,284514', 'only_matching': True, diff --git a/hypervideo_dl/extractor/tvplay.py b/hypervideo_dl/extractor/tvplay.py index 9ef4f96..48a6efe 100644 --- a/hypervideo_dl/extractor/tvplay.py +++ b/hypervideo_dl/extractor/tvplay.py @@ -1,10 +1,8 @@ import re from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urlparse, -) +from ..compat import compat_urlparse +from ..networking.exceptions import HTTPError from ..utils import ( determine_ext, ExtractorError, @@ -30,10 +28,7 @@ class TVPlayIE(InfoExtractor): (?: tvplay(?:\.skaties)?\.lv(?:/parraides)?| (?:tv3play|play\.tv3)\.lt(?:/programos)?| - tv3play(?:\.tv3)?\.ee/sisu| - (?:tv(?:3|6|8|10)play)\.se/program| - (?:(?:tv3play|viasat4play|tv6play)\.no|(?:tv3play)\.dk)/programmer| - play\.nova(?:tv)?\.bg/programi + tv3play(?:\.tv3)?\.ee/sisu ) /(?:[^/]+/)+ ) @@ -93,117 +88,6 @@ class TVPlayIE(InfoExtractor): }, }, { - 'url': 'http://www.tv3play.se/program/husraddarna/395385?autostart=true', - 'info_dict': { - 'id': '395385', - 'ext': 'mp4', - 'title': 'Husräddarna S02E07', - 'description': 'md5:f210c6c89f42d4fc39faa551be813777', - 'duration': 2574, - 'timestamp': 1400596321, - 'upload_date': '20140520', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv6play.se/program/den-sista-dokusapan/266636?autostart=true', - 'info_dict': { - 'id': '266636', - 'ext': 'mp4', - 'title': 'Den sista dokusåpan S01E08', - 'description': 'md5:295be39c872520221b933830f660b110', - 'duration': 1492, - 'timestamp': 1330522854, - 'upload_date': '20120229', - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv8play.se/program/antikjakten/282756?autostart=true', - 'info_dict': { - 'id': '282756', - 'ext': 'mp4', - 'title': 'Antikjakten S01E10', - 'description': 'md5:1b201169beabd97e20c5ad0ad67b13b8', - 'duration': 2646, - 'timestamp': 1348575868, - 'upload_date': '20120925', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv3play.no/programmer/anna-anka-soker-assistent/230898?autostart=true', - 'info_dict': { - 'id': '230898', - 'ext': 'mp4', - 'title': 'Anna Anka søker assistent - Ep. 8', - 'description': 'md5:f80916bf5bbe1c5f760d127f8dd71474', - 'duration': 2656, - 'timestamp': 1277720005, - 'upload_date': '20100628', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.viasat4play.no/programmer/budbringerne/21873?autostart=true', - 'info_dict': { - 'id': '21873', - 'ext': 'mp4', - 'title': 'Budbringerne program 10', - 'description': 'md5:4db78dc4ec8a85bb04fd322a3ee5092d', - 'duration': 1297, - 'timestamp': 1254205102, - 'upload_date': '20090929', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.tv6play.no/programmer/hotelinspektor-alex-polizzi/361883?autostart=true', - 'info_dict': { - 'id': '361883', - 'ext': 'mp4', - 'title': 'Hotelinspektør Alex Polizzi - Ep. 10', - 'description': 'md5:3ecf808db9ec96c862c8ecb3a7fdaf81', - 'duration': 2594, - 'timestamp': 1393236292, - 'upload_date': '20140224', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true', - 'info_dict': { - 'id': '624952', - 'ext': 'flv', - 'title': 'Здравей, България (12.06.2015 г.) ', - 'description': 'md5:99f3700451ac5bb71a260268b8daefd7', - 'duration': 8838, - 'timestamp': 1434100372, - 'upload_date': '20150612', - }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, - { - 'url': 'https://play.nova.bg/programi/zdravei-bulgariya/764300?autostart=true', - 'only_matching': True, - }, - { 'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true', 'only_matching': True, }, @@ -243,8 +127,8 @@ class TVPlayIE(InfoExtractor): 'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id, video_id, 'Downloading streams JSON') except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - msg = self._parse_json(e.cause.read().decode('utf-8'), video_id) + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + msg = self._parse_json(e.cause.response.read().decode('utf-8'), video_id) raise ExtractorError(msg['msg'], expected=True) raise @@ -327,103 +211,6 @@ class TVPlayIE(InfoExtractor): } -class ViafreeIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - viafree\.(?P<country>dk|no|se|fi) - /(?P<id>(?:program(?:mer)?|ohjelmat)?/(?:[^/]+/)+[^/?#&]+) - ''' - _TESTS = [{ - 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', - 'info_dict': { - 'id': '757786', - 'ext': 'mp4', - 'title': 'Det beste vorspielet - Sesong 2 - Episode 1', - 'description': 'md5:b632cb848331404ccacd8cd03e83b4c3', - 'series': 'Det beste vorspielet', - 'season_number': 2, - 'duration': 1116, - 'timestamp': 1471200600, - 'upload_date': '20160814', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.viafree.dk/programmer/humor/comedy-central-roast-of-charlie-sheen/film/1047660', - 'info_dict': { - 'id': '1047660', - 'ext': 'mp4', - 'title': 'Comedy Central Roast of Charlie Sheen - Comedy Central Roast of Charlie Sheen', - 'description': 'md5:ec956d941ae9fd7c65a48fd64951dc6d', - 'series': 'Comedy Central Roast of Charlie Sheen', - 'season_number': 1, - 'duration': 3747, - 'timestamp': 1608246060, - 'upload_date': '20201217' - }, - 'params': { - 'skip_download': True - } - }, { - # with relatedClips - 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1', - 'only_matching': True, - }, { - # Different og:image URL schema - 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2', - 'only_matching': True, - }, { - 'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2', - 'only_matching': True, - }, { - 'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5', - 'only_matching': True, - }, { - 'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869', - 'only_matching': True, - }, { - 'url': 'https://www.viafree.fi/ohjelmat/entertainment/amazing-makeovers/kausi-7/jakso-2', - 'only_matching': True, - }] - _GEO_BYPASS = False - - def _real_extract(self, url): - country, path = self._match_valid_url(url).groups() - content = self._download_json( - 'https://viafree-content.mtg-api.com/viafree-content/v1/%s/path/%s' % (country, path), path) - program = content['_embedded']['viafreeBlocks'][0]['_embedded']['program'] - guid = program['guid'] - meta = content['meta'] - title = meta['title'] - - try: - stream_href = self._download_json( - program['_links']['streamLink']['href'], guid, - headers=self.geo_verification_headers())['embedded']['prioritizedStreams'][0]['links']['stream']['href'] - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_geo_restricted(countries=[country]) - raise - - formats, subtitles = self._extract_m3u8_formats_and_subtitles(stream_href, guid, 'mp4') - episode = program.get('episode') or {} - return { - 'id': guid, - 'title': title, - 'thumbnail': meta.get('image'), - 'description': meta.get('description'), - 'series': episode.get('seriesTitle'), - 'subtitles': subtitles, - 'episode_number': int_or_none(episode.get('episodeNumber')), - 'season_number': int_or_none(episode.get('seasonNumber')), - 'duration': int_or_none(try_get(program, lambda x: x['video']['duration']['milliseconds']), 1000), - 'timestamp': parse_iso8601(try_get(program, lambda x: x['availability']['start'])), - 'formats': formats, - } - - class TVPlayHomeIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// diff --git a/hypervideo_dl/extractor/tvplayer.py b/hypervideo_dl/extractor/tvplayer.py index b05355f..228c236 100644 --- a/hypervideo_dl/extractor/tvplayer.py +++ b/hypervideo_dl/extractor/tvplayer.py @@ -1,8 +1,6 @@ from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( extract_attributes, try_get, @@ -64,9 +62,9 @@ class TVPlayerIE(InfoExtractor): 'validate': validate, }))['tvplayer']['response'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): + if isinstance(e.cause, HTTPError): response = self._parse_json( - e.cause.read().decode(), resource_id)['tvplayer']['response'] + e.cause.response.read().decode(), resource_id)['tvplayer']['response'] raise ExtractorError( '%s said: %s' % (self.IE_NAME, response['error']), expected=True) raise diff --git a/hypervideo_dl/extractor/twitcasting.py b/hypervideo_dl/extractor/twitcasting.py index 30bc987..ede1085 100644 --- a/hypervideo_dl/extractor/twitcasting.py +++ b/hypervideo_dl/extractor/twitcasting.py @@ -38,7 +38,7 @@ class TwitCastingIE(InfoExtractor): 'description': 'Twitter Oficial da cantora brasileira Ivete Sangalo.', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20110822', - 'timestamp': 1314010824, + 'timestamp': 1313978424, 'duration': 32, 'view_count': int, }, @@ -52,10 +52,10 @@ class TwitCastingIE(InfoExtractor): 'ext': 'mp4', 'title': 'Live playing something #3689740', 'uploader_id': 'mttbernardini', - 'description': 'Salve, io sono Matto (ma con la e). Questa è la mia presentazione, in quanto sono letteralmente matto (nel senso di strano), con qualcosa in più.', + 'description': 'md5:1dc7efa2f1ab932fcd119265cebeec69', 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20120212', - 'timestamp': 1329028024, + 'upload_date': '20120211', + 'timestamp': 1328995624, 'duration': 681, 'view_count': int, }, @@ -64,15 +64,22 @@ class TwitCastingIE(InfoExtractor): 'videopassword': 'abc', }, }, { - 'note': 'archive is split in 2 parts', 'url': 'https://twitcasting.tv/loft_heaven/movie/685979292', 'info_dict': { 'id': '685979292', 'ext': 'mp4', - 'title': '南波一海のhear_here “ナタリー望月哲さんに聞く編集と「渋谷系狂騒曲」”', - 'duration': 6964.599334, + 'title': '【無料配信】南波一海のhear/here “ナタリー望月哲さんに聞く編集と「渋谷系狂騒曲」”', + 'uploader_id': 'loft_heaven', + 'description': 'md5:3a0c7b53019df987ce545c935538bacf', + 'upload_date': '20210604', + 'timestamp': 1622802114, + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 6964, + 'view_count': int, + }, + 'params': { + 'skip_download': True, }, - 'playlist_mincount': 2, }] def _parse_data_movie_playlist(self, dmp, video_id): @@ -88,18 +95,21 @@ class TwitCastingIE(InfoExtractor): def _real_extract(self, url): uploader_id, video_id = self._match_valid_url(url).groups() + webpage, urlh = self._download_webpage_handle(url, video_id) video_password = self.get_param('videopassword') request_data = None if video_password: request_data = urlencode_postdata({ 'password': video_password, + **self._hidden_inputs(webpage), }, encoding='utf-8') - webpage, urlh = self._download_webpage_handle( - url, video_id, data=request_data, - headers={'Origin': 'https://twitcasting.tv'}) - if urlh.geturl() != url and request_data: + webpage, urlh = self._download_webpage_handle( + url, video_id, data=request_data, + headers={'Origin': 'https://twitcasting.tv'}, + note='Trying video password') + if urlh.url != url and request_data: webpage = self._download_webpage( - urlh.geturl(), video_id, data=request_data, + urlh.url, video_id, data=request_data, headers={'Origin': 'https://twitcasting.tv'}, note='Retrying authentication') # has to check here as the first request can contain password input form even if the password is correct @@ -122,7 +132,7 @@ class TwitCastingIE(InfoExtractor): duration = (try_get(video_js_data, lambda x: sum(float_or_none(y.get('duration')) for y in x) / 1000) or parse_duration(clean_html(get_element_by_class('tw-player-duration-time', webpage)))) view_count = str_to_int(self._search_regex( - (r'Total\s*:\s*([\d,]+)\s*Views', r'総視聴者\s*:\s*([\d,]+)\s*</'), webpage, 'views', None)) + (r'Total\s*:\s*Views\s*([\d,]+)', r'総視聴者\s*:\s*([\d,]+)\s*</'), webpage, 'views', None)) timestamp = unified_timestamp(self._search_regex( r'data-toggle="true"[^>]+datetime="([^"]+)"', webpage, 'datetime', None)) diff --git a/hypervideo_dl/extractor/twitch.py b/hypervideo_dl/extractor/twitch.py index c59d1cf..3297ef0 100644 --- a/hypervideo_dl/extractor/twitch.py +++ b/hypervideo_dl/extractor/twitch.py @@ -41,23 +41,27 @@ class TwitchBaseIE(InfoExtractor): _USHER_BASE = 'https://usher.ttvnw.net' _LOGIN_FORM_URL = 'https://www.twitch.tv/login' _LOGIN_POST_URL = 'https://passport.twitch.tv/login' - _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko' _NETRC_MACHINE = 'twitch' _OPERATION_HASHES = { 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', - 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', - 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', + 'ChannelCollectionsContent': '447aec6a0cc1e8d0a8d7732d47eb0762c336a2294fdb009e9c9d854e49d484b9', + 'StreamMetadata': 'a647c2a13599e5991e175155f798ca7f1ecddde73f7f341f39009c14dbf59962', 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', 'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11', 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', - 'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687', + 'VideoMetadata': '49b5b8f268cdeb259d75b58dcb0c1a748e3b575003448a2333dc5cdafd49adad', 'VideoPlayer_ChapterSelectButtonVideo': '8d2793384aac3773beab5e59bd5d6f585aedb923d292800119e03d40cd0f9b41', 'VideoPlayer_VODSeekbarPreviewVideo': '07e99e4d56c5a7c67117a154777b0baf85a5ffefa393b213f4bc712ccaf85dd6', } + @property + def _CLIENT_ID(self): + return self._configuration_arg( + 'client_id', ['ue6666qo983tsx6so1t0vnawi233wa'], ie_key='Twitch', casesense=True)[0] + def _perform_login(self, username, password): def fail(message): raise ExtractorError( @@ -67,7 +71,7 @@ class TwitchBaseIE(InfoExtractor): form = self._hidden_inputs(page) form.update(data) - page_url = urlh.geturl() + page_url = urlh.url post_url = self._search_regex( r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, 'post url', default=self._LOGIN_POST_URL, group='url') @@ -179,6 +183,14 @@ class TwitchBaseIE(InfoExtractor): video_id, ops, 'Downloading %s access token GraphQL' % token_kind)['data'][method] + def _get_thumbnails(self, thumbnail): + return [{ + 'url': re.sub(r'\d+x\d+(\.\w+)($|(?=[?#]))', r'0x0\g<1>', thumbnail), + 'preference': 1, + }, { + 'url': thumbnail, + }] if thumbnail else None + class TwitchVodIE(TwitchBaseIE): IE_NAME = 'twitch:vod' @@ -186,7 +198,8 @@ class TwitchVodIE(TwitchBaseIE): https?:// (?: (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/| - player\.twitch\.tv/\?.*?\bvideo=v? + player\.twitch\.tv/\?.*?\bvideo=v?| + www\.twitch\.tv/[^/]+/schedule\?vodID= ) (?P<id>\d+) ''' @@ -355,6 +368,9 @@ class TwitchVodIE(TwitchBaseIE): 'skip_download': True }, 'expected_warnings': ['Unable to download JSON metadata: HTTP Error 403: Forbidden'] + }, { + 'url': 'https://www.twitch.tv/tangotek/schedule?vodID=1822395420', + 'only_matching': True, }] def _download_info(self, item_id): @@ -380,13 +396,14 @@ class TwitchVodIE(TwitchBaseIE): }], 'Downloading stream metadata GraphQL') - video = traverse_obj(data, (0, 'data', 'video')) - video['moments'] = traverse_obj(data, (1, 'data', 'video', 'moments', 'edges', ..., 'node')) - video['storyboard'] = traverse_obj(data, (2, 'data', 'video', 'seekPreviewsURL'), expected_type=url_or_none) - + video = traverse_obj(data, (..., 'data', 'video'), get_all=False) if video is None: - raise ExtractorError( - 'Video %s does not exist' % item_id, expected=True) + raise ExtractorError(f'Video {item_id} does not exist', expected=True) + + video['moments'] = traverse_obj(data, (..., 'data', 'video', 'moments', 'edges', ..., 'node')) + video['storyboard'] = traverse_obj( + data, (..., 'data', 'video', 'seekPreviewsURL', {url_or_none}), get_all=False) + return video def _extract_info(self, info): @@ -455,19 +472,17 @@ class TwitchVodIE(TwitchBaseIE): thumbnail = url_or_none(info.get('previewThumbnailURL')) is_live = None if thumbnail: - if thumbnail.endswith('/404_processing_{width}x{height}.png'): + if re.findall(r'/404_processing_[^.?#]+\.png', thumbnail): is_live, thumbnail = True, None else: is_live = False - for p in ('width', 'height'): - thumbnail = thumbnail.replace('{%s}' % p, '0') return { 'id': vod_id, 'title': info.get('title') or 'Untitled Broadcast', 'description': info.get('description'), 'duration': int_or_none(info.get('lengthSeconds')), - 'thumbnail': thumbnail, + 'thumbnails': self._get_thumbnails(thumbnail), 'uploader': try_get(info, lambda x: x['owner']['displayName'], compat_str), 'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str), 'timestamp': unified_timestamp(info.get('publishedAt')), @@ -854,6 +869,13 @@ class TwitchVideosCollectionsIE(TwitchPlaylistBaseIE): 'title': 'spamfish - Collections', }, 'playlist_mincount': 3, + }, { + 'url': 'https://www.twitch.tv/monstercat/videos?filter=collections', + 'info_dict': { + 'id': 'monstercat', + 'title': 'monstercat - Collections', + }, + 'playlist_mincount': 13, }] _OPERATION_NAME = 'ChannelCollectionsContent' @@ -922,6 +944,7 @@ class TwitchStreamIE(TwitchBaseIE): # m3u8 download 'skip_download': True, }, + 'skip': 'User does not exist', }, { 'url': 'http://www.twitch.tv/miracle_doto#profile-0', 'only_matching': True, @@ -934,6 +957,25 @@ class TwitchStreamIE(TwitchBaseIE): }, { 'url': 'https://m.twitch.tv/food', 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/monstercat', + 'info_dict': { + 'id': '40500071752', + 'display_id': 'monstercat', + 'title': 're:Monstercat', + 'description': 'md5:0945ad625e615bc8f0469396537d87d9', + 'is_live': True, + 'timestamp': 1677107190, + 'upload_date': '20230222', + 'uploader': 'Monstercat', + 'uploader_id': 'monstercat', + 'live_status': 'is_live', + 'thumbnail': 're:https://.*.jpg', + 'ext': 'mp4', + }, + 'params': { + 'skip_download': 'Livestream', + }, }] @classmethod @@ -1025,7 +1067,7 @@ class TwitchStreamIE(TwitchBaseIE): 'display_id': channel_name, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnails': self._get_thumbnails(thumbnail), 'uploader': uploader, 'uploader_id': channel_name, 'timestamp': timestamp, @@ -1041,7 +1083,7 @@ class TwitchClipsIE(TwitchBaseIE): https?:// (?: clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)| - (?:(?:www|go|m)\.)?twitch\.tv/[^/]+/clip/ + (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/)?clip/ ) (?P<id>[^/?#&]+) ''' @@ -1077,6 +1119,9 @@ class TwitchClipsIE(TwitchBaseIE): }, { 'url': 'https://go.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank', 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/clip/FaintLightGullWholeWheat', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/twitter.py b/hypervideo_dl/extractor/twitter.py index 18ebb36..66d1eb8 100644 --- a/hypervideo_dl/extractor/twitter.py +++ b/hypervideo_dl/extractor/twitter.py @@ -1,10 +1,9 @@ +import functools import json import re -import urllib.error from .common import InfoExtractor from .periscope import PeriscopeBaseIE, PeriscopeIE -from ..compat import functools # isort: split from ..compat import ( compat_parse_qs, compat_urllib_parse_unquote, @@ -13,10 +12,12 @@ from ..compat import ( from ..utils import ( ExtractorError, dict_get, + filter_dict, float_or_none, format_field, int_or_none, make_archive_id, + remove_end, str_or_none, strip_or_none, traverse_obj, @@ -30,13 +31,67 @@ from ..utils import ( class TwitterBaseIE(InfoExtractor): + _NETRC_MACHINE = 'twitter' _API_BASE = 'https://api.twitter.com/1.1/' _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' - _TOKENS = { - 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA': None, - 'AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw': None, - } _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' + _AUTH = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' + _LEGACY_AUTH = 'AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE' + _flow_token = None + + _LOGIN_INIT_DATA = json.dumps({ + 'input_flow_data': { + 'flow_context': { + 'debug_overrides': {}, + 'start_location': { + 'location': 'unknown' + } + } + }, + 'subtask_versions': { + 'action_list': 2, + 'alert_dialog': 1, + 'app_download_cta': 1, + 'check_logged_in_account': 1, + 'choice_selection': 3, + 'contacts_live_sync_permission_prompt': 0, + 'cta': 7, + 'email_verification': 2, + 'end_flow': 1, + 'enter_date': 1, + 'enter_email': 2, + 'enter_password': 5, + 'enter_phone': 2, + 'enter_recaptcha': 1, + 'enter_text': 5, + 'enter_username': 2, + 'generic_urt': 3, + 'in_app_notification': 1, + 'interest_picker': 3, + 'js_instrumentation': 1, + 'menu_dialog': 1, + 'notifications_permission_prompt': 2, + 'open_account': 2, + 'open_home_timeline': 1, + 'open_link': 1, + 'phone_verification': 4, + 'privacy_options': 1, + 'security_key': 3, + 'select_avatar': 4, + 'select_banner': 2, + 'settings_list': 7, + 'show_code': 1, + 'sign_up': 2, + 'sign_up_review': 4, + 'tweet_selection_urt': 1, + 'update_users': 1, + 'upload_media': 1, + 'user_recommendations_list': 4, + 'user_recommendations_urt': 1, + 'wait_spinner': 3, + 'web_modal': 1 + } + }, separators=(',', ':')).encode() def _extract_variant_formats(self, variant, video_id): variant_url = variant.get('url') @@ -88,73 +143,179 @@ class TwitterBaseIE(InfoExtractor): 'height': int(m.group('height')), }) - @functools.cached_property + @property def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) - def _call_api(self, path, video_id, query={}, graphql=False): - cookies = self._get_cookies(self._API_BASE) - headers = {} + def _fetch_guest_token(self, display_id): + guest_token = traverse_obj(self._download_json( + f'{self._API_BASE}guest/activate.json', display_id, 'Downloading guest token', data=b'', + headers=self._set_base_headers(legacy=display_id and self._configuration_arg('legacy_api'))), + ('guest_token', {str})) + if not guest_token: + raise ExtractorError('Could not retrieve guest token') + return guest_token + + def _set_base_headers(self, legacy=False): + bearer_token = self._LEGACY_AUTH if legacy and not self.is_logged_in else self._AUTH + return filter_dict({ + 'Authorization': f'Bearer {bearer_token}', + 'x-csrf-token': try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value), + }) + + def _call_login_api(self, note, headers, query={}, data=None): + response = self._download_json( + f'{self._API_BASE}onboarding/task.json', None, note, + headers=headers, query=query, data=data, expected_status=400) + error = traverse_obj(response, ('errors', 0, 'message', {str})) + if error: + raise ExtractorError(f'Login failed, Twitter API says: {error}', expected=True) + elif traverse_obj(response, 'status') != 'success': + raise ExtractorError('Login was unsuccessful') + + subtask = traverse_obj( + response, ('subtasks', ..., 'subtask_id', {str}), get_all=False) + if not subtask: + raise ExtractorError('Twitter API did not return next login subtask') - csrf_cookie = cookies.get('ct0') - if csrf_cookie: - headers['x-csrf-token'] = csrf_cookie.value + self._flow_token = response['flow_token'] + return subtask + + def _perform_login(self, username, password): if self.is_logged_in: - headers.update({ - 'x-twitter-auth-type': 'OAuth2Session', - 'x-twitter-client-language': 'en', - 'x-twitter-active-user': 'yes', - }) + return + + webpage = self._download_webpage('https://twitter.com/', None, 'Downloading login page') + guest_token = self._search_regex( + r'\.cookie\s*=\s*["\']gt=(\d+);', webpage, 'gt', default=None) or self._fetch_guest_token(None) + headers = { + **self._set_base_headers(), + 'content-type': 'application/json', + 'x-guest-token': guest_token, + 'x-twitter-client-language': 'en', + 'x-twitter-active-user': 'yes', + 'Referer': 'https://twitter.com/', + 'Origin': 'https://twitter.com', + } - last_error = None - for bearer_token in self._TOKENS: - for first_attempt in (True, False): - headers['Authorization'] = f'Bearer {bearer_token}' - - if not self.is_logged_in: - if not self._TOKENS[bearer_token]: - headers.pop('x-guest-token', None) - guest_token_response = self._download_json( - self._API_BASE + 'guest/activate.json', video_id, - 'Downloading guest token', data=b'', headers=headers) - - self._TOKENS[bearer_token] = guest_token_response.get('guest_token') - if not self._TOKENS[bearer_token]: - raise ExtractorError('Could not retrieve guest token') - - headers['x-guest-token'] = self._TOKENS[bearer_token] - - try: - allowed_status = {400, 403, 404} if graphql else {403} - result = self._download_json( - (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, - video_id, headers=headers, query=query, expected_status=allowed_status) - - except ExtractorError as e: - if last_error: - raise last_error - - if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code != 404: - raise - - last_error = e - self.report_warning( - 'Twitter API gave 404 response, retrying with deprecated auth token. ' - 'Only one media item can be extracted') - break # continue outer loop with next bearer_token - - if result.get('errors'): - errors = traverse_obj(result, ('errors', ..., 'message'), expected_type=str) - if first_attempt and any('bad guest token' in error.lower() for error in errors): - self.to_screen('Guest token has expired. Refreshing guest token') - self._TOKENS[bearer_token] = None - continue + def build_login_json(*subtask_inputs): + return json.dumps({ + 'flow_token': self._flow_token, + 'subtask_inputs': subtask_inputs + }, separators=(',', ':')).encode() + + def input_dict(subtask_id, text): + return { + 'subtask_id': subtask_id, + 'enter_text': { + 'text': text, + 'link': 'next_link' + } + } + + next_subtask = self._call_login_api( + 'Downloading flow token', headers, query={'flow_name': 'login'}, data=self._LOGIN_INIT_DATA) + + while not self.is_logged_in: + if next_subtask == 'LoginJsInstrumentationSubtask': + next_subtask = self._call_login_api( + 'Submitting JS instrumentation response', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'js_instrumentation': { + 'response': '{}', + 'link': 'next_link' + } + })) + + elif next_subtask == 'LoginEnterUserIdentifierSSO': + next_subtask = self._call_login_api( + 'Submitting username', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'settings_list': { + 'setting_responses': [{ + 'key': 'user_identifier', + 'response_data': { + 'text_data': { + 'result': username + } + } + }], + 'link': 'next_link' + } + })) + + elif next_subtask == 'LoginEnterAlternateIdentifierSubtask': + next_subtask = self._call_login_api( + 'Submitting alternate identifier', headers, + data=build_login_json(input_dict(next_subtask, self._get_tfa_info( + 'one of username, phone number or email that was not used as --username')))) + + elif next_subtask == 'LoginEnterPassword': + next_subtask = self._call_login_api( + 'Submitting password', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'enter_password': { + 'password': password, + 'link': 'next_link' + } + })) + + elif next_subtask == 'AccountDuplicationCheck': + next_subtask = self._call_login_api( + 'Submitting account duplication check', headers, data=build_login_json({ + 'subtask_id': next_subtask, + 'check_logged_in_account': { + 'link': 'AccountDuplicationCheck_false' + } + })) + + elif next_subtask == 'LoginTwoFactorAuthChallenge': + next_subtask = self._call_login_api( + 'Submitting 2FA token', headers, data=build_login_json(input_dict( + next_subtask, self._get_tfa_info('two-factor authentication token')))) + + elif next_subtask == 'LoginAcid': + next_subtask = self._call_login_api( + 'Submitting confirmation code', headers, data=build_login_json(input_dict( + next_subtask, self._get_tfa_info('confirmation code sent to your email or phone')))) + + elif next_subtask == 'ArkoseLogin': + self.raise_login_required('Twitter is requiring captcha for this login attempt', method='cookies') + + elif next_subtask == 'DenyLoginSubtask': + self.raise_login_required('Twitter rejected this login attempt as suspicious', method='cookies') + + elif next_subtask == 'LoginSuccessSubtask': + raise ExtractorError('Twitter API did not grant auth token cookie') - error_message = ', '.join(set(errors)) or 'Unknown error' - raise ExtractorError(f'Error(s) while querying API: {error_message}', expected=True) + else: + raise ExtractorError(f'Unrecognized subtask ID "{next_subtask}"') + + self.report_login() + + def _call_api(self, path, video_id, query={}, graphql=False): + headers = self._set_base_headers(legacy=not graphql and self._configuration_arg('legacy_api')) + headers.update({ + 'x-twitter-auth-type': 'OAuth2Session', + 'x-twitter-client-language': 'en', + 'x-twitter-active-user': 'yes', + } if self.is_logged_in else { + 'x-guest-token': self._fetch_guest_token(video_id) + }) + allowed_status = {400, 401, 403, 404} if graphql else {403} + result = self._download_json( + (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, + video_id, headers=headers, query=query, expected_status=allowed_status, + note=f'Downloading {"GraphQL" if graphql else "legacy API"} JSON') - return result + if result.get('errors'): + errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) + if errors and 'not authorized' in errors: + self.raise_login_required(remove_end(errors, '.')) + raise ExtractorError(f'Error(s) while querying API: {errors or "Unknown error"}') + + return result def _build_graphql_query(self, media_id): raise NotImplementedError('Method must be implemented to support GraphQL') @@ -293,7 +454,7 @@ class TwitterCardIE(InfoExtractor): class TwitterIE(TwitterBaseIE): IE_NAME = 'twitter' - _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)(?:/(?:video|photo)/(?P<index>\d+))?' _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', @@ -313,6 +474,7 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 18, }, @@ -336,7 +498,7 @@ class TwitterIE(TwitterBaseIE): 'id': '665052190608723968', 'display_id': '665052190608723968', 'ext': 'mp4', - 'title': 'md5:55fef1d5b811944f1550e91b44abb82e', + 'title': r're:Star Wars.*A new beginning is coming December 18.*', 'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ', 'uploader_id': 'starwars', 'uploader': r're:Star Wars.*', @@ -391,6 +553,7 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': ['Damndaniel'], 'age_limit': 0, }, @@ -431,6 +594,7 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -451,7 +615,7 @@ class TwitterIE(TwitterBaseIE): # has mp4 formats via mobile API 'url': 'https://twitter.com/news_al3alm/status/852138619213144067', 'info_dict': { - 'id': '852138619213144067', + 'id': '852077943283097602', 'ext': 'mp4', 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة', 'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN', @@ -460,8 +624,16 @@ class TwitterIE(TwitterBaseIE): 'duration': 277.4, 'timestamp': 1492000653, 'upload_date': '20170412', + 'display_id': '852138619213144067', + 'age_limit': 0, + 'uploader_url': 'https://twitter.com/news_al3alm', + 'thumbnail': r're:^https?://.*\.jpg', + 'tags': [], + 'repost_count': int, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, - 'skip': 'Account suspended', }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', 'info_dict': { @@ -480,6 +652,7 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': ['Maria'], 'age_limit': 0, }, @@ -505,6 +678,7 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -517,18 +691,19 @@ class TwitterIE(TwitterBaseIE): 'id': '1087791272830607360', 'display_id': '1087791357756956680', 'ext': 'mp4', - 'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', + 'title': 'X - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', 'thumbnail': r're:^https?://.*\.jpg', 'description': 'md5:6dfd341a3310fb97d80d2bf7145df976', - 'uploader': 'Twitter', - 'uploader_id': 'Twitter', + 'uploader': 'X', + 'uploader_id': 'X', 'duration': 61.567, 'timestamp': 1548184644, 'upload_date': '20190122', - 'uploader_url': 'https://twitter.com/Twitter', + 'uploader_url': 'https://twitter.com/X', 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -589,6 +764,7 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': [], 'age_limit': 0, }, @@ -597,9 +773,9 @@ class TwitterIE(TwitterBaseIE): 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'info_dict': { 'id': '1577719286659006464', - 'title': 'Ultima | #\u0432\u029f\u043c - Test', + 'title': 'Ultima📛 | #вʟм - Test', 'description': 'Test https://t.co/Y3KEZD7Dad', - 'uploader': 'Ultima | #\u0432\u029f\u043c', + 'uploader': 'Ultima📛 | #вʟм', 'uploader_id': 'UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX', 'upload_date': '20221005', @@ -630,12 +806,12 @@ class TwitterIE(TwitterBaseIE): 'comment_count': int, 'repost_count': int, 'like_count': int, + 'view_count': int, 'tags': ['HurricaneIan'], 'age_limit': 0, }, }, { - # Adult content, uses old token - # Fails if not logged in (GraphQL) + # Adult content, fails if not logged in (GraphQL) 'url': 'https://twitter.com/Rizdraws/status/1575199173472927762', 'info_dict': { 'id': '1575199163847000068', @@ -648,16 +824,16 @@ class TwitterIE(TwitterBaseIE): 'uploader_url': 'https://twitter.com/Rizdraws', 'upload_date': '20220928', 'timestamp': 1664391723, - 'thumbnail': 're:^https?://.*\\.jpg', + 'thumbnail': r're:^https?://.+\.jpg', 'like_count': int, 'repost_count': int, 'comment_count': int, 'age_limit': 18, 'tags': [] }, - 'expected_warnings': ['404'], + 'skip': 'Requires authentication', }, { - # Description is missing one https://t.co url (GraphQL) + # Playlist result only with auth 'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', 'playlist_mincount': 2, 'info_dict': { @@ -669,14 +845,13 @@ class TwitterIE(TwitterBaseIE): 'upload_date': '20210519', 'age_limit': 0, 'repost_count': int, - 'description': 'Here it is! Finished my gothic western cartoon. Pretty proud of it. It\'s got some goofs and lots of splashy over the top violence, something for everyone, hope you like it https://t.co/fOsG5glUnw https://t.co/kbXZrozlY7', + 'description': 'Here it is! Finished my gothic western cartoon. Pretty proud of it. It\'s got some goofs and lots of splashy over the top violence, something for everyone, hope you like it https://t.co/fOsG5glUnw', 'uploader_id': 'Srirachachau', 'comment_count': int, 'uploader_url': 'https://twitter.com/Srirachachau', 'timestamp': 1621447860, }, }, { - # Description is missing one https://t.co url (GraphQL) 'url': 'https://twitter.com/DavidToons_/status/1578353380363501568', 'playlist_mincount': 2, 'info_dict': { @@ -688,7 +863,7 @@ class TwitterIE(TwitterBaseIE): 'uploader': str, 'timestamp': 1665143744, 'uploader_url': 'https://twitter.com/DavidToons_', - 'description': 'Chris sounds like Linda from Bob\'s Burgers, so as an animator: this had to be done. https://t.co/glfQdgfFXH https://t.co/WgJauwIW1w', + 'description': 'Chris sounds like Linda from Bob\'s Burgers, so as an animator: this had to be done. https://t.co/WgJauwIW1w', 'tags': [], 'comment_count': int, 'upload_date': '20221007', @@ -722,11 +897,174 @@ class TwitterIE(TwitterBaseIE): 'uploader': r're:Monique Camarra.+?', 'uploader_id': 'MoniqueCamarra', 'live_status': 'was_live', - 'description': 'md5:acce559345fd49f129c20dbcda3f1201', - 'timestamp': 1658407771464, + 'release_timestamp': 1658417414, + 'description': 'md5:4dc8e972f1d8b3c6580376fabb02a3ad', + 'timestamp': 1658407771, + 'release_date': '20220721', + 'upload_date': '20220721', }, 'add_ie': ['TwitterSpaces'], 'params': {'skip_download': 'm3u8'}, + 'skip': 'Requires authentication', + }, { + # URL specifies video number but --yes-playlist + 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/1', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '1600649710662213632', + 'title': 'md5:be05989b0722e114103ed3851a0ffae2', + 'timestamp': 1670459604.0, + 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', + 'comment_count': int, + 'uploader_id': 'CTVJLaidlaw', + 'repost_count': int, + 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'], + 'upload_date': '20221208', + 'age_limit': 0, + 'uploader': 'Jocelyn Laidlaw', + 'uploader_url': 'https://twitter.com/CTVJLaidlaw', + 'like_count': int, + }, + }, { + # URL specifies video number and --no-playlist + 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/2', + 'info_dict': { + 'id': '1600649511827013632', + 'ext': 'mp4', + 'title': 'md5:7662a0a27ce6faa3e5b160340f3cfab1', + 'thumbnail': r're:^https?://.+\.jpg', + 'timestamp': 1670459604.0, + 'uploader_id': 'CTVJLaidlaw', + 'uploader': 'Jocelyn Laidlaw', + 'repost_count': int, + 'comment_count': int, + 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'], + 'duration': 102.226, + 'uploader_url': 'https://twitter.com/CTVJLaidlaw', + 'display_id': '1600649710662213632', + 'like_count': int, + 'view_count': int, + 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', + 'upload_date': '20221208', + 'age_limit': 0, + }, + 'params': {'noplaylist': True}, + }, { + # id pointing to TweetWithVisibilityResults type entity which wraps the actual Tweet over + # note the id different between extraction and url + 'url': 'https://twitter.com/s2FAKER/status/1621117700482416640', + 'info_dict': { + 'id': '1621117577354424321', + 'display_id': '1621117700482416640', + 'ext': 'mp4', + 'title': '뽀 - 아 최우제 이동속도 봐', + 'description': '아 최우제 이동속도 봐 https://t.co/dxu2U5vXXB', + 'duration': 24.598, + 'uploader': '뽀', + 'uploader_id': 's2FAKER', + 'uploader_url': 'https://twitter.com/s2FAKER', + 'upload_date': '20230202', + 'timestamp': 1675339553.0, + 'thumbnail': r're:https?://pbs\.twimg\.com/.+', + 'age_limit': 18, + 'tags': [], + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'view_count': int, + }, + }, { + 'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2', + 'info_dict': { + 'id': '1599108643743473680', + 'display_id': '1599108751385972737', + 'ext': 'mp4', + 'title': '\u06ea - \U0001F48B', + 'uploader_url': 'https://twitter.com/hlo_again', + 'like_count': int, + 'uploader_id': 'hlo_again', + 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1599108643743473680/pu/img/UG3xjov4rgg5sbYM.jpg?name=orig', + 'repost_count': int, + 'duration': 9.531, + 'comment_count': int, + 'view_count': int, + 'upload_date': '20221203', + 'age_limit': 0, + 'timestamp': 1670092210.0, + 'tags': [], + 'uploader': '\u06ea', + 'description': '\U0001F48B https://t.co/bTj9Qz7vQP', + }, + 'params': {'noplaylist': True}, + }, { + 'url': 'https://twitter.com/MunTheShinobi/status/1600009574919962625', + 'info_dict': { + 'id': '1600009362759733248', + 'display_id': '1600009574919962625', + 'ext': 'mp4', + 'uploader_url': 'https://twitter.com/MunTheShinobi', + 'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml', + 'view_count': int, + 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', + 'age_limit': 0, + 'uploader': 'Mün The Friend Of YWAP', + 'repost_count': int, + 'upload_date': '20221206', + 'title': 'Mün The Friend Of YWAP - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', + 'comment_count': int, + 'like_count': int, + 'tags': [], + 'uploader_id': 'MunTheShinobi', + 'duration': 139.987, + 'timestamp': 1670306984.0, + }, + }, { + # url to retweet id w/ legacy api + 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', + 'info_dict': { + 'id': '1623274794488659969', + 'display_id': '1623739803874349067', + 'ext': 'mp4', + 'title': 'Johnny Bullets - Me after going viral to over 30million people: Whoopsie-daisy', + 'description': 'md5:b06864cd3dc2554821cc327f5348485a', + 'uploader': 'Johnny Bullets', + 'uploader_id': 'Johnnybull3ts', + 'uploader_url': 'https://twitter.com/Johnnybull3ts', + 'age_limit': 0, + 'tags': [], + 'duration': 8.033, + 'timestamp': 1675853859.0, + 'upload_date': '20230208', + 'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+', + 'like_count': int, + 'repost_count': int, + }, + 'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}}, + 'skip': 'Protected tweet', + }, { + # orig tweet w/ graphql + 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', + 'info_dict': { + 'id': '1623274794488659969', + 'display_id': '1623739803874349067', + 'ext': 'mp4', + 'title': '@selfisekai@hackerspace.pl 🐀 - RT @Johnnybull3ts: Me after going viral to over 30million people: Whoopsie-daisy', + 'description': 'md5:9258bdbb54793bdc124fe1cd47e96c6a', + 'uploader': '@selfisekai@hackerspace.pl 🐀', + 'uploader_id': 'liberdalau', + 'uploader_url': 'https://twitter.com/liberdalau', + 'age_limit': 0, + 'tags': [], + 'duration': 8.033, + 'timestamp': 1675964711.0, + 'upload_date': '20230209', + 'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+', + 'like_count': int, + 'view_count': int, + 'repost_count': int, + 'comment_count': int, + }, + 'skip': 'Protected tweet', }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -769,12 +1107,23 @@ class TwitterIE(TwitterBaseIE): result = traverse_obj(data, ( 'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries', lambda _, v: v['entryId'] == f'tweet-{twid}', 'content', 'itemContent', - 'tweet_results', 'result' - ), expected_type=dict, default={}, get_all=False) + 'tweet_results', 'result', ('tweet', None), {dict}, + ), default={}, get_all=False) if self.is_logged_in else traverse_obj( + data, ('tweetResult', 'result', {dict}), default={}) + + if result.get('__typename') not in ('Tweet', 'TweetTombstone', 'TweetUnavailable', None): + self.report_warning(f'Unknown typename: {result.get("__typename")}', twid, only_once=True) if 'tombstone' in result: - cause = traverse_obj(result, ('tombstone', 'text', 'text'), expected_type=str) + cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more') raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True) + elif result.get('__typename') == 'TweetUnavailable': + reason = result.get('reason') + if reason == 'NsfwLoggedOut': + self.raise_login_required('NSFW tweet requires authentication') + elif reason == 'Protected': + self.raise_login_required('You are not authorized to view this protected tweet') + raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True) status = result.get('legacy', {}) status.update(traverse_obj(result, { @@ -786,7 +1135,7 @@ class TwitterIE(TwitterBaseIE): # extra transformation is needed since result does not match legacy format binding_values = { binding_value.get('key'): binding_value.get('value') - for binding_value in traverse_obj(status, ('card', 'binding_values', ...), expected_type=dict) + for binding_value in traverse_obj(status, ('card', 'binding_values', ..., {dict})) } if binding_values: status['card']['binding_values'] = binding_values @@ -825,25 +1174,74 @@ class TwitterIE(TwitterBaseIE): 'verified_phone_label_enabled': False, 'vibe_api_enabled': True, }, + } if self.is_logged_in else { + 'variables': { + 'tweetId': media_id, + 'withCommunity': False, + 'includePromotedContent': False, + 'withVoice': False, + }, + 'features': { + 'creator_subscriptions_tweet_preview_api_enabled': True, + 'tweetypie_unmention_optimization_enabled': True, + 'responsive_web_edit_tweet_api_enabled': True, + 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True, + 'view_counts_everywhere_api_enabled': True, + 'longform_notetweets_consumption_enabled': True, + 'responsive_web_twitter_article_tweet_consumption_enabled': False, + 'tweet_awards_web_tipping_enabled': False, + 'freedom_of_speech_not_reach_fetch_enabled': True, + 'standardized_nudges_misinfo': True, + 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': True, + 'longform_notetweets_rich_text_read_enabled': True, + 'longform_notetweets_inline_media_enabled': True, + 'responsive_web_graphql_exclude_directive_enabled': True, + 'verified_phone_label_enabled': False, + 'responsive_web_media_download_video_enabled': False, + 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, + 'responsive_web_graphql_timeline_navigation_enabled': True, + 'responsive_web_enhance_cards_enabled': False + }, + 'fieldToggles': { + 'withArticleRichContentState': False + } } - def _real_extract(self, url): - twid = self._match_id(url) - if self.is_logged_in or self._configuration_arg('force_graphql'): - self.write_debug(f'Using GraphQL API (Auth = {self.is_logged_in})') - result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid) - status = self._graphql_to_legacy(result, twid) - - else: - status = self._call_api(f'statuses/show/{twid}.json', twid, { + def _extract_status(self, twid): + if self.is_logged_in: + return self._graphql_to_legacy( + self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid) + + try: + if not self._configuration_arg('legacy_api'): + return self._graphql_to_legacy( + self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid) + return traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { 'cards_platform': 'Web-12', 'include_cards': 1, 'include_reply_count': 1, 'include_user_entities': 0, 'tweet_mode': 'extended', - }) + }), 'retweeted_status', None) + + except ExtractorError as e: + if e.expected: + raise + self.report_warning( + f'{e.orig_msg}. Falling back to syndication endpoint; some metadata may be missing', twid) + + status = self._download_json( + 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', + headers={'User-Agent': 'Googlebot'}, query={'id': twid}) + status['extended_entities'] = {'media': status.get('mediaDetails')} + return status + + def _real_extract(self, url): + twid, selected_index = self._match_valid_url(url).group('id', 'index') + status = self._extract_status(twid) - title = description = status['full_text'].replace('\n', ' ') + title = description = traverse_obj( + status, (('full_text', 'text'), {lambda x: x.replace('\n', ' ')}), get_all=False) or '' # strip 'https -_t.co_BJYgOjSeGA' junk from filenames title = re.sub(r'\s+(https?://[^ ]+)', '', title) user = status.get('user') or {} @@ -852,13 +1250,6 @@ class TwitterIE(TwitterBaseIE): title = f'{uploader} - {title}' uploader_id = user.get('screen_name') - tags = [] - for hashtag in (try_get(status, lambda x: x['entities']['hashtags'], list) or []): - hashtag_text = hashtag.get('text') - if not hashtag_text: - continue - tags.append(hashtag_text) - info = { 'id': twid, 'title': title, @@ -871,17 +1262,19 @@ class TwitterIE(TwitterBaseIE): 'repost_count': int_or_none(status.get('retweet_count')), 'comment_count': int_or_none(status.get('reply_count')), 'age_limit': 18 if status.get('possibly_sensitive') else 0, - 'tags': tags, + 'tags': traverse_obj(status, ('entities', 'hashtags', ..., 'text')), } def extract_from_video_info(media): - media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none) + media_id = traverse_obj(media, 'id_str', 'id', ( + 'video_info', 'variants', ..., 'url', + {functools.partial(re.search, r'_video/(\d+)/')}, 1 + ), get_all=False, expected_type=str_or_none) or twid self.write_debug(f'Extracting from video info: {media_id}') - video_info = media.get('video_info') or {} formats = [] subtitles = {} - for variant in video_info.get('variants', []): + for variant in traverse_obj(media, ('video_info', 'variants', ...)): fmts, subs = self._extract_variant_formats(variant, twid) subtitles = self._merge_subtitles(subtitles, subs) formats.extend(fmts) @@ -905,7 +1298,8 @@ class TwitterIE(TwitterBaseIE): 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, - 'duration': float_or_none(video_info.get('duration_millis'), 1000), + 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), + 'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000), # The codec of http formats are unknown '_format_sort_fields': ('res', 'br', 'size', 'proto'), } @@ -984,15 +1378,37 @@ class TwitterIE(TwitterBaseIE): 'content_duration_seconds')), } - media_path = ((None, 'quoted_status'), 'extended_entities', 'media', lambda _, m: m['type'] != 'photo') - videos = map(extract_from_video_info, traverse_obj(status, media_path, expected_type=dict)) - cards = extract_from_card_info(status.get('card')) - entries = [{**info, **data, 'display_id': twid} for data in (*videos, *cards)] + videos = traverse_obj(status, ( + (None, 'quoted_status'), 'extended_entities', 'media', lambda _, m: m['type'] != 'photo', {dict})) + if self._yes_playlist(twid, selected_index, video_label='URL-specified video number'): + selected_entries = (*map(extract_from_video_info, videos), *extract_from_card_info(status.get('card'))) + else: + desired_obj = traverse_obj(status, ( + (None, 'quoted_status'), 'extended_entities', 'media', int(selected_index) - 1, {dict}), get_all=False) + if not desired_obj: + raise ExtractorError(f'Video #{selected_index} is unavailable', expected=True) + elif desired_obj.get('type') != 'video': + raise ExtractorError(f'Media #{selected_index} is not a video', expected=True) + + # Restore original archive id and video index in title + for index, entry in enumerate(videos, 1): + if entry.get('id') != desired_obj.get('id'): + continue + if index == 1: + info['_old_archive_ids'] = [make_archive_id(self, twid)] + if len(videos) != 1: + info['title'] += f' #{index}' + break + + return {**info, **extract_from_video_info(desired_obj), 'display_id': twid} + + entries = [{**info, **data, 'display_id': twid} for data in selected_entries] if not entries: expanded_url = traverse_obj(status, ('entities', 'urls', 0, 'expanded_url'), expected_type=url_or_none) if not expanded_url or expanded_url == url: - raise ExtractorError('No video could be found in this tweet', expected=True) + self.raise_no_formats('No video could be found in this tweet', expected=True) + return info return self.url_result(expanded_url, display_id=twid, **info) @@ -1116,7 +1532,42 @@ class TwitterSpacesIE(TwitterBaseIE): 'uploader': r're:Lucio Di Gaetano.*?', 'uploader_id': 'luciodigaetano', 'live_status': 'was_live', - 'timestamp': 1659877956397, + 'timestamp': 1659877956, + 'upload_date': '20220807', + 'release_timestamp': 1659904215, + 'release_date': '20220807', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # post_live/TimedOut but downloadable + 'url': 'https://twitter.com/i/spaces/1vAxRAVQWONJl', + 'info_dict': { + 'id': '1vAxRAVQWONJl', + 'ext': 'm4a', + 'title': 'Framing Up FinOps: Billing Tools', + 'description': 'Twitter Space participated by rupa, Alfonso Hernandez', + 'uploader': 'Google Cloud', + 'uploader_id': 'googlecloud', + 'live_status': 'post_live', + 'timestamp': 1681409554, + 'upload_date': '20230413', + 'release_timestamp': 1681839000, + 'release_date': '20230418', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + # Needs ffmpeg as downloader, see: https://github.com/hypervideo/hypervideo/issues/7536 + 'url': 'https://twitter.com/i/spaces/1eaKbrQbjoRKX', + 'info_dict': { + 'id': '1eaKbrQbjoRKX', + 'ext': 'm4a', + 'title': 'あ', + 'description': 'Twitter Space participated by nobody yet', + 'uploader': '息根とめる🔪Twitchで復活', + 'uploader_id': 'tomeru_ikinone', + 'live_status': 'was_live', + 'timestamp': 1685617198, + 'upload_date': '20230601', }, 'params': {'skip_download': 'm3u8'}, }] @@ -1156,32 +1607,39 @@ class TwitterSpacesIE(TwitterBaseIE): def _real_extract(self, url): space_id = self._match_id(url) + if not self.is_logged_in: + self.raise_login_required('Twitter Spaces require authentication') space_data = self._call_graphql_api('HPEisOmj1epUNLCWTYhUWw/AudioSpaceById', space_id)['audioSpace'] if not space_data: raise ExtractorError('Twitter Space not found', expected=True) metadata = space_data['metadata'] live_status = try_call(lambda: self.SPACE_STATUS[metadata['state'].lower()]) + is_live = live_status == 'is_live' formats = [] if live_status == 'is_upcoming': self.raise_no_formats('Twitter Space not started yet', expected=True) - elif live_status == 'post_live': - self.raise_no_formats('Twitter Space ended but not downloadable yet', expected=True) - else: - source = self._call_api( - f'live_video_stream/status/{metadata["media_key"]}', metadata['media_key'])['source'] - - # XXX: Native downloader does not work - formats = self._extract_m3u8_formats( - traverse_obj(source, 'noRedirectPlaybackUrl', 'location'), - metadata['media_key'], 'm4a', 'm3u8', live=live_status == 'is_live', - headers={'Referer': 'https://twitter.com/'}) + elif not is_live and not metadata.get('is_space_available_for_replay'): + self.raise_no_formats('Twitter Space ended and replay is disabled', expected=True) + elif metadata.get('media_key'): + source = traverse_obj( + self._call_api(f'live_video_stream/status/{metadata["media_key"]}', metadata['media_key']), + ('source', ('noRedirectPlaybackUrl', 'location'), {url_or_none}), get_all=False) + formats = self._extract_m3u8_formats( # XXX: Some Spaces need ffmpeg as downloader + source, metadata['media_key'], 'm4a', entry_protocol='m3u8', live=is_live, + headers={'Referer': 'https://twitter.com/'}, fatal=False) if source else [] for fmt in formats: fmt.update({'vcodec': 'none', 'acodec': 'aac'}) + if not is_live: + fmt['container'] = 'm4a_dash' participants = ', '.join(traverse_obj( space_data, ('participants', 'speakers', ..., 'display_name'))) or 'nobody yet' + + if not formats and live_status == 'post_live': + self.raise_no_formats('Twitter Space ended but not downloadable yet', expected=True) + return { 'id': space_id, 'title': metadata.get('title'), @@ -1191,7 +1649,9 @@ class TwitterSpacesIE(TwitterBaseIE): 'uploader_id': traverse_obj( metadata, ('creator_results', 'result', 'legacy', 'screen_name')), 'live_status': live_status, - 'timestamp': metadata.get('created_at'), + 'release_timestamp': try_call( + lambda: int_or_none(metadata['scheduled_start'], scale=1000)), + 'timestamp': int_or_none(metadata.get('created_at'), scale=1000), 'formats': formats, } @@ -1207,7 +1667,7 @@ class TwitterShortenerIE(TwitterBaseIE): if eid: id = eid url = self._BASE_URL + id - new_url = self._request_webpage(url, id, headers={'User-Agent': 'curl'}).geturl() + new_url = self._request_webpage(url, id, headers={'User-Agent': 'curl'}).url __UNSAFE_LINK = "https://twitter.com/safety/unsafe_link_warning?unsafe_link=" if new_url.startswith(__UNSAFE_LINK): new_url = new_url.replace(__UNSAFE_LINK, "") diff --git a/hypervideo_dl/extractor/txxx.py b/hypervideo_dl/extractor/txxx.py new file mode 100644 index 0000000..fff7a5d --- /dev/null +++ b/hypervideo_dl/extractor/txxx.py @@ -0,0 +1,418 @@ +import base64 +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + merge_dicts, + parse_duration, + traverse_obj, + try_call, + urljoin, + variadic, +) + + +def decode_base64(text): + return base64.b64decode(text.translate(text.maketrans({ + '\u0405': 'S', + '\u0406': 'I', + '\u0408': 'J', + '\u0410': 'A', + '\u0412': 'B', + '\u0415': 'E', + '\u041a': 'K', + '\u041c': 'M', + '\u041d': 'H', + '\u041e': 'O', + '\u0420': 'P', + '\u0421': 'C', + '\u0425': 'X', + ',': '/', + '.': '+', + '~': '=', + }))).decode() + + +def get_formats(host, video_file): + return [{ + 'url': urljoin(f'https://{host}', decode_base64(video['video_url'])), + 'format_id': try_call(lambda: variadic(video['format'])[0].lstrip('_')), + 'quality': index, + } for index, video in enumerate(video_file) if video.get('video_url')] + + +class TxxxIE(InfoExtractor): + _DOMAINS = ( + 'hclips.com', + 'hdzog.com', + 'hdzog.tube', + 'hotmovs.com', + 'hotmovs.tube', + 'inporn.com', + 'privatehomeclips.com', + 'tubepornclassic.com', + 'txxx.com', + 'txxx.tube', + 'upornia.com', + 'upornia.tube', + 'vjav.com', + 'vjav.tube', + 'vxxx.com', + 'voyeurhit.com', + 'voyeurhit.tube', + ) + _VALID_URL = rf'''(?x) + https?://(?:www\.)?(?P<host>{"|".join(map(re.escape, _DOMAINS))})/ + (?:videos?[/-]|embed/)(?P<id>\d+)(?:/(?P<display_id>[^/?#]+))? + ''' + _EMBED_REGEX = [rf'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:{"|".join(map(re.escape, _DOMAINS))})/embed/[^"\']*)\1'] + _TESTS = [{ + 'url': 'https://txxx.com/videos/16574965/digital-desire-malena-morgan/', + 'md5': 'c54e4ace54320aaf8e2a72df87859391', + 'info_dict': { + 'id': '16574965', + 'display_id': 'digital-desire-malena-morgan', + 'ext': 'mp4', + 'title': 'Digital Desire - Malena Morgan', + 'uploader': 'Lois Argentum', + 'duration': 694, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://txxx.tube/videos/16574965/digital-desire-malena-morgan/', + 'md5': 'c54e4ace54320aaf8e2a72df87859391', + 'info_dict': { + 'id': '16574965', + 'display_id': 'digital-desire-malena-morgan', + 'ext': 'mp4', + 'title': 'Digital Desire - Malena Morgan', + 'uploader': 'Lois Argentum', + 'duration': 694, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://vxxx.com/video-68925/', + 'md5': '1fcff3748b0c5b41fe41d0afa22409e1', + 'info_dict': { + 'id': '68925', + 'display_id': '68925', + 'ext': 'mp4', + 'title': 'Malena Morgan', + 'uploader': 'Huge Hughes', + 'duration': 694, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://hclips.com/videos/6291073/malena-morgan-masturbates-her-sweet/', + 'md5': 'a5dd4f83363972ee043313cff85e7e26', + 'info_dict': { + 'id': '6291073', + 'display_id': 'malena-morgan-masturbates-her-sweet', + 'ext': 'mp4', + 'title': 'Malena Morgan masturbates her sweet', + 'uploader': 'John Salt', + 'duration': 426, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://hdzog.com/videos/67063/gorgeous-malena-morgan-will-seduce-you-at-the-first-glance/', + 'md5': 'f8bdedafd45d1ec2875c43fe33a846d3', + 'info_dict': { + 'id': '67063', + 'display_id': 'gorgeous-malena-morgan-will-seduce-you-at-the-first-glance', + 'ext': 'mp4', + 'title': 'Gorgeous Malena Morgan will seduce you at the first glance', + 'uploader': 'momlesson', + 'duration': 601, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://hdzog.tube/videos/67063/gorgeous-malena-morgan-will-seduce-you-at-the-first-glance/', + 'md5': 'f8bdedafd45d1ec2875c43fe33a846d3', + 'info_dict': { + 'id': '67063', + 'display_id': 'gorgeous-malena-morgan-will-seduce-you-at-the-first-glance', + 'ext': 'mp4', + 'title': 'Gorgeous Malena Morgan will seduce you at the first glance', + 'uploader': 'momlesson', + 'duration': 601, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://hotmovs.com/videos/8789287/unbelievable-malena-morgan-performing-in-incredible-masturantion/', + 'md5': '71d32c51584876472db87e561171a386', + 'info_dict': { + 'id': '8789287', + 'display_id': 'unbelievable-malena-morgan-performing-in-incredible-masturantion', + 'ext': 'mp4', + 'title': 'Unbelievable Malena Morgan performing in incredible masturantion', + 'uploader': 'Davit Sanchez', + 'duration': 940, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://hotmovs.tube/videos/8789287/unbelievable-malena-morgan-performing-in-incredible-masturantion/', + 'md5': '71d32c51584876472db87e561171a386', + 'info_dict': { + 'id': '8789287', + 'display_id': 'unbelievable-malena-morgan-performing-in-incredible-masturantion', + 'ext': 'mp4', + 'title': 'Unbelievable Malena Morgan performing in incredible masturantion', + 'uploader': 'Davit Sanchez', + 'duration': 940, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://inporn.com/video/517897/malena-morgan-solo/', + 'md5': '344db467481edf78f193cdf5820a7cfb', + 'info_dict': { + 'id': '517897', + 'display_id': 'malena-morgan-solo', + 'ext': 'mp4', + 'title': 'Malena Morgan - Solo', + 'uploader': 'Ashley Oxy', + 'duration': 480, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://privatehomeclips.com/videos/3630599/malena-morgan-cam-show/', + 'md5': 'ea657273e352493c5fb6357fbfa4f126', + 'info_dict': { + 'id': '3630599', + 'display_id': 'malena-morgan-cam-show', + 'ext': 'mp4', + 'title': 'malena morgan cam show', + 'uploader': 'Member9915', + 'duration': 290, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://tubepornclassic.com/videos/1015455/mimi-rogers-full-body-massage-nude-compilation/', + 'md5': '2e9a6cf610c9862e86e0ce24f08f4427', + 'info_dict': { + 'id': '1015455', + 'display_id': 'mimi-rogers-full-body-massage-nude-compilation', + 'ext': 'mp4', + 'title': 'Mimi Rogers - Full Body Massage (Nude) compilation', + 'uploader': '88bhuto', + 'duration': 286, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://upornia.com/videos/1498858/twistys-malena-morgan-starring-at-dr-morgan-baller/', + 'md5': '7ff7033340bc88a173198b7c22600e4f', + 'info_dict': { + 'id': '1498858', + 'display_id': 'twistys-malena-morgan-starring-at-dr-morgan-baller', + 'ext': 'mp4', + 'title': 'Twistys - Malena Morgan starring at Dr. Morgan-Baller', + 'uploader': 'mindgeek', + 'duration': 480, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://upornia.tube/videos/1498858/twistys-malena-morgan-starring-at-dr-morgan-baller/', + 'md5': '7ff7033340bc88a173198b7c22600e4f', + 'info_dict': { + 'id': '1498858', + 'display_id': 'twistys-malena-morgan-starring-at-dr-morgan-baller', + 'ext': 'mp4', + 'title': 'Twistys - Malena Morgan starring at Dr. Morgan-Baller', + 'uploader': 'mindgeek', + 'duration': 480, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://vjav.com/videos/11761/yui-hatano-in-if-yui-was-my-girlfriend2/', + 'md5': '6de5bc1f13bdfc3491a77f23edb1676f', + 'info_dict': { + 'id': '11761', + 'display_id': 'yui-hatano-in-if-yui-was-my-girlfriend2', + 'ext': 'mp4', + 'title': 'Yui Hatano in If Yui Was My Girlfriend', + 'uploader': 'Matheus69', + 'duration': 3310, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://vjav.tube/videos/11761/yui-hatano-in-if-yui-was-my-girlfriend2/', + 'md5': '6de5bc1f13bdfc3491a77f23edb1676f', + 'info_dict': { + 'id': '11761', + 'display_id': 'yui-hatano-in-if-yui-was-my-girlfriend2', + 'ext': 'mp4', + 'title': 'Yui Hatano in If Yui Was My Girlfriend', + 'uploader': 'Matheus69', + 'duration': 3310, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://voyeurhit.com/videos/332875/charlotte-stokely-elle-alexandra-malena-morgan-lingerie/', + 'md5': '12b4666e9c3e60dafe9182e5d12aae33', + 'info_dict': { + 'id': '332875', + 'display_id': 'charlotte-stokely-elle-alexandra-malena-morgan-lingerie', + 'ext': 'mp4', + 'title': 'Charlotte Stokely, Elle Alexandra, Malena Morgan-Lingerie', + 'uploader': 'Kyle Roberts', + 'duration': 655, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://voyeurhit.tube/videos/332875/charlotte-stokely-elle-alexandra-malena-morgan-lingerie/', + 'md5': '12b4666e9c3e60dafe9182e5d12aae33', + 'info_dict': { + 'id': '332875', + 'display_id': 'charlotte-stokely-elle-alexandra-malena-morgan-lingerie', + 'ext': 'mp4', + 'title': 'Charlotte Stokely, Elle Alexandra, Malena Morgan-Lingerie', + 'uploader': 'Kyle Roberts', + 'duration': 655, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://pornzog.com/video/9125519/michelle-malone-dreamgirls-wild-wet-3/', + 'info_dict': { + 'id': '5119660', + 'display_id': '5119660', + 'ext': 'mp4', + 'title': 'Michelle Malone - Dreamgirls - Wild Wet 3', + 'uploader': 'FallenAngel12', + 'duration': 402, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + }] + + def _call_api(self, url, video_id, fatal=False, **kwargs): + content = self._download_json(url, video_id, fatal=fatal, **kwargs) + if traverse_obj(content, 'error'): + raise self._error_or_warning(ExtractorError( + f'Txxx said: {content["error"]}', expected=True), fatal=fatal) + return content or {} + + def _real_extract(self, url): + video_id, host, display_id = self._match_valid_url(url).group('id', 'host', 'display_id') + headers = {'Referer': url, 'X-Requested-With': 'XMLHttpRequest'} + + video_file = self._call_api( + f'https://{host}/api/videofile.php?video_id={video_id}&lifetime=8640000', + video_id, fatal=True, note='Downloading video file info', headers=headers) + + slug = f'{int(1E6 * (int(video_id) // 1E6))}/{1000 * (int(video_id) // 1000)}' + video_info = self._call_api( + f'https://{host}/api/json/video/86400/{slug}/{video_id}.json', + video_id, note='Downloading video info', headers=headers) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': traverse_obj(video_info, ('video', 'title')), + 'uploader': traverse_obj(video_info, ('video', 'user', 'username')), + 'duration': parse_duration(traverse_obj(video_info, ('video', 'duration'))), + 'view_count': int_or_none(traverse_obj(video_info, ('video', 'statistics', 'viewed'))), + 'like_count': int_or_none(traverse_obj(video_info, ('video', 'statistics', 'likes'))), + 'dislike_count': int_or_none(traverse_obj(video_info, ('video', 'statistics', 'dislikes'))), + 'age_limit': 18, + 'formats': get_formats(host, video_file), + } + + +class PornTopIE(InfoExtractor): + _VALID_URL = r'https?://(?P<host>(?:www\.)?porntop\.com)/video/(?P<id>\d+)(?:/(?P<display_id>[^/?]+))?' + _TESTS = [{ + 'url': 'https://porntop.com/video/101569/triple-threat-with-lia-lor-malena-morgan-and-dani-daniels/', + 'md5': '612ba7b3cb99455b382972948e200b08', + 'info_dict': { + 'id': '101569', + 'display_id': 'triple-threat-with-lia-lor-malena-morgan-and-dani-daniels', + 'ext': 'mp4', + 'title': 'Triple Threat With Lia Lor, Malena Morgan And Dani Daniels', + 'description': 'md5:285357d9d3a00ce5acb29f39f826dbf6', + 'uploader': 'PatrickBush', + 'duration': 480, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + 'timestamp': 1609455029, + 'upload_date': '20201231', + 'thumbnail': 'https://tn.porntop.com/media/tn/sources/101569_1.jpg', + } + }] + + def _real_extract(self, url): + video_id, host, display_id = self._match_valid_url(url).group('id', 'host', 'display_id') + webpage = self._download_webpage(url, video_id) + + json_ld = self._json_ld(self._search_json( + r'\bschemaJson\s*=', webpage, 'JSON-LD', video_id, transform_source=js_to_json, + contains_pattern='{[^<]+?VideoObject[^<]+};'), video_id, fatal=True) + + video_file = self._parse_json(decode_base64(self._search_regex( + r"window\.initPlayer\(.*}}},\s*'(?P<json_b64c>[^']+)'", + webpage, 'json_urls', group='json_b64c')), video_id) + + return merge_dicts({ + 'id': video_id, + 'display_id': display_id, + 'age_limit': 18, + 'formats': get_formats(host, video_file), + }, json_ld) diff --git a/hypervideo_dl/extractor/udemy.py b/hypervideo_dl/extractor/udemy.py index 4faad58..117acc7 100644 --- a/hypervideo_dl/extractor/udemy.py +++ b/hypervideo_dl/extractor/udemy.py @@ -1,8 +1,9 @@ import re -import urllib.request from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str, compat_urlparse +from ..compat import compat_str, compat_urlparse +from ..networking import Request +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -10,9 +11,10 @@ from ..utils import ( float_or_none, int_or_none, js_to_json, - sanitized_Request, + smuggle_url, try_get, unescapeHTML, + unsmuggle_url, url_or_none, urlencode_postdata, ) @@ -106,7 +108,7 @@ class UdemyIE(InfoExtractor): % (course_id, lecture_id), lecture_id, 'Downloading lecture JSON', query={ 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data,course_is_drmed', }) def _handle_error(self, response): @@ -151,11 +153,10 @@ class UdemyIE(InfoExtractor): headers['X-Udemy-Bearer-Token'] = cookie.value headers['X-Udemy-Authorization'] = 'Bearer %s' % cookie.value - if isinstance(url_or_request, urllib.request.Request): - for header, value in headers.items(): - url_or_request.add_header(header, value) + if isinstance(url_or_request, Request): + url_or_request.headers.update(headers) else: - url_or_request = sanitized_Request(url_or_request, headers=headers) + url_or_request = Request(url_or_request, headers=headers) response = super(UdemyIE, self)._download_json(url_or_request, *args, **kwargs) self._handle_error(response) @@ -199,16 +200,19 @@ class UdemyIE(InfoExtractor): def _real_extract(self, url): lecture_id = self._match_id(url) + course_id = unsmuggle_url(url, {})[1].get('course_id') - webpage = self._download_webpage(url, lecture_id) - - course_id, _ = self._extract_course_info(webpage, lecture_id) + webpage = None + if not course_id: + webpage = self._download_webpage(url, lecture_id) + course_id, _ = self._extract_course_info(webpage, lecture_id) try: lecture = self._download_lecture(course_id, lecture_id) except ExtractorError as e: # Error could possibly mean we are not enrolled in the course - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + webpage = webpage or self._download_webpage(url, lecture_id) self._enroll_course(url, webpage, course_id) lecture = self._download_lecture(course_id, lecture_id) else: @@ -391,6 +395,9 @@ class UdemyIE(InfoExtractor): if f.get('url'): formats.append(f) + if not formats and asset.get('course_is_drmed'): + self.report_drm(video_id) + return { 'id': video_id, 'title': title, @@ -449,7 +456,9 @@ class UdemyCourseIE(UdemyIE): # XXX: Do not subclass from concrete IE if lecture_id: entry = { '_type': 'url_transparent', - 'url': 'https://www.udemy.com/%s/learn/v4/t/lecture/%s' % (course_path, entry['id']), + 'url': smuggle_url( + f'https://www.udemy.com/{course_path}/learn/v4/t/lecture/{entry["id"]}', + {'course_id': course_id}), 'title': entry.get('title'), 'ie_key': UdemyIE.ie_key(), } diff --git a/hypervideo_dl/extractor/unsupported.py b/hypervideo_dl/extractor/unsupported.py index 620c025..78c2206 100644 --- a/hypervideo_dl/extractor/unsupported.py +++ b/hypervideo_dl/extractor/unsupported.py @@ -42,6 +42,12 @@ class KnownDRMIE(UnsupportedInfoExtractor): r'vootkids\.com', r'nowtv\.it/watch', r'tv\.apple\.com', + r'primevideo\.com', + r'hulu\.com', + r'resource\.inkryptvideos\.com', + r'joyn\.de', + r'amazon\.(?:\w{2}\.)?\w+/gp/video', + r'music\.amazon\.(?:\w{2}\.)?\w+', ) _TESTS = [{ @@ -111,6 +117,30 @@ class KnownDRMIE(UnsupportedInfoExtractor): # https://github.com/hypervideo/hypervideo/issues/5557 'url': 'https://tv.apple.com/it/show/loot---una-fortuna/umc.cmc.5erbujil1mpazuerhr1udnk45?ctx_brand=tvs.sbd.4000', 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/3072 + 'url': 'https://www.joyn.de/play/serien/clannad/1-1-wo-die-kirschblueten-fallen', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/7323 + 'url': 'https://music.amazon.co.jp/albums/B088Y368TK', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/7323 + 'url': 'https://www.amazon.co.jp/gp/video/detail/B09X5HBYRS/', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/6125 + 'url': 'https://www.primevideo.com/region/eu/detail/0H3DDB4KBJFNDCKKLHNRLRLVKQ/ref=atv_br_def_r_br_c_unkc_1_10', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/5740 + 'url': 'https://resource.inkryptvideos.com/v2-a83ns52/iframe/index.html#video_id=7999ea0f6e03439eb40d056258c2d736&otp=xxx', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/5767 + 'url': 'https://www.hulu.com/movie/anthem-6b25fac9-da2b-45a3-8e09-e4156b0471cc', + 'only_matching': True, }] def _real_extract(self, url): @@ -130,6 +160,10 @@ class KnownPiracyIE(UnsupportedInfoExtractor): URLS = ( r'dood\.(?:to|watch|so|pm|wf|re)', + # Sites youtube-dl supports, but we won't + r'viewsb\.com', + r'filemoon\.sx', + r'hentai\.animestigma\.com', ) _TESTS = [{ diff --git a/hypervideo_dl/extractor/uplynk.py b/hypervideo_dl/extractor/uplynk.py index 87c427f..e7d816e 100644 --- a/hypervideo_dl/extractor/uplynk.py +++ b/hypervideo_dl/extractor/uplynk.py @@ -2,40 +2,42 @@ import re from .common import InfoExtractor from ..utils import ( - float_or_none, ExtractorError, + float_or_none, + smuggle_url, + traverse_obj, + unsmuggle_url, + update_url_query, ) -class UplynkIE(InfoExtractor): - IE_NAME = 'uplynk' - _VALID_URL = r'https?://.*?\.uplynk\.com/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.(?:m3u8|json)(?:.*?\bpbs=(?P<session_id>[^&]+))?' - _TEST = { - 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8', - 'info_dict': { - 'id': 'e89eaf2ce9054aa89d92ddb2d817a52e', - 'ext': 'mp4', - 'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4', - 'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } +class UplynkBaseIE(InfoExtractor): + _UPLYNK_URL_RE = r'''(?x) + https?://[\w-]+\.uplynk\.com/(?P<path> + ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)| + (?P<id>[0-9a-f]{32}) + )\.(?:m3u8|json) + (?:.*?\bpbs=(?P<session_id>[^&]+))?''' - def _extract_uplynk_info(self, uplynk_content_url): - path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups() + def _extract_uplynk_info(self, url): + uplynk_content_url, smuggled_data = unsmuggle_url(url, {}) + mobj = re.match(self._UPLYNK_URL_RE, uplynk_content_url) + if not mobj: + raise ExtractorError('Necessary parameters not found in Uplynk URL') + path, external_id, video_id, session_id = mobj.group('path', 'external_id', 'id', 'session_id') display_id = video_id or external_id + headers = traverse_obj( + smuggled_data, {'Referer': 'Referer', 'Origin': 'Origin'}, casesense=False) formats, subtitles = self._extract_m3u8_formats_and_subtitles( - 'http://content.uplynk.com/%s.m3u8' % path, - display_id, 'mp4', 'm3u8_native') + f'http://content.uplynk.com/{path}.m3u8', display_id, 'mp4', headers=headers) if session_id: for f in formats: - f['extra_param_to_segment_url'] = 'pbs=' + session_id - asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id) + f['extra_param_to_segment_url'] = f'pbs={session_id}' + asset = self._download_json( + f'http://content.uplynk.com/player/assetinfo/{path}.json', display_id) if asset.get('error') == 1: - raise ExtractorError('% said: %s' % (self.IE_NAME, asset['msg']), expected=True) + msg = asset.get('msg') or 'unknown error' + raise ExtractorError(f'{self.IE_NAME} said: {msg}', expected=True) return { 'id': asset['asset'], @@ -47,20 +49,40 @@ class UplynkIE(InfoExtractor): 'subtitles': subtitles, } + +class UplynkIE(UplynkBaseIE): + IE_NAME = 'uplynk' + _VALID_URL = UplynkBaseIE._UPLYNK_URL_RE + _TEST = { + 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8', + 'info_dict': { + 'id': 'e89eaf2ce9054aa89d92ddb2d817a52e', + 'ext': 'mp4', + 'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4', + 'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa', + 'duration': 530.2739166666679, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': 'm3u8', + }, + } + def _real_extract(self, url): return self._extract_uplynk_info(url) -class UplynkPreplayIE(UplynkIE): # XXX: Do not subclass from concrete IE +class UplynkPreplayIE(UplynkBaseIE): IE_NAME = 'uplynk:preplay' - _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.json' + _VALID_URL = r'https?://[\w-]+\.uplynk\.com/preplay2?/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.json' def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) path, external_id, video_id = self._match_valid_url(url).groups() display_id = video_id or external_id preplay = self._download_json(url, display_id) - content_url = 'http://content.uplynk.com/%s.m3u8' % path + content_url = f'http://content.uplynk.com/{path}.m3u8' session_id = preplay.get('sid') if session_id: - content_url += '?pbs=' + session_id - return self._extract_uplynk_info(content_url) + content_url = update_url_query(content_url, {'pbs': session_id}) + return self._extract_uplynk_info(smuggle_url(content_url, smuggled_data)) diff --git a/hypervideo_dl/extractor/urplay.py b/hypervideo_dl/extractor/urplay.py index 0f0d659..7f97fc9 100644 --- a/hypervideo_dl/extractor/urplay.py +++ b/hypervideo_dl/extractor/urplay.py @@ -14,12 +14,13 @@ class URPlayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ur(?:play|skola)\.se/(?:program|Produkter)/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://urplay.se/program/203704-ur-samtiden-livet-universum-och-rymdens-markliga-musik-om-vetenskap-kritiskt-tankande-och-motstand', - 'md5': 'ff5b0c89928f8083c74bbd5099c9292d', + 'md5': '5ba36643c77cc3d34ffeadad89937d1e', 'info_dict': { 'id': '203704', 'ext': 'mp4', 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', + 'thumbnail': r're:^https?://.+\.jpg', 'timestamp': 1513292400, 'upload_date': '20171214', 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', @@ -30,18 +31,41 @@ class URPlayIE(InfoExtractor): 'age_limit': 15, }, }, { + 'url': 'https://urplay.se/program/222967-en-foralders-dagbok-mitt-barn-skadar-sig-sjalv', + 'info_dict': { + 'id': '222967', + 'ext': 'mp4', + 'title': 'En förälders dagbok : Mitt barn skadar sig själv', + 'description': 'md5:9f771eef03a732a213b367b52fe826ca', + 'thumbnail': r're:^https?://.+\.jpg', + 'timestamp': 1629676800, + 'upload_date': '20210823', + 'series': 'En förälders dagbok', + 'duration': 1740, + 'age_limit': 15, + 'episode_number': 3, + 'categories': 'count:2', + 'tags': 'count:7', + 'episode': 'Mitt barn skadar sig själv', + }, + }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', 'info_dict': { 'id': '190031', 'ext': 'mp4', 'title': 'Tripp, Trapp, Träd : Sovkudde', 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', + 'thumbnail': r're:^https?://.+\.jpg', 'timestamp': 1440086400, 'upload_date': '20150820', 'series': 'Tripp, Trapp, Träd', 'duration': 865, + 'age_limit': 1, + 'episode_number': 1, + 'categories': [], 'tags': ['Sova'], 'episode': 'Sovkudde', + 'season': 'Säsong 1', }, }, { 'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden', @@ -69,7 +93,7 @@ class URPlayIE(InfoExtractor): urplayer_streams = urplayer_data.get('streamingInfo', {}) for k, v in urplayer_streams.get('raw', {}).items(): - if not (k in ('sd', 'hd') and isinstance(v, dict)): + if not (k in ('sd', 'hd', 'mp3', 'm4a') and isinstance(v, dict)): continue file_http = v.get('location') if file_http: @@ -88,18 +112,19 @@ class URPlayIE(InfoExtractor): lang = ISO639Utils.short2long(lang) return lang or None - for k, v in (urplayer_data['streamingInfo'].get('sweComplete') or {}).items(): - if (k in ('sd', 'hd') or not isinstance(v, dict)): - continue - lang, sttl_url = (v.get(kk) for kk in ('language', 'location', )) - if not sttl_url: - continue - lang = parse_lang_code(lang) - if not lang: - continue - sttl = subtitles.get(lang) or [] - sttl.append({'ext': k, 'url': sttl_url, }) - subtitles[lang] = sttl + for stream in urplayer_data['streamingInfo'].values(): + for k, v in stream.items(): + if (k in ('sd', 'hd') or not isinstance(v, dict)): + continue + lang, sttl_url = (v.get(kk) for kk in ('language', 'location', )) + if not sttl_url: + continue + lang = parse_lang_code(lang) + if not lang: + continue + sttl = subtitles.get(lang) or [] + sttl.append({'ext': k, 'url': sttl_url, }) + subtitles[lang] = sttl image = urplayer_data.get('image') or {} thumbnails = [] diff --git a/hypervideo_dl/extractor/vevo.py b/hypervideo_dl/extractor/vevo.py index da4ce49..aa40227 100644 --- a/hypervideo_dl/extractor/vevo.py +++ b/hypervideo_dl/extractor/vevo.py @@ -2,10 +2,8 @@ import re import json from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_HTTPError, -) +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -184,8 +182,8 @@ class VevoIE(VevoBaseIE): try: data = self._download_json(self._api_url_template % path, *args, **kwargs) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): - errors = self._parse_json(e.cause.read().decode(), None)['errors'] + if isinstance(e.cause, HTTPError): + errors = self._parse_json(e.cause.response.read().decode(), None)['errors'] error_message = ', '.join([error['message'] for error in errors]) raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) raise diff --git a/hypervideo_dl/extractor/vice.py b/hypervideo_dl/extractor/vice.py index d1a3b48..8a71268 100644 --- a/hypervideo_dl/extractor/vice.py +++ b/hypervideo_dl/extractor/vice.py @@ -7,10 +7,8 @@ import time from .adobepass import AdobePassIE from .common import InfoExtractor from .youtube import YoutubeIE -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, ExtractorError, @@ -140,8 +138,8 @@ class ViceIE(ViceBaseIE, AdobePassIE): 'https://vms.vice.com/%s/video/preplay/%s' % (locale, video_id), video_id, query=query) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401): - error = json.loads(e.cause.read().decode()) + if isinstance(e.cause, HTTPError) and e.cause.status in (400, 401): + error = json.loads(e.cause.response.read().decode()) error_message = error.get('error_description') or error['details'] raise ExtractorError('%s said: %s' % ( self.IE_NAME, error_message), expected=True) diff --git a/hypervideo_dl/extractor/videa.py b/hypervideo_dl/extractor/videa.py index 52fa8fc..59ae933 100644 --- a/hypervideo_dl/extractor/videa.py +++ b/hypervideo_dl/extractor/videa.py @@ -119,7 +119,7 @@ class VideaIE(InfoExtractor): result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)] query = parse_qs(player_url) - random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) + random_seed = ''.join(random.choices(string.ascii_letters + string.digits, k=8)) query['_s'] = random_seed query['_t'] = result[:16] diff --git a/hypervideo_dl/extractor/videocampus_sachsen.py b/hypervideo_dl/extractor/videocampus_sachsen.py index 982ab3d..37bc7d7 100644 --- a/hypervideo_dl/extractor/videocampus_sachsen.py +++ b/hypervideo_dl/extractor/videocampus_sachsen.py @@ -2,7 +2,7 @@ import functools import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ExtractorError, OnDemandPagedList, urlencode_postdata @@ -169,7 +169,7 @@ class VideocampusSachsenIE(InfoExtractor): f'https://{host}/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8', video_id, 'mp4', m3u8_id='hls', fatal=True) except ExtractorError as e: - if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (404, 500): + if not isinstance(e.cause, HTTPError) or e.cause.status not in (404, 500): raise formats.append({'url': f'https://{host}/getMedium/{video_id}.mp4'}) diff --git a/hypervideo_dl/extractor/videoken.py b/hypervideo_dl/extractor/videoken.py new file mode 100644 index 0000000..560b41a --- /dev/null +++ b/hypervideo_dl/extractor/videoken.py @@ -0,0 +1,336 @@ +import base64 +import functools +import math +import re +import time +import urllib.parse + +from .common import InfoExtractor +from .slideslive import SlidesLiveIE +from ..utils import ( + ExtractorError, + InAdvancePagedList, + int_or_none, + traverse_obj, + update_url_query, + url_or_none, +) + + +class VideoKenBaseIE(InfoExtractor): + _ORGANIZATIONS = { + 'videos.icts.res.in': 'icts', + 'videos.cncf.io': 'cncf', + 'videos.neurips.cc': 'neurips', + } + _BASE_URL_RE = rf'https?://(?P<host>{"|".join(map(re.escape, _ORGANIZATIONS))})/' + + _PAGE_SIZE = 12 + + def _get_org_id_and_api_key(self, org, video_id): + details = self._download_json( + f'https://analytics.videoken.com/api/videolake/{org}/details', video_id, + note='Downloading organization ID and API key', headers={ + 'Accept': 'application/json', + }) + return details['id'], details['apikey'] + + def _create_slideslive_url(self, video_url, video_id, referer): + if not video_url and not video_id: + return + elif not video_url or 'embed/sign-in' in video_url: + video_url = f'https://slideslive.com/embed/{video_id.lstrip("slideslive-")}' + if url_or_none(referer): + return update_url_query(video_url, { + 'embed_parent_url': referer, + 'embed_container_origin': f'https://{urllib.parse.urlparse(referer).netloc}', + }) + return video_url + + def _extract_videos(self, videos, url): + for video in traverse_obj(videos, (('videos', 'results'), ...)): + video_id = traverse_obj(video, 'youtube_id', 'videoid') + if not video_id: + continue + ie_key = None + if traverse_obj(video, 'type', 'source') == 'youtube': + video_url = video_id + ie_key = 'Youtube' + else: + video_url = traverse_obj(video, 'embed_url', 'embeddableurl') + if urllib.parse.urlparse(video_url).netloc == 'slideslive.com': + ie_key = SlidesLiveIE + video_url = self._create_slideslive_url(video_url, video_id, url) + if not video_url: + continue + yield self.url_result(video_url, ie_key, video_id) + + +class VideoKenIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:(?:topic|category)/[^/#?]+/)?video/(?P<id>[\w-]+)' + _TESTS = [{ + # neurips -> videoken -> slideslive + 'url': 'https://videos.neurips.cc/video/slideslive-38922815', + 'info_dict': { + 'id': '38922815', + 'ext': 'mp4', + 'title': 'Efficient Processing of Deep Neural Network: from Algorithms to Hardware Architectures', + 'timestamp': 1630939331, + 'upload_date': '20210906', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:330', + 'chapters': 'count:329', + }, + 'params': { + 'skip_download': 'm3u8', + }, + 'expected_warnings': ['Failed to download VideoKen API JSON'], + }, { + # neurips -> videoken -> slideslive -> youtube + 'url': 'https://videos.neurips.cc/topic/machine%20learning/video/slideslive-38923348', + 'info_dict': { + 'id': '2Xa_dt78rJE', + 'ext': 'mp4', + 'display_id': '38923348', + 'title': 'Machine Education', + 'description': 'Watch full version of this video at https://slideslive.com/38923348.', + 'channel': 'SlidesLive Videos - G2', + 'channel_id': 'UCOExahQQ588Da8Nft_Ltb9w', + 'channel_url': 'https://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w', + 'uploader': 'SlidesLive Videos - G2', + 'uploader_id': 'UCOExahQQ588Da8Nft_Ltb9w', + 'uploader_url': 'http://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w', + 'duration': 2504, + 'timestamp': 1618922125, + 'upload_date': '20200131', + 'age_limit': 0, + 'channel_follower_count': int, + 'view_count': int, + 'availability': 'unlisted', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'categories': ['People & Blogs'], + 'tags': [], + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)', + 'thumbnails': 'count:78', + 'chapters': 'count:77', + }, + 'params': { + 'skip_download': 'm3u8', + }, + 'expected_warnings': ['Failed to download VideoKen API JSON'], + }, { + # icts -> videoken -> youtube + 'url': 'https://videos.icts.res.in/topic/random%20variable/video/zysIsojYdvc', + 'info_dict': { + 'id': 'zysIsojYdvc', + 'ext': 'mp4', + 'title': 'Small-worlds, complex networks and random graphs (Lecture 3) by Remco van der Hofstad', + 'description': 'md5:87433069d79719eeadc1962cc2ace00b', + 'channel': 'International Centre for Theoretical Sciences', + 'channel_id': 'UCO3xnVTHzB7l-nc8mABUJIQ', + 'channel_url': 'https://www.youtube.com/channel/UCO3xnVTHzB7l-nc8mABUJIQ', + 'uploader': 'International Centre for Theoretical Sciences', + 'uploader_id': 'ICTStalks', + 'uploader_url': 'http://www.youtube.com/user/ICTStalks', + 'duration': 3372, + 'upload_date': '20191004', + 'age_limit': 0, + 'live_status': 'not_live', + 'availability': 'public', + 'playable_in_embed': True, + 'channel_follower_count': int, + 'like_count': int, + 'view_count': int, + 'categories': ['Science & Technology'], + 'tags': [], + 'thumbnail': r're:^https?://.*\.(?:jpg|webp)', + 'thumbnails': 'count:42', + 'chapters': 'count:20', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, { + 'url': 'https://videos.cncf.io/category/478/video/IL4nxbmUIX8', + 'only_matching': True, + }, { + 'url': 'https://videos.cncf.io/topic/kubernetes/video/YAM2d7yTrrI', + 'only_matching': True, + }, { + 'url': 'https://videos.icts.res.in/video/d7HuP_abpKU', + 'only_matching': True, + }] + + def _real_extract(self, url): + hostname, video_id = self._match_valid_url(url).group('host', 'id') + org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], video_id) + details = self._download_json( + 'https://analytics.videoken.com/api/videoinfo_private', video_id, query={ + 'videoid': video_id, + 'org_id': org_id, + }, headers={'Accept': 'application/json'}, note='Downloading VideoKen API JSON', + errnote='Failed to download VideoKen API JSON', fatal=False) + if details: + return next(self._extract_videos({'videos': [details]}, url)) + # fallback for API error 400 response + elif video_id.startswith('slideslive-'): + return self.url_result( + self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id) + elif re.match(r'^[\w-]{11}$', video_id): + self.url_result(video_id, 'Youtube', video_id) + else: + raise ExtractorError('Unable to extract without VideoKen API response') + + +class VideoKenPlayerIE(VideoKenBaseIE): + _VALID_URL = r'https?://player\.videoken\.com/embed/slideslive-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://player.videoken.com/embed/slideslive-38968434', + 'info_dict': { + 'id': '38968434', + 'ext': 'mp4', + 'title': 'Deep Learning with Label Differential Privacy', + 'timestamp': 1643377020, + 'upload_date': '20220128', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:30', + 'chapters': 'count:29', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id) + + +class VideoKenPlaylistIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:category/\d+/)?playlist/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://videos.icts.res.in/category/1822/playlist/381', + 'playlist_mincount': 117, + 'info_dict': { + 'id': '381', + 'title': 'Cosmology - The Next Decade', + }, + }] + + def _real_extract(self, url): + hostname, playlist_id = self._match_valid_url(url).group('host', 'id') + org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], playlist_id) + videos = self._download_json( + f'https://analytics.videoken.com/api/{org_id}/playlistitems/{playlist_id}/', + playlist_id, headers={'Accept': 'application/json'}, note='Downloading API JSON') + return self.playlist_result(self._extract_videos(videos, url), playlist_id, videos.get('title')) + + +class VideoKenCategoryIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'category/(?P<id>\d+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://videos.icts.res.in/category/1822/', + 'playlist_mincount': 500, + 'info_dict': { + 'id': '1822', + 'title': 'Programs', + }, + }, { + 'url': 'https://videos.neurips.cc/category/350/', + 'playlist_mincount': 34, + 'info_dict': { + 'id': '350', + 'title': 'NeurIPS 2018', + }, + }, { + 'url': 'https://videos.cncf.io/category/479/', + 'playlist_mincount': 328, + 'info_dict': { + 'id': '479', + 'title': 'KubeCon + CloudNativeCon Europe\'19', + }, + }] + + def _get_category_page(self, category_id, org_id, page=1, note=None): + return self._download_json( + f'https://analytics.videoken.com/api/videolake/{org_id}/category_videos', category_id, + fatal=False, note=note if note else f'Downloading category page {page}', + query={ + 'category_id': category_id, + 'page_number': page, + 'length': self._PAGE_SIZE, + }, headers={'Accept': 'application/json'}) or {} + + def _entries(self, category_id, org_id, url, page): + videos = self._get_category_page(category_id, org_id, page + 1) + yield from self._extract_videos(videos, url) + + def _real_extract(self, url): + hostname, category_id = self._match_valid_url(url).group('host', 'id') + org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], category_id) + category_info = self._get_category_page(category_id, org_id, note='Downloading category info') + category = category_info['category_name'] + total_pages = math.ceil(int(category_info['recordsTotal']) / self._PAGE_SIZE) + return self.playlist_result(InAdvancePagedList( + functools.partial(self._entries, category_id, org_id, url), + total_pages, self._PAGE_SIZE), category_id, category) + + +class VideoKenTopicIE(VideoKenBaseIE): + _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'topic/(?P<id>[^/#?]+)/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://videos.neurips.cc/topic/machine%20learning/', + 'playlist_mincount': 500, + 'info_dict': { + 'id': 'machine_learning', + 'title': 'machine learning', + }, + }, { + 'url': 'https://videos.icts.res.in/topic/gravitational%20waves/', + 'playlist_mincount': 77, + 'info_dict': { + 'id': 'gravitational_waves', + 'title': 'gravitational waves' + }, + }, { + 'url': 'https://videos.cncf.io/topic/prometheus/', + 'playlist_mincount': 134, + 'info_dict': { + 'id': 'prometheus', + 'title': 'prometheus', + }, + }] + + def _get_topic_page(self, topic, org_id, search_id, api_key, page=1, note=None): + return self._download_json( + 'https://es.videoken.com/api/v1.0/get_results', topic, fatal=False, query={ + 'orgid': org_id, + 'size': self._PAGE_SIZE, + 'query': topic, + 'page': page, + 'sort': 'upload_desc', + 'filter': 'all', + 'token': api_key, + 'is_topic': 'true', + 'category': '', + 'searchid': search_id, + }, headers={'Accept': 'application/json'}, + note=note if note else f'Downloading topic page {page}') or {} + + def _entries(self, topic, org_id, search_id, api_key, url, page): + videos = self._get_topic_page(topic, org_id, search_id, api_key, page + 1) + yield from self._extract_videos(videos, url) + + def _real_extract(self, url): + hostname, topic_id = self._match_valid_url(url).group('host', 'id') + topic = urllib.parse.unquote(topic_id) + topic_id = topic.replace(' ', '_') + org_id, api_key = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], topic) + search_id = base64.b64encode(f':{topic}:{int(time.time())}:transient'.encode()).decode() + total_pages = int_or_none(self._get_topic_page( + topic, org_id, search_id, api_key, note='Downloading topic info')['total_no_of_pages']) + return self.playlist_result(InAdvancePagedList( + functools.partial(self._entries, topic, org_id, search_id, api_key, url), + total_pages, self._PAGE_SIZE), topic_id, topic) diff --git a/hypervideo_dl/extractor/vidlii.py b/hypervideo_dl/extractor/vidlii.py index 5933783..44353b7 100644 --- a/hypervideo_dl/extractor/vidlii.py +++ b/hypervideo_dl/extractor/vidlii.py @@ -1,8 +1,8 @@ import re from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( - HEADRequest, format_field, float_or_none, get_element_by_id, @@ -70,6 +70,7 @@ class VidLiiIE(InfoExtractor): r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', webpage) or []] for source in sources: + source = urljoin(url, source) height = int(self._search_regex(r'(\d+).mp4', source, 'height', default=360)) if self._request_webpage(HEADRequest(source), video_id, f'Checking {height}p url', errnote=False): formats.append({ diff --git a/hypervideo_dl/extractor/viewlift.py b/hypervideo_dl/extractor/viewlift.py index 3812601..8f686f0 100644 --- a/hypervideo_dl/extractor/viewlift.py +++ b/hypervideo_dl/extractor/viewlift.py @@ -1,7 +1,7 @@ import json from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -46,8 +46,8 @@ class ViewLiftBaseIE(InfoExtractor): return self._download_json( self._API_BASE + path, video_id, headers={'Authorization': self._TOKENS.get(site)}, query=query) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - webpage = e.cause.read().decode() + if isinstance(e.cause, HTTPError) and e.cause.status == 403: + webpage = e.cause.response.read().decode() try: error_message = traverse_obj(json.loads(webpage), 'errorMessage', 'message') except json.JSONDecodeError: diff --git a/hypervideo_dl/extractor/viidea.py b/hypervideo_dl/extractor/viidea.py index 4cdf267..649ffe3 100644 --- a/hypervideo_dl/extractor/viidea.py +++ b/hypervideo_dl/extractor/viidea.py @@ -2,10 +2,10 @@ import re from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_str, compat_urlparse, ) +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, js_to_json, @@ -133,9 +133,9 @@ class ViideaIE(InfoExtractor): '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), lecture_id)['lecture'][0] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, HTTPError) and e.cause.status == 403: msg = self._parse_json( - e.cause.read().decode('utf-8'), lecture_id) + e.cause.response.read().decode('utf-8'), lecture_id) raise ExtractorError(msg['detail'], expected=True) raise diff --git a/hypervideo_dl/extractor/vimeo.py b/hypervideo_dl/extractor/vimeo.py index 516b76d..c0c08e8 100644 --- a/hypervideo_dl/extractor/vimeo.py +++ b/hypervideo_dl/extractor/vimeo.py @@ -2,20 +2,16 @@ import base64 import functools import re import itertools -import urllib.error from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urlparse, -) +from ..compat import compat_str, compat_urlparse +from ..networking import HEADRequest, Request +from ..networking.exceptions import HTTPError from ..utils import ( clean_html, determine_ext, ExtractorError, get_element_by_class, - HEADRequest, js_to_json, int_or_none, merge_dicts, @@ -23,7 +19,6 @@ from ..utils import ( parse_filesize, parse_iso8601, parse_qs, - sanitized_Request, smuggle_url, str_or_none, try_get, @@ -72,7 +67,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'Referer': self._LOGIN_URL, }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 418: + if isinstance(e.cause, HTTPError) and e.cause.status == 418: raise ExtractorError( 'Unable to log in: bad username or password', expected=True) @@ -304,27 +299,33 @@ class VimeoIE(VimeoBaseInfoExtractor): # _VALID_URL matches Vimeo URLs _VALID_URL = r'''(?x) - https?:// - (?: - (?: - www| - player - ) - \. - )? - vimeo\.com/ - (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) - (?:[^/]+/)*? - (?: - (?: - play_redirect_hls| - moogaloop\.swf)\?clip_id= - )? - (?:videos?/)? - (?P<id>[0-9]+) - (?:/(?P<unlisted_hash>[\da-f]{10}))? - /?(?:[?&].*)?(?:[#].*)?$ - ''' + https?:// + (?: + (?: + www| + player + ) + \. + )? + vimeo\.com/ + (?: + (?P<u>user)| + (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) + (?:.*?/)?? + (?P<q> + (?: + play_redirect_hls| + moogaloop\.swf)\?clip_id= + )? + (?:videos?/)? + ) + (?P<id>[0-9]+) + (?(u) + /(?!videos|likes)[^/?#]+/?| + (?(q)|/(?P<unlisted_hash>[\da-f]{10}))? + ) + (?:(?(q)[&]|(?(u)|/?)[?]).*?)?(?:[#].*)?$ + ''' IE_NAME = 'vimeo' _EMBED_REGEX = [ # iframe @@ -358,7 +359,7 @@ class VimeoIE(VimeoBaseInfoExtractor): }, { 'url': 'http://player.vimeo.com/video/54469442', - 'md5': 'b3e7f4d2cbb53bd7dc3bb6ff4ed5cfbd', + 'md5': '619b811a4417aa4abe78dc653becf511', 'note': 'Videos that embed the url in the player page', 'info_dict': { 'id': '54469442', @@ -389,8 +390,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', - 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_960', + 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', + 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', 'view_count': int, 'comment_count': int, 'like_count': int, @@ -407,7 +408,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '75629013', 'ext': 'mp4', 'title': 'Key & Peele: Terrorist Interrogation', - 'description': 'md5:8678b246399b070816b12313e8b4eb5c', + 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/atencio', 'uploader_id': 'atencio', 'uploader': 'Peter Atencio', @@ -559,8 +560,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', - 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_960', + 'description': 'md5:6173f270cd0c0119f22817204b3eb86c', + 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_1280', 'view_count': int, 'comment_count': int, 'like_count': int, @@ -705,7 +706,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'skip_download': True, }, - } + }, + { + # user playlist alias -> https://vimeo.com/258705797 + 'url': 'https://vimeo.com/user26785108/newspiritualguide', + 'only_matching': True, + }, # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header ] @@ -798,7 +804,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'X-Requested-With': 'XMLHttpRequest', }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: raise ExtractorError('Wrong password', expected=True) raise @@ -821,10 +827,10 @@ class VimeoIE(VimeoBaseInfoExtractor): # Retrieve video webpage to extract further information webpage, urlh = self._download_webpage_handle( url, video_id, headers=headers) - redirect_url = urlh.geturl() + redirect_url = urlh.url except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - errmsg = ee.cause.read() + if isinstance(ee.cause, HTTPError) and ee.cause.status == 403: + errmsg = ee.cause.response.read() if b'Because of its privacy settings, this video cannot be played here' in errmsg: raise ExtractorError( 'Cannot download embed-only video without embedding ' @@ -834,8 +840,8 @@ class VimeoIE(VimeoBaseInfoExtractor): raise if '://player.vimeo.com/video/' in url: - config = self._parse_json(self._search_regex( - r'\b(?:playerC|c)onfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) + config = self._search_json( + r'\b(?:playerC|c)onfig\s*=', webpage, 'info section', video_id) if config.get('view') == 4: config = self._verify_player_video_password( redirect_url, video_id, headers) @@ -1143,7 +1149,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): 'Authorization': 'jwt ' + authorization, })['data'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: return for video in videos: link = video.get('link') @@ -1185,7 +1191,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): 'X-Requested-With': 'XMLHttpRequest', })['hashed_pass'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: raise ExtractorError('Wrong password', expected=True) raise entries = OnDemandPagedList(functools.partial( @@ -1298,10 +1304,10 @@ class VimeoWatchLaterIE(VimeoChannelIE): # XXX: Do not subclass from concrete I def _page_url(self, base_url, pagenum): url = '%s/page:%d/' % (base_url, pagenum) - request = sanitized_Request(url) + request = Request(url) # Set the header to get a partial html page with the ids, # the normal page doesn't contain them. - request.add_header('X-Requested-With', 'XMLHttpRequest') + request.headers['X-Requested-With'] = 'XMLHttpRequest' return request def _real_extract(self, url): @@ -1421,7 +1427,7 @@ class VimeoProIE(VimeoBaseInfoExtractor): **self._hidden_inputs(password_form), }), note='Logging in with video password') except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 418: + if isinstance(e.cause, HTTPError) and e.cause.status == 418: raise ExtractorError('Wrong video password', expected=True) raise diff --git a/hypervideo_dl/extractor/viu.py b/hypervideo_dl/extractor/viu.py index b183c88..f315687 100644 --- a/hypervideo_dl/extractor/viu.py +++ b/hypervideo_dl/extractor/viu.py @@ -9,9 +9,12 @@ from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + remove_end, strip_or_none, + traverse_obj, try_get, smuggle_url, + unified_timestamp, unsmuggle_url, url_or_none, ) @@ -251,7 +254,7 @@ class ViuOTTIE(InfoExtractor): return self._user_token def _get_token(self, country_code, video_id): - rand = ''.join(random.choice('0123456789') for _ in range(10)) + rand = ''.join(random.choices('0123456789', k=10)) return self._download_json( f'https://api-gateway-global.viu.com/api/auth/token?v={rand}000', video_id, headers={'Content-Type': 'application/json'}, note='Getting bearer token', @@ -394,3 +397,146 @@ class ViuOTTIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class ViuOTTIndonesiaBaseIE(InfoExtractor): + _BASE_QUERY = { + 'ver': 1.0, + 'fmt': 'json', + 'aver': 5.0, + 'appver': 2.0, + 'appid': 'viu_desktop', + 'platform': 'desktop', + } + + _DEVICE_ID = str(uuid.uuid4()) + _SESSION_ID = str(uuid.uuid4()) + _TOKEN = None + + _HEADERS = { + 'x-session-id': _SESSION_ID, + 'x-client': 'browser' + } + + _AGE_RATINGS_MAPPER = { + 'ADULTS': 18, + 'teens': 13 + } + + def _real_initialize(self): + ViuOTTIndonesiaBaseIE._TOKEN = self._download_json( + 'https://um.viuapi.io/user/identity', None, + headers={'Content-type': 'application/json', **self._HEADERS}, + query={**self._BASE_QUERY, 'iid': self._DEVICE_ID}, + data=json.dumps({'deviceId': self._DEVICE_ID}).encode(), + note='Downloading token information')['token'] + + +class ViuOTTIndonesiaIE(ViuOTTIndonesiaBaseIE): + _VALID_URL = r'https?://www\.viu\.com/ott/\w+/\w+/all/video-[\w-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.viu.com/ott/id/id/all/video-japanese-drama-tv_shows-detective_conan_episode_793-1165863142?containerId=playlist-26271226', + 'info_dict': { + 'id': '1165863142', + 'ext': 'mp4', + 'episode_number': 793, + 'episode': 'Episode 793', + 'title': 'Detective Conan - Episode 793', + 'duration': 1476, + 'description': 'md5:b79d55345bc1e0217ece22616267c9a5', + 'thumbnail': 'https://vuclipi-a.akamaihd.net/p/cloudinary/h_171,w_304,dpr_1.5,f_auto,c_thumb,q_auto:low/1165863189/d-1', + 'upload_date': '20210101', + 'timestamp': 1609459200, + } + }, { + 'url': 'https://www.viu.com/ott/id/id/all/video-korean-reality-tv_shows-entertainment_weekly_episode_1622-1118617054', + 'info_dict': { + 'id': '1118617054', + 'ext': 'mp4', + 'episode_number': 1622, + 'episode': 'Episode 1622', + 'description': 'md5:6d68ca450004020113e9bf27ad99f0f8', + 'title': 'Entertainment Weekly - Episode 1622', + 'duration': 4729, + 'thumbnail': 'https://vuclipi-a.akamaihd.net/p/cloudinary/h_171,w_304,dpr_1.5,f_auto,c_thumb,q_auto:low/1120187848/d-1', + 'timestamp': 1420070400, + 'upload_date': '20150101', + 'cast': ['Shin Hyun-joon', 'Lee Da-Hee'] + } + }, { + # age-limit test + 'url': 'https://www.viu.com/ott/id/id/all/video-japanese-trailer-tv_shows-trailer_jujutsu_kaisen_ver_01-1166044219?containerId=playlist-26273140', + 'info_dict': { + 'id': '1166044219', + 'ext': 'mp4', + 'upload_date': '20200101', + 'timestamp': 1577836800, + 'title': 'Trailer \'Jujutsu Kaisen\' Ver.01', + 'duration': 92, + 'thumbnail': 'https://vuclipi-a.akamaihd.net/p/cloudinary/h_171,w_304,dpr_1.5,f_auto,c_thumb,q_auto:low/1166044240/d-1', + 'description': 'Trailer \'Jujutsu Kaisen\' Ver.01', + 'cast': ['Junya Enoki', ' Yûichi Nakamura', ' Yuma Uchida', 'Asami Seto'], + 'age_limit': 13, + } + }, { + # json ld metadata type equal to Movie instead of TVEpisodes + 'url': 'https://www.viu.com/ott/id/id/all/video-japanese-animation-movies-demon_slayer_kimetsu_no_yaiba_the_movie_mugen_train-1165892707?containerId=1675060691786', + 'info_dict': { + 'id': '1165892707', + 'ext': 'mp4', + 'timestamp': 1577836800, + 'upload_date': '20200101', + 'title': 'Demon Slayer - Kimetsu no Yaiba - The Movie: Mugen Train', + 'age_limit': 13, + 'cast': 'count:9', + 'thumbnail': 'https://vuclipi-a.akamaihd.net/p/cloudinary/h_171,w_304,dpr_1.5,f_auto,c_thumb,q_auto:low/1165895279/d-1', + 'description': 'md5:1ce9c35a3aeab384085533f746c87469', + 'duration': 7021, + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + video_data = self._download_json( + f'https://um.viuapi.io/drm/v1/content/{display_id}', display_id, data=b'', + headers={'Authorization': ViuOTTIndonesiaBaseIE._TOKEN, **self._HEADERS, 'ccode': 'ID'}) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_data['playUrl'], display_id) + + initial_state = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', + display_id)['content']['clipDetails'] + for key, url in initial_state.items(): + lang, ext = self._search_regex( + r'^subtitle_(?P<lang>[\w-]+)_(?P<ext>\w+)$', key, 'subtitle metadata', + default=(None, None), group=('lang', 'ext')) + if lang and ext: + subtitles.setdefault(lang, []).append({ + 'ext': ext, + 'url': url, + }) + + if ext == 'vtt': + subtitles[lang].append({ + 'ext': 'srt', + 'url': f'{remove_end(initial_state[key], "vtt")}srt', + }) + + episode = traverse_obj(list(filter( + lambda x: x.get('@type') in ('TVEpisode', 'Movie'), self._yield_json_ld(webpage, display_id))), 0) or {} + return { + 'id': display_id, + 'title': (traverse_obj(initial_state, 'title', 'display_title') + or episode.get('name')), + 'description': initial_state.get('description') or episode.get('description'), + 'duration': initial_state.get('duration'), + 'thumbnail': traverse_obj(episode, ('image', 'url')), + 'timestamp': unified_timestamp(episode.get('dateCreated')), + 'formats': formats, + 'subtitles': subtitles, + 'episode_number': (traverse_obj(initial_state, 'episode_no', 'episodeno', expected_type=int_or_none) + or int_or_none(episode.get('episodeNumber'))), + 'cast': traverse_obj(episode, ('actor', ..., 'name'), default=None), + 'age_limit': self._AGE_RATINGS_MAPPER.get(initial_state.get('internal_age_rating')) + } diff --git a/hypervideo_dl/extractor/vk.py b/hypervideo_dl/extractor/vk.py index 347aa38..9154228 100644 --- a/hypervideo_dl/extractor/vk.py +++ b/hypervideo_dl/extractor/vk.py @@ -6,22 +6,28 @@ from .common import InfoExtractor from .dailymotion import DailymotionIE from .odnoklassniki import OdnoklassnikiIE from .pladform import PladformIE +from .sibnet import SibnetEmbedIE from .vimeo import VimeoIE from .youtube import YoutubeIE -from ..compat import compat_urlparse from ..utils import ( ExtractorError, + UserNotLive, clean_html, get_element_by_class, + get_element_html_by_id, int_or_none, - orderedSet, + join_nonempty, + parse_resolution, str_or_none, str_to_int, + try_call, unescapeHTML, unified_timestamp, update_url_query, url_or_none, urlencode_postdata, + urljoin, + traverse_obj, ) @@ -30,7 +36,7 @@ class VKBaseIE(InfoExtractor): def _download_webpage_handle(self, url_or_request, video_id, *args, fatal=True, **kwargs): response = super()._download_webpage_handle(url_or_request, video_id, *args, fatal=fatal, **kwargs) - challenge_url, cookie = response[1].geturl() if response else '', None + challenge_url, cookie = response[1].url if response else '', None if challenge_url.startswith('https://vk.com/429.html?'): cookie = self._get_cookies(challenge_url).get('hash429') if not cookie: @@ -101,8 +107,7 @@ class VKIE(VKBaseIE): (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>([\da-f]+)|(ln-[\da-zA-Z]+)))? ) ''' - # https://help.sibnet.ru/?sibnet_video_embed - _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1'] + _TESTS = [ { 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', @@ -117,7 +122,7 @@ class VKIE(VKBaseIE): 'upload_date': '20120212', 'comment_count': int, 'like_count': int, - 'thumbnail': r're:https?://.+\.jpg$', + 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$', }, 'params': {'skip_download': 'm3u8'}, }, @@ -134,7 +139,7 @@ class VKIE(VKBaseIE): 'upload_date': '20130720', 'comment_count': int, 'like_count': int, - 'thumbnail': r're:https?://.+\.jpg$', + 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$', } }, { @@ -149,56 +154,11 @@ class VKIE(VKBaseIE): 'upload_date': '20120212', 'timestamp': 1329049880, 'uploader_id': '39545378', - 'thumbnail': r're:https?://.+\.jpg$', + 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$', }, 'params': {'skip_download': 'm3u8'}, }, { - # VIDEO NOW REMOVED - # please update if you find a video whose URL follows the same pattern - 'url': 'http://vk.com/video-8871596_164049491', - 'md5': 'a590bcaf3d543576c9bd162812387666', - 'note': 'Only available for registered users', - 'info_dict': { - 'id': '-8871596_164049491', - 'ext': 'mp4', - 'uploader': 'Триллеры', - 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]', - 'duration': 8352, - 'upload_date': '20121218', - 'view_count': int, - }, - 'skip': 'Removed', - }, - { - 'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d', - 'info_dict': { - 'id': '-43215063_168067957', - 'ext': 'mp4', - 'uploader': 'Bro Mazter', - 'title': ' ', - 'duration': 7291, - 'upload_date': '20140328', - 'uploader_id': '223413403', - 'timestamp': 1396018030, - }, - 'skip': 'Requires vk account credentials', - }, - { - 'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540', - 'md5': '0c45586baa71b7cb1d0784ee3f4e00a6', - 'note': 'ivi.ru embed', - 'info_dict': { - 'id': '-43215063_169084319', - 'ext': 'mp4', - 'title': 'Книга Илая', - 'duration': 6771, - 'upload_date': '20140626', - 'view_count': int, - }, - 'skip': 'Removed', - }, - { 'url': 'https://vk.com/video-93049196_456239755?list=ln-cBjJ7S4jYYx3ADnmDT', 'info_dict': { 'id': '-93049196_456239755', @@ -211,26 +171,11 @@ class VKIE(VKBaseIE): 'timestamp': 1640162189, 'upload_date': '20211222', 'uploader_id': '-93049196', - 'thumbnail': r're:https?://.+\.jpg$', + 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$', }, }, { - # video (removed?) only available with list id - 'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4', - 'md5': '091287af5402239a1051c37ec7b92913', - 'info_dict': { - 'id': '30481095_171201961', - 'ext': 'mp4', - 'title': 'ТюменцевВВ_09.07.2015', - 'uploader': 'Anton Ivanov', - 'duration': 109, - 'upload_date': '20150709', - 'view_count': int, - }, - 'skip': 'Removed', - }, - { - # youtube embed + 'note': 'youtube embed', 'url': 'https://vk.com/video276849682_170681728', 'info_dict': { 'id': 'V3K4mi0SYkc', @@ -254,23 +199,45 @@ class VKIE(VKBaseIE): 'start_time': 0.0, 'categories': ['Nonprofits & Activism'], 'channel_url': 'https://www.youtube.com/channel/UCgzCNQ11TmR9V97ECnhi3gw', + 'channel_follower_count': int, 'age_limit': 0, }, }, { - # dailymotion embed - 'url': 'https://vk.com/video-37468416_456239855', + 'note': 'dailymotion embed', + 'url': 'https://vk.com/video-95168827_456239103?list=cca524a0f0d5557e16', 'info_dict': { - 'id': 'k3lz2cmXyRuJQSjGHUv', + 'id': 'x8gfli0', 'ext': 'mp4', - 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', - 'description': 'md5:424b8e88cc873217f520e582ba28bb36', - 'uploader': 'AniLibria.Tv', - 'upload_date': '20160914', - 'uploader_id': 'x1p5vl5', - 'timestamp': 1473877246, + 'title': 'md5:45410f60ccd4b2760da98cb5fc777d70', + 'description': 'md5:2e71c5c9413735cfa06cf1a166f16c84', + 'uploader': 'Movies and cinema.', + 'upload_date': '20221218', + 'uploader_id': 'x1jdavv', + 'timestamp': 1671387617, + 'age_limit': 0, + 'duration': 2918, + 'like_count': int, + 'view_count': int, + 'thumbnail': r're:https?://.+x1080$', + 'tags': list + }, + }, + { + 'url': 'https://vk.com/clips-74006511?z=clip-74006511_456247211', + 'info_dict': { + 'id': '-74006511_456247211', + 'ext': 'mp4', + 'comment_count': int, + 'duration': 9, + 'like_count': int, + 'thumbnail': r're:https?://.+(?:\.jpg|getVideoPreview.*)$', + 'timestamp': 1664995597, + 'title': 'Clip by @madempress', + 'upload_date': '20221005', + 'uploader': 'Шальная императрица', + 'uploader_id': '-74006511', }, - 'skip': 'Removed' }, { # video key is extra_data not url\d+ @@ -288,7 +255,7 @@ class VKIE(VKBaseIE): 'skip': 'Removed', }, { - # finished live stream, postlive_mp4 + 'note': 'finished live stream, postlive_mp4', 'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2', 'info_dict': { 'id': '-387766_456242764', @@ -455,7 +422,7 @@ class VKIE(VKBaseIE): if odnoklassniki_url: return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) - sibnet_url = next(self._extract_embed_urls(url, info_page), None) + sibnet_url = next(SibnetEmbedIE._extract_embed_urls(url, info_page), None) if sibnet_url: return self.url_result(sibnet_url) @@ -552,7 +519,7 @@ class VKUserVideosIE(VKBaseIE): }, { 'url': 'https://vk.com/video/playlist/-174476437_2', 'info_dict': { - 'id': '-174476437_2', + 'id': '-174476437_playlist_2', 'title': 'Анонсы' }, 'playlist_mincount': 108, @@ -595,6 +562,7 @@ class VKUserVideosIE(VKBaseIE): page_id = self._search_regex(r'data-owner-id\s?=\s?"([^"]+)"', webpage, 'page_id') elif '_' in u_id: page_id, section = u_id.split('_', 1) + section = f'playlist_{section}' else: raise ExtractorError('Invalid URL', expected=True) @@ -614,13 +582,13 @@ class VKWallPostIE(VKBaseIE): 'info_dict': { 'id': '-23538238_35', 'title': 'Black Shadow - Wall post -23538238_35', - 'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', + 'description': 'md5:190c78f905a53e0de793d83933c6e67f', }, 'playlist': [{ 'md5': '5ba93864ec5b85f7ce19a9af4af080f6', 'info_dict': { 'id': '135220665_111806521', - 'ext': 'mp4', + 'ext': 'm4a', 'title': 'Black Shadow - Слепое Верование', 'duration': 370, 'uploader': 'Black Shadow', @@ -631,7 +599,7 @@ class VKWallPostIE(VKBaseIE): 'md5': '4cc7e804579122b17ea95af7834c9233', 'info_dict': { 'id': '135220665_111802303', - 'ext': 'mp4', + 'ext': 'm4a', 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', 'duration': 423, 'uploader': 'Black Shadow', @@ -642,16 +610,15 @@ class VKWallPostIE(VKBaseIE): 'params': { 'skip_download': True, }, - 'skip': 'Requires vk account credentials', }, { - # single YouTube embed, no leading - - 'url': 'https://vk.com/wall85155021_6319', + # single YouTube embed with irrelevant reaction videos + 'url': 'https://vk.com/wall-32370614_7173954', 'info_dict': { - 'id': '85155021_6319', - 'title': 'Сергей Горбунов - Wall post 85155021_6319', + 'id': '-32370614_7173954', + 'title': 'md5:9f93c405bbc00061d34007d78c75e3bc', + 'description': 'md5:953b811f26fa9f21ee5856e2ea8e68fc', }, 'playlist_count': 1, - 'skip': 'Requires vk account credentials', }, { # wall page URL 'url': 'https://vk.com/wall-23538238_35', @@ -703,39 +670,173 @@ class VKWallPostIE(VKBaseIE): 'w': 'wall' + post_id, })[1] - description = clean_html(get_element_by_class('wall_post_text', webpage)) - uploader = clean_html(get_element_by_class('author', webpage)) + uploader = clean_html(get_element_by_class('PostHeaderTitle__authorName', webpage)) entries = [] for audio in re.findall(r'data-audio="([^"]+)', webpage): audio = self._parse_json(unescapeHTML(audio), post_id) - a = self._AUDIO._make(audio[:16]) - if not a.url: + if not audio['url']: continue - title = unescapeHTML(a.title) - performer = unescapeHTML(a.performer) + title = unescapeHTML(audio.get('title')) + artist = unescapeHTML(audio.get('artist')) entries.append({ - 'id': '%s_%s' % (a.owner_id, a.id), - 'url': self._unmask_url(a.url, a.ads['vk_id']), - 'title': '%s - %s' % (performer, title) if performer else title, - 'thumbnails': [{'url': c_url} for c_url in a.cover_url.split(',')] if a.cover_url else None, - 'duration': int_or_none(a.duration), + 'id': f'{audio["owner_id"]}_{audio["id"]}', + 'title': join_nonempty(artist, title, delim=' - '), + 'thumbnails': try_call(lambda: [{'url': u} for u in audio['coverUrl'].split(',')]), + 'duration': int_or_none(audio.get('duration')), 'uploader': uploader, - 'artist': performer, + 'artist': artist, 'track': title, - 'ext': 'mp4', - 'protocol': 'm3u8_native', + 'formats': [{ + 'url': audio['url'], + 'ext': 'm4a', + 'vcodec': 'none', + 'acodec': 'mp3', + 'container': 'm4a_dash', + }], }) - for video in re.finditer( - r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage): - entries.append(self.url_result( - compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key())) - - title = 'Wall post %s' % post_id + entries.extend(self.url_result(urljoin(url, entry), VKIE) for entry in set(re.findall( + r'<a[^>]+href=(?:["\'])(/video(?:-?[\d_]+)[^"\']*)', + get_element_html_by_id('wl_post_body', webpage)))) return self.playlist_result( - orderedSet(entries), post_id, - '%s - %s' % (uploader, title) if uploader else title, - description) + entries, post_id, join_nonempty(uploader, f'Wall post {post_id}', delim=' - '), + clean_html(get_element_by_class('wall_post_text', webpage))) + + +class VKPlayBaseIE(InfoExtractor): + _RESOLUTIONS = { + 'tiny': '256x144', + 'lowest': '426x240', + 'low': '640x360', + 'medium': '852x480', + 'high': '1280x720', + 'full_hd': '1920x1080', + 'quad_hd': '2560x1440', + } + + def _extract_from_initial_state(self, url, video_id, path): + webpage = self._download_webpage(url, video_id) + video_info = traverse_obj(self._search_json( + r'<script[^>]+\bid="initial-state"[^>]*>', webpage, 'initial state', video_id), + path, expected_type=dict) + if not video_info: + raise ExtractorError('Unable to extract video info from html inline initial state') + return video_info + + def _extract_formats(self, stream_info, video_id): + formats = [] + for stream in traverse_obj(stream_info, ( + 'data', 0, 'playerUrls', lambda _, v: url_or_none(v['url']) and v['type'])): + url = stream['url'] + format_id = str_or_none(stream['type']) + if format_id in ('hls', 'live_hls', 'live_playback_hls') or '.m3u8' in url: + formats.extend(self._extract_m3u8_formats(url, video_id, m3u8_id=format_id, fatal=False)) + elif format_id == 'dash': + formats.extend(self._extract_mpd_formats(url, video_id, mpd_id=format_id, fatal=False)) + elif format_id in ('live_dash', 'live_playback_dash'): + self.write_debug(f'Not extracting unsupported format "{format_id}"') + else: + formats.append({ + 'url': url, + 'ext': 'mp4', + 'format_id': format_id, + **parse_resolution(self._RESOLUTIONS.get(format_id)), + }) + return formats + + def _extract_common_meta(self, stream_info): + return traverse_obj(stream_info, { + 'id': ('id', {str_or_none}), + 'title': ('title', {str}), + 'release_timestamp': ('startTime', {int_or_none}), + 'thumbnail': ('previewUrl', {url_or_none}), + 'view_count': ('count', 'views', {int_or_none}), + 'like_count': ('count', 'likes', {int_or_none}), + 'categories': ('category', 'title', {str}, {lambda x: [x] if x else None}), + 'uploader': (('user', ('blog', 'owner')), 'nick', {str}), + 'uploader_id': (('user', ('blog', 'owner')), 'id', {str_or_none}), + 'duration': ('duration', {int_or_none}), + 'is_live': ('isOnline', {bool}), + 'concurrent_view_count': ('count', 'viewers', {int_or_none}), + }, get_all=False) + + +class VKPlayIE(VKPlayBaseIE): + _VALID_URL = r'https?://vkplay\.live/(?P<username>[^/#?]+)/record/(?P<id>[a-f0-9-]+)' + _TESTS = [{ + 'url': 'https://vkplay.live/zitsmann/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da', + 'info_dict': { + 'id': 'f5e6e3b5-dc52-4d14-965d-0680dd2882da', + 'ext': 'mp4', + 'title': 'Atomic Heart (пробуем!) спасибо подписчику EKZO!', + 'uploader': 'ZitsmanN', + 'uploader_id': '13159830', + 'release_timestamp': 1683461378, + 'release_date': '20230507', + 'thumbnail': r're:https://images.vkplay.live/public_video_stream/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da/preview\?change_time=\d+', + 'duration': 10608, + 'view_count': int, + 'like_count': int, + 'categories': ['Atomic Heart'], + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + username, video_id = self._match_valid_url(url).groups() + + record_info = traverse_obj(self._download_json( + f'https://api.vkplay.live/v1/blog/{username}/public_video_stream/record/{video_id}', video_id, fatal=False), + ('data', 'record', {dict})) + if not record_info: + record_info = self._extract_from_initial_state(url, video_id, ('record', 'currentRecord', 'data')) + + return { + **self._extract_common_meta(record_info), + 'id': video_id, + 'formats': self._extract_formats(record_info, video_id), + } + + +class VKPlayLiveIE(VKPlayBaseIE): + _VALID_URL = r'https?://vkplay\.live/(?P<id>[^/#?]+)/?(?:[#?]|$)' + _TESTS = [{ + 'url': 'https://vkplay.live/bayda', + 'info_dict': { + 'id': 'f02c321e-427b-408d-b12f-ae34e53e0ea2', + 'ext': 'mp4', + 'title': r're:эскапизм крута .*', + 'uploader': 'Bayda', + 'uploader_id': 12279401, + 'release_timestamp': 1687209962, + 'release_date': '20230619', + 'thumbnail': r're:https://images.vkplay.live/public_video_stream/12279401/preview\?change_time=\d+', + 'view_count': int, + 'concurrent_view_count': int, + 'like_count': int, + 'categories': ['EVE Online'], + 'live_status': 'is_live', + }, + 'skip': 'livestream', + 'params': {'skip_download': True}, + }] + + def _real_extract(self, url): + username = self._match_id(url) + + stream_info = self._download_json( + f'https://api.vkplay.live/v1/blog/{username}/public_video_stream', username, fatal=False) + if not stream_info: + stream_info = self._extract_from_initial_state(url, username, ('stream', 'stream', 'data', 'stream')) + + formats = self._extract_formats(stream_info, username) + if not formats and not traverse_obj(stream_info, ('isOnline', {bool})): + raise UserNotLive(video_id=username) + + return { + **self._extract_common_meta(stream_info), + 'formats': formats, + } diff --git a/hypervideo_dl/extractor/vocaroo.py b/hypervideo_dl/extractor/vocaroo.py new file mode 100644 index 0000000..d98fbfd --- /dev/null +++ b/hypervideo_dl/extractor/vocaroo.py @@ -0,0 +1,63 @@ +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import float_or_none + + +class VocarooIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:vocaroo\.com|voca\.ro)/(?:embed/)?(?P<id>\w+)' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:www\.)?vocaroo\.com/embed/.+?)\1'] + _TESTS = [ + { + 'url': 'https://vocaroo.com/1de8yA3LNe77', + 'md5': 'c557841d5e50261777a6585648adf439', + 'info_dict': { + 'id': '1de8yA3LNe77', + 'ext': 'mp3', + 'title': 'Vocaroo video #1de8yA3LNe77', + 'timestamp': 1675059800.370, + 'upload_date': '20230130', + }, + }, + { + 'url': 'https://vocaroo.com/embed/12WqtjLnpj6g?autoplay=0', + 'only_matching': True, + }, + { + 'url': 'https://voca.ro/12D52rgpzkB0', + 'only_matching': True, + }, + ] + + _WEBPAGE_TESTS = [ + { + 'url': 'https://qbnu.github.io/cool.html', + 'md5': 'f322e529275dd8a47994919eeac404a5', + 'info_dict': { + 'id': '19cgWmKO6AmC', + 'ext': 'mp3', + 'title': 'Vocaroo video #19cgWmKO6AmC', + 'timestamp': 1675093841.408, + 'upload_date': '20230130', + }, + }, + ] + + def _real_extract(self, url): + audio_id = self._match_id(url) + if len(audio_id) == 10 or (len(audio_id) == 12 and audio_id[0] == '1'): + media_subdomain = 'media1' + else: + media_subdomain = 'media' + + url = f'https://{media_subdomain}.vocaroo.com/mp3/{audio_id}' + http_headers = {'Referer': 'https://vocaroo.com/'} + resp = self._request_webpage(HEADRequest(url), audio_id, headers=http_headers) + return { + 'id': audio_id, + 'title': '', + 'url': url, + 'ext': 'mp3', + 'timestamp': float_or_none(resp.getheader('x-bz-upload-timestamp'), scale=1000), + 'vcodec': 'none', + 'http_headers': http_headers, + } diff --git a/hypervideo_dl/extractor/vodlocker.py b/hypervideo_dl/extractor/vodlocker.py index 1c7236e..b215d6c 100644 --- a/hypervideo_dl/extractor/vodlocker.py +++ b/hypervideo_dl/extractor/vodlocker.py @@ -1,10 +1,6 @@ from .common import InfoExtractor -from ..utils import ( - ExtractorError, - NO_DEFAULT, - sanitized_Request, - urlencode_postdata, -) +from ..networking import Request +from ..utils import NO_DEFAULT, ExtractorError, urlencode_postdata class VodlockerIE(InfoExtractor): @@ -37,8 +33,8 @@ class VodlockerIE(InfoExtractor): if fields['op'] == 'download1': self._sleep(3, video_id) # they do detect when requests happen too fast! post = urlencode_postdata(fields) - req = sanitized_Request(url, post) - req.add_header('Content-type', 'application/x-www-form-urlencoded') + req = Request(url, post) + req.headers['Content-type'] = 'application/x-www-form-urlencoded' webpage = self._download_webpage( req, video_id, 'Downloading video page') diff --git a/hypervideo_dl/extractor/volejtv.py b/hypervideo_dl/extractor/volejtv.py new file mode 100644 index 0000000..622d841 --- /dev/null +++ b/hypervideo_dl/extractor/volejtv.py @@ -0,0 +1,40 @@ +from .common import InfoExtractor + + +class VolejTVIE(InfoExtractor): + _VALID_URL = r'https?://volej\.tv/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://volej.tv/video/725742/', + 'info_dict': { + 'id': '725742', + 'ext': 'mp4', + 'description': 'Zápas VK Královo Pole vs VK Prostějov 10.12.2022 v 19:00 na Volej.TV', + 'thumbnail': 'https://volej.tv/images/og/16/17186/og.png', + 'title': 'VK Královo Pole vs VK Prostějov', + } + }, { + 'url': 'https://volej.tv/video/725605/', + 'info_dict': { + 'id': '725605', + 'ext': 'mp4', + 'thumbnail': 'https://volej.tv/images/og/15/17185/og.png', + 'title': 'VK Lvi Praha vs VK Euro Sitex Příbram', + 'description': 'Zápas VK Lvi Praha vs VK Euro Sitex Příbram 11.12.2022 v 19:00 na Volej.TV', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_data = self._search_json( + r'<\s*!\[CDATA[^=]+=', webpage, 'CDATA', video_id) + formats, subtitle = self._extract_m3u8_formats_and_subtitles( + json_data['urls']['hls'], video_id) + return { + 'id': video_id, + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage), + 'description': self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage), + 'formats': formats, + 'subtitles': subtitle, + } diff --git a/hypervideo_dl/extractor/voot.py b/hypervideo_dl/extractor/voot.py index b709b74..b19a279 100644 --- a/hypervideo_dl/extractor/voot.py +++ b/hypervideo_dl/extractor/voot.py @@ -1,14 +1,86 @@ +import json +import time +import uuid + from .common import InfoExtractor from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, + float_or_none, int_or_none, + jwt_decode_hs256, + parse_age_limit, + traverse_obj, + try_call, try_get, - unified_timestamp, + unified_strdate, ) -class VootIE(InfoExtractor): +class VootBaseIE(InfoExtractor): + _NETRC_MACHINE = 'voot' + _GEO_BYPASS = False + _LOGIN_HINT = 'Log in with "-u <email_address> -p <password>", or use "-u token -p <auth_token>" to login with auth token.' + _TOKEN = None + _EXPIRY = 0 + _API_HEADERS = {'Origin': 'https://www.voot.com', 'Referer': 'https://www.voot.com/'} + + def _perform_login(self, username, password): + if self._TOKEN and self._EXPIRY: + return + + if username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)): + VootBaseIE._TOKEN = password + VootBaseIE._EXPIRY = jwt_decode_hs256(password)['exp'] + self.report_login() + + # Mobile number as username is not supported + elif not username.isdigit(): + check_username = self._download_json( + 'https://userauth.voot.com/usersV3/v3/checkUser', None, data=json.dumps({ + 'type': 'email', + 'email': username + }, separators=(',', ':')).encode(), headers={ + **self._API_HEADERS, + 'Content-Type': 'application/json;charset=utf-8', + }, note='Checking username', expected_status=403) + if not traverse_obj(check_username, ('isExist', {bool})): + if traverse_obj(check_username, ('status', 'code', {int})) == 9999: + self.raise_geo_restricted(countries=['IN']) + raise ExtractorError('Incorrect username', expected=True) + auth_token = traverse_obj(self._download_json( + 'https://userauth.voot.com/usersV3/v3/login', None, data=json.dumps({ + 'type': 'traditional', + 'deviceId': str(uuid.uuid4()), + 'deviceBrand': 'PC/MAC', + 'data': { + 'email': username, + 'password': password + } + }, separators=(',', ':')).encode(), headers={ + **self._API_HEADERS, + 'Content-Type': 'application/json;charset=utf-8', + }, note='Logging in', expected_status=400), ('data', 'authToken', {dict})) + if not auth_token: + raise ExtractorError('Incorrect password', expected=True) + VootBaseIE._TOKEN = auth_token['accessToken'] + VootBaseIE._EXPIRY = auth_token['expirationTime'] + + else: + raise ExtractorError(self._LOGIN_HINT, expected=True) + + def _check_token_expiry(self): + if int(time.time()) >= self._EXPIRY: + raise ExtractorError('Access token has expired', expected=True) + + def _real_initialize(self): + if not self._TOKEN: + self.raise_login_required(self._LOGIN_HINT, method=None) + self._check_token_expiry() + + +class VootIE(VootBaseIE): _VALID_URL = r'''(?x) (?: voot:| @@ -20,27 +92,25 @@ class VootIE(InfoExtractor): ) (?P<id>\d{3,}) ''' - _GEO_COUNTRIES = ['IN'] _TESTS = [{ 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', 'info_dict': { - 'id': '0_8ledb18o', + 'id': '441353', 'ext': 'mp4', - 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', + 'title': 'Is this the end of Kamini?', 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', - 'timestamp': 1472162937, + 'timestamp': 1472103000, 'upload_date': '20160825', 'series': 'Ishq Ka Rang Safed', 'season_number': 1, 'episode': 'Is this the end of Kamini?', 'episode_number': 340, - 'view_count': int, - 'like_count': int, - }, - 'params': { - 'skip_download': True, + 'release_date': '20160825', + 'season': 'Season 1', + 'age_limit': 13, + 'duration': 1146.0, }, - 'expected_warnings': ['Failed to download m3u8 information'], + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925', 'only_matching': True, @@ -55,59 +125,50 @@ class VootIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) media_info = self._download_json( - 'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id, - query={ - 'platform': 'Web', - 'pId': 2, - 'mediaId': video_id, - }) - - status_code = try_get(media_info, lambda x: x['status']['code'], int) - if status_code != 0: - raise ExtractorError(media_info['status']['message'], expected=True) - - media = media_info['assets'] - - entry_id = media['EntryId'] - title = media['MediaName'] - formats = self._extract_m3u8_formats( - 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + entry_id, - video_id, 'mp4', m3u8_id='hls') - - description, series, season_number, episode, episode_number = [None] * 5 - - for meta in try_get(media, lambda x: x['Metas'], list) or []: - key, value = meta.get('Key'), meta.get('Value') - if not key or not value: - continue - if key == 'ContentSynopsis': - description = value - elif key == 'RefSeriesTitle': - series = value - elif key == 'RefSeriesSeason': - season_number = int_or_none(value) - elif key == 'EpisodeMainTitle': - episode = value - elif key == 'EpisodeNo': - episode_number = int_or_none(value) + 'https://psapi.voot.com/jio/voot/v1/voot-web/content/query/asset-details', video_id, + query={'ids': f'include:{video_id}', 'responseType': 'common'}, headers={'accesstoken': self._TOKEN}) + + try: + m3u8_url = self._download_json( + 'https://vootapi.media.jio.com/playback/v1/playbackrights', video_id, + 'Downloading playback JSON', data=b'{}', headers={ + **self.geo_verification_headers(), + **self._API_HEADERS, + 'Content-Type': 'application/json;charset=utf-8', + 'platform': 'androidwebdesktop', + 'vootid': video_id, + 'voottoken': self._TOKEN, + })['m3u8'] + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: + self._check_token_expiry() + raise + + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls') + self._remove_duplicate_formats(formats) + return { - 'extractor_key': 'Kaltura', - 'id': entry_id, - 'title': title, - 'description': description, - 'series': series, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'timestamp': unified_timestamp(media.get('CreationDate')), - 'duration': int_or_none(media.get('Duration')), - 'view_count': int_or_none(media.get('ViewCounter')), - 'like_count': int_or_none(media.get('like_counter')), - 'formats': formats, + 'id': video_id, + # '/_definst_/smil:vod/' m3u8 manifests claim to have 720p+ formats but max out at 480p + 'formats': traverse_obj(formats, ( + lambda _, v: '/_definst_/smil:vod/' not in v['url'] or v['height'] <= 480)), + 'http_headers': self._API_HEADERS, + **traverse_obj(media_info, ('result', 0, { + 'title': ('fullTitle', {str}), + 'description': ('fullSynopsis', {str}), + 'series': ('showName', {str}), + 'season_number': ('season', {int_or_none}), + 'episode': ('fullTitle', {str}), + 'episode_number': ('episode', {int_or_none}), + 'timestamp': ('uploadTime', {int_or_none}), + 'release_date': ('telecastDate', {unified_strdate}), + 'age_limit': ('ageNemonic', {parse_age_limit}), + 'duration': ('duration', {float_or_none}), + })), } -class VootSeriesIE(InfoExtractor): +class VootSeriesIE(VootBaseIE): _VALID_URL = r'https?://(?:www\.)?voot\.com/shows/[^/]+/(?P<id>\d{3,})' _TESTS = [{ 'url': 'https://www.voot.com/shows/chakravartin-ashoka-samrat/100002', diff --git a/hypervideo_dl/extractor/vrt.py b/hypervideo_dl/extractor/vrt.py index 26f48bf..497233d 100644 --- a/hypervideo_dl/extractor/vrt.py +++ b/hypervideo_dl/extractor/vrt.py @@ -1,45 +1,139 @@ -from .common import InfoExtractor +import functools +import json +import time +import urllib.parse + +from .gigya import GigyaBaseIE +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, + clean_html, extract_attributes, float_or_none, get_element_by_class, + get_element_html_by_class, + int_or_none, + join_nonempty, + jwt_encode_hs256, + make_archive_id, + parse_age_limit, + parse_iso8601, + str_or_none, strip_or_none, - unified_timestamp, + traverse_obj, + url_or_none, + urlencode_postdata, ) -class VRTIE(InfoExtractor): +class VRTBaseIE(GigyaBaseIE): + _GEO_BYPASS = False + _PLAYER_INFO = { + 'platform': 'desktop', + 'app': { + 'type': 'browser', + 'name': 'Chrome', + }, + 'device': 'undefined (undefined)', + 'os': { + 'name': 'Windows', + 'version': 'x86_64' + }, + 'player': { + 'name': 'VRT web player', + 'version': '2.7.4-prod-2023-04-19T06:05:45' + } + } + # From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.8cdb11341bcb79e4cd44.js + _JWT_KEY_ID = '0-0Fp51UZykfaiCJrfTE3+oMI8zvDteYfPtR+2n1R+z8w=' + _JWT_SIGNING_KEY = 'b5f500d55cb44715107249ccd8a5c0136cfb2788dbb71b90a4f142423bacaf38' # -dev + # player-stag.vrt.be key: d23987504521ae6fbf2716caca6700a24bb1579477b43c84e146b279de5ca595 + # player.vrt.be key: 2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae + + def _extract_formats_and_subtitles(self, data, video_id): + if traverse_obj(data, 'drm'): + self.report_drm(video_id) + + formats, subtitles = [], {} + for target in traverse_obj(data, ('targetUrls', lambda _, v: url_or_none(v['url']) and v['type'])): + format_type = target['type'].upper() + format_url = target['url'] + if format_type in ('HLS', 'HLS_AES'): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + format_url, video_id, 'mp4', m3u8_id=format_type, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif format_type == 'HDS': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_type, fatal=False)) + elif format_type == 'MPEG_DASH': + fmts, subs = self._extract_mpd_formats_and_subtitles( + format_url, video_id, mpd_id=format_type, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + elif format_type == 'HSS': + fmts, subs = self._extract_ism_formats_and_subtitles( + format_url, video_id, ism_id='mss', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'format_id': format_type, + 'url': format_url, + }) + + for sub in traverse_obj(data, ('subtitleUrls', lambda _, v: v['url'] and v['type'] == 'CLOSED')): + subtitles.setdefault('nl', []).append({'url': sub['url']}) + + return formats, subtitles + + def _call_api(self, video_id, client='null', id_token=None, version='v2'): + player_info = {'exp': (round(time.time(), 3) + 900), **self._PLAYER_INFO} + player_token = self._download_json( + 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2/tokens', + video_id, 'Downloading player token', headers={ + **self.geo_verification_headers(), + 'Content-Type': 'application/json', + }, data=json.dumps({ + 'identityToken': id_token or {}, + 'playerInfo': jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={ + 'kid': self._JWT_KEY_ID + }).decode() + }, separators=(',', ':')).encode())['vrtPlayerToken'] + + return self._download_json( + f'https://media-services-public.vrt.be/media-aggregator/{version}/media-items/{video_id}', + video_id, 'Downloading API JSON', query={ + 'vrtPlayerToken': player_token, + 'client': client, + }, expected_status=400) + + +class VRTIE(VRTBaseIE): IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza' _VALID_URL = r'https?://(?:www\.)?(?P<site>vrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/', - 'md5': 'e1663accf5cf13f375f3cd0d10476669', 'info_dict': { 'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd', 'ext': 'mp4', 'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand', - 'description': 'Op maandagavond 15 april ging een deel van het dakgebinte van de Parijse kathedraal in vlammen op.', - 'timestamp': 1557924660, - 'upload_date': '20190515', + 'description': 'md5:6fd85f999b2d1841aa5568f4bf02c3ff', 'duration': 31.2, + 'thumbnail': 'https://images.vrt.be/orig/2019/05/15/2d914d61-7710-11e9-abcc-02b7b76bf47f.jpg', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/', - 'md5': '910bba927566e9ab992278f647eb4b75', 'info_dict': { 'id': 'pbs-pub-f2c86a46-8138-413a-a4b9-a0015a16ce2c$vid-1f112b31-e58e-4379-908d-aca6d80f8818', 'ext': 'mp4', - 'title': 'De Belgian Cats zijn klaar voor het EK mét Ann Wauters', - 'timestamp': 1557923760, - 'upload_date': '20190515', + 'title': 'De Belgian Cats zijn klaar voor het EK', + 'description': 'Video: De Belgian Cats zijn klaar voor het EK mét Ann Wauters | basketbal, sport in het journaal', 'duration': 115.17, + 'thumbnail': 'https://images.vrt.be/orig/2019/05/15/11c0dba3-770e-11e9-abcc-02b7b76bf47f.jpg', }, - }, { - 'url': 'https://www.vrt.be/vrtnws/en/2019/05/15/belgium_s-eurovision-entry-falls-at-the-first-hurdle/', - 'only_matching': True, - }, { - 'url': 'https://www.vrt.be/vrtnws/de/2019/05/15/aus-fuer-eliott-im-halbfinale-des-eurosongfestivals/', - 'only_matching': True, + 'params': {'skip_download': 'm3u8'}, }] _CLIENT_MAP = { 'vrt.be/vrtnws': 'vrtnieuws', @@ -49,34 +143,285 @@ class VRTIE(InfoExtractor): def _real_extract(self, url): site, display_id = self._match_valid_url(url).groups() webpage = self._download_webpage(url, display_id) - attrs = extract_attributes(self._search_regex( - r'(<[^>]+class="vrtvideo( [^"]*)?"[^>]*>)', webpage, 'vrt video')) + attrs = extract_attributes(get_element_html_by_class('vrtvideo', webpage) or '') - asset_id = attrs['data-video-id'] - publication_id = attrs.get('data-publication-id') + asset_id = attrs.get('data-video-id') or attrs['data-videoid'] + publication_id = traverse_obj(attrs, 'data-publication-id', 'data-publicationid') if publication_id: - asset_id = publication_id + '$' + asset_id - client = attrs.get('data-client-code') or self._CLIENT_MAP[site] + asset_id = f'{publication_id}${asset_id}' + client = traverse_obj(attrs, 'data-client-code', 'data-client') or self._CLIENT_MAP[site] + + data = self._call_api(asset_id, client) + formats, subtitles = self._extract_formats_and_subtitles(data, asset_id) - title = strip_or_none(get_element_by_class( - 'vrt-title', webpage) or self._html_search_meta( - ['og:title', 'twitter:title', 'name'], webpage)) description = self._html_search_meta( ['og:description', 'twitter:description', 'description'], webpage) if description == '…': description = None - timestamp = unified_timestamp(self._html_search_meta( - 'article:published_time', webpage)) return { - '_type': 'url_transparent', 'id': asset_id, - 'display_id': display_id, - 'title': title, + 'formats': formats, + 'subtitles': subtitles, 'description': description, - 'thumbnail': attrs.get('data-posterimage'), - 'timestamp': timestamp, + 'thumbnail': url_or_none(attrs.get('data-posterimage')), 'duration': float_or_none(attrs.get('data-duration'), 1000), - 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (client, asset_id), - 'ie_key': 'Canvas', + '_old_archive_ids': [make_archive_id('Canvas', asset_id)], + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('shortDescription', {str}), + 'duration': ('duration', {functools.partial(float_or_none, scale=1000)}), + 'thumbnail': ('posterImageUrl', {url_or_none}), + }), + } + + +class VrtNUIE(VRTBaseIE): + IE_DESC = 'VRT MAX' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)' + _TESTS = [{ + # CONTENT_IS_AGE_RESTRICTED + 'url': 'https://www.vrt.be/vrtnu/a-z/de-ideale-wereld/2023-vj/de-ideale-wereld-d20230116/', + 'info_dict': { + 'id': 'pbs-pub-855b00a8-6ce2-4032-ac4f-1fcf3ae78524$vid-d2243aa1-ec46-4e34-a55b-92568459906f', + 'ext': 'mp4', + 'title': 'Tom Waes', + 'description': 'Satirisch actualiteitenmagazine met Ella Leyers. Tom Waes is te gast.', + 'timestamp': 1673905125, + 'release_timestamp': 1673905125, + 'series': 'De ideale wereld', + 'season_id': '1672830988794', + 'episode': 'Aflevering 1', + 'episode_number': 1, + 'episode_id': '1672830988861', + 'display_id': 'de-ideale-wereld-d20230116', + 'channel': 'VRT', + 'duration': 1939.0, + 'thumbnail': 'https://images.vrt.be/orig/2023/01/10/1bb39cb3-9115-11ed-b07d-02b7b76bf47f.jpg', + 'release_date': '20230116', + 'upload_date': '20230116', + 'age_limit': 12, + }, + }, { + 'url': 'https://www.vrt.be/vrtnu/a-z/buurman--wat-doet-u-nu-/6/buurman--wat-doet-u-nu--s6-trailer/', + 'info_dict': { + 'id': 'pbs-pub-ad4050eb-d9e5-48c2-9ec8-b6c355032361$vid-0465537a-34a8-4617-8352-4d8d983b4eee', + 'ext': 'mp4', + 'title': 'Trailer seizoen 6 \'Buurman, wat doet u nu?\'', + 'description': 'md5:197424726c61384b4e5c519f16c0cf02', + 'timestamp': 1652940000, + 'release_timestamp': 1652940000, + 'series': 'Buurman, wat doet u nu?', + 'season': 'Seizoen 6', + 'season_number': 6, + 'season_id': '1652344200907', + 'episode': 'Aflevering 0', + 'episode_number': 0, + 'episode_id': '1652951873524', + 'display_id': 'buurman--wat-doet-u-nu--s6-trailer', + 'channel': 'VRT', + 'duration': 33.13, + 'thumbnail': 'https://images.vrt.be/orig/2022/05/23/3c234d21-da83-11ec-b07d-02b7b76bf47f.jpg', + 'release_date': '20220519', + 'upload_date': '20220519', + }, + 'params': {'skip_download': 'm3u8'}, + }] + _NETRC_MACHINE = 'vrtnu' + _authenticated = False + + def _perform_login(self, username, password): + auth_info = self._gigya_login({ + 'APIKey': '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy', + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + }) + + if auth_info.get('errorDetails'): + raise ExtractorError(f'Unable to login. VrtNU said: {auth_info["errorDetails"]}', expected=True) + + # Sometimes authentication fails for no good reason, retry + for retry in self.RetryManager(): + if retry.attempt > 1: + self._sleep(1, None) + try: + self._request_webpage( + 'https://token.vrt.be/vrtnuinitlogin', None, note='Requesting XSRF Token', + errnote='Could not get XSRF Token', query={ + 'provider': 'site', + 'destination': 'https://www.vrt.be/vrtnu/', + }) + self._request_webpage( + 'https://login.vrt.be/perform_login', None, + note='Performing login', errnote='Login failed', + query={'client_id': 'vrtnu-site'}, data=urlencode_postdata({ + 'UID': auth_info['UID'], + 'UIDSignature': auth_info['UIDSignature'], + 'signatureTimestamp': auth_info['signatureTimestamp'], + '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, + })) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + retry.error = e + continue + raise + + self._authenticated = True + + def _real_extract(self, url): + display_id = self._match_id(url) + parsed_url = urllib.parse.urlparse(url) + details = self._download_json( + f'{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip("/")}.model.json', + display_id, 'Downloading asset JSON', 'Unable to download asset JSON')['details'] + + watch_info = traverse_obj(details, ( + 'actions', lambda _, v: v['type'] == 'watch-episode', {dict}), get_all=False) or {} + video_id = join_nonempty( + 'episodePublicationId', 'episodeVideoId', delim='$', from_dict=watch_info) + if '$' not in video_id: + raise ExtractorError('Unable to extract video ID') + + vrtnutoken = self._download_json( + 'https://token.vrt.be/refreshtoken', video_id, note='Retrieving vrtnutoken', + errnote='Token refresh failed')['vrtnutoken'] if self._authenticated else None + + video_info = self._call_api(video_id, 'vrtnu-web@PROD', vrtnutoken) + + if 'title' not in video_info: + code = video_info.get('code') + if code in ('AUTHENTICATION_REQUIRED', 'CONTENT_IS_AGE_RESTRICTED'): + self.raise_login_required(code, method='password') + elif code in ('INVALID_LOCATION', 'CONTENT_AVAILABLE_ONLY_IN_BE'): + self.raise_geo_restricted(countries=['BE']) + elif code == 'CONTENT_AVAILABLE_ONLY_FOR_BE_RESIDENTS_AND_EXPATS': + if not self._authenticated: + self.raise_login_required(code, method='password') + self.raise_geo_restricted(countries=['BE']) + raise ExtractorError(code, expected=True) + + formats, subtitles = self._extract_formats_and_subtitles(video_info, video_id) + + return { + **traverse_obj(details, { + 'title': 'title', + 'description': ('description', {clean_html}), + 'timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}), + 'release_timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}), + 'series': ('data', 'program', 'title'), + 'season': ('data', 'season', 'title', 'value'), + 'season_number': ('data', 'season', 'title', 'raw', {int_or_none}), + 'season_id': ('data', 'season', 'id', {str_or_none}), + 'episode': ('data', 'episode', 'number', 'value', {str_or_none}), + 'episode_number': ('data', 'episode', 'number', 'raw', {int_or_none}), + 'episode_id': ('data', 'episode', 'id', {str_or_none}), + 'age_limit': ('data', 'episode', 'age', 'raw', {parse_age_limit}), + }), + 'id': video_id, + 'display_id': display_id, + 'channel': 'VRT', + 'formats': formats, + 'duration': float_or_none(video_info.get('duration'), 1000), + 'thumbnail': url_or_none(video_info.get('posterImageUrl')), + 'subtitles': subtitles, + '_old_archive_ids': [make_archive_id('Canvas', video_id)], + } + + +class KetnetIE(VRTBaseIE): + _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P<id>(?:[^/]+/)*[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.ketnet.be/kijken/m/meisjes/6/meisjes-s6a5', + 'info_dict': { + 'id': 'pbs-pub-39f8351c-a0a0-43e6-8394-205d597d6162$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e', + 'ext': 'mp4', + 'title': 'Meisjes', + 'episode': 'Reeks 6: Week 5', + 'season': 'Reeks 6', + 'series': 'Meisjes', + 'timestamp': 1685251800, + 'upload_date': '20230528', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + video = self._download_json( + 'https://senior-bff.ketnet.be/graphql', display_id, query={ + 'query': '''{ + video(id: "content/ketnet/nl/%s.model.json") { + description + episodeNr + imageUrl + mediaReference + programTitle + publicationDate + seasonTitle + subtitleVideodetail + titleVideodetail + } +}''' % display_id, + })['data']['video'] + + video_id = urllib.parse.unquote(video['mediaReference']) + data = self._call_api(video_id, 'ketnet@PROD', version='v1') + formats, subtitles = self._extract_formats_and_subtitles(data, video_id) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + '_old_archive_ids': [make_archive_id('Canvas', video_id)], + **traverse_obj(video, { + 'title': ('titleVideodetail', {str}), + 'description': ('description', {str}), + 'thumbnail': ('thumbnail', {url_or_none}), + 'timestamp': ('publicationDate', {parse_iso8601}), + 'series': ('programTitle', {str}), + 'season': ('seasonTitle', {str}), + 'episode': ('subtitleVideodetail', {str}), + 'episode_number': ('episodeNr', {int_or_none}), + }), + } + + +class DagelijkseKostIE(VRTBaseIE): + IE_DESC = 'dagelijksekost.een.be' + _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', + 'info_dict': { + 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', + 'ext': 'mp4', + 'title': 'Hachis parmentier met witloof', + 'description': 'md5:9960478392d87f63567b5b117688cdc5', + 'display_id': 'hachis-parmentier-met-witloof', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._html_search_regex( + r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', group='id') + + data = self._call_api(video_id, 'dako@prod', version='v1') + formats, subtitles = self._extract_formats_and_subtitles(data, video_id) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'display_id': display_id, + 'title': strip_or_none(get_element_by_class( + 'dish-metadata__title', webpage) or self._html_search_meta('twitter:title', webpage)), + 'description': clean_html(get_element_by_class( + 'dish-description', webpage)) or self._html_search_meta( + ['description', 'twitter:description', 'og:description'], webpage), + '_old_archive_ids': [make_archive_id('Canvas', video_id)], } diff --git a/hypervideo_dl/extractor/vrv.py b/hypervideo_dl/extractor/vrv.py index 89fa7af..523c442 100644 --- a/hypervideo_dl/extractor/vrv.py +++ b/hypervideo_dl/extractor/vrv.py @@ -8,7 +8,8 @@ import time import urllib.parse from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_urllib_parse_urlencode +from ..compat import compat_urllib_parse_urlencode +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, float_or_none, @@ -30,7 +31,7 @@ class VRVBaseIE(InfoExtractor): base_url = self._API_DOMAIN + '/core/' + path query = [ ('oauth_consumer_key', self._API_PARAMS['oAuthKey']), - ('oauth_nonce', ''.join([random.choice(string.ascii_letters) for _ in range(32)])), + ('oauth_nonce', ''.join(random.choices(string.ascii_letters, k=32))), ('oauth_signature_method', 'HMAC-SHA1'), ('oauth_timestamp', int(time.time())), ] @@ -54,8 +55,8 @@ class VRVBaseIE(InfoExtractor): '?'.join([base_url, encoded_query]), video_id, note='Downloading %s JSON metadata' % note, headers=headers, data=data) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - raise ExtractorError(json.loads(e.cause.read().decode())['message'], expected=True) + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + raise ExtractorError(json.loads(e.cause.response.read().decode())['message'], expected=True) raise def _call_cms(self, path, video_id, note): diff --git a/hypervideo_dl/extractor/vshare.py b/hypervideo_dl/extractor/vshare.py index 1bc7ae4..443ed43 100644 --- a/hypervideo_dl/extractor/vshare.py +++ b/hypervideo_dl/extractor/vshare.py @@ -22,7 +22,7 @@ class VShareIE(InfoExtractor): packed = self._search_regex( r'(eval\(function.+)', webpage, 'packed code') unpacked = decode_packed_codes(packed) - digits = self._search_regex(r'\[((?:\d+,?)+)\]', unpacked, 'digits') + digits = self._search_regex(r'\[([\d,]+)\]', unpacked, 'digits') digits = [int(digit) for digit in digits.split(',')] key_digit = self._search_regex( r'fromCharCode\(.+?(\d+)\)}', unpacked, 'key digit') diff --git a/hypervideo_dl/extractor/vzaar.py b/hypervideo_dl/extractor/vzaar.py index 6b9817c..19908a9 100644 --- a/hypervideo_dl/extractor/vzaar.py +++ b/hypervideo_dl/extractor/vzaar.py @@ -87,7 +87,7 @@ class VzaarIE(InfoExtractor): m3u8_id='hls', fatal=False) if hls_aes: for f in m3u8_formats: - f['_decryption_key_url'] = url_templ % ('goose', '') + qs + f['hls_aes'] = {'uri': url_templ % ('goose', '') + qs} formats.extend(m3u8_formats) return { diff --git a/hypervideo_dl/extractor/wat.py b/hypervideo_dl/extractor/wat.py index 7c62d28..9ea3fdd 100644 --- a/hypervideo_dl/extractor/wat.py +++ b/hypervideo_dl/extractor/wat.py @@ -41,6 +41,18 @@ class WatIE(InfoExtractor): 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], 'skip': 'This content is no longer available', }, + { + 'url': 'wat:14010600', + 'info_dict': { + 'id': '14010600', + 'title': 'Burger Quiz - S03 EP21 avec Eye Haidara, Anne Depétrini, Jonathan Zaccaï et Pio Marmaï', + 'thumbnail': 'https://photos.tf1.fr/1280/720/burger-quiz-11-9adb79-0@1x.jpg', + 'upload_date': '20230819', + 'duration': 2312, + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + } ] _GEO_BYPASS = False @@ -54,7 +66,7 @@ class WatIE(InfoExtractor): # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) video_data = self._download_json( 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, - video_id, query={'context': 'MYTF1', 'pver': '4020003'}) + video_id, query={'pver': '5010000'}) video_info = video_data['media'] error_desc = video_info.get('error_desc') diff --git a/hypervideo_dl/extractor/webcamerapl.py b/hypervideo_dl/extractor/webcamerapl.py new file mode 100644 index 0000000..a02d951 --- /dev/null +++ b/hypervideo_dl/extractor/webcamerapl.py @@ -0,0 +1,44 @@ +import codecs + +from .common import InfoExtractor + + +class WebcameraplIE(InfoExtractor): + _VALID_URL = r'https?://(?P<id>[\w-]+)\.webcamera\.pl' + _TESTS = [{ + 'url': 'https://warszawa-plac-zamkowy.webcamera.pl', + 'info_dict': { + 'id': 'warszawa-plac-zamkowy', + 'ext': 'mp4', + 'title': r're:WIDOK NA PLAC ZAMKOWY W WARSZAWIE \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'live_status': 'is_live', + } + }, { + 'url': 'https://gdansk-stare-miasto.webcamera.pl/', + 'info_dict': { + 'id': 'gdansk-stare-miasto', + 'ext': 'mp4', + 'title': r're:GDAŃSK - widok na Stare Miasto \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'live_status': 'is_live', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + rot13_m3u8_url = self._search_regex(r'data-src\s*=\s*"(uggc[^"]+\.z3h8)"', + webpage, 'm3u8 url', default=None) + if not rot13_m3u8_url: + self.raise_no_formats('No video/audio found at the provided url', expected=True) + + m3u8_url = codecs.decode(rot13_m3u8_url, 'rot-13') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, live=True) + + return { + 'id': video_id, + 'title': self._html_search_regex(r'<h1\b[^>]*>([^>]+)</h1>', webpage, 'title'), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + } diff --git a/hypervideo_dl/extractor/weibo.py b/hypervideo_dl/extractor/weibo.py index 81a23b9..bc9a71a 100644 --- a/hypervideo_dl/extractor/weibo.py +++ b/hypervideo_dl/extractor/weibo.py @@ -31,7 +31,7 @@ class WeiboIE(InfoExtractor): # to get Referer url for genvisitor webpage, urlh = self._download_webpage_handle(url, video_id) - visitor_url = urlh.geturl() + visitor_url = urlh.url if 'passport.weibo.com' in visitor_url: # first visit diff --git a/hypervideo_dl/extractor/weverse.py b/hypervideo_dl/extractor/weverse.py new file mode 100644 index 0000000..bbf6285 --- /dev/null +++ b/hypervideo_dl/extractor/weverse.py @@ -0,0 +1,608 @@ +import base64 +import hashlib +import hmac +import itertools +import json +import re +import time +import urllib.parse +import uuid + +from .common import InfoExtractor +from .naver import NaverBaseIE +from .youtube import YoutubeIE +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + UserNotLive, + float_or_none, + int_or_none, + str_or_none, + traverse_obj, + try_call, + update_url_query, + url_or_none, +) + + +class WeverseBaseIE(InfoExtractor): + _NETRC_MACHINE = 'weverse' + _ACCOUNT_API_BASE = 'https://accountapi.weverse.io/web/api/v2' + _API_HEADERS = { + 'Referer': 'https://weverse.io/', + 'WEV-device-Id': str(uuid.uuid4()), + } + + def _perform_login(self, username, password): + if self._API_HEADERS.get('Authorization'): + return + + headers = { + 'x-acc-app-secret': '5419526f1c624b38b10787e5c10b2a7a', + 'x-acc-app-version': '2.2.6', + 'x-acc-language': 'en', + 'x-acc-service-id': 'weverse', + 'x-acc-trace-id': str(uuid.uuid4()), + 'x-clog-user-device-id': str(uuid.uuid4()), + } + check_username = self._download_json( + f'{self._ACCOUNT_API_BASE}/signup/email/status', None, + note='Checking username', query={'email': username}, headers=headers) + if not check_username.get('hasPassword'): + raise ExtractorError('Invalid username provided', expected=True) + + headers['content-type'] = 'application/json' + try: + auth = self._download_json( + f'{self._ACCOUNT_API_BASE}/auth/token/by-credentials', None, data=json.dumps({ + 'email': username, + 'password': password, + }, separators=(',', ':')).encode(), headers=headers, note='Logging in') + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + raise ExtractorError('Invalid password provided', expected=True) + raise + + WeverseBaseIE._API_HEADERS['Authorization'] = f'Bearer {auth["accessToken"]}' + + def _real_initialize(self): + if self._API_HEADERS.get('Authorization'): + return + + token = try_call(lambda: self._get_cookies('https://weverse.io/')['we2_access_token'].value) + if token: + WeverseBaseIE._API_HEADERS['Authorization'] = f'Bearer {token}' + + def _call_api(self, ep, video_id, data=None, note='Downloading API JSON'): + # Ref: https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/2488.a09b41ff.chunk.js + # From https://ssl.pstatic.net/static/wevweb/2_3_2_11101725/public/static/js/main.e206f7c1.js: + key = b'1b9cb6378d959b45714bec49971ade22e6e24e42' + api_path = update_url_query(ep, { + 'appId': 'be4d79eb8fc7bd008ee82c8ec4ff6fd4', + 'language': 'en', + 'platform': 'WEB', + 'wpf': 'pc', + }) + wmsgpad = int(time.time() * 1000) + wmd = base64.b64encode(hmac.HMAC( + key, f'{api_path[:255]}{wmsgpad}'.encode(), digestmod=hashlib.sha1).digest()).decode() + headers = {'Content-Type': 'application/json'} if data else {} + try: + return self._download_json( + f'https://global.apis.naver.com/weverse/wevweb{api_path}', video_id, note=note, + data=data, headers={**self._API_HEADERS, **headers}, query={ + 'wmsgpad': wmsgpad, + 'wmd': wmd, + }) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + self.raise_login_required( + 'Session token has expired. Log in again or refresh cookies in browser') + elif isinstance(e.cause, HTTPError) and e.cause.status == 403: + if 'Authorization' in self._API_HEADERS: + raise ExtractorError('Your account does not have access to this content', expected=True) + self.raise_login_required() + raise + + def _call_post_api(self, video_id): + path = '' if 'Authorization' in self._API_HEADERS else '/preview' + return self._call_api(f'/post/v1.0/post-{video_id}{path}?fieldSet=postV1', video_id) + + def _get_community_id(self, channel): + return str(self._call_api( + f'/community/v1.0/communityIdUrlPathByUrlPathArtistCode?keyword={channel}', + channel, note='Fetching community ID')['communityId']) + + def _get_formats(self, data, video_id): + formats = traverse_obj(data, ('videos', 'list', lambda _, v: url_or_none(v['source']), { + 'url': 'source', + 'width': ('encodingOption', 'width', {int_or_none}), + 'height': ('encodingOption', 'height', {int_or_none}), + 'vcodec': 'type', + 'vbr': ('bitrate', 'video', {int_or_none}), + 'abr': ('bitrate', 'audio', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'format_id': ('encodingOption', 'id', {str_or_none}), + })) + + for stream in traverse_obj(data, ('streams', lambda _, v: v['type'] == 'HLS' and url_or_none(v['source']))): + query = {} + for param in traverse_obj(stream, ('keys', lambda _, v: v['type'] == 'param' and v['name'])): + query[param['name']] = param.get('value', '') + fmts = self._extract_m3u8_formats( + stream['source'], video_id, 'mp4', m3u8_id='hls', fatal=False, query=query) + if query: + for fmt in fmts: + fmt['url'] = update_url_query(fmt['url'], query) + fmt['extra_param_to_segment_url'] = urllib.parse.urlencode(query) + formats.extend(fmts) + + return formats + + def _get_subs(self, caption_url): + subs_ext_re = r'\.(?:ttml|vtt)' + replace_ext = lambda x, y: re.sub(subs_ext_re, y, x) + if re.search(subs_ext_re, caption_url): + return [replace_ext(caption_url, '.ttml'), replace_ext(caption_url, '.vtt')] + return [caption_url] + + def _parse_post_meta(self, metadata): + return traverse_obj(metadata, { + 'title': ((('extension', 'mediaInfo', 'title'), 'title'), {str}), + 'description': ((('extension', 'mediaInfo', 'body'), 'body'), {str}), + 'uploader': ('author', 'profileName', {str}), + 'uploader_id': ('author', 'memberId', {str}), + 'creator': ('community', 'communityName', {str}), + 'channel_id': (('community', 'author'), 'communityId', {str_or_none}), + 'duration': ('extension', 'video', 'playTime', {float_or_none}), + 'timestamp': ('publishedAt', {lambda x: int_or_none(x, 1000)}), + 'release_timestamp': ('extension', 'video', 'onAirStartAt', {lambda x: int_or_none(x, 1000)}), + 'thumbnail': ('extension', (('mediaInfo', 'thumbnail', 'url'), ('video', 'thumb')), {url_or_none}), + 'view_count': ('extension', 'video', 'playCount', {int_or_none}), + 'like_count': ('extension', 'video', 'likeCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + }, get_all=False) + + def _extract_availability(self, data): + return self._availability(**traverse_obj(data, ((('extension', 'video'), None), { + 'needs_premium': 'paid', + 'needs_subscription': 'membershipOnly', + }), get_all=False, expected_type=bool), needs_auth=True) + + def _extract_live_status(self, data): + data = traverse_obj(data, ('extension', 'video', {dict})) or {} + if data.get('type') == 'LIVE': + return traverse_obj({ + 'ONAIR': 'is_live', + 'DONE': 'post_live', + 'STANDBY': 'is_upcoming', + 'DELAY': 'is_upcoming', + }, (data.get('status'), {str})) or 'is_live' + return 'was_live' if data.get('liveToVod') else 'not_live' + + +class WeverseIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<artist>[^/?#]+)/live/(?P<id>[\d-]+)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/live/0-107323480', + 'md5': '1fa849f00181eef9100d3c8254c47979', + 'info_dict': { + 'id': '0-107323480', + 'ext': 'mp4', + 'title': '행복한 평이루💜', + 'description': '', + 'uploader': 'Billlie', + 'uploader_id': '5ae14aed7b7cdc65fa87c41fe06cc936', + 'channel': 'billlie', + 'channel_id': '72', + 'channel_url': 'https://weverse.io/billlie', + 'creator': 'Billlie', + 'timestamp': 1666262062, + 'upload_date': '20221020', + 'release_timestamp': 1666262058, + 'release_date': '20221020', + 'duration': 3102, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'was_live', + }, + }, { + 'url': 'https://weverse.io/lesserafim/live/2-102331763', + 'md5': 'e46125c08b13a6c8c1f4565035cca987', + 'info_dict': { + 'id': '2-102331763', + 'ext': 'mp4', + 'title': '🎂김채원 생신🎂', + 'description': '🎂김채원 생신🎂', + 'uploader': 'LE SSERAFIM ', + 'uploader_id': 'd26ddc1e258488a0a2b795218d14d59d', + 'channel': 'lesserafim', + 'channel_id': '47', + 'channel_url': 'https://weverse.io/lesserafim', + 'creator': 'LE SSERAFIM', + 'timestamp': 1659353400, + 'upload_date': '20220801', + 'release_timestamp': 1659353400, + 'release_date': '20220801', + 'duration': 3006, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'was_live', + 'subtitles': { + 'id_ID': 'count:2', + 'en_US': 'count:2', + 'es_ES': 'count:2', + 'vi_VN': 'count:2', + 'th_TH': 'count:2', + 'zh_CN': 'count:2', + 'zh_TW': 'count:2', + 'ja_JP': 'count:2', + 'ko_KR': 'count:2', + }, + }, + }, { + 'url': 'https://weverse.io/treasure/live/2-117230416', + 'info_dict': { + 'id': '2-117230416', + 'ext': 'mp4', + 'title': r're:스껄도려님 첫 스무살 생파🦋', + 'description': '', + 'uploader': 'TREASURE', + 'uploader_id': '77eabbc449ca37f7970054a136f60082', + 'channel': 'treasure', + 'channel_id': '20', + 'channel_url': 'https://weverse.io/treasure', + 'creator': 'TREASURE', + 'timestamp': 1680667651, + 'upload_date': '20230405', + 'release_timestamp': 1680667639, + 'release_date': '20230405', + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'is_live', + }, + 'skip': 'Livestream has ended', + }] + + def _real_extract(self, url): + channel, video_id = self._match_valid_url(url).group('artist', 'id') + post = self._call_post_api(video_id) + api_video_id = post['extension']['video']['videoId'] + availability = self._extract_availability(post) + live_status = self._extract_live_status(post) + video_info, formats = {}, [] + + if live_status == 'is_upcoming': + self.raise_no_formats('Livestream has not yet started', expected=True) + + elif live_status == 'is_live': + video_info = self._call_api( + f'/video/v1.0/lives/{api_video_id}/playInfo?preview.format=json&preview.version=v2', + video_id, note='Downloading live JSON') + playback = self._parse_json(video_info['lipPlayback'], video_id) + m3u8_url = traverse_obj(playback, ( + 'media', lambda _, v: v['protocol'] == 'HLS', 'path', {url_or_none}), get_all=False) + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls', live=True) + + elif live_status == 'post_live': + if availability in ('premium_only', 'subscriber_only'): + self.report_drm(video_id) + self.raise_no_formats( + 'Livestream has ended and downloadable VOD is not available', expected=True) + + else: + infra_video_id = post['extension']['video']['infraVideoId'] + in_key = self._call_api( + f'/video/v1.0/vod/{api_video_id}/inKey?preview=false', video_id, + data=b'{}', note='Downloading VOD API key')['inKey'] + + video_info = self._download_json( + f'https://global.apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{infra_video_id}', + video_id, note='Downloading VOD JSON', query={ + 'key': in_key, + 'sid': traverse_obj(post, ('extension', 'video', 'serviceId')) or '2070', + 'pid': str(uuid.uuid4()), + 'nonce': int(time.time() * 1000), + 'devt': 'html5_pc', + 'prv': 'Y' if post.get('membershipOnly') else 'N', + 'aup': 'N', + 'stpb': 'N', + 'cpl': 'en', + 'env': 'prod', + 'lc': 'en', + 'adi': '[{"adSystem":"null"}]', + 'adu': '/', + }) + + formats = self._get_formats(video_info, video_id) + has_drm = traverse_obj(video_info, ('meta', 'provider', 'name', {str.lower})) == 'drm' + if has_drm and formats: + self.report_warning( + 'Requested content is DRM-protected, only a 30-second preview is available', video_id) + elif has_drm and not formats: + self.report_drm(video_id) + + return { + 'id': video_id, + 'channel': channel, + 'channel_url': f'https://weverse.io/{channel}', + 'formats': formats, + 'availability': availability, + 'live_status': live_status, + **self._parse_post_meta(post), + **NaverBaseIE.process_subtitles(video_info, self._get_subs), + } + + +class WeverseMediaIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<artist>[^/?#]+)/media/(?P<id>[\d-]+)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/media/4-116372884', + 'md5': '8efc9cfd61b2f25209eb1a5326314d28', + 'info_dict': { + 'id': 'e-C9wLSQs6o', + 'ext': 'mp4', + 'title': 'Billlie | \'EUNOIA\' Performance Video (heartbeat ver.)', + 'description': 'md5:6181caaf2a2397bca913ffe368c104e5', + 'channel': 'Billlie', + 'channel_id': 'UCyc9sUCxELTDK9vELO5Fzeg', + 'channel_url': 'https://www.youtube.com/channel/UCyc9sUCxELTDK9vELO5Fzeg', + 'uploader': 'Billlie', + 'uploader_id': '@Billlie', + 'uploader_url': 'http://www.youtube.com/@Billlie', + 'upload_date': '20230403', + 'duration': 211, + 'age_limit': 0, + 'playable_in_embed': True, + 'live_status': 'not_live', + 'availability': 'public', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/e-C9wLSQs6o/maxresdefault.jpg', + 'categories': ['Entertainment'], + 'tags': 'count:7', + }, + }, { + 'url': 'https://weverse.io/billlie/media/3-102914520', + 'md5': '031551fcbd716bc4f080cb6174a43d8a', + 'info_dict': { + 'id': '3-102914520', + 'ext': 'mp4', + 'title': 'From. SUHYEON🌸', + 'description': 'Billlie 멤버별 독점 영상 공개💙💜', + 'uploader': 'Billlie_official', + 'uploader_id': 'f569c6e92f7eaffef0a395037dcaa54f', + 'channel': 'billlie', + 'channel_id': '72', + 'channel_url': 'https://weverse.io/billlie', + 'creator': 'Billlie', + 'timestamp': 1662174000, + 'upload_date': '20220903', + 'release_timestamp': 1662174000, + 'release_date': '20220903', + 'duration': 17.0, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'not_live', + }, + }] + + def _real_extract(self, url): + channel, video_id = self._match_valid_url(url).group('artist', 'id') + post = self._call_post_api(video_id) + media_type = traverse_obj(post, ('extension', 'mediaInfo', 'mediaType', {str.lower})) + youtube_id = traverse_obj(post, ('extension', 'youtube', 'youtubeVideoId', {str})) + + if media_type == 'vod': + return self.url_result(f'https://weverse.io/{channel}/live/{video_id}', WeverseIE) + elif media_type == 'youtube' and youtube_id: + return self.url_result(youtube_id, YoutubeIE) + elif media_type == 'image': + self.raise_no_formats('No video content found in webpage', expected=True) + elif media_type: + raise ExtractorError(f'Unsupported media type "{media_type}"') + + self.raise_no_formats('No video content found in webpage') + + +class WeverseMomentIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<artist>[^/?#]+)/moment/(?P<uid>[\da-f]+)/post/(?P<id>[\d-]+)' + _TESTS = [{ + 'url': 'https://weverse.io/secretnumber/moment/66a07e164b56a696ee71c99315ffe27b/post/1-117229444', + 'md5': '87733ac19a54081b7dfc2442036d282b', + 'info_dict': { + 'id': '1-117229444', + 'ext': 'mp4', + 'title': '今日もめっちゃいい天気☀️🌤️', + 'uploader': '레아', + 'uploader_id': '66a07e164b56a696ee71c99315ffe27b', + 'channel': 'secretnumber', + 'channel_id': '56', + 'creator': 'SECRET NUMBER', + 'duration': 10, + 'upload_date': '20230405', + 'timestamp': 1680653968, + 'thumbnail': r're:^https?://.*\.jpe?g$', + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + }, + 'skip': 'Moment has expired', + }] + + def _real_extract(self, url): + channel, uploader_id, video_id = self._match_valid_url(url).group('artist', 'uid', 'id') + post = self._call_post_api(video_id) + api_video_id = post['extension']['moment']['video']['videoId'] + video_info = self._call_api( + f'/cvideo/v1.0/cvideo-{api_video_id}/playInfo?videoId={api_video_id}', video_id, + note='Downloading moment JSON')['playInfo'] + + return { + 'id': video_id, + 'channel': channel, + 'uploader_id': uploader_id, + 'formats': self._get_formats(video_info, video_id), + 'availability': self._extract_availability(post), + **traverse_obj(post, { + 'title': ((('extension', 'moment', 'body'), 'body'), {str}), + 'uploader': ('author', 'profileName', {str}), + 'creator': (('community', 'author'), 'communityName', {str}), + 'channel_id': (('community', 'author'), 'communityId', {str_or_none}), + 'duration': ('extension', 'moment', 'video', 'uploadInfo', 'playTime', {float_or_none}), + 'timestamp': ('publishedAt', {lambda x: int_or_none(x, 1000)}), + 'thumbnail': ('extension', 'moment', 'video', 'uploadInfo', 'imageUrl', {url_or_none}), + 'like_count': ('emotionCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + }, get_all=False), + **NaverBaseIE.process_subtitles(video_info, self._get_subs), + } + + +class WeverseTabBaseIE(WeverseBaseIE): + _ENDPOINT = None + _PATH = None + _QUERY = {} + _RESULT_IE = None + + def _entries(self, channel_id, channel, first_page): + query = self._QUERY.copy() + + for page in itertools.count(1): + posts = first_page if page == 1 else self._call_api( + update_url_query(self._ENDPOINT % channel_id, query), channel, + note=f'Downloading {self._PATH} tab page {page}') + + for post in traverse_obj(posts, ('data', lambda _, v: v['postId'])): + yield self.url_result( + f'https://weverse.io/{channel}/{self._PATH}/{post["postId"]}', + self._RESULT_IE, post['postId'], **self._parse_post_meta(post), + channel=channel, channel_url=f'https://weverse.io/{channel}', + availability=self._extract_availability(post), + live_status=self._extract_live_status(post)) + + query['after'] = traverse_obj(posts, ('paging', 'nextParams', 'after', {str})) + if not query['after']: + break + + def _real_extract(self, url): + channel = self._match_id(url) + channel_id = self._get_community_id(channel) + + first_page = self._call_api( + update_url_query(self._ENDPOINT % channel_id, self._QUERY), channel, + note=f'Downloading {self._PATH} tab page 1') + + return self.playlist_result( + self._entries(channel_id, channel, first_page), f'{channel}-{self._PATH}', + **traverse_obj(first_page, ('data', ..., { + 'playlist_title': ('community', 'communityName', {str}), + 'thumbnail': ('author', 'profileImageUrl', {url_or_none}), + }), get_all=False)) + + +class WeverseLiveTabIE(WeverseTabBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<id>[^/?#]+)/live/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/live/', + 'playlist_mincount': 55, + 'info_dict': { + 'id': 'billlie-live', + 'title': 'Billlie', + 'thumbnail': r're:^https?://.*\.jpe?g$', + }, + }] + + _ENDPOINT = '/post/v1.0/community-%s/liveTabPosts' + _PATH = 'live' + _QUERY = {'fieldSet': 'postsV1'} + _RESULT_IE = WeverseIE + + +class WeverseMediaTabIE(WeverseTabBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<id>[^/?#]+)/media(?:/|/all|/new)?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://weverse.io/billlie/media/', + 'playlist_mincount': 231, + 'info_dict': { + 'id': 'billlie-media', + 'title': 'Billlie', + 'thumbnail': r're:^https?://.*\.jpe?g$', + }, + }, { + 'url': 'https://weverse.io/lesserafim/media/all', + 'only_matching': True, + }, { + 'url': 'https://weverse.io/lesserafim/media/new', + 'only_matching': True, + }] + + _ENDPOINT = '/media/v1.0/community-%s/more' + _PATH = 'media' + _QUERY = {'fieldSet': 'postsV1', 'filterType': 'RECENT'} + _RESULT_IE = WeverseMediaIE + + +class WeverseLiveIE(WeverseBaseIE): + _VALID_URL = r'https?://(?:www\.|m\.)?weverse.io/(?P<id>[^/?#]+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://weverse.io/purplekiss', + 'info_dict': { + 'id': '3-116560493', + 'ext': 'mp4', + 'title': r're:모하냥🫶🏻', + 'description': '내일은 금요일~><', + 'uploader': '채인', + 'uploader_id': '1ffb1d9d904d6b3db2783f876eb9229d', + 'channel': 'purplekiss', + 'channel_id': '35', + 'channel_url': 'https://weverse.io/purplekiss', + 'creator': 'PURPLE KISS', + 'timestamp': 1680780892, + 'upload_date': '20230406', + 'release_timestamp': 1680780883, + 'release_date': '20230406', + 'thumbnail': 'https://weverse-live.pstatic.net/v1.0/live/62044/thumb', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'availability': 'needs_auth', + 'live_status': 'is_live', + }, + 'skip': 'Livestream has ended', + }, { + 'url': 'https://weverse.io/billlie/', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel = self._match_id(url) + channel_id = self._get_community_id(channel) + + video_id = traverse_obj( + self._call_api(update_url_query(f'/post/v1.0/community-{channel_id}/liveTab', { + 'debugMessage': 'true', + 'fields': 'onAirLivePosts.fieldSet(postsV1).limit(10),reservedLivePosts.fieldSet(postsV1).limit(10)', + }), channel, note='Downloading live JSON'), ( + ('onAirLivePosts', 'reservedLivePosts'), 'data', + lambda _, v: self._extract_live_status(v) in ('is_live', 'is_upcoming'), 'postId', {str}), + get_all=False) + + if not video_id: + raise UserNotLive(video_id=channel) + + return self.url_result(f'https://weverse.io/{channel}/live/{video_id}', WeverseIE) diff --git a/hypervideo_dl/extractor/wevidi.py b/hypervideo_dl/extractor/wevidi.py new file mode 100644 index 0000000..3b6d032 --- /dev/null +++ b/hypervideo_dl/extractor/wevidi.py @@ -0,0 +1,108 @@ +from .common import InfoExtractor +from ..utils import clean_html, float_or_none, get_element_by_class, js_to_json, traverse_obj + + +class WeVidiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?wevidi\.net/watch/(?P<id>[\w-]{11})' + _TESTS = [{ + 'url': 'https://wevidi.net/watch/2th7UO5F4KV', + 'md5': 'b913d1ff5bbad499e2c7ef4aa6d829d7', + 'info_dict': { + 'id': '2th7UO5F4KV', + 'ext': 'mp4', + 'title': 'YouTube Alternative: WeVidi - customizable channels & more', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:73a27d0a87d49fbcc5584566326ebeed', + 'uploader': 'eclecRC', + 'duration': 932.098, + } + }, { + 'url': 'https://wevidi.net/watch/ievRuuQHbPS', + 'md5': 'ce8a94989a959bff9003fa27ee572935', + 'info_dict': { + 'id': 'ievRuuQHbPS', + 'ext': 'mp4', + 'title': 'WeVidi Playlists', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:32cdfca272687390d9bd9b0c9c6153ee', + 'uploader': 'WeVidi', + 'duration': 36.1999, + } + }, { + 'url': 'https://wevidi.net/watch/PcMzDWaQSWb', + 'md5': '55ee0d3434be5d9e5cc76b83f2bb57ec', + 'info_dict': { + 'id': 'PcMzDWaQSWb', + 'ext': 'mp4', + 'title': 'Cat blep', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:e2c9e2b54b8bb424cc64937c8fdc068f', + 'uploader': 'WeVidi', + 'duration': 41.972, + } + }, { + 'url': 'https://wevidi.net/watch/wJnRqDHNe_u', + 'md5': 'c8f263dd47e66cc17546b3abf47b5a77', + 'info_dict': { + 'id': 'wJnRqDHNe_u', + 'ext': 'mp4', + 'title': 'Gissy Talks: YouTube Alternatives', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:e65036f0d4af80e0af191bd11af5195e', + 'uploader': 'GissyEva', + 'duration': 630.451, + } + }, { + 'url': 'https://wevidi.net/watch/4m1c4yJR_yc', + 'md5': 'c63ce5ca6990dce86855fc02ca5bc1ed', + 'info_dict': { + 'id': '4m1c4yJR_yc', + 'ext': 'mp4', + 'title': 'Enough of that! - Awesome Exilez Podcast', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:96af99dd63468b2dfab3020560e3e9b2', + 'uploader': 'eclecRC', + 'duration': 6.804, + } + }] + + def _extract_formats(self, wvplayer_props): + # Taken from WeVidi player JS: https://wevidi.net/layouts/default/static/player.min.js + resolution_map = { + 1: 144, + 2: 240, + 3: 360, + 4: 480, + 5: 720, + 6: 1080 + } + + src_path = f'{wvplayer_props["srcVID"]}/{wvplayer_props["srcUID"]}/{wvplayer_props["srcNAME"]}' + for res in traverse_obj(wvplayer_props, ('resolutions', ..., {int}, {lambda x: x or None})): + format_id = str(-(res // -2) - 1) + yield { + 'acodec': 'mp4a.40.2', + 'ext': 'mp4', + 'format_id': format_id, + 'height': resolution_map.get(res), + 'url': f'https://www.wevidi.net/videoplayback/{src_path}/{format_id}', + 'vcodec': 'avc1.42E01E', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + wvplayer_props = self._search_json( + r'WVPlayer\(', webpage, 'player', video_id, + transform_source=lambda x: js_to_json(x.replace('||', '}'))) + + return { + 'id': video_id, + 'title': clean_html(get_element_by_class('video_title', webpage)), + 'description': clean_html(get_element_by_class('descr_long', webpage)), + 'uploader': clean_html(get_element_by_class('username', webpage)), + 'formats': list(self._extract_formats(wvplayer_props)), + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': float_or_none(wvplayer_props.get('duration')), + } diff --git a/hypervideo_dl/extractor/weyyak.py b/hypervideo_dl/extractor/weyyak.py new file mode 100644 index 0000000..ef12be8 --- /dev/null +++ b/hypervideo_dl/extractor/weyyak.py @@ -0,0 +1,86 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + parse_age_limit, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class WeyyakIE(InfoExtractor): + _VALID_URL = r'https?://weyyak\.com/(?P<lang>\w+)/(?:player/)?(?P<type>episode|movie)/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://weyyak.com/en/player/episode/1341952/Ribat-Al-Hob-Episode49', + 'md5': '0caf55c1a615531c8fe60f146ae46849', + 'info_dict': { + 'id': '1341952', + 'ext': 'mp4', + 'title': 'Ribat Al Hob', + 'duration': 2771, + 'alt_title': 'رباط الحب', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 49', + 'episode_number': 49, + 'timestamp': 1485907200, + 'upload_date': '20170201', + 'thumbnail': r're:^https://content\.weyyak\.com/.+/poster-image', + 'categories': ['Drama', 'Thrillers', 'Romance'], + 'tags': 'count:8', + }, + }, + { + 'url': 'https://weyyak.com/en/movie/233255/8-Seconds', + 'md5': 'fe740ae0f63e4d1c8a7fc147a410c564', + 'info_dict': { + 'id': '233255', + 'ext': 'mp4', + 'title': '8 Seconds', + 'duration': 6490, + 'alt_title': '8 ثواني', + 'description': 'md5:45b83a155c30b49950624c7e99600b9d', + 'age_limit': 15, + 'release_year': 2015, + 'timestamp': 1683106031, + 'upload_date': '20230503', + 'thumbnail': r're:^https://content\.weyyak\.com/.+/poster-image', + 'categories': ['Drama', 'Social'], + 'cast': ['Ceylin Adiyaman', 'Esra Inal'], + }, + }, + ] + + def _real_extract(self, url): + video_id, lang, type_ = self._match_valid_url(url).group('id', 'lang', 'type') + + path = 'episode/' if type_ == 'episode' else 'contents/moviedetails?contentkey=' + data = self._download_json( + f'https://msapifo-prod-me.weyyak.z5.com/v1/{lang}/{path}{video_id}', video_id)['data'] + m3u8_url = self._download_json( + f'https://api-weyyak.akamaized.net/get_info/{data["video_id"]}', + video_id, 'Extracting video details')['url_video'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'alt_title': ('translated_title', {str}), + 'description': ('synopsis', {str}), + 'duration': ('length', {float_or_none}), + 'age_limit': ('age_rating', {parse_age_limit}), + 'season_number': ('season_number', {int_or_none}), + 'episode_number': ('episode_number', {int_or_none}), + 'thumbnail': ('imagery', 'thumbnail', {url_or_none}), + 'categories': ('genres', ..., {str}), + 'tags': ('tags', ..., {str}), + 'cast': (('main_actor', 'main_actress'), {str}), + 'timestamp': ('insertedAt', {unified_timestamp}), + 'release_year': ('production_year', {int_or_none}), + }), + } diff --git a/hypervideo_dl/extractor/whyp.py b/hypervideo_dl/extractor/whyp.py new file mode 100644 index 0000000..fef89c3 --- /dev/null +++ b/hypervideo_dl/extractor/whyp.py @@ -0,0 +1,50 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + str_or_none, + traverse_obj, + url_or_none, +) + + +class WhypIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?whyp\.it/tracks/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.whyp.it/tracks/18337/home-page-example-track-b4kq7', + 'md5': 'c1187b42ebf8605284e3dc92aeb33d16', + 'info_dict': { + 'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3', + 'id': '18337', + 'title': 'Home Page Example Track', + 'description': 'md5:bd758000fb93f3159339c852b5b9133c', + 'ext': 'mp3', + 'duration': 52.82, + 'uploader': 'Brad', + 'uploader_id': '1', + 'thumbnail': 'https://cdn.whyp.it/a537bb36-3373-4c61-96c8-27fc1b2f427a.jpg', + }, + }, { + 'url': 'https://www.whyp.it/tracks/18337', + 'only_matching': True, + }] + + def _real_extract(self, url): + unique_id = self._match_id(url) + webpage = self._download_webpage(url, unique_id) + data = self._search_nuxt_data(webpage, unique_id)['rawTrack'] + + return { + 'url': data['audio_url'], + 'id': unique_id, + **traverse_obj(data, { + 'title': 'title', + 'description': 'description', + 'duration': ('duration', {float_or_none}), + 'uploader': ('user', 'username'), + 'uploader_id': ('user', 'id', {str_or_none}), + 'thumbnail': ('artwork_url', {url_or_none}), + }), + 'ext': 'mp3', + 'vcodec': 'none', + 'http_headers': {'Referer': 'https://whyp.it/'}, + } diff --git a/hypervideo_dl/extractor/wimbledon.py b/hypervideo_dl/extractor/wimbledon.py new file mode 100644 index 0000000..0223e54 --- /dev/null +++ b/hypervideo_dl/extractor/wimbledon.py @@ -0,0 +1,61 @@ +from .common import InfoExtractor +from ..utils import ( + parse_duration, + traverse_obj, +) + + +class WimbledonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?wimbledon\.com/\w+/video/media/(?P<id>\d+)\.html' + _TESTS = [{ + 'url': 'https://www.wimbledon.com/en_GB/video/media/6330247525112.html', + 'info_dict': { + 'id': '6330247525112', + 'ext': 'mp4', + 'timestamp': 1687972186, + 'description': '', + 'thumbnail': r're:^https://[\w.-]+\.prod\.boltdns\.net/[^?#]+/image\.jpg', + 'upload_date': '20230628', + 'title': 'Coco Gauff | My Wimbledon Inspiration', + 'tags': ['features', 'trending', 'homepage'], + 'uploader_id': '3506358525001', + 'duration': 163072.0, + }, + }, { + 'url': 'https://www.wimbledon.com/en_GB/video/media/6308703111112.html', + 'info_dict': { + 'id': '6308703111112', + 'ext': 'mp4', + 'thumbnail': r're:^https://[\w.-]+\.prod\.boltdns\.net/[^?#]+/image\.jpg', + 'description': 'null', + 'upload_date': '20220629', + 'uploader_id': '3506358525001', + 'title': 'Roblox | WimbleWorld ', + 'duration': 101440.0, + 'tags': ['features', 'kids'], + 'timestamp': 1656500867, + }, + }, { + 'url': 'https://www.wimbledon.com/en_US/video/media/6309327106112.html', + 'only_matching': True, + }, { + 'url': 'https://www.wimbledon.com/es_Es/video/media/6308377909112.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + metadata = self._download_json( + f'https://www.wimbledon.com/relatedcontent/rest/v2/wim_v1/en/content/wim_v1_{video_id}_en', video_id) + + return { + '_type': 'url_transparent', + 'url': f'http://players.brightcove.net/3506358525001/default_default/index.html?videoId={video_id}', + 'ie_key': 'BrightcoveNew', + 'id': video_id, + **traverse_obj(metadata, { + 'title': 'title', + 'description': 'description', + 'duration': ('metadata', 'duration', {parse_duration}), + }), + } diff --git a/hypervideo_dl/extractor/wistia.py b/hypervideo_dl/extractor/wistia.py index 38dcc2f..bce5e83 100644 --- a/hypervideo_dl/extractor/wistia.py +++ b/hypervideo_dl/extractor/wistia.py @@ -1,17 +1,20 @@ import re -import urllib.error import urllib.parse from base64 import b64decode from .common import InfoExtractor +from ..networking import HEADRequest +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, + determine_ext, float_or_none, int_or_none, parse_qs, traverse_obj, try_get, update_url_query, + urlhandle_detect_ext, ) @@ -34,6 +37,16 @@ class WistiaBaseIE(InfoExtractor): return embed_config + def _get_real_ext(self, url): + ext = determine_ext(url, default_ext='bin') + if ext == 'bin': + urlh = self._request_webpage( + HEADRequest(url), None, note='Checking media extension', + errnote='HEAD request returned error', fatal=False) + if urlh: + ext = urlhandle_detect_ext(urlh, default='bin') + return 'mp4' if ext == 'mov' else ext + def _extract_media(self, embed_config): data = embed_config['media'] video_id = data['hashedId'] @@ -51,13 +64,13 @@ class WistiaBaseIE(InfoExtractor): continue elif atype in ('still', 'still_image'): thumbnails.append({ - 'url': aurl, + 'url': aurl.replace('.bin', f'.{self._get_real_ext(aurl)}'), 'width': int_or_none(a.get('width')), 'height': int_or_none(a.get('height')), 'filesize': int_or_none(a.get('size')), }) else: - aext = a.get('ext') + aext = a.get('ext') or self._get_real_ext(aurl) display_name = a.get('display_name') format_id = atype if atype and atype.endswith('_video') and display_name: @@ -169,26 +182,26 @@ class WistiaIE(WistiaBaseIE): 'md5': '10c1ce9c4dde638202513ed17a3767bd', 'info_dict': { 'id': 'a6ndpko1wg', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'Episode 2: Boxed Water\'s retention is thirsty', 'upload_date': '20210324', 'description': 'md5:da5994c2c2d254833b412469d9666b7a', 'duration': 966.0, 'timestamp': 1616614369, - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.png', } }, { 'url': 'wistia:5vd7p4bct5', 'md5': 'b9676d24bf30945d97060638fbfe77f0', 'info_dict': { 'id': '5vd7p4bct5', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'md5:eaa9f64c4efd7b5f098b9b6118597679', 'description': 'md5:a9bea0315f0616aa5df2dc413ddcdd0f', 'upload_date': '20220915', 'timestamp': 1663258727, 'duration': 623.019, - 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.(?:jpg|bin)$', + 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.jpg$', }, }, { 'url': 'wistia:sh7fpupwlt', @@ -208,25 +221,25 @@ class WistiaIE(WistiaBaseIE): 'url': 'https://www.weidert.com/blog/wistia-channels-video-marketing-tool', 'info_dict': { 'id': 'cqwukac3z1', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'How Wistia Channels Can Help Capture Inbound Value From Your Video Content', 'duration': 158.125, 'timestamp': 1618974400, 'description': 'md5:27abc99a758573560be72600ef95cece', 'upload_date': '20210421', - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.jpg', } }, { 'url': 'https://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', 'md5': 'b9676d24bf30945d97060638fbfe77f0', 'info_dict': { 'id': '5vd7p4bct5', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', 'upload_date': '20220915', 'timestamp': 1663258727, 'duration': 623.019, - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.jpg', 'description': 'a Paywall Videos video', }, }] @@ -302,9 +315,9 @@ class WistiaChannelIE(WistiaBaseIE): 'url': 'https://fast.wistia.net/embed/channel/3802iirk0l?wchannelid=3802iirk0l&wmediaid=sp5dqjzw3n', 'info_dict': { 'id': 'sp5dqjzw3n', - 'ext': 'bin', + 'ext': 'mp4', 'title': 'The Roof S2: The Modern CRO', - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.png', 'duration': 86.487, 'description': 'A sales leader on The Roof? Man, they really must be letting anyone up here this season.\n', 'timestamp': 1619790290, @@ -334,12 +347,12 @@ class WistiaChannelIE(WistiaBaseIE): 'info_dict': { 'id': 'pz0m0l0if3', 'title': 'A Framework for Improving Product Team Performance', - 'ext': 'bin', + 'ext': 'mp4', 'timestamp': 1653935275, 'upload_date': '20220530', 'description': 'Learn how to help your company improve and achieve your product related goals.', 'duration': 1854.39, - 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.bin', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.png', }, 'params': {'noplaylist': True, 'skip_download': True}, }] @@ -352,7 +365,7 @@ class WistiaChannelIE(WistiaBaseIE): try: data = self._download_embed_config('channel', channel_id, url) - except (ExtractorError, urllib.error.HTTPError): + except (ExtractorError, HTTPError): # Some channels give a 403 from the JSON API self.report_warning('Failed to download channel data from API, falling back to webpage.') webpage = self._download_webpage(f'https://fast.wistia.net/embed/channel/{channel_id}', channel_id) diff --git a/hypervideo_dl/extractor/wrestleuniverse.py b/hypervideo_dl/extractor/wrestleuniverse.py new file mode 100644 index 0000000..dd12804 --- /dev/null +++ b/hypervideo_dl/extractor/wrestleuniverse.py @@ -0,0 +1,307 @@ +import base64 +import binascii +import json +import time +import uuid + +from .common import InfoExtractor +from ..dependencies import Cryptodome +from ..utils import ( + ExtractorError, + int_or_none, + jwt_decode_hs256, + traverse_obj, + try_call, + url_or_none, + urlencode_postdata, + variadic, +) + + +class WrestleUniverseBaseIE(InfoExtractor): + _NETRC_MACHINE = 'wrestleuniverse' + _VALID_URL_TMPL = r'https?://(?:www\.)?wrestle-universe\.com/(?:(?P<lang>\w{2})/)?%s/(?P<id>\w+)' + _API_HOST = 'api.wrestle-universe.com' + _API_PATH = None + _REAL_TOKEN = None + _TOKEN_EXPIRY = None + _REFRESH_TOKEN = None + _DEVICE_ID = None + _LOGIN_QUERY = {'key': 'AIzaSyCaRPBsDQYVDUWWBXjsTrHESi2r_F3RAdA'} + _LOGIN_HEADERS = { + 'Accept': '*/*', + 'Content-Type': 'application/json', + 'X-Client-Version': 'Chrome/JsCore/9.9.4/FirebaseCore-web', + 'X-Firebase-gmpid': '1:307308870738:web:820f38fe5150c8976e338b', + 'Referer': 'https://www.wrestle-universe.com/', + 'Origin': 'https://www.wrestle-universe.com', + } + + @property + def _TOKEN(self): + if not self._REAL_TOKEN or not self._TOKEN_EXPIRY: + token = try_call(lambda: self._get_cookies('https://www.wrestle-universe.com/')['token'].value) + if not token and not self._REFRESH_TOKEN: + self.raise_login_required() + self._TOKEN = token + + if not self._REAL_TOKEN or self._TOKEN_EXPIRY <= int(time.time()): + if not self._REFRESH_TOKEN: + raise ExtractorError( + 'Expired token. Refresh your cookies in browser and try again', expected=True) + self._refresh_token() + + return self._REAL_TOKEN + + @_TOKEN.setter + def _TOKEN(self, value): + self._REAL_TOKEN = value + + expiry = traverse_obj(value, ({jwt_decode_hs256}, 'exp', {int_or_none})) + if not expiry: + raise ExtractorError('There was a problem with the auth token') + self._TOKEN_EXPIRY = expiry + + def _perform_login(self, username, password): + login = self._download_json( + 'https://identitytoolkit.googleapis.com/v1/accounts:signInWithPassword', None, + 'Logging in', query=self._LOGIN_QUERY, headers=self._LOGIN_HEADERS, data=json.dumps({ + 'returnSecureToken': True, + 'email': username, + 'password': password, + }, separators=(',', ':')).encode(), expected_status=400) + token = traverse_obj(login, ('idToken', {str})) + if not token: + raise ExtractorError( + f'Unable to log in: {traverse_obj(login, ("error", "message"))}', expected=True) + self._REFRESH_TOKEN = traverse_obj(login, ('refreshToken', {str})) + if not self._REFRESH_TOKEN: + self.report_warning('No refresh token was granted') + self._TOKEN = token + + def _real_initialize(self): + if self._DEVICE_ID: + return + + self._DEVICE_ID = self._configuration_arg('device_id', [None], ie_key=self._NETRC_MACHINE)[0] + if not self._DEVICE_ID: + self._DEVICE_ID = self.cache.load(self._NETRC_MACHINE, 'device_id') + if self._DEVICE_ID: + return + self._DEVICE_ID = str(uuid.uuid4()) + + self.cache.store(self._NETRC_MACHINE, 'device_id', self._DEVICE_ID) + + def _refresh_token(self): + refresh = self._download_json( + 'https://securetoken.googleapis.com/v1/token', None, 'Refreshing token', + query=self._LOGIN_QUERY, data=urlencode_postdata({ + 'grant_type': 'refresh_token', + 'refresh_token': self._REFRESH_TOKEN, + }), headers={ + **self._LOGIN_HEADERS, + 'Content-Type': 'application/x-www-form-urlencoded', + }) + if traverse_obj(refresh, ('refresh_token', {str})): + self._REFRESH_TOKEN = refresh['refresh_token'] + token = traverse_obj(refresh, 'access_token', 'id_token', expected_type=str) + if not token: + raise ExtractorError('No auth token returned from refresh request') + self._TOKEN = token + + def _call_api(self, video_id, param='', msg='API', auth=True, data=None, query={}, fatal=True): + headers = {'CA-CID': ''} + if data: + headers['Content-Type'] = 'application/json;charset=utf-8' + data = json.dumps(data, separators=(',', ':')).encode() + if auth and self._TOKEN: + headers['Authorization'] = f'Bearer {self._TOKEN}' + return self._download_json( + f'https://{self._API_HOST}/v1/{self._API_PATH}/{video_id}{param}', video_id, + note=f'Downloading {msg} JSON', errnote=f'Failed to download {msg} JSON', + data=data, headers=headers, query=query, fatal=fatal) + + def _call_encrypted_api(self, video_id, param='', msg='API', data={}, query={}, fatal=True): + if not Cryptodome.RSA: + raise ExtractorError('pycryptodomex not found. Please install', expected=True) + private_key = Cryptodome.RSA.generate(2048) + cipher = Cryptodome.PKCS1_OAEP.new(private_key, hashAlgo=Cryptodome.SHA1) + + def decrypt(data): + if not data: + return None + try: + return cipher.decrypt(base64.b64decode(data)).decode() + except (ValueError, binascii.Error) as e: + raise ExtractorError(f'Could not decrypt data: {e}') + + token = base64.b64encode(private_key.public_key().export_key('DER')).decode() + api_json = self._call_api(video_id, param, msg, data={ + 'deviceId': self._DEVICE_ID, + 'token': token, + **data, + }, query=query, fatal=fatal) + return api_json, decrypt + + def _download_metadata(self, url, video_id, lang, props_keys): + metadata = self._call_api(video_id, msg='metadata', query={'al': lang or 'ja'}, auth=False, fatal=False) + if not metadata: + webpage = self._download_webpage(url, video_id) + nextjs_data = self._search_nextjs_data(webpage, video_id) + metadata = traverse_obj(nextjs_data, ( + 'props', 'pageProps', *variadic(props_keys, (str, bytes, dict, set)), {dict})) or {} + return metadata + + def _get_formats(self, data, path, video_id=None): + hls_url = traverse_obj(data, path, get_all=False) + if not hls_url and not data.get('canWatch'): + self.raise_no_formats( + 'This account does not have access to the requested content', expected=True) + elif not hls_url: + self.raise_no_formats('No supported formats found') + return self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls', live=True) + + +class WrestleUniverseVODIE(WrestleUniverseBaseIE): + _VALID_URL = WrestleUniverseBaseIE._VALID_URL_TMPL % 'videos' + _TESTS = [{ + 'url': 'https://www.wrestle-universe.com/en/videos/dp8mpjmcKfxzUhEHM2uFws', + 'info_dict': { + 'id': 'dp8mpjmcKfxzUhEHM2uFws', + 'ext': 'mp4', + 'title': 'The 3rd “Futari wa Princess” Max Heart Tournament', + 'description': 'md5:318d5061e944797fbbb81d5c7dd00bf5', + 'location': '埼玉・春日部ふれあいキューブ', + 'channel': 'tjpw', + 'duration': 7119, + 'timestamp': 1674979200, + 'upload_date': '20230129', + 'thumbnail': 'https://image.asset.wrestle-universe.com/8FjD67P8rZc446RBQs5RBN/8FjD67P8rZc446RBQs5RBN', + 'chapters': 'count:7', + 'cast': 'count:21', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + _API_PATH = 'videoEpisodes' + + def _real_extract(self, url): + lang, video_id = self._match_valid_url(url).group('lang', 'id') + metadata = self._download_metadata(url, video_id, lang, 'videoEpisodeFallbackData') + video_data = self._call_api(video_id, ':watch', 'watch', data={ + # 'deviceId' is required if ignoreDeviceRestriction is False + 'ignoreDeviceRestriction': True, + }) + + return { + 'id': video_id, + 'formats': self._get_formats(video_data, ( + (('protocolHls', 'url'), ('chromecastUrls', ...)), {url_or_none}), video_id), + **traverse_obj(metadata, { + 'title': ('displayName', {str}), + 'description': ('description', {str}), + 'channel': ('labels', 'group', {str}), + 'location': ('labels', 'venue', {str}), + 'timestamp': ('watchStartTime', {int_or_none}), + 'thumbnail': ('keyVisualUrl', {url_or_none}), + 'cast': ('casts', ..., 'displayName', {str}), + 'duration': ('duration', {int}), + 'chapters': ('videoChapters', lambda _, v: isinstance(v.get('start'), int), { + 'title': ('displayName', {str}), + 'start_time': ('start', {int}), + 'end_time': ('end', {int}), + }), + }), + } + + +class WrestleUniversePPVIE(WrestleUniverseBaseIE): + _VALID_URL = WrestleUniverseBaseIE._VALID_URL_TMPL % 'lives' + _TESTS = [{ + 'note': 'HLS AES-128 key obtained via API', + 'url': 'https://www.wrestle-universe.com/en/lives/buH9ibbfhdJAY4GKZcEuJX', + 'info_dict': { + 'id': 'buH9ibbfhdJAY4GKZcEuJX', + 'ext': 'mp4', + 'title': '【PPV】Beyond the origins, into the future', + 'description': 'md5:9a872db68cd09be4a1e35a3ee8b0bdfc', + 'channel': 'tjpw', + 'location': '東京・Twin Box AKIHABARA', + 'duration': 10098, + 'timestamp': 1675076400, + 'upload_date': '20230130', + 'thumbnail': 'https://image.asset.wrestle-universe.com/rJs2m7cBaLXrwCcxMdQGRM/rJs2m7cBaLXrwCcxMdQGRM', + 'thumbnails': 'count:3', + 'hls_aes': { + 'key': '5633184acd6e43f1f1ac71c6447a4186', + 'iv': '5bac71beb33197d5600337ce86de7862', + }, + }, + 'params': { + 'skip_download': 'm3u8', + }, + 'skip': 'No longer available', + }, { + 'note': 'unencrypted HLS', + 'url': 'https://www.wrestle-universe.com/en/lives/wUG8hP5iApC63jbtQzhVVx', + 'info_dict': { + 'id': 'wUG8hP5iApC63jbtQzhVVx', + 'ext': 'mp4', + 'title': 'GRAND PRINCESS \'22', + 'description': 'md5:e4f43d0d4262de3952ff34831bc99858', + 'channel': 'tjpw', + 'location': '東京・両国国技館', + 'duration': 18044, + 'timestamp': 1647665400, + 'upload_date': '20220319', + 'thumbnail': 'https://image.asset.wrestle-universe.com/i8jxSTCHPfdAKD4zN41Psx/i8jxSTCHPfdAKD4zN41Psx', + 'thumbnails': 'count:3', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + _API_PATH = 'events' + + def _real_extract(self, url): + lang, video_id = self._match_valid_url(url).group('lang', 'id') + metadata = self._download_metadata(url, video_id, lang, 'eventFallbackData') + + info = { + 'id': video_id, + **traverse_obj(metadata, { + 'title': ('displayName', {str}), + 'description': ('description', {str}), + 'channel': ('labels', 'group', {str}), + 'location': ('labels', 'venue', {str}), + 'timestamp': ('startTime', {int_or_none}), + 'thumbnails': (('keyVisualUrl', 'alterKeyVisualUrl', 'heroKeyVisualUrl'), {'url': {url_or_none}}), + }), + } + + ended_time = traverse_obj(metadata, ('endedTime', {int_or_none})) + if info.get('timestamp') and ended_time: + info['duration'] = ended_time - info['timestamp'] + + video_data, decrypt = self._call_encrypted_api( + video_id, ':watchArchive', 'watch archive', data={'method': 1}) + info['formats'] = self._get_formats(video_data, ( + ('hls', None), ('urls', 'chromecastUrls'), ..., {url_or_none}), video_id) + for f in info['formats']: + # bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values + if f.get('tbr'): + f['tbr'] = int(f['tbr'] / 2.5) + + hls_aes_key = traverse_obj(video_data, ('hls', 'key', {decrypt})) + if hls_aes_key: + info['hls_aes'] = { + 'key': hls_aes_key, + 'iv': traverse_obj(video_data, ('hls', 'iv', {decrypt})), + } + elif traverse_obj(video_data, ('hls', 'encryptType', {int})): + self.report_warning('HLS AES-128 key was not found in API response') + + return info diff --git a/hypervideo_dl/extractor/wykop.py b/hypervideo_dl/extractor/wykop.py new file mode 100644 index 0000000..1d29cc8 --- /dev/null +++ b/hypervideo_dl/extractor/wykop.py @@ -0,0 +1,268 @@ +import json + +from .common import InfoExtractor +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + format_field, + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class WykopBaseExtractor(InfoExtractor): + def _get_token(self, force_refresh=False): + if not force_refresh: + maybe_cached = self.cache.load('wykop', 'bearer') + if maybe_cached: + return maybe_cached + + new_token = traverse_obj( + self._do_call_api('auth', None, 'Downloading anonymous auth token', data={ + # hardcoded in frontend + 'key': 'w53947240748', + 'secret': 'd537d9e0a7adc1510842059ae5316419', + }), ('data', 'token')) + + self.cache.store('wykop', 'bearer', new_token) + return new_token + + def _do_call_api(self, path, video_id, note='Downloading JSON metadata', data=None, headers={}): + if data: + data = json.dumps({'data': data}).encode() + headers['Content-Type'] = 'application/json' + + return self._download_json( + f'https://wykop.pl/api/v3/{path}', video_id, + note=note, data=data, headers=headers) + + def _call_api(self, path, video_id, note='Downloading JSON metadata'): + token = self._get_token() + for retrying in range(2): + try: + return self._do_call_api(path, video_id, note, headers={'Authorization': f'Bearer {token}'}) + except ExtractorError as e: + if not retrying and isinstance(e.cause, HTTPError) and e.cause.status == 403: + token = self._get_token(True) + continue + raise + + def _common_data_extract(self, data): + author = traverse_obj(data, ('author', 'username'), expected_type=str) + + return { + '_type': 'url_transparent', + 'display_id': data.get('slug'), + 'url': traverse_obj(data, + ('media', 'embed', 'url'), # what gets an iframe embed + ('source', 'url'), # clickable url (dig only) + expected_type=url_or_none), + 'thumbnail': traverse_obj( + data, ('media', 'photo', 'url'), ('media', 'embed', 'thumbnail'), expected_type=url_or_none), + 'uploader': author, + 'uploader_id': author, + 'uploader_url': format_field(author, None, 'https://wykop.pl/ludzie/%s'), + 'timestamp': parse_iso8601(data.get('created_at'), delimiter=' '), # time it got submitted + 'like_count': traverse_obj(data, ('votes', 'up'), expected_type=int), + 'dislike_count': traverse_obj(data, ('votes', 'down'), expected_type=int), + 'comment_count': traverse_obj(data, ('comments', 'count'), expected_type=int), + 'age_limit': 18 if data.get('adult') else 0, + 'tags': data.get('tags'), + } + + +class WykopDigIE(WykopBaseExtractor): + IE_NAME = 'wykop:dig' + _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://wykop.pl/link/6912923/najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth', + 'info_dict': { + 'id': 'rlSTBvViflc', + 'ext': 'mp4', + 'title': 'Najbardziej zrzędliwy kot na świecie I Frozen Planet II I BBC Earth', + 'display_id': 'najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth', + 'description': 'md5:ac0f87dea1cdcb6b0c53f3612a095c87', + 'tags': ['zwierzaczki', 'koty', 'smiesznykotek', 'humor', 'rozrywka', 'ciekawostki'], + 'age_limit': 0, + 'timestamp': 1669154480, + 'release_timestamp': 1669194241, + 'release_date': '20221123', + 'uploader': 'starnak', + 'uploader_id': 'starnak', + 'uploader_url': 'https://wykop.pl/ludzie/starnak', + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'thumbnail': r're:https?://wykop\.pl/cdn/.+', + 'view_count': int, + 'channel': 'BBC Earth', + 'channel_id': 'UCwmZiChSryoWQCZMIQezgTg', + 'channel_url': 'https://www.youtube.com/channel/UCwmZiChSryoWQCZMIQezgTg', + 'categories': ['Pets & Animals'], + 'upload_date': '20220923', + 'duration': 191, + 'channel_follower_count': int, + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + }, + }] + + @classmethod + def suitable(cls, url): + return cls._match_valid_url(url) and not WykopDigCommentIE.suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api(f'links/{video_id}', video_id)['data'] + + return { + **self._common_data_extract(data), + 'id': video_id, + 'title': data['title'], + 'description': data.get('description'), + # time it got "digged" to the homepage + 'release_timestamp': parse_iso8601(data.get('published_at'), delimiter=' '), + } + + +class WykopDigCommentIE(WykopBaseExtractor): + IE_NAME = 'wykop:dig:comment' + _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<dig_id>\d+)/[^/]+/komentarz/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://wykop.pl/link/6992589/strollowal-oszusta-przez-ponad-24-minuty-udawal-naiwniaka-i-nagral-rozmowe/komentarz/114540527/podobna-sytuacja-ponizej-ciekawa-dyskusja-z-oszustem-na-sam-koniec-sam-bylem-w-biurze-swiadkiem-podobnej-rozmowy-niemal-zakonczonej-sukcesem-bandyty-g', + 'info_dict': { + 'id': 'u6tEi2FmKZY', + 'ext': 'mp4', + 'title': 'md5:e7c741c5baa7ed6478000caf72865577', + 'display_id': 'md5:45b2d12bd0e262d09cc7cf7abc8412db', + 'description': 'md5:bcec7983429f9c0630f9deb9d3d1ba5e', + 'timestamp': 1674476945, + 'uploader': 'Bartholomew', + 'uploader_id': 'Bartholomew', + 'uploader_url': 'https://wykop.pl/ludzie/Bartholomew', + 'thumbnail': r're:https?://wykop\.pl/cdn/.+', + 'tags': [], + 'availability': 'public', + 'duration': 1838, + 'upload_date': '20230117', + 'categories': ['Entertainment'], + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'channel_follower_count': int, + 'playable_in_embed': True, + 'live_status': 'not_live', + 'age_limit': 0, + 'chapters': 'count:3', + 'channel': 'Poszukiwacze Okazji', + 'channel_id': 'UCzzvJDZThwv06dR4xmzrZBw', + 'channel_url': 'https://www.youtube.com/channel/UCzzvJDZThwv06dR4xmzrZBw', + }, + }] + + def _real_extract(self, url): + dig_id, comment_id = self._search_regex(self._VALID_URL, url, 'dig and comment ids', group=('dig_id', 'id')) + data = self._call_api(f'links/{dig_id}/comments/{comment_id}', comment_id)['data'] + + return { + **self._common_data_extract(data), + 'id': comment_id, + 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}", + 'description': data.get('content'), + } + + +class WykopPostIE(WykopBaseExtractor): + IE_NAME = 'wykop:post' + _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://wykop.pl/wpis/68893343/kot-koty-smiesznykotek', + 'info_dict': { + 'id': 'PL8JMjiUPHUhwc9ZlKa_5IFeBwBV8Xe7jI', + 'title': 'PawelW124 - #kot #koty #smiesznykotek', + 'description': '#kot #koty #smiesznykotek', + 'display_id': 'kot-koty-smiesznykotek', + 'tags': ['kot', 'koty', 'smiesznykotek'], + 'uploader': 'PawelW124', + 'uploader_id': 'PawelW124', + 'uploader_url': 'https://wykop.pl/ludzie/PawelW124', + 'timestamp': 1668938142, + 'age_limit': 0, + 'like_count': int, + 'dislike_count': int, + 'thumbnail': r're:https?://wykop\.pl/cdn/.+', + 'comment_count': int, + 'channel': 'Revan', + 'channel_id': 'UCW9T_-uZoiI7ROARQdTDyOw', + 'channel_url': 'https://www.youtube.com/channel/UCW9T_-uZoiI7ROARQdTDyOw', + 'upload_date': '20221120', + 'modified_date': '20220814', + 'availability': 'public', + 'view_count': int, + }, + 'playlist_mincount': 15, + 'params': { + 'flat_playlist': True, + } + }] + + @classmethod + def suitable(cls, url): + return cls._match_valid_url(url) and not WykopPostCommentIE.suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api(f'entries/{video_id}', video_id)['data'] + + return { + **self._common_data_extract(data), + 'id': video_id, + 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}", + 'description': data.get('content'), + } + + +class WykopPostCommentIE(WykopBaseExtractor): + IE_NAME = 'wykop:post:comment' + _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<post_id>\d+)/[^/#]+#(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://wykop.pl/wpis/70084873/test-test-test#249303979', + 'info_dict': { + 'id': 'confusedquickarmyant', + 'ext': 'mp4', + 'title': 'tpap - treść komentarza', + 'display_id': 'tresc-komentarza', + 'description': 'treść komentarza', + 'uploader': 'tpap', + 'uploader_id': 'tpap', + 'uploader_url': 'https://wykop.pl/ludzie/tpap', + 'timestamp': 1675349470, + 'upload_date': '20230202', + 'tags': [], + 'duration': 2.12, + 'age_limit': 0, + 'categories': [], + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'thumbnail': r're:https?://wykop\.pl/cdn/.+', + }, + }] + + def _real_extract(self, url): + post_id, comment_id = self._search_regex(self._VALID_URL, url, 'post and comment ids', group=('post_id', 'id')) + data = self._call_api(f'entries/{post_id}/comments/{comment_id}', comment_id)['data'] + + return { + **self._common_data_extract(data), + 'id': comment_id, + 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}", + 'description': data.get('content'), + } diff --git a/hypervideo_dl/extractor/xanimu.py b/hypervideo_dl/extractor/xanimu.py new file mode 100644 index 0000000..e0b7bf9 --- /dev/null +++ b/hypervideo_dl/extractor/xanimu.py @@ -0,0 +1,51 @@ +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class XanimuIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?xanimu\.com/(?P<id>[^/]+)/?' + _TESTS = [{ + 'url': 'https://xanimu.com/51944-the-princess-the-frog-hentai/', + 'md5': '899b88091d753d92dad4cb63bbf357a7', + 'info_dict': { + 'id': '51944-the-princess-the-frog-hentai', + 'ext': 'mp4', + 'title': 'The Princess + The Frog Hentai', + 'thumbnail': 'https://xanimu.com/storage/2020/09/the-princess-and-the-frog-hentai.jpg', + 'description': r're:^Enjoy The Princess \+ The Frog Hentai', + 'duration': 207.0, + 'age_limit': 18 + } + }, { + 'url': 'https://xanimu.com/huge-expansion/', + 'only_matching': True + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + formats = [] + for format in ['videoHigh', 'videoLow']: + format_url = self._search_json(r'var\s+%s\s*=' % re.escape(format), webpage, format, + video_id, default=None, contains_pattern=r'[\'"]([^\'"]+)[\'"]') + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format, + 'quality': -2 if format.endswith('Low') else None, + }) + + return { + 'id': video_id, + 'formats': formats, + 'title': self._search_regex(r'[\'"]headline[\'"]:\s*[\'"]([^"]+)[\'"]', webpage, + 'title', default=None) or self._html_extract_title(webpage), + 'thumbnail': self._html_search_meta('thumbnailUrl', webpage, default=None), + 'description': self._html_search_meta('description', webpage, default=None), + 'duration': int_or_none(self._search_regex(r'duration:\s*[\'"]([^\'"]+?)[\'"]', + webpage, 'duration', fatal=False)), + 'age_limit': 18 + } diff --git a/hypervideo_dl/extractor/xhamster.py b/hypervideo_dl/extractor/xhamster.py index 59eecec..3722479 100644 --- a/hypervideo_dl/extractor/xhamster.py +++ b/hypervideo_dl/extractor/xhamster.py @@ -21,7 +21,7 @@ from ..utils import ( class XHamsterIE(InfoExtractor): - _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com)' + _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com|xhvid\.com)' _VALID_URL = r'''(?x) https?:// (?:.+?\.)?%s/ @@ -120,6 +120,9 @@ class XHamsterIE(InfoExtractor): }, { 'url': 'https://xhday.com/videos/strapless-threesome-xhh7yVf', 'only_matching': True, + }, { + 'url': 'https://xhvid.com/videos/lk-mm-xhc6wn6', + 'only_matching': True, }] def _real_extract(self, url): @@ -180,7 +183,7 @@ class XHamsterIE(InfoExtractor): 'height': get_height(quality), 'filesize': format_sizes.get(quality), 'http_headers': { - 'Referer': urlh.geturl(), + 'Referer': urlh.url, }, }) xplayer_sources = try_get( @@ -422,6 +425,9 @@ class XHamsterUserIE(InfoExtractor): }, { 'url': 'https://xhday.com/users/mobhunter', 'only_matching': True, + }, { + 'url': 'https://xhvid.com/users/pelushe21', + 'only_matching': True, }] def _entries(self, user_id): diff --git a/hypervideo_dl/extractor/ximalaya.py b/hypervideo_dl/extractor/ximalaya.py index b25be77..3d5e6cf 100644 --- a/hypervideo_dl/extractor/ximalaya.py +++ b/hypervideo_dl/extractor/ximalaya.py @@ -36,7 +36,7 @@ class XimalayaIE(XimalayaBaseIE): 'height': 180 } ], - 'categories': ['人文'], + 'categories': ['其他'], 'duration': 93, 'view_count': int, 'like_count': int, @@ -123,7 +123,7 @@ class XimalayaIE(XimalayaBaseIE): class XimalayaAlbumIE(XimalayaBaseIE): IE_NAME = 'ximalaya:album' IE_DESC = '喜马拉雅FM 专辑' - _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/\d+/album/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?:\d+/)?album/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.ximalaya.com/61425525/album/5534601/', 'info_dict': { @@ -131,6 +131,13 @@ class XimalayaAlbumIE(XimalayaBaseIE): 'id': '5534601', }, 'playlist_mincount': 323, + }, { + 'url': 'https://www.ximalaya.com/album/6912905', + 'info_dict': { + 'title': '埃克哈特《修炼当下的力量》', + 'id': '6912905', + }, + 'playlist_mincount': 41, }] def _real_extract(self, url): @@ -151,7 +158,7 @@ class XimalayaAlbumIE(XimalayaBaseIE): return self._download_json( 'https://www.ximalaya.com/revision/album/v1/getTracksList', playlist_id, note=f'Downloading tracks list page {page_idx}', - query={'albumId': playlist_id, 'pageNum': page_idx, 'sort': 1})['data'] + query={'albumId': playlist_id, 'pageNum': page_idx})['data'] def _get_entries(self, page_data): for e in page_data['tracks']: diff --git a/hypervideo_dl/extractor/xtube.py b/hypervideo_dl/extractor/xtube.py index ce4480c..db82925 100644 --- a/hypervideo_dl/extractor/xtube.py +++ b/hypervideo_dl/extractor/xtube.py @@ -2,12 +2,12 @@ import itertools import re from .common import InfoExtractor +from ..networking import Request from ..utils import ( int_or_none, js_to_json, orderedSet, parse_duration, - sanitized_Request, str_to_int, url_or_none, ) @@ -186,7 +186,7 @@ class XTubeUserIE(InfoExtractor): entries = [] for pagenum in itertools.count(1): - request = sanitized_Request( + request = Request( 'http://www.xtube.com/profile/%s/videos/%d' % (user_id, pagenum), headers={ 'Cookie': 'popunder=4', diff --git a/hypervideo_dl/extractor/xvideos.py b/hypervideo_dl/extractor/xvideos.py index 5c505c8..5df0715 100644 --- a/hypervideo_dl/extractor/xvideos.py +++ b/hypervideo_dl/extractor/xvideos.py @@ -157,3 +157,24 @@ class XVideosIE(InfoExtractor): 'thumbnails': thumbnails, 'age_limit': 18, } + + +class XVideosQuickiesIE(InfoExtractor): + IE_NAME = 'xvideos:quickies' + _VALID_URL = r'https?://(?P<domain>(?:[^/]+\.)?xvideos2?\.com)/amateur-channels/[^#]+#quickies/a/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.xvideos.com/amateur-channels/wifeluna#quickies/a/47258683', + 'md5': '16e322a93282667f1963915568f782c1', + 'info_dict': { + 'id': '47258683', + 'ext': 'mp4', + 'title': 'Verification video', + 'age_limit': 18, + 'duration': 16, + 'thumbnail': r're:^https://cdn.*-pic.xvideos-cdn.com/.+\.jpg', + } + }] + + def _real_extract(self, url): + domain, id_ = self._match_valid_url(url).group('domain', 'id') + return self.url_result(f'https://{domain}/video{id_}/_', XVideosIE, id_) diff --git a/hypervideo_dl/extractor/yahoo.py b/hypervideo_dl/extractor/yahoo.py index a69715b..24148a0 100644 --- a/hypervideo_dl/extractor/yahoo.py +++ b/hypervideo_dl/extractor/yahoo.py @@ -2,7 +2,6 @@ import hashlib import itertools import urllib.parse -from .brightcove import BrightcoveNewIE from .common import InfoExtractor, SearchInfoExtractor from .youtube import YoutubeIE from ..utils import ( @@ -11,7 +10,6 @@ from ..utils import ( int_or_none, mimetype2ext, parse_iso8601, - smuggle_url, traverse_obj, try_get, url_or_none, @@ -337,121 +335,6 @@ class YahooSearchIE(SearchInfoExtractor): break -class YahooGyaOPlayerIE(InfoExtractor): - IE_NAME = 'yahoo:gyao:player' - _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:player|episode(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/c/y)/(?P<id>\d+/v\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _TESTS = [{ - 'url': 'https://gyao.yahoo.co.jp/player/00998/v00818/v0000000000000008564/', - 'info_dict': { - 'id': '5993125228001', - 'ext': 'mp4', - 'title': 'フューリー 【字幕版】', - 'description': 'md5:21e691c798a15330eda4db17a8fe45a5', - 'uploader_id': '4235717419001', - 'upload_date': '20190124', - 'timestamp': 1548294365, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://streaming.yahoo.co.jp/c/y/01034/v00133/v0000000000000000706/', - 'only_matching': True, - }, { - 'url': 'https://gyao.yahoo.co.jp/episode/%E3%81%8D%E3%81%AE%E3%81%86%E4%BD%95%E9%A3%9F%E3%81%B9%E3%81%9F%EF%BC%9F%20%E7%AC%AC2%E8%A9%B1%202019%2F4%2F12%E6%94%BE%E9%80%81%E5%88%86/5cb02352-b725-409e-9f8d-88f947a9f682', - 'only_matching': True, - }, { - 'url': 'https://gyao.yahoo.co.jp/episode/5fa1226c-ef8d-4e93-af7a-fd92f4e30597', - 'only_matching': True, - }] - _GEO_BYPASS = False - - def _real_extract(self, url): - video_id = self._match_id(url).replace('/', ':') - headers = self.geo_verification_headers() - headers['Accept'] = 'application/json' - resp = self._download_json( - 'https://gyao.yahoo.co.jp/apis/playback/graphql', video_id, query={ - 'appId': 'dj00aiZpPUNJeDh2cU1RazU3UCZzPWNvbnN1bWVyc2VjcmV0Jng9NTk-', - 'query': '''{ - content(parameter: {contentId: "%s", logicaAgent: PC_WEB}) { - video { - delivery { - id - } - title - } - } -}''' % video_id, - }, headers=headers) - content = resp['data']['content'] - if not content: - msg = resp['errors'][0]['message'] - if msg == 'not in japan': - self.raise_geo_restricted(countries=['JP']) - raise ExtractorError(msg) - video = content['video'] - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': video['title'], - 'url': smuggle_url( - 'http://players.brightcove.net/4235717419001/SyG5P0gjb_default/index.html?videoId=' + video['delivery']['id'], - {'geo_countries': ['JP']}), - 'ie_key': BrightcoveNewIE.ie_key(), - } - - -class YahooGyaOIE(InfoExtractor): - IE_NAME = 'yahoo:gyao' - _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:p|title(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/p/y)/(?P<id>\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _TESTS = [{ - 'url': 'https://gyao.yahoo.co.jp/title/%E3%82%BF%E3%82%A4%E3%83%A0%E3%83%9C%E3%82%AB%E3%83%B3%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA%20%E3%83%A4%E3%83%83%E3%82%BF%E3%83%BC%E3%83%9E%E3%83%B3/5f60ceb3-6e5e-40ef-ba40-d68b598d067f', - 'info_dict': { - 'id': '5f60ceb3-6e5e-40ef-ba40-d68b598d067f', - }, - 'playlist_mincount': 80, - }, { - 'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/', - 'only_matching': True, - }, { - 'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/', - 'only_matching': True, - }, { - 'url': 'https://gyao.yahoo.co.jp/title/%E3%81%97%E3%82%83%E3%81%B9%E3%81%8F%E3%82%8A007/5b025a49-b2e5-4dc7-945c-09c6634afacf', - 'only_matching': True, - }, { - 'url': 'https://gyao.yahoo.co.jp/title/5b025a49-b2e5-4dc7-945c-09c6634afacf', - 'only_matching': True, - }] - - def _entries(self, program_id): - page = 1 - while True: - playlist = self._download_json( - f'https://gyao.yahoo.co.jp/api/programs/{program_id}/videos?page={page}&serviceId=gy', program_id, - note=f'Downloading JSON metadata page {page}') - if not playlist: - break - for video in playlist['videos']: - video_id = video.get('id') - if not video_id: - continue - if video.get('streamingAvailability') == 'notYet': - continue - yield self.url_result( - 'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'), - YahooGyaOPlayerIE.ie_key(), video_id) - if playlist.get('ended'): - break - page += 1 - - def _real_extract(self, url): - program_id = self._match_id(url).replace('/', ':') - return self.playlist_result(self._entries(program_id), program_id) - - class YahooJapanNewsIE(InfoExtractor): IE_NAME = 'yahoo:japannews' IE_DESC = 'Yahoo! Japan News' diff --git a/hypervideo_dl/extractor/yandexvideo.py b/hypervideo_dl/extractor/yandexvideo.py index 535b61f..727250e 100644 --- a/hypervideo_dl/extractor/yandexvideo.py +++ b/hypervideo_dl/extractor/yandexvideo.py @@ -270,9 +270,9 @@ class ZenYandexIE(InfoExtractor): for s_url in stream_urls: ext = determine_ext(s_url) if ext == 'mpd': - formats.extend(self._extract_mpd_formats(s_url, id, mpd_id='dash')) + formats.extend(self._extract_mpd_formats(s_url, video_id, mpd_id='dash')) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats(s_url, id, 'mp4')) + formats.extend(self._extract_m3u8_formats(s_url, video_id, 'mp4')) return { 'id': video_id, 'title': video_json.get('title') or self._og_search_title(webpage), diff --git a/hypervideo_dl/extractor/yappy.py b/hypervideo_dl/extractor/yappy.py new file mode 100644 index 0000000..7b3d0cb --- /dev/null +++ b/hypervideo_dl/extractor/yappy.py @@ -0,0 +1,127 @@ +from .common import InfoExtractor +from ..utils import ( + OnDemandPagedList, + int_or_none, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class YappyIE(InfoExtractor): + _VALID_URL = r'https?://yappy\.media/video/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://yappy.media/video/47fea6d8586f48d1a0cf96a7342aabd2', + 'info_dict': { + 'id': '47fea6d8586f48d1a0cf96a7342aabd2', + 'ext': 'mp4', + 'title': 'Куда нажимать? Как снимать? Смотри видос и погнали!🤘🏻', + 'timestamp': 1661893200, + 'description': 'Куда нажимать? Как снимать? Смотри видос и погнали!🤘🏻', + 'thumbnail': 'https://cdn-st.ritm.media/static/pic/thumbnails/0c7c4d73388f47848acaf540d2e2bb8c-thumbnail.jpg', + 'upload_date': '20220830', + 'view_count': int, + 'like_count': int, + 'uploader_id': '59a0c8c485e5410b9c43474bf4c6a373', + 'categories': ['Образование и наука', 'Лайфхак', 'Технологии', 'Арт/искусство'], + 'repost_count': int, + 'uploader': 'YAPPY', + } + }, { + 'url': 'https://yappy.media/video/3862451954ad4bd58ae2ccefddb0bd33', + 'info_dict': { + 'id': '3862451954ad4bd58ae2ccefddb0bd33', + 'ext': 'mp4', + 'title': 'Опиши свой характер 3 словами🙃\n#психология #дружба #отношения', + 'timestamp': 1674726985, + 'like_count': int, + 'description': 'Опиши свой характер 3 словами🙃\n#психология #дружба #отношения', + 'uploader_id': '6793ee3581974a3586fc01e157de6c99', + 'view_count': int, + 'repost_count': int, + 'uploader': 'LENA SHTURMAN', + 'upload_date': '20230126', + 'thumbnail': 'https://cdn-st.ritm.media/static/pic/user_thumbnails/6e76bb4bbad640b6/9ec84c115b2b1967/1674716171.jpg', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_ld = self._search_json_ld(webpage, video_id) + nextjs_data = self._search_nextjs_data(webpage, video_id) + + media_data = ( + traverse_obj( + nextjs_data, ('props', 'pageProps', ('data', 'OpenGraphParameters')), get_all=False) + or self._download_json(f'https://yappy.media/api/video/{video_id}', video_id)) + + media_url = traverse_obj(media_data, ('link', {url_or_none})) or '' + has_watermark = media_url.endswith('-wm.mp4') + + formats = [{ + 'url': media_url, + 'ext': 'mp4', + 'format_note': 'Watermarked' if has_watermark else None, + 'preference': -10 if has_watermark else None + }] if media_url else [] + + if has_watermark: + formats.append({ + 'url': media_url.replace('-wm.mp4', '.mp4'), + 'ext': 'mp4' + }) + + audio_link = traverse_obj(media_data, ('audio', 'link')) + if audio_link: + formats.append({ + 'url': audio_link, + 'ext': 'mp3', + 'acodec': 'mp3', + 'vcodec': 'none' + }) + + return { + 'id': video_id, + 'title': (json_ld.get('description') or self._html_search_meta(['og:title'], webpage) + or self._html_extract_title(webpage)), + 'formats': formats, + 'thumbnail': (media_data.get('thumbnail') + or self._html_search_meta(['og:image', 'og:image:secure_url'], webpage)), + 'description': (media_data.get('description') or json_ld.get('description') + or self._html_search_meta(['description', 'og:description'], webpage)), + 'timestamp': unified_timestamp(media_data.get('publishedAt') or json_ld.get('timestamp')), + 'view_count': int_or_none(media_data.get('viewsCount') or json_ld.get('view_count')), + 'like_count': int_or_none(media_data.get('likesCount')), + 'uploader': traverse_obj(media_data, ('creator', 'firstName')), + 'uploader_id': traverse_obj(media_data, ('creator', ('uuid', 'nickname')), get_all=False), + 'categories': traverse_obj(media_data, ('categories', ..., 'name')) or None, + 'repost_count': int_or_none(media_data.get('sharingCount')) + } + + +class YappyProfileIE(InfoExtractor): + _VALID_URL = r'https?://yappy\.media/profile/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://yappy.media/profile/59a0c8c485e5410b9c43474bf4c6a373', + 'info_dict': { + 'id': '59a0c8c485e5410b9c43474bf4c6a373', + }, + 'playlist_mincount': 527, + }] + + def _real_extract(self, url): + profile_id = self._match_id(url) + + def fetch_page(page_num): + page_num += 1 + videos = self._download_json( + f'https://yappy.media/api/video/list/{profile_id}?page={page_num}', + profile_id, f'Downloading profile page {page_num} JSON') + + for video in traverse_obj(videos, ('results', lambda _, v: v['uuid'])): + yield self.url_result( + f'https://yappy.media/video/{video["uuid"]}', YappyIE, + video['uuid'], video.get('description')) + + return self.playlist_result(OnDemandPagedList(fetch_page, 15), profile_id) diff --git a/hypervideo_dl/extractor/yesjapan.py b/hypervideo_dl/extractor/yesjapan.py index b45fa8f..94e4166 100644 --- a/hypervideo_dl/extractor/yesjapan.py +++ b/hypervideo_dl/extractor/yesjapan.py @@ -1,9 +1,6 @@ from .common import InfoExtractor -from ..utils import ( - HEADRequest, - get_element_by_attribute, - parse_iso8601, -) +from ..networking import HEADRequest +from ..utils import get_element_by_attribute, parse_iso8601 class YesJapanIE(InfoExtractor): @@ -42,7 +39,7 @@ class YesJapanIE(InfoExtractor): req = self._request_webpage( redirect_req, video_id, note='Resolving final URL', errnote='Could not resolve final URL', fatal=False) if req: - video_url = req.geturl() + video_url = req.url formats = [{ 'format_id': 'sd', diff --git a/hypervideo_dl/extractor/yle_areena.py b/hypervideo_dl/extractor/yle_areena.py index 118dc12..c5b45f0 100644 --- a/hypervideo_dl/extractor/yle_areena.py +++ b/hypervideo_dl/extractor/yle_areena.py @@ -1,40 +1,94 @@ from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import int_or_none, traverse_obj, url_or_none +from ..utils import ( + int_or_none, + smuggle_url, + traverse_obj, + unified_strdate, + url_or_none, +) class YleAreenaIE(InfoExtractor): _VALID_URL = r'https?://areena\.yle\.fi/(?P<id>[\d-]+)' - _TESTS = [{ - 'url': 'https://areena.yle.fi/1-4371942', - 'md5': '932edda0ecf5dfd6423804182d32f8ac', - 'info_dict': { - 'id': '0_a3tjk92c', - 'ext': 'mp4', - 'title': 'Pouchit', - 'description': 'md5:d487309c3abbe5650265bbd1742d2f82', - 'series': 'Modernit miehet', - 'season': 'Season 1', - 'season_number': 1, - 'episode': 'Episode 2', - 'episode_number': 2, - 'thumbnail': 'http://cfvod.kaltura.com/p/1955031/sp/195503100/thumbnail/entry_id/0_a3tjk92c/version/100061', - 'uploader_id': 'ovp@yle.fi', - 'duration': 1435, - 'view_count': int, - 'upload_date': '20181204', - 'timestamp': 1543916210, - 'subtitles': {'fin': [{'url': r're:^https?://', 'ext': 'srt'}]}, - 'age_limit': 7, - } - }] + _TESTS = [ + { + 'url': 'https://areena.yle.fi/1-4371942', + 'md5': '932edda0ecf5dfd6423804182d32f8ac', + 'info_dict': { + 'id': '0_a3tjk92c', + 'ext': 'mp4', + 'title': 'Pouchit', + 'description': 'md5:d487309c3abbe5650265bbd1742d2f82', + 'series': 'Modernit miehet', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 2', + 'episode_number': 2, + 'thumbnail': 'http://cfvod.kaltura.com/p/1955031/sp/195503100/thumbnail/entry_id/0_a3tjk92c/version/100061', + 'uploader_id': 'ovp@yle.fi', + 'duration': 1435, + 'view_count': int, + 'upload_date': '20181204', + 'release_date': '20190106', + 'timestamp': 1543916210, + 'subtitles': {'fin': [{'url': r're:^https?://', 'ext': 'srt'}]}, + 'age_limit': 7, + 'webpage_url': 'https://areena.yle.fi/1-4371942' + } + }, + { + 'url': 'https://areena.yle.fi/1-2158940', + 'md5': 'cecb603661004e36af8c5188b5212b12', + 'info_dict': { + 'id': '1_l38iz9ur', + 'ext': 'mp4', + 'title': 'Albi haluaa vessan', + 'description': 'md5:15236d810c837bed861fae0e88663c33', + 'series': 'Albi Lumiukko', + 'season': None, + 'season_number': None, + 'episode': None, + 'episode_number': None, + 'thumbnail': 'http://cfvod.kaltura.com/p/1955031/sp/195503100/thumbnail/entry_id/1_l38iz9ur/version/100021', + 'uploader_id': 'ovp@yle.fi', + 'duration': 319, + 'view_count': int, + 'upload_date': '20211202', + 'release_date': '20211215', + 'timestamp': 1638448202, + 'subtitles': {}, + 'age_limit': 0, + 'webpage_url': 'https://areena.yle.fi/1-2158940' + } + }, + { + 'url': 'https://areena.yle.fi/1-64829589', + 'info_dict': { + 'id': '1-64829589', + 'ext': 'mp4', + 'title': 'HKO & Mälkki & Tanner', + 'description': 'md5:b4f1b1af2c6569b33f75179a86eea156', + 'series': 'Helsingin kaupunginorkesterin konsertteja', + 'thumbnail': r're:^https?://.+\.jpg$', + 'release_date': '20230120', + }, + 'params': { + 'skip_download': 'm3u8', + }, + }, + ] def _real_extract(self, url): video_id = self._match_id(url) info = self._search_json_ld(self._download_webpage(url, video_id), video_id, default={}) video_data = self._download_json( f'https://player.api.yle.fi/v1/preview/{video_id}.json?app_id=player_static_prod&app_key=8930d72170e48303cf5f3867780d549b', - video_id) + video_id, headers={ + 'origin': 'https://areena.yle.fi', + 'referer': 'https://areena.yle.fi/', + 'content-type': 'application/json' + }) # Example title: 'K1, J2: Pouchit | Modernit miehet' series, season_number, episode_number, episode = self._search_regex( @@ -52,20 +106,33 @@ class YleAreenaIE(InfoExtractor): 'name': sub.get('kind'), }) + kaltura_id = traverse_obj(video_data, ('data', 'ongoing_ondemand', 'kaltura', 'id'), expected_type=str) + if kaltura_id: + info_dict = { + '_type': 'url_transparent', + 'url': smuggle_url(f'kaltura:1955031:{kaltura_id}', {'source_url': url}), + 'ie_key': KalturaIE.ie_key(), + } + else: + info_dict = { + 'id': video_id, + 'formats': self._extract_m3u8_formats( + video_data['data']['ongoing_ondemand']['manifest_url'], video_id, 'mp4', m3u8_id='hls'), + } + return { - '_type': 'url_transparent', - 'url': 'kaltura:1955031:%s' % traverse_obj(video_data, ('data', 'ongoing_ondemand', 'kaltura', 'id')), - 'ie_key': KalturaIE.ie_key(), + **info_dict, 'title': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'title', 'fin'), expected_type=str) or episode or info.get('title')), 'description': description, 'series': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'series', 'title', 'fin'), expected_type=str) or series), 'season_number': (int_or_none(self._search_regex(r'Kausi (\d+)', description, 'season number', default=None)) - or int(season_number)), + or int_or_none(season_number)), 'episode_number': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'episode_number'), expected_type=int_or_none) - or int(episode_number)), + or int_or_none(episode_number)), 'thumbnails': traverse_obj(info, ('thumbnails', ..., {'url': 'url'})), 'age_limit': traverse_obj(video_data, ('data', 'ongoing_ondemand', 'content_rating', 'age_restriction'), expected_type=int_or_none), 'subtitles': subtitles, + 'release_date': unified_strdate(traverse_obj(video_data, ('data', 'ongoing_ondemand', 'start_time'), expected_type=str)), } diff --git a/hypervideo_dl/extractor/youku.py b/hypervideo_dl/extractor/youku.py index 624975b..7ecd9f1 100644 --- a/hypervideo_dl/extractor/youku.py +++ b/hypervideo_dl/extractor/youku.py @@ -6,6 +6,7 @@ import time from .common import InfoExtractor from ..utils import ( ExtractorError, + clean_html, get_element_by_class, js_to_json, str_or_none, @@ -26,49 +27,9 @@ class YoukuIE(InfoExtractor): ''' _TESTS = [{ - # MD5 is unstable - 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', - 'info_dict': { - 'id': 'XMTc1ODE5Njcy', - 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', - 'ext': 'mp4', - 'duration': 74.73, - 'thumbnail': r're:^https?://.*', - 'uploader': '。躲猫猫、', - 'uploader_id': '36017967', - 'uploader_url': 'http://i.youku.com/u/UMTQ0MDcxODY4', - 'tags': list, - } - }, { 'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf', 'only_matching': True, }, { - 'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html', - 'info_dict': { - 'id': 'XODgxNjg1Mzk2', - 'ext': 'mp4', - 'title': '武媚娘传奇 85', - 'duration': 1999.61, - 'thumbnail': r're:^https?://.*', - 'uploader': '疯狂豆花', - 'uploader_id': '62583473', - 'uploader_url': 'http://i.youku.com/u/UMjUwMzMzODky', - 'tags': list, - }, - }, { - 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html', - 'info_dict': { - 'id': 'XMTI1OTczNDM5Mg', - 'ext': 'mp4', - 'title': '花千骨 04', - 'duration': 2363, - 'thumbnail': r're:^https?://.*', - 'uploader': '放剧场-花千骨', - 'uploader_id': '772849359', - 'uploader_url': 'http://i.youku.com/u/UMzA5MTM5NzQzNg==', - 'tags': list, - }, - }, { 'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html', 'note': 'Video protected with password', 'info_dict': { @@ -81,6 +42,7 @@ class YoukuIE(InfoExtractor): 'uploader_id': '322014285', 'uploader_url': 'http://i.youku.com/u/UMTI4ODA1NzE0MA==', 'tags': list, + 'skip': '404', }, 'params': { 'videopassword': '100600', @@ -96,31 +58,41 @@ class YoukuIE(InfoExtractor): 'thumbnail': r're:^https?://.*', 'uploader': '明月庄主moon', 'uploader_id': '38465621', - 'uploader_url': 'http://i.youku.com/u/UMTUzODYyNDg0', + 'uploader_url': 'https://www.youku.com/profile/index/?uid=UMTUzODYyNDg0', 'tags': list, }, }, { - 'url': 'http://video.tudou.com/v/XMjIyNzAzMTQ4NA==.html?f=46177805', + 'url': 'https://v.youku.com/v_show/id_XNTA2NTA0MjA1Mg==.html', 'info_dict': { - 'id': 'XMjIyNzAzMTQ4NA', + 'id': 'XNTA2NTA0MjA1Mg', 'ext': 'mp4', - 'title': '卡马乔国足开大脚长传冲吊集锦', - 'duration': 289, + 'title': 'Minecraft我的世界:建造超大巨型航空飞机,菜鸟vs高手vs黑客', + 'duration': 542.13, 'thumbnail': r're:^https?://.*', - 'uploader': '阿卜杜拉之星', - 'uploader_id': '2382249', - 'uploader_url': 'http://i.youku.com/u/UOTUyODk5Ng==', + 'uploader': '波哥游戏解说', + 'uploader_id': '156688084', + 'uploader_url': 'https://www.youku.com/profile/index/?uid=UNjI2NzUyMzM2', 'tags': list, }, }, { - 'url': 'http://video.tudou.com/v/XMjE4ODI3OTg2MA==.html', - 'only_matching': True, + 'url': 'https://v.youku.com/v_show/id_XNTE1MzczOTg4MA==.html', + 'info_dict': { + 'id': 'XNTE1MzczOTg4MA', + 'ext': 'mp4', + 'title': '国产超A特工片', + 'duration': 362.97, + 'thumbnail': r're:^https?://.*', + 'uploader': '陈晓娟说历史', + 'uploader_id': '1640913339', + 'uploader_url': 'https://www.youku.com/profile/index/?uid=UNjU2MzY1MzM1Ng==', + 'tags': list, + }, }] @staticmethod def get_ysuid(): - return '%d%s' % (int(time.time()), ''.join([ - random.choice(string.ascii_letters) for i in range(3)])) + return '%d%s' % (int(time.time()), ''.join( + random.choices(string.ascii_letters, k=3))) def get_format_name(self, fm): _dict = { @@ -151,7 +123,7 @@ class YoukuIE(InfoExtractor): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0532', + 'ccode': '0524', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, @@ -182,7 +154,7 @@ class YoukuIE(InfoExtractor): else: msg = 'Youku server reported error %i' % error.get('code') if error_note is not None: - msg += ': ' + error_note + msg += ': ' + clean_html(error_note) raise ExtractorError(msg) # get video title diff --git a/hypervideo_dl/extractor/youporn.py b/hypervideo_dl/extractor/youporn.py index 8f1b991..6ee0abc 100644 --- a/hypervideo_dl/extractor/youporn.py +++ b/hypervideo_dl/extractor/youporn.py @@ -6,6 +6,7 @@ from ..utils import ( int_or_none, merge_dicts, str_to_int, + traverse_obj, unified_strdate, url_or_none, ) @@ -86,32 +87,31 @@ class YouPornIE(InfoExtractor): }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') definitions = self._download_json( - 'https://www.youporn.com/api/video/media_definitions/%s/' % video_id, - display_id) + f'https://www.youporn.com/api/video/media_definitions/{video_id}/', display_id or video_id) + + def get_format_data(data, f): + return traverse_obj(data, lambda _, v: v['format'] == f and url_or_none(v['videoUrl'])) formats = [] - for definition in definitions: - if not isinstance(definition, dict): - continue - video_url = url_or_none(definition.get('videoUrl')) - if not video_url: - continue - f = { - 'url': video_url, - 'filesize': int_or_none(definition.get('videoSize')), - } + # Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s + for hls_url in traverse_obj(get_format_data(definitions, 'hls'), ( + lambda _, v: not isinstance(v['defaultQuality'], bool), 'videoUrl'), (..., 'videoUrl')): + formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', fatal=False, m3u8_id='hls')) + + for definition in get_format_data(definitions, 'mp4'): + f = traverse_obj(definition, { + 'url': 'videoUrl', + 'filesize': ('videoSize', {int_or_none}) + }) height = int_or_none(definition.get('quality')) # Video URL's path looks like this: # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 # /videos/201703/11/109285532/1080P_4000K_109285532.mp4 # We will benefit from it by extracting some metadata - mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url) + mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', definition['videoUrl']) if mobj: if not height: height = int(mobj.group('height')) @@ -179,6 +179,7 @@ class YouPornIE(InfoExtractor): 'tags') data = self._search_json_ld(webpage, video_id, expected_type='VideoObject', fatal=False) + data.pop('url', None) return merge_dicts(data, { 'id': video_id, 'display_id': display_id, diff --git a/hypervideo_dl/extractor/youtube.py b/hypervideo_dl/extractor/youtube.py index f7e3c75..8d606b2 100644 --- a/hypervideo_dl/extractor/youtube.py +++ b/hypervideo_dl/extractor/youtube.py @@ -15,13 +15,13 @@ import sys import threading import time import traceback -import urllib.error import urllib.parse from .common import InfoExtractor, SearchInfoExtractor from .openload import PhantomJSwrapper from ..compat import functools from ..jsinterp import JSInterpreter +from ..networking.exceptions import HTTPError, network_exceptions from ..utils import ( NO_DEFAULT, ExtractorError, @@ -41,7 +41,6 @@ from ..utils import ( join_nonempty, js_to_json, mimetype2ext, - network_exceptions, orderedSet, parse_codecs, parse_count, @@ -66,6 +65,7 @@ from ..utils import ( variadic, ) +STREAMING_DATA_CLIENT_NAME = '__hypervideo_dl_client' # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { 'web': { @@ -248,11 +248,16 @@ def _split_innertube_client(client_name): return client_name, base, variant[0] if variant else None +def short_client_name(client_name): + main, *parts = _split_innertube_client(client_name)[0].replace('embedscreen', 'e_s').split('_') + return join_nonempty(main[:4], ''.join(x[0] for x in parts)).upper() + + def build_innertube_clients(): THIRD_PARTY = { 'embedUrl': 'https://www.youtube.com/', # Can be any valid URL } - BASE_CLIENTS = ('android', 'web', 'tv', 'ios', 'mweb') + BASE_CLIENTS = ('ios', 'android', 'web', 'tv', 'mweb') priority = qualities(BASE_CLIENTS[::-1]) for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): @@ -286,13 +291,14 @@ class BadgeType(enum.Enum): AVAILABILITY_PREMIUM = enum.auto() AVAILABILITY_SUBSCRIPTION = enum.auto() LIVE_NOW = enum.auto() + VERIFIED = enum.auto() class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _RESERVED_NAMES = ( - r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|' + r'channel|c|user|playlist|watch|w|v|embed|e|live|watch_popup|clip|' r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|' r'browse|oembed|get_video_info|iframe_api|s/player|source|' r'storefront|oops|index|account|t/terms|about|upload|signin|logout') @@ -312,6 +318,40 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'(?:www\.)?invidious\.pussthecat\.org', r'(?:www\.)?invidious\.zee\.li', r'(?:www\.)?invidious\.ethibox\.fr', + r'(?:www\.)?iv\.ggtyler\.dev', + r'(?:www\.)?inv\.vern\.i2p', + r'(?:www\.)?am74vkcrjp2d5v36lcdqgsj2m6x36tbrkhsruoegwfcizzabnfgf5zyd\.onion', + r'(?:www\.)?inv\.riverside\.rocks', + r'(?:www\.)?invidious\.silur\.me', + r'(?:www\.)?inv\.bp\.projectsegfau\.lt', + r'(?:www\.)?invidious\.g4c3eya4clenolymqbpgwz3q3tawoxw56yhzk4vugqrl6dtu3ejvhjid\.onion', + r'(?:www\.)?invidious\.slipfox\.xyz', + r'(?:www\.)?invidious\.esmail5pdn24shtvieloeedh7ehz3nrwcdivnfhfcedl7gf4kwddhkqd\.onion', + r'(?:www\.)?inv\.vernccvbvyi5qhfzyqengccj7lkove6bjot2xhh5kajhwvidqafczrad\.onion', + r'(?:www\.)?invidious\.tiekoetter\.com', + r'(?:www\.)?iv\.odysfvr23q5wgt7i456o5t3trw2cw5dgn56vbjfbq2m7xsc5vqbqpcyd\.onion', + r'(?:www\.)?invidious\.nerdvpn\.de', + r'(?:www\.)?invidious\.weblibre\.org', + r'(?:www\.)?inv\.odyssey346\.dev', + r'(?:www\.)?invidious\.dhusch\.de', + r'(?:www\.)?iv\.melmac\.space', + r'(?:www\.)?watch\.thekitty\.zone', + r'(?:www\.)?invidious\.privacydev\.net', + r'(?:www\.)?ng27owmagn5amdm7l5s3rsqxwscl5ynppnis5dqcasogkyxcfqn7psid\.onion', + r'(?:www\.)?invidious\.drivet\.xyz', + r'(?:www\.)?vid\.priv\.au', + r'(?:www\.)?euxxcnhsynwmfidvhjf6uzptsmh4dipkmgdmcmxxuo7tunp3ad2jrwyd\.onion', + r'(?:www\.)?inv\.vern\.cc', + r'(?:www\.)?invidious\.esmailelbob\.xyz', + r'(?:www\.)?invidious\.sethforprivacy\.com', + r'(?:www\.)?yt\.oelrichsgarcia\.de', + r'(?:www\.)?yt\.artemislena\.eu', + r'(?:www\.)?invidious\.flokinet\.to', + r'(?:www\.)?invidious\.baczek\.me', + r'(?:www\.)?y\.com\.sb', + r'(?:www\.)?invidious\.epicsite\.xyz', + r'(?:www\.)?invidious\.lidarshield\.cloud', + r'(?:www\.)?yt\.funami\.tech', r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', r'(?:www\.)?osbivz6guyeahrwp2lnwyjk2xos342h4ocsxyqrlaopqjuhwn2djiiyd\.onion', r'(?:www\.)?u2cvlit75owumwpy4dj2hsmvkq7nvrclkpht7xgyye2pyoxhpmclkrad\.onion', @@ -390,6 +430,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'(?:www\.)?piped\.qdi\.fi', r'(?:www\.)?piped\.video', r'(?:www\.)?piped\.aeong\.one', + r'(?:www\.)?piped\.moomoo\.me', + r'(?:www\.)?piped\.chauvet\.pro', + r'(?:www\.)?watch\.leptons\.xyz', + r'(?:www\.)?pd\.vern\.cc', + r'(?:www\.)?piped\.hostux\.net', + r'(?:www\.)?piped\.lunar\.icu', + # Hyperpipe instances from https://hyperpipe.codeberg.page/ + r'(?:www\.)?hyperpipe\.surge\.sh', + r'(?:www\.)?hyperpipe\.esmailelbob\.xyz', + r'(?:www\.)?listen\.whatever\.social', + r'(?:www\.)?music\.adminforge\.de', ) # extracted from account/account_menu ep @@ -406,6 +457,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _IGNORED_WARNINGS = {'Unavailable videos will be hidden during playback'} + _YT_HANDLE_RE = r'@[\w.-]{3,30}' # https://support.google.com/youtube/answer/11585688?hl=en + _YT_CHANNEL_UCID_RE = r'UC[\w-]{22}' + + def ucid_or_none(self, ucid): + return self._search_regex(rf'^({self._YT_CHANNEL_UCID_RE})$', ucid, 'UC-id', default=None) + + def handle_or_none(self, handle): + return self._search_regex(rf'^({self._YT_HANDLE_RE})$', handle, '@-handle', default=None) + + def handle_from_url(self, url): + return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_HANDLE_RE})', + url, 'channel handle', default=None) + + def ucid_from_url(self, url): + return self._search_regex(rf'^(?:https?://(?:www\.)?youtube\.com)?/({self._YT_CHANNEL_UCID_RE})', + url, 'channel id', default=None) + @functools.cached_property def _preferred_lang(self): """ @@ -428,16 +496,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): cookies = self._get_cookies('https://www.youtube.com/') if cookies.get('__Secure-3PSID'): return - consent_id = None - consent = cookies.get('CONSENT') - if consent: - if 'YES' in consent.value: - return - consent_id = self._search_regex( - r'PENDING\+(\d+)', consent.value, 'consent', default=None) - if not consent_id: - consent_id = random.randint(100, 999) - self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + socs = cookies.get('SOCS') + if socs and not socs.value.startswith('CAA'): # not consented + return + self._set_cookie('.youtube.com', 'SOCS', 'CAI', secure=True) # accept all (required for mixes) def _initialize_pref(self): cookies = self._get_cookies('https://www.youtube.com/') @@ -723,17 +785,26 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _extract_and_report_alerts(self, data, *args, **kwargs): return self._report_alerts(self._extract_alerts(data), *args, **kwargs) - def _extract_badges(self, renderer: dict): - privacy_icon_map = { + def _extract_badges(self, badge_list: list): + """ + Extract known BadgeType's from a list of badge renderers. + @returns [{'type': BadgeType}] + """ + icon_type_map = { 'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED, 'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE, - 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC + 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC, + 'CHECK_CIRCLE_THICK': BadgeType.VERIFIED, + 'OFFICIAL_ARTIST_BADGE': BadgeType.VERIFIED, + 'CHECK': BadgeType.VERIFIED, } badge_style_map = { 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, - 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW + 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW, + 'BADGE_STYLE_TYPE_VERIFIED': BadgeType.VERIFIED, + 'BADGE_STYLE_TYPE_VERIFIED_ARTIST': BadgeType.VERIFIED, } label_map = { @@ -741,13 +812,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'private': BadgeType.AVAILABILITY_PRIVATE, 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, 'live': BadgeType.LIVE_NOW, - 'premium': BadgeType.AVAILABILITY_PREMIUM + 'premium': BadgeType.AVAILABILITY_PREMIUM, + 'verified': BadgeType.VERIFIED, + 'official artist channel': BadgeType.VERIFIED, } badges = [] - for badge in traverse_obj(renderer, ('badges', ..., 'metadataBadgeRenderer'), default=[]): + for badge in traverse_obj(badge_list, (..., lambda key, _: re.search(r'[bB]adgeRenderer$', key))): badge_type = ( - privacy_icon_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) + icon_type_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) or badge_style_map.get(traverse_obj(badge, 'style')) ) if badge_type: @@ -755,11 +828,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor): continue # fallback, won't work in some languages - label = traverse_obj(badge, 'label', expected_type=str, default='') + label = traverse_obj( + badge, 'label', ('accessibilityData', 'label'), 'tooltip', 'iconTooltip', get_all=False, expected_type=str, default='') for match, label_badge_type in label_map.items(): if match in label.lower(): - badges.append({'type': badge_type}) - continue + badges.append({'type': label_badge_type}) + break return badges @@ -785,7 +859,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): runs = item runs = runs[:min(len(runs), max_runs or len(runs))] - text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str, default=[])) + text = ''.join(traverse_obj(runs, (..., 'text'), expected_type=str)) if text: return text @@ -805,7 +879,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """ thumbnails = [] for path in path_list or [()]: - for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...), default=[]): + for thumbnail in traverse_obj(data, (*variadic(path), 'thumbnails', ...)): thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue @@ -825,9 +899,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def extract_relative_time(relative_time_text): """ Extracts a relative time from string and converts to dt object - e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today' + e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today', '8 yr ago' """ - mobj = re.search(r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text) + + # XXX: this could be moved to a general function in utils.py + # The relative time text strings are roughly the same as what + # Javascript's Intl.RelativeTimeFormat function generates. + # See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/RelativeTimeFormat + mobj = re.search( + r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>sec(?:ond)?|s|min(?:ute)?|h(?:our|r)?|d(?:ay)?|w(?:eek|k)?|mo(?:nth)?|y(?:ear|r)?)s?\s*ago', + relative_time_text) if mobj: start = mobj.group('start') if start: @@ -871,15 +952,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor): except ExtractorError as e: if not isinstance(e.cause, network_exceptions): return self._error_or_warning(e, fatal=fatal) - elif not isinstance(e.cause, urllib.error.HTTPError): + elif not isinstance(e.cause, HTTPError): retry.error = e continue - first_bytes = e.cause.read(512) + first_bytes = e.cause.response.read(512) if not is_html(first_bytes): yt_error = try_get( self._parse_json( - self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), + self._webpage_read_content(e.cause.response, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), lambda x: x['error']['message'], str) if yt_error: self._report_alerts([('ERROR', yt_error)], fatal=False) @@ -887,7 +968,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 # We also want to catch all other network exceptions since errors in later pages can be troublesome # See https://github.com/hypervideo/hypervideo/issues/507#issuecomment-880188210 - if e.cause.code not in (403, 429): + if e.cause.status not in (403, 429): retry.error = e continue return self._error_or_warning(e, fatal=fatal) @@ -911,7 +992,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): @staticmethod def is_music_url(url): - return re.match(r'https?://music\.youtube\.com/', url) is not None + return re.match(r'(https?://)?music\.youtube\.com/', url) is not None def _extract_video(self, renderer): video_id = renderer.get('videoId') @@ -940,11 +1021,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if not channel_id: channel_id = traverse_obj(reel_header_renderer, ('channelNavigationEndpoint', 'browseEndpoint', 'browseId')) + channel_id = self.ucid_or_none(channel_id) + overlay_style = traverse_obj( renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) - badges = self._extract_badges(renderer) - + badges = self._extract_badges(traverse_obj(renderer, 'badges')) + owner_badges = self._extract_badges(traverse_obj(renderer, 'ownerBadges')) navigation_url = urljoin('https://www.youtube.com/', traverse_obj( renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str)) or '' @@ -968,6 +1051,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor): else self._get_count({'simpleText': view_count_text})) view_count_field = 'concurrent_view_count' if live_status in ('is_live', 'is_upcoming') else 'view_count' + channel = (self._get_text(renderer, 'ownerText', 'shortBylineText') + or self._get_text(reel_header_renderer, 'channelTitleText')) + + channel_handle = traverse_obj(renderer, ( + 'shortBylineText', 'runs', ..., 'navigationEndpoint', + (('commandMetadata', 'webCommandMetadata', 'url'), ('browseEndpoint', 'canonicalBaseUrl'))), + expected_type=self.handle_from_url, get_all=False) return { '_type': 'url', 'ie_key': YoutubeIE.ie_key(), @@ -977,9 +1067,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'description': description, 'duration': duration, 'channel_id': channel_id, - 'channel': (self._get_text(renderer, 'ownerText', 'shortBylineText') - or self._get_text(reel_header_renderer, 'channelTitleText')), + 'channel': channel, 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None, + 'uploader': channel, + 'uploader_id': channel_handle, + 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), 'timestamp': (self._parse_time_text(time_text) if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE) @@ -993,7 +1085,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None), view_count_field: view_count, - 'live_status': live_status + 'live_status': live_status, + 'channel_is_verified': True if self._has_badge(owner_badges, BadgeType.VERIFIED) else None } @@ -1012,7 +1105,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: - (?:(?:v|embed|e|shorts)/(?!videoseries|live_stream)) # v/ or embed/ or e/ or shorts/ + (?:(?:v|embed|e|shorts|live)/(?!videoseries|live_stream)) # v/ or embed/ or e/ or shorts/ |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! @@ -1181,9 +1274,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'channel': 'Philipp Hagemeister', 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', @@ -1202,7 +1292,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'start_time': 1, 'end_time': 9, 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Philipp Hagemeister', + 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', + 'uploader_id': '@PhilippHagemeister', + 'heatmap': 'count:100', } }, { @@ -1214,9 +1308,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20120608', 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', - 'uploader': 'SET India', - 'uploader_id': 'setindia', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', 'age_limit': 18, }, 'skip': 'Private video', @@ -1228,9 +1319,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', 'channel': 'Philipp Hagemeister', 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', @@ -1247,7 +1335,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'not_live', 'age_limit': 0, 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Philipp Hagemeister', + 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', + 'uploader_id': '@PhilippHagemeister', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -1260,10 +1352,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'a9LDPn-MO4I', 'ext': 'm4a', 'upload_date': '20121002', - 'uploader_id': '8KVIDEO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'description': '', - 'uploader': '8KVIDEO', 'title': 'UHDTV TEST 8K VIDEO.mp4' }, 'params': { @@ -1281,8 +1370,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', 'duration': 244, - 'uploader': 'AfrojackVEVO', - 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', 'abr': 129.495, 'like_count': int, @@ -1294,13 +1381,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'not_live', 'thumbnail': 'https://i.ytimg.com/vi_webp/IB3lcPjvWLA/maxresdefault.webp', 'channel': 'Afrojack', - 'uploader_url': 'http://www.youtube.com/user/AfrojackVEVO', 'tags': 'count:19', 'availability': 'public', 'categories': ['Music'], 'age_limit': 0, 'alt_title': 'The Spark', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Afrojack', + 'uploader_url': 'https://www.youtube.com/@Afrojack', + 'uploader_id': '@Afrojack', }, 'params': { 'youtube_include_dash_manifest': True, @@ -1317,9 +1406,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'duration': 142, - 'uploader': 'The Witcher', - 'uploader_id': 'WitcherGame', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', 'age_limit': 18, 'categories': ['Gaming'], @@ -1333,7 +1419,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_id': 'UCzybXLxv08IApdjdN0mJhEg', 'playable_in_embed': True, 'view_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'The Witcher', + 'uploader_url': 'https://www.youtube.com/@thewitcher', + 'uploader_id': '@thewitcher', + 'comment_count': int, + 'channel_is_verified': True, + 'heatmap': 'count:100', }, }, { @@ -1345,12 +1437,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Godzilla 2 (Official Video)', 'description': 'md5:bf77e03fcae5529475e500129b05668a', 'upload_date': '20200408', - 'uploader_id': 'FlyingKitty900', - 'uploader': 'FlyingKitty', 'age_limit': 18, 'availability': 'needs_auth', 'channel_id': 'UCYQT13AtrJC0gsM1far_zJg', - 'uploader_url': 'http://www.youtube.com/user/FlyingKitty900', 'channel': 'FlyingKitty', 'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg', 'view_count': int, @@ -1361,7 +1450,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'duration': 177, 'playable_in_embed': True, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'FlyingKitty', + 'uploader_url': 'https://www.youtube.com/@FlyingKitty900', + 'uploader_id': '@FlyingKitty900', + 'comment_count': int, + 'channel_is_verified': True, }, }, { @@ -1372,13 +1466,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': '[MMD] Adios - EVERGLOW [+Motion DL]', 'ext': 'mp4', 'upload_date': '20191228', - 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', - 'uploader': 'Projekt Melody', 'description': 'md5:17eccca93a786d51bc67646756894066', 'age_limit': 18, 'like_count': int, 'availability': 'needs_auth', - 'uploader_url': 'http://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', 'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', 'view_count': int, 'thumbnail': 'https://i.ytimg.com/vi_webp/Tq92D6wQ1mg/sddefault.webp', @@ -1390,7 +1481,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 106, 'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Projekt Melody', + 'uploader_url': 'https://www.youtube.com/@ProjektMelody', + 'uploader_id': '@ProjektMelody', }, }, { @@ -1400,8 +1494,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'MeJVWBSsPAY', 'ext': 'mp4', 'title': 'OOMPH! - Such Mich Find Mich (Lyrics)', - 'uploader': 'Herr Lurik', - 'uploader_id': 'st3in234', 'description': 'Fan Video. Music & Lyrics by OOMPH!.', 'upload_date': '20130730', 'track': 'Such mich find mich', @@ -1418,11 +1510,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_id': 'UCdR3RSDPqub28LjZx0v9-aA', 'categories': ['Music'], 'availability': 'public', - 'uploader_url': 'http://www.youtube.com/user/st3in234', 'channel_url': 'https://www.youtube.com/channel/UCdR3RSDPqub28LjZx0v9-aA', 'live_status': 'not_live', 'artist': 'OOMPH!', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Herr Lurik', + 'uploader_url': 'https://www.youtube.com/@HerrLurik', + 'uploader_id': '@HerrLurik', }, }, { @@ -1439,11 +1533,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 266, 'upload_date': '20100430', - 'uploader_id': 'deadmau5', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', 'creator': 'deadmau5', 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336', - 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', 'alt_title': 'Some Chords', 'availability': 'public', @@ -1461,7 +1552,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UCYEK6xds6eo-3tr4xRdflmQ', 'categories': ['Music'], 'album': 'Some Chords', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'deadmau5', + 'uploader_url': 'https://www.youtube.com/@deadmau5', + 'uploader_id': '@deadmau5', }, 'expected_warnings': [ 'DASH manifest missing', @@ -1475,10 +1569,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'duration': 6085, 'upload_date': '20150827', - 'uploader_id': 'olympic', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', 'description': 'md5:04bbbf3ccceb6795947572ca36f45904', - 'uploader': 'Olympics', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', 'like_count': int, 'release_timestamp': 1343767800, @@ -1494,7 +1585,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'was_live', 'view_count': int, 'channel_url': 'https://www.youtube.com/channel/UCTl3QQTvqHFjurroKxexy2Q', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Olympics', + 'uploader_url': 'https://www.youtube.com/@Olympics', + 'uploader_id': '@Olympics', + 'channel_is_verified': True, }, 'params': { 'skip_download': 'requires avconv', @@ -1509,10 +1604,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'stretched_ratio': 16 / 9., 'duration': 85, 'upload_date': '20110310', - 'uploader_id': 'AllenMeow', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫ᄋᄅ', 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', 'playable_in_embed': True, 'channel': '孫ᄋᄅ', @@ -1527,7 +1619,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'not_live', 'availability': 'unlisted', 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': '孫ᄋᄅ', + 'uploader_url': 'https://www.youtube.com/@AllenMeow', + 'uploader_id': '@AllenMeow', }, }, # url_encoded_fmt_stream_map is empty string @@ -1539,8 +1634,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', 'description': '', 'upload_date': '20150404', - 'uploader_id': 'spbelect', - 'uploader': 'Наблюдатели Петербурга', }, 'params': { 'skip_download': 'requires avconv', @@ -1557,9 +1650,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:116377fd2963b81ec4ce64b542173306', 'duration': 220, 'upload_date': '20150625', - 'uploader_id': 'dorappi2000', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', - 'uploader': 'dorappi2000', 'formats': 'mincount:31', }, 'skip': 'not actual anymore', @@ -1572,9 +1662,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'CsmdDsKjzN8', 'ext': 'mp4', 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510 - 'uploader': 'Airtek', 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', - 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', }, 'params': { @@ -1585,6 +1673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, { # Multifeed videos (multiple cameras), URL can be of any Camera + # TODO: fix multifeed titles 'url': 'https://www.youtube.com/watch?v=zaPI8MvL8pg', 'info_dict': { 'id': 'zaPI8MvL8pg', @@ -1596,16 +1685,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'j5yGuxZ8lLU', 'ext': 'mp4', 'title': 'Terraria 1.2 Live Stream | Let\'s Play - Part 04 (Chris)', - 'uploader': 'WiiLikeToPlay', 'description': 'md5:563ccbc698b39298481ca3c571169519', - 'uploader_url': 'http://www.youtube.com/user/WiiRikeToPray', 'duration': 10120, 'channel_follower_count': int, 'channel_url': 'https://www.youtube.com/channel/UCN2XePorRokPB9TEgRZpddg', 'availability': 'public', 'playable_in_embed': True, 'upload_date': '20131105', - 'uploader_id': 'WiiRikeToPray', 'categories': ['Gaming'], 'live_status': 'was_live', 'tags': 'count:24', @@ -1618,17 +1704,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel': 'WiiLikeToPlay', 'view_count': int, 'release_date': '20131106', + 'uploader': 'WiiLikeToPlay', + 'uploader_id': '@WLTP', + 'uploader_url': 'https://www.youtube.com/@WLTP', }, }, { 'info_dict': { 'id': 'zaPI8MvL8pg', 'ext': 'mp4', 'title': 'Terraria 1.2 Live Stream | Let\'s Play - Part 04 (Tyson)', - 'uploader_id': 'WiiRikeToPray', 'availability': 'public', 'channel_url': 'https://www.youtube.com/channel/UCN2XePorRokPB9TEgRZpddg', 'channel': 'WiiLikeToPlay', - 'uploader_url': 'http://www.youtube.com/user/WiiRikeToPray', 'channel_follower_count': int, 'description': 'md5:563ccbc698b39298481ca3c571169519', 'duration': 10108, @@ -1636,7 +1723,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'tags': 'count:24', 'channel_id': 'UCN2XePorRokPB9TEgRZpddg', - 'uploader': 'WiiLikeToPlay', 'release_timestamp': 1383701915, 'comment_count': int, 'upload_date': '20131105', @@ -1646,6 +1732,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'was_live', 'categories': ['Gaming'], 'view_count': int, + 'uploader': 'WiiLikeToPlay', + 'uploader_id': '@WLTP', + 'uploader_url': 'https://www.youtube.com/@WLTP', }, }, { 'info_dict': { @@ -1659,12 +1748,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'playable_in_embed': True, 'upload_date': '20131105', 'description': 'md5:563ccbc698b39298481ca3c571169519', - 'uploader_id': 'WiiRikeToPray', - 'uploader_url': 'http://www.youtube.com/user/WiiRikeToPray', 'channel_follower_count': int, 'tags': 'count:24', 'release_date': '20131106', - 'uploader': 'WiiLikeToPlay', 'comment_count': int, 'channel_url': 'https://www.youtube.com/channel/UCN2XePorRokPB9TEgRZpddg', 'channel': 'WiiLikeToPlay', @@ -1674,6 +1760,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'age_limit': 0, 'duration': 10128, 'view_count': int, + 'uploader': 'WiiLikeToPlay', + 'uploader_id': '@WLTP', + 'uploader_url': 'https://www.youtube.com/@WLTP', }, }], 'params': {'skip_download': True}, @@ -1710,9 +1799,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'duration': 133, 'upload_date': '20151119', - 'uploader_id': 'IronSoulElf', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', - 'uploader': 'IronSoulElf', 'creator': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan', 'track': 'Dark Walk', 'artist': 'Todd Haberman;\nDaniel Law Heath and Aaron Kaplan', @@ -1749,8 +1835,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4', 'description': 'md5:ee18a25c350637c8faff806845bddee9', 'upload_date': '20151107', - 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA', - 'uploader': 'CH GAMER DROID', }, 'params': { 'skip_download': True, @@ -1772,9 +1856,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:a677553cf0840649b731a3024aeff4cc', 'duration': 721, 'upload_date': '20150128', - 'uploader_id': 'BerkmanCenter', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', - 'uploader': 'The Berkman Klein Center for Internet & Society', 'license': 'Creative Commons Attribution license (reuse allowed)', 'channel_id': 'UCuLGmD72gJDBwmLw06X58SA', 'channel_url': 'https://www.youtube.com/channel/UCuLGmD72gJDBwmLw06X58SA', @@ -1788,16 +1869,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': 'https://i.ytimg.com/vi_webp/M4gD1WSo5mA/maxresdefault.webp', 'live_status': 'not_live', 'playable_in_embed': True, - 'comment_count': int, 'channel_follower_count': int, 'chapters': list, + 'uploader': 'The Berkman Klein Center for Internet & Society', + 'uploader_id': '@BKCHarvard', + 'uploader_url': 'https://www.youtube.com/@BKCHarvard', }, 'params': { 'skip_download': True, }, }, { - # Channel-like uploader_url 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg', 'info_dict': { 'id': 'eQcmzGIKrzg', @@ -1806,9 +1888,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:13a2503d7b5904ef4b223aa101628f39', 'duration': 4060, 'upload_date': '20151120', - 'uploader': 'Bernie Sanders', - 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', 'license': 'Creative Commons Attribution license (reuse allowed)', 'playable_in_embed': True, 'tags': 'count:12', @@ -1825,6 +1904,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'comment_count': int, 'channel_follower_count': int, 'chapters': list, + 'uploader': 'Bernie Sanders', + 'uploader_url': 'https://www.youtube.com/@BernieSanders', + 'uploader_id': '@BernieSanders', + 'channel_is_verified': True, + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -1848,9 +1932,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Piku - Trailer', 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb', 'upload_date': '20150811', - 'uploader': 'FlixMatrix', - 'uploader_id': 'FlixMatrixKaravan', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan', 'license': 'Standard YouTube License', }, 'params': { @@ -1868,9 +1949,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd', 'duration': 2085, 'upload_date': '20170118', - 'uploader': 'Vsauce', - 'uploader_id': 'Vsauce', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', 'series': 'Mind Field', 'season_number': 1, 'episode_number': 1, @@ -1888,7 +1966,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'playable_in_embed': True, 'live_status': 'not_live', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Vsauce', + 'uploader_url': 'https://www.youtube.com/@Vsauce', + 'uploader_id': '@Vsauce', + 'comment_count': int, + 'channel_is_verified': True, }, 'params': { 'skip_download': True, @@ -1908,9 +1991,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', 'duration': 965, 'upload_date': '20140124', - 'uploader': 'New Century Foundation', - 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', }, 'params': { 'skip_download': True, @@ -1955,9 +2035,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'duration': 433, 'upload_date': '20130923', - 'uploader': 'Amelia Putri Harwita', - 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q', 'formats': 'maxcount:10', }, 'params': { @@ -1968,6 +2045,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, { # Youtube Music Auto-generated description + # TODO: fix metadata extraction 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs', 'info_dict': { 'id': 'MgNrAu2pzNs', @@ -1975,8 +2053,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Voyeur Girl', 'description': 'md5:7ae382a65843d6df2685993e90a8628f', 'upload_date': '20190312', - 'uploader': 'Stephen - Topic', - 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA', 'artist': 'Stephen', 'track': 'Voyeur Girl', 'album': 'it\'s too much love to know my dear', @@ -1984,12 +2060,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'release_year': 2019, 'alt_title': 'Voyeur Girl', 'view_count': int, - 'uploader_url': 'http://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA', 'playable_in_embed': True, 'like_count': int, 'categories': ['Music'], 'channel_url': 'https://www.youtube.com/channel/UC-pWHpBjdGG69N9mM2auIAA', - 'channel': 'Stephen', + 'channel': 'Stephen', # TODO: should be "Stephen - Topic" + 'uploader': 'Stephen', 'availability': 'public', 'creator': 'Stephen', 'duration': 169, @@ -2017,9 +2093,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)', 'description': 'md5:bf577a41da97918e94fa9798d9228825', 'upload_date': '20090125', - 'uploader': 'Prochorowka', - 'uploader_id': 'Prochorowka', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka', 'artist': 'Panjabi MC', 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix', 'album': 'Beware of the Boys (Mundian To Bach Ke)', @@ -2038,11 +2111,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'IMG 3456', 'description': '', 'upload_date': '20170613', - 'uploader_id': 'ElevageOrVert', - 'uploader': 'ElevageOrVert', 'view_count': int, 'thumbnail': 'https://i.ytimg.com/vi_webp/x41yOUIvK2k/maxresdefault.webp', - 'uploader_url': 'http://www.youtube.com/user/ElevageOrVert', 'like_count': int, 'channel_id': 'UCo03ZQPBW5U4UC3regpt1nw', 'tags': [], @@ -2053,8 +2123,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 7, 'playable_in_embed': True, 'live_status': 'not_live', - 'channel': 'ElevageOrVert', - 'channel_follower_count': int + 'channel': 'l\'Or Vert asbl', + 'channel_follower_count': int, + 'uploader': 'l\'Or Vert asbl', + 'uploader_url': 'https://www.youtube.com/@ElevageOrVert', + 'uploader_id': '@ElevageOrVert', }, 'params': { 'skip_download': True, @@ -2072,11 +2145,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Part 77 Sort a list of simple types in c#', 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', 'upload_date': '20130831', - 'uploader_id': 'kudvenkat', - 'uploader': 'kudvenkat', 'channel_id': 'UCCTVrRB5KpIiK6V2GGVsR1Q', 'like_count': int, - 'uploader_url': 'http://www.youtube.com/user/kudvenkat', 'channel_url': 'https://www.youtube.com/channel/UCCTVrRB5KpIiK6V2GGVsR1Q', 'live_status': 'not_live', 'categories': ['Education'], @@ -2091,6 +2161,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'comment_count': int, 'channel_follower_count': int, 'chapters': list, + 'uploader': 'kudvenkat', + 'uploader_url': 'https://www.youtube.com/@Csharp-video-tutorialsBlogspot', + 'uploader_id': '@Csharp-video-tutorialsBlogspot', + 'channel_is_verified': True, + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -2114,9 +2189,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Burn Out', 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131', 'upload_date': '20141120', - 'uploader': 'The Cinematic Orchestra - Topic', - 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw', 'artist': 'The Cinematic Orchestra', 'track': 'Burn Out', 'album': 'Every Day', @@ -2135,7 +2207,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': 'https://i.ytimg.com/vi/OtqTfy26tG0/maxresdefault.jpg', 'categories': ['Music'], 'playable_in_embed': True, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'The Cinematic Orchestra', + 'comment_count': int, }, 'params': { 'skip_download': True, @@ -2154,13 +2228,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'San Diego teen commits suicide after bullying over embarrassing video', 'channel_id': 'UC-SJ6nODDmufqBzPBwCvYvQ', - 'uploader': 'CBS Mornings', - 'uploader_id': 'CBSThisMorning', 'upload_date': '20140716', 'description': 'md5:acde3a73d3f133fc97e837a9f76b53b7', 'duration': 170, 'categories': ['News & Politics'], - 'uploader_url': 'http://www.youtube.com/user/CBSThisMorning', 'view_count': int, 'channel': 'CBS Mornings', 'tags': ['suicide', 'bullying', 'video', 'cbs', 'news'], @@ -2171,7 +2242,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'live_status': 'not_live', 'playable_in_embed': True, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'CBS Mornings', + 'uploader_url': 'https://www.youtube.com/@CBSMornings', + 'uploader_id': '@CBSMornings', + 'comment_count': int, + 'channel_is_verified': True, } }, { @@ -2183,9 +2259,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き', 'description': 'md5:ea770e474b7cd6722b4c95b833c03630', 'upload_date': '20201120', - 'uploader': 'Walk around Japan', - 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw', 'duration': 1456, 'categories': ['Travel & Events'], 'channel_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw', @@ -2198,7 +2271,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw', 'live_status': 'not_live', 'playable_in_embed': True, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Walk around Japan', + 'uploader_url': 'https://www.youtube.com/@walkaroundjapan7124', + 'uploader_id': '@walkaroundjapan7124', }, 'params': { 'skip_download': True, @@ -2224,13 +2300,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': '3gp', 'upload_date': '20210624', 'channel_id': 'UCp68_FLety0O-n9QU6phsgw', - 'uploader': 'colinfurze', - 'uploader_id': 'colinfurze', 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCp68_FLety0O-n9QU6phsgw', 'description': 'md5:5d5991195d599b56cd0c4148907eec50', 'duration': 596, 'categories': ['Entertainment'], - 'uploader_url': 'http://www.youtube.com/user/colinfurze', 'view_count': int, 'channel': 'colinfurze', 'tags': ['Colin', 'furze', 'Terry', 'tunnel', 'underground', 'bunker'], @@ -2242,6 +2315,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'playable_in_embed': True, 'channel_follower_count': int, 'chapters': list, + 'uploader': 'colinfurze', + 'uploader_url': 'https://www.youtube.com/@colinfurze', + 'uploader_id': '@colinfurze', + 'comment_count': int, + 'channel_is_verified': True, + 'heatmap': 'count:100', }, 'params': { 'format': '17', # 3gp format available on android @@ -2267,10 +2346,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mhtml', 'format_id': 'sb0', 'title': 'Your Brain is Plastic', - 'uploader_id': 'scishow', 'description': 'md5:89cd86034bdb5466cd87c6ba206cd2bc', 'upload_date': '20140324', - 'uploader': 'SciShow', 'like_count': int, 'channel_id': 'UCZYTClx2T1of7BRZ86-8fow', 'channel_url': 'https://www.youtube.com/channel/UCZYTClx2T1of7BRZ86-8fow', @@ -2278,7 +2355,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': 'https://i.ytimg.com/vi/5KLPxDtMqe8/maxresdefault.jpg', 'playable_in_embed': True, 'tags': 'count:12', - 'uploader_url': 'http://www.youtube.com/user/scishow', 'availability': 'public', 'channel': 'SciShow', 'live_status': 'not_live', @@ -2287,6 +2363,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'age_limit': 0, 'channel_follower_count': int, 'chapters': list, + 'uploader': 'SciShow', + 'uploader_url': 'https://www.youtube.com/@SciShow', + 'uploader_id': '@SciShow', + 'comment_count': int, + 'channel_is_verified': True, + 'heatmap': 'count:100', }, 'params': {'format': 'mhtml', 'skip_download': True} }, { # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) @@ -2296,9 +2378,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'The NP that test your phone performance 🙂', 'description': 'md5:144494b24d4f9dfacb97c1bbef5de84d', - 'uploader': 'Leon Nguyen', - 'uploader_id': 'VNSXIII', - 'uploader_url': 'http://www.youtube.com/user/VNSXIII', 'channel_id': 'UCRqNBSOHgilHfAczlUmlWHA', 'channel_url': 'https://www.youtube.com/channel/UCRqNBSOHgilHfAczlUmlWHA', 'duration': 21, @@ -2314,7 +2393,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel': 'Leon Nguyen', 'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp', 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Leon Nguyen', + 'uploader_url': 'https://www.youtube.com/@LeonNguyen', + 'uploader_id': '@LeonNguyen', + 'heatmap': 'count:100', } }, { # Same video as above, but with --compat-opt no-youtube-prefer-utc-upload-date @@ -2324,9 +2407,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'The NP that test your phone performance 🙂', 'description': 'md5:144494b24d4f9dfacb97c1bbef5de84d', - 'uploader': 'Leon Nguyen', - 'uploader_id': 'VNSXIII', - 'uploader_url': 'http://www.youtube.com/user/VNSXIII', 'channel_id': 'UCRqNBSOHgilHfAczlUmlWHA', 'channel_url': 'https://www.youtube.com/channel/UCRqNBSOHgilHfAczlUmlWHA', 'duration': 21, @@ -2342,7 +2422,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel': 'Leon Nguyen', 'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp', 'comment_count': int, - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Leon Nguyen', + 'uploader_url': 'https://www.youtube.com/@LeonNguyen', + 'uploader_id': '@LeonNguyen', + 'heatmap': 'count:100', }, 'params': {'compat_opts': ['no-youtube-prefer-utc-upload-date']} }, { @@ -2352,10 +2436,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'mzZzzBU6lrM', 'ext': 'mp4', 'title': 'I Met GeorgeNotFound In Real Life...', - 'description': 'md5:cca98a355c7184e750f711f3a1b22c84', - 'uploader': 'Quackity', - 'uploader_id': 'QuackityHQ', - 'uploader_url': 'http://www.youtube.com/user/QuackityHQ', + 'description': 'md5:978296ec9783a031738b684d4ebf302d', 'channel_id': 'UC_8NknAFiyhOUaZqHR3lq3Q', 'channel_url': 'https://www.youtube.com/channel/UC_8NknAFiyhOUaZqHR3lq3Q', 'duration': 955, @@ -2372,7 +2453,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'availability': 'public', 'channel': 'Quackity', 'thumbnail': 'https://i.ytimg.com/vi/mzZzzBU6lrM/maxresdefault.jpg', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader': 'Quackity', + 'uploader_id': '@Quackity', + 'uploader_url': 'https://www.youtube.com/@Quackity', + 'comment_count': int, + 'channel_is_verified': True, + 'heatmap': 'count:100', } }, { # continuous livestream. Microformat upload date should be preferred. @@ -2390,48 +2477,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA', 'live_status': 'is_live', 'thumbnail': 'https://i.ytimg.com/vi/kgx4WGK0oNU/maxresdefault.jpg', - 'uploader': '阿鲍Abao', - 'uploader_url': 'http://www.youtube.com/channel/UC84whx2xxsiA1gXHXXqKGOA', 'channel': 'Abao in Tokyo', 'channel_follower_count': int, 'release_date': '20211127', 'tags': 'count:39', 'categories': ['People & Blogs'], 'like_count': int, - 'uploader_id': 'UC84whx2xxsiA1gXHXXqKGOA', 'view_count': int, 'playable_in_embed': True, 'description': 'md5:2ef1d002cad520f65825346e2084e49d', 'concurrent_view_count': int, + 'uploader': 'Abao in Tokyo', + 'uploader_url': 'https://www.youtube.com/@abaointokyo', + 'uploader_id': '@abaointokyo', }, 'params': {'skip_download': True} }, { - # Story. Requires specific player params to work. - 'url': 'https://www.youtube.com/watch?v=vv8qTUWmulI', - 'info_dict': { - 'id': 'vv8qTUWmulI', - 'ext': 'mp4', - 'availability': 'unlisted', - 'view_count': int, - 'channel_id': 'UCzIZ8HrzDgc-pNQDUG6avBA', - 'upload_date': '20220526', - 'categories': ['Education'], - 'title': 'Story', - 'channel': 'IT\'S HISTORY', - 'description': '', - 'uploader_id': 'BlastfromthePast', - 'duration': 12, - 'uploader': 'IT\'S HISTORY', - 'playable_in_embed': True, - 'age_limit': 0, - 'live_status': 'not_live', - 'tags': [], - 'thumbnail': 'https://i.ytimg.com/vi_webp/vv8qTUWmulI/maxresdefault.webp', - 'uploader_url': 'http://www.youtube.com/user/BlastfromthePast', - 'channel_url': 'https://www.youtube.com/channel/UCzIZ8HrzDgc-pNQDUG6avBA', - }, - 'skip': 'stories get removed after some period of time', - }, { 'url': 'https://www.youtube.com/watch?v=tjjjtzRLHvA', 'info_dict': { 'id': 'tjjjtzRLHvA', @@ -2440,11 +2501,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20220323', 'like_count': int, 'availability': 'unlisted', - 'channel': 'nao20010128nao', - 'thumbnail': 'https://i.ytimg.com/vi_webp/tjjjtzRLHvA/maxresdefault.webp', + 'channel': 'Lesmiscore', + 'thumbnail': r're:^https?://.*\.jpg', 'age_limit': 0, - 'uploader': 'nao20010128nao', - 'uploader_id': 'nao20010128nao', 'categories': ['Music'], 'view_count': int, 'description': '', @@ -2455,7 +2514,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_follower_count': int, 'duration': 6, 'tags': [], - 'uploader_url': 'http://www.youtube.com/user/nao20010128nao', + 'uploader_id': '@lesmiscore', + 'uploader': 'Lesmiscore', + 'uploader_url': 'https://www.youtube.com/@lesmiscore', } }, { # Prefer primary title+description language metadata by default @@ -2473,16 +2534,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'playable_in_embed': True, 'availability': 'unlisted', - 'thumbnail': 'https://i.ytimg.com/vi_webp/el3E4MbxRqQ/maxresdefault.webp', + 'thumbnail': r're:^https?://.*\.jpg', 'age_limit': 0, 'duration': 5, - 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', - 'uploader_url': 'http://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', 'live_status': 'not_live', 'upload_date': '20220908', 'categories': ['People & Blogs'], - 'uploader': 'cole-dlp-test-acc', 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', }, 'params': {'skip_download': True} }, { @@ -2497,18 +2558,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'live_status': 'not_live', 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', 'upload_date': '20220728', - 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', 'view_count': int, 'categories': ['People & Blogs'], - 'thumbnail': 'https://i.ytimg.com/vi_webp/gHKT4uU8Zng/maxresdefault.webp', + 'thumbnail': r're:^https?://.*\.jpg', 'title': 'dlp test video title translated (fr)', 'availability': 'public', - 'uploader': 'cole-dlp-test-acc', 'age_limit': 0, 'description': 'dlp test video description translated (fr)', 'playable_in_embed': True, 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', - 'uploader_url': 'http://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', }, 'params': {'skip_download': True, 'extractor_args': {'youtube': {'lang': ['fr']}}}, 'expected_warnings': [r'Preferring "fr" translated fields'], @@ -2524,7 +2585,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'categories': ['Entertainment'], 'description': 'md5:e8031ff6e426cdb6a77670c9b81f6fa6', - 'uploader_url': 'http://www.youtube.com/user/MrBeast6000', 'live_status': 'not_live', 'duration': 937, 'channel_follower_count': int, @@ -2534,17 +2594,118 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'playable_in_embed': True, 'view_count': int, 'upload_date': '20221112', - 'uploader': 'MrBeast', - 'uploader_id': 'MrBeast6000', 'channel_url': 'https://www.youtube.com/channel/UCX6OQ3DkcsbYNE6H8uQQuVA', 'age_limit': 0, 'availability': 'public', 'channel_id': 'UCX6OQ3DkcsbYNE6H8uQQuVA', 'like_count': int, 'tags': [], + 'uploader': 'MrBeast', + 'uploader_url': 'https://www.youtube.com/@MrBeast', + 'uploader_id': '@MrBeast', + 'comment_count': int, + 'channel_is_verified': True, + 'heatmap': 'count:100', }, 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, - } + }, { + 'note': 'Audio formats with Dynamic Range Compression', + 'url': 'https://www.youtube.com/watch?v=Tq92D6wQ1mg', + 'info_dict': { + 'id': 'Tq92D6wQ1mg', + 'ext': 'webm', + 'title': '[MMD] Adios - EVERGLOW [+Motion DL]', + 'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', + 'channel_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', + 'channel_follower_count': int, + 'description': 'md5:17eccca93a786d51bc67646756894066', + 'upload_date': '20191228', + 'tags': ['mmd', 'dance', 'mikumikudance', 'kpop', 'vtuber'], + 'playable_in_embed': True, + 'like_count': int, + 'categories': ['Entertainment'], + 'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg', + 'age_limit': 18, + 'channel': 'Projekt Melody', + 'view_count': int, + 'availability': 'needs_auth', + 'comment_count': int, + 'live_status': 'not_live', + 'duration': 106, + 'uploader': 'Projekt Melody', + 'uploader_id': '@ProjektMelody', + 'uploader_url': 'https://www.youtube.com/@ProjektMelody', + }, + 'params': {'extractor_args': {'youtube': {'player_client': ['tv_embedded']}}, 'format': '251-drc'}, + }, + { + 'url': 'https://www.youtube.com/live/qVv6vCqciTM', + 'info_dict': { + 'id': 'qVv6vCqciTM', + 'ext': 'mp4', + 'age_limit': 0, + 'comment_count': int, + 'chapters': 'count:13', + 'upload_date': '20221223', + 'thumbnail': 'https://i.ytimg.com/vi/qVv6vCqciTM/maxresdefault.jpg', + 'channel_url': 'https://www.youtube.com/channel/UCIdEIHpS0TdkqRkHL5OkLtA', + 'like_count': int, + 'release_date': '20221223', + 'tags': ['Vtuber', '月ノ美兎', '名取さな', 'にじさんじ', 'クリスマス', '3D配信'], + 'title': '【 #インターネット女クリスマス 】3Dで歌ってはしゃぐインターネットの女たち【月ノ美兎/名取さな】', + 'view_count': int, + 'playable_in_embed': True, + 'duration': 4438, + 'availability': 'public', + 'channel_follower_count': int, + 'channel_id': 'UCIdEIHpS0TdkqRkHL5OkLtA', + 'categories': ['Entertainment'], + 'live_status': 'was_live', + 'release_timestamp': 1671793345, + 'channel': 'さなちゃんねる', + 'description': 'md5:6aebf95cc4a1d731aebc01ad6cc9806d', + 'uploader': 'さなちゃんねる', + 'uploader_url': 'https://www.youtube.com/@sana_natori', + 'uploader_id': '@sana_natori', + 'channel_is_verified': True, + 'heatmap': 'count:100', + }, + }, + { + # Fallbacks when webpage and web client is unavailable + 'url': 'https://www.youtube.com/watch?v=wSSmNUl9Snw', + 'info_dict': { + 'id': 'wSSmNUl9Snw', + 'ext': 'mp4', + # 'categories': ['Science & Technology'], + 'view_count': int, + 'chapters': 'count:2', + 'channel': 'Scott Manley', + 'like_count': int, + 'age_limit': 0, + # 'availability': 'public', + 'channel_follower_count': int, + 'live_status': 'not_live', + 'upload_date': '20170831', + 'duration': 682, + 'tags': 'count:8', + 'uploader_url': 'https://www.youtube.com/@scottmanley', + 'description': 'md5:f4bed7b200404b72a394c2f97b782c02', + 'uploader': 'Scott Manley', + 'uploader_id': '@scottmanley', + 'title': 'The Computer Hack That Saved Apollo 14', + 'channel_id': 'UCxzC4EngIsMrPmbm6Nxvb-A', + 'thumbnail': r're:^https?://.*\.webp', + 'channel_url': 'https://www.youtube.com/channel/UCxzC4EngIsMrPmbm6Nxvb-A', + 'playable_in_embed': True, + 'comment_count': int, + 'channel_is_verified': True, + 'heatmap': 'count:100', + }, + 'params': { + 'extractor_args': {'youtube': {'player_client': ['android'], 'player_skip': ['webpage']}}, + }, + }, ] _WEBPAGE_TESTS = [ @@ -2558,8 +2719,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Feynman: Mirrors FUN TO IMAGINE 6', 'upload_date': '20080526', 'description': 'md5:873c81d308b979f0e23ee7e620b312a3', - 'uploader': 'Christopher Sykes', - 'uploader_id': 'ChristopherJSykes', 'age_limit': 0, 'tags': ['feynman', 'mirror', 'science', 'physics', 'imagination', 'fun', 'cool', 'puzzle'], 'channel_id': 'UCCeo--lls1vna5YJABWAcVA', @@ -2575,7 +2734,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'view_count': int, 'categories': ['Science & Technology'], 'channel_follower_count': int, - 'uploader_url': 'http://www.youtube.com/user/ChristopherJSykes', + 'uploader': 'Christopher Sykes', + 'uploader_url': 'https://www.youtube.com/@ChristopherSykesDocumentaries', + 'uploader_id': '@ChristopherSykesDocumentaries', + 'heatmap': 'count:100', }, 'params': { 'skip_download': True, @@ -2608,11 +2770,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return _, _, prs, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url) - video_details = traverse_obj( - prs, (..., 'videoDetails'), expected_type=dict, default=[]) + video_details = traverse_obj(prs, (..., 'videoDetails'), expected_type=dict) microformats = traverse_obj( prs, (..., 'microformat', 'playerMicroformatRenderer'), - expected_type=dict, default=[]) + expected_type=dict) _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url) is_live = live_status == 'is_live' start_time = time.time() @@ -2621,18 +2782,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): """ @returns (manifest_url, manifest_stream_number, is_live) or None """ - with lock: - refetch_manifest(format_id, delay) - - f = next((f for f in formats if f['format_id'] == format_id), None) - if not f: - if not is_live: - self.to_screen(f'{video_id}: Video is no longer live') - else: - self.report_warning( - f'Cannot find refreshed manifest for format {format_id}{bug_reports_message()}') - return None - return f['manifest_url'], f['manifest_stream_number'], is_live + for retry in self.RetryManager(fatal=False): + with lock: + refetch_manifest(format_id, delay) + + f = next((f for f in formats if f['format_id'] == format_id), None) + if not f: + if not is_live: + retry.error = f'{video_id}: Video is no longer live' + else: + retry.error = f'Cannot find refreshed manifest for format {format_id}{bug_reports_message()}' + continue + return f['manifest_url'], f['manifest_stream_number'], is_live + return None for f in formats: f['is_live'] = is_live @@ -2668,7 +2830,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Obtain from MPD's maximum seq value old_mpd_url = mpd_url last_error = ctx.pop('last_error', None) - expire_fast = immediate or last_error and isinstance(last_error, urllib.error.HTTPError) and last_error.code == 403 + expire_fast = immediate or last_error and isinstance(last_error, HTTPError) and last_error.status == 403 mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000) or (mpd_url, stream_number, False)) if not refresh_sequence: @@ -2867,17 +3029,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)', - r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', - r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\))?', r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns - r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'("|\')signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') @@ -2951,7 +3110,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return funcname return json.loads(js_to_json(self._search_regex( - rf'var {re.escape(funcname)}\s*=\s*(\[.+?\]);', jscode, + rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode, f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] def _extract_n_function_code(self, video_id, player_url): @@ -3100,11 +3259,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters' ), expected_type=list) - return self._extract_chapters( + return self._extract_chapters_helper( chapter_list, - chapter_time=lambda chapter: float_or_none( + start_function=lambda chapter: float_or_none( traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000), - chapter_title=lambda chapter: traverse_obj( + title_function=lambda chapter: traverse_obj( chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str), duration=duration) @@ -3112,83 +3271,74 @@ class YoutubeIE(YoutubeBaseInfoExtractor): content_list = traverse_obj( data, ('engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'macroMarkersListRenderer', 'contents'), - expected_type=list, default=[]) + expected_type=list) chapter_time = lambda chapter: parse_duration(self._get_text(chapter, 'timeDescription')) chapter_title = lambda chapter: self._get_text(chapter, 'title') return next(filter(None, ( - self._extract_chapters(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')), - chapter_time, chapter_title, duration) + self._extract_chapters_helper(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')), + chapter_time, chapter_title, duration) for contents in content_list)), []) - def _extract_chapters_from_description(self, description, duration): - duration_re = r'(?:\d+:)?\d{1,2}:\d{2}' - sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$' - return self._extract_chapters( - re.findall(sep_re % (duration_re, r'.+?'), description or ''), - chapter_time=lambda x: parse_duration(x[0]), chapter_title=lambda x: x[1], - duration=duration, strict=False) or self._extract_chapters( - re.findall(sep_re % (r'.+?', duration_re), description or ''), - chapter_time=lambda x: parse_duration(x[1]), chapter_title=lambda x: x[0], - duration=duration, strict=False) - - def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, strict=True): - if not duration: - return - chapter_list = [{ - 'start_time': chapter_time(chapter), - 'title': chapter_title(chapter), - } for chapter in chapter_list or []] - if not strict: - chapter_list.sort(key=lambda c: c['start_time'] or 0) - - chapters = [{'start_time': 0}] - for idx, chapter in enumerate(chapter_list): - if chapter['start_time'] is None: - self.report_warning(f'Incomplete chapter {idx}') - elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: - chapters.append(chapter) - elif chapter not in chapters: - self.report_warning( - f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"') - return chapters[1:] + def _extract_heatmap_from_player_overlay(self, data): + content_list = traverse_obj(data, ( + 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer', 'decoratedPlayerBarRenderer', 'playerBar', + 'multiMarkersPlayerBarRenderer', 'markersMap', ..., 'value', 'heatmap', 'heatmapRenderer', 'heatMarkers', {list})) + return next(filter(None, ( + traverse_obj(contents, (..., 'heatMarkerRenderer', { + 'start_time': ('timeRangeStartMillis', {functools.partial(float_or_none, scale=1000)}), + 'end_time': {lambda x: (x['timeRangeStartMillis'] + x['markerDurationMillis']) / 1000}, + 'value': ('heatMarkerIntensityScoreNormalized', {float_or_none}), + })) for contents in content_list)), None) def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') if not comment_id: return - text = self._get_text(comment_renderer, 'contentText') + info = { + 'id': comment_id, + 'text': self._get_text(comment_renderer, 'contentText'), + 'like_count': self._get_count(comment_renderer, 'voteCount'), + 'author_id': traverse_obj(comment_renderer, ('authorEndpoint', 'browseEndpoint', 'browseId', {self.ucid_or_none})), + 'author': self._get_text(comment_renderer, 'authorText'), + 'author_thumbnail': traverse_obj(comment_renderer, ('authorThumbnail', 'thumbnails', -1, 'url', {url_or_none})), + 'parent': parent or 'root', + } # Timestamp is an estimate calculated from the current time and time_text time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' timestamp = self._parse_time_text(time_text) - author = self._get_text(comment_renderer, 'authorText') - author_id = try_get(comment_renderer, - lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], str) + info.update({ + # FIXME: non-standard, but we need a way of showing that it is an estimate. + '_time_text': time_text, + 'timestamp': timestamp, + }) - votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'], - lambda x: x['likeCount']), str)) or 0 - author_thumbnail = try_get(comment_renderer, - lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], str) + info['author_url'] = urljoin( + 'https://www.youtube.com', traverse_obj(comment_renderer, ('authorEndpoint', ( + ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url'))), + expected_type=str, get_all=False)) - author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool) - is_favorited = 'creatorHeart' in (try_get( - comment_renderer, lambda x: x['actionButtons']['commentActionButtonsRenderer'], dict) or {}) - return { - 'id': comment_id, - 'text': text, - 'timestamp': timestamp, - 'time_text': time_text, - 'like_count': votes, - 'is_favorited': is_favorited, - 'author': author, - 'author_id': author_id, - 'author_thumbnail': author_thumbnail, - 'author_is_uploader': author_is_uploader, - 'parent': parent or 'root' - } + author_is_uploader = traverse_obj(comment_renderer, 'authorIsChannelOwner') + if author_is_uploader is not None: + info['author_is_uploader'] = author_is_uploader + + comment_abr = traverse_obj( + comment_renderer, ('actionButtons', 'commentActionButtonsRenderer'), expected_type=dict) + if comment_abr is not None: + info['is_favorited'] = 'creatorHeart' in comment_abr + + badges = self._extract_badges([traverse_obj(comment_renderer, 'authorCommentBadge')]) + if self._has_badge(badges, BadgeType.VERIFIED): + info['author_is_verified'] = True + + is_pinned = traverse_obj(comment_renderer, 'pinnedCommentBadge') + if is_pinned: + info['is_pinned'] = True + + return info def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, tracker=None): @@ -3201,7 +3351,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): expected_comment_count = self._get_count( comments_header_renderer, 'countText', 'commentsCount') - if expected_comment_count: + if expected_comment_count is not None: tracker['est_total'] = expected_comment_count self.to_screen(f'Downloading ~{expected_comment_count} comments') comment_sort_index = int(get_single_config_arg('comment_sort') != 'top') # 1 = new, 0 = top @@ -3236,6 +3386,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): comment = self._extract_comment(comment_renderer, parent) if not comment: continue + comment_id = comment['id'] + if comment.get('is_pinned'): + tracker['pinned_comment_ids'].add(comment_id) + # Sometimes YouTube may break and give us infinite looping comments. + # See: https://github.com/hypervideo/hypervideo/issues/6290 + if comment_id in tracker['seen_comment_ids']: + if comment_id in tracker['pinned_comment_ids'] and not comment.get('is_pinned'): + # Pinned comments may appear a second time in newest first sort + # See: https://github.com/hypervideo/hypervideo/issues/6712 + continue + self.report_warning( + 'Detected YouTube comments looping. Stopping comment extraction ' + f'{"for this thread" if parent else ""} as we probably cannot get any more.') + yield + else: + tracker['seen_comment_ids'].add(comment['id']) tracker['running_total'] += 1 tracker['total_reply_comments' if parent else 'total_parent_comments'] += 1 @@ -3257,10 +3423,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not tracker: tracker = dict( running_total=0, - est_total=0, + est_total=None, current_page_thread=0, total_parent_comments=0, - total_reply_comments=0) + total_reply_comments=0, + seen_comment_ids=set(), + pinned_comment_ids=set() + ) # TODO: Deprecated # YouTube comments have a max depth of 2 @@ -3287,11 +3456,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continuation = self._build_api_continuation_query(self._generate_comment_continuation(video_id)) is_forced_continuation = True + continuation_items_path = ( + 'onResponseReceivedEndpoints', ..., ('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems') for page_num in itertools.count(0): if not continuation: break headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response)) - comment_prog_str = f"({tracker['running_total']}/{tracker['est_total']})" + comment_prog_str = f"({tracker['running_total']}/~{tracker['est_total']})" if page_num == 0: if is_first_continuation: note_prefix = 'Downloading comment section API JSON' @@ -3302,31 +3473,37 @@ class YoutubeIE(YoutubeBaseInfoExtractor): note_prefix = '%sDownloading comment%s API JSON page %d %s' % ( ' ' if parent else '', ' replies' if parent else '', page_num, comment_prog_str) + + # Do a deep check for incomplete data as sometimes YouTube may return no comments for a continuation + # Ignore check if YouTube says the comment count is 0. + check_get_keys = None + if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0): + check_get_keys = [[*continuation_items_path, ..., ( + 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]] try: response = self._extract_response( item_id=None, query=continuation, ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix, - check_get_keys='onResponseReceivedEndpoints' if not is_forced_continuation else None) + check_get_keys=check_get_keys) except ExtractorError as e: # Ignore incomplete data error for replies if retries didn't work. # This is to allow any other parent comments and comment threads to be downloaded. # See: https://github.com/hypervideo/hypervideo/issues/4669 - if 'incomplete data' in str(e).lower() and parent and self.get_param('ignoreerrors') is True: - self.report_warning( - 'Received incomplete data for a comment reply thread and retrying did not help. ' - 'Ignoring to let other comments be downloaded.') - else: - raise + if 'incomplete data' in str(e).lower() and parent: + if self.get_param('ignoreerrors') in (True, 'only_download'): + self.report_warning( + 'Received incomplete data for a comment reply thread and retrying did not help. ' + 'Ignoring to let other comments be downloaded. Pass --no-ignore-errors to not ignore.') + return + else: + raise ExtractorError( + 'Incomplete data received for comment reply thread. ' + 'Pass --ignore-errors to ignore and allow rest of comments to download.', + expected=True) + raise is_forced_continuation = False - continuation_contents = traverse_obj( - response, 'onResponseReceivedEndpoints', expected_type=list, default=[]) - continuation = None - for continuation_section in continuation_contents: - continuation_items = traverse_obj( - continuation_section, - (('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems'), - get_all=False, expected_type=list) or [] + for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]): if is_first_continuation: continuation = extract_header(continuation_items) is_first_continuation = False @@ -3389,7 +3566,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if traverse_obj(player_response, ('playabilityStatus', 'desktopLegacyAgeGateReason')): return True - reasons = traverse_obj(player_response, ('playabilityStatus', ('status', 'reason')), default=[]) + reasons = traverse_obj(player_response, ('playabilityStatus', ('status', 'reason'))) AGE_GATE_REASONS = ( 'confirm your age', 'age-restricted', 'inappropriate', # reason 'age_verification_required', 'age_check_required', # status @@ -3400,8 +3577,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - _STORY_PLAYER_PARAMS = '8AEB' - def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): session_index = self._extract_session_index(player_ytcfg, master_ytcfg) @@ -3413,8 +3588,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): yt_query = { 'videoId': video_id, } - if smuggled_data.get('is_story') or _split_innertube_client(client)[0] == 'android': - yt_query['params'] = self._STORY_PLAYER_PARAMS + if _split_innertube_client(client)[0] == 'android': + yt_query['params'] = 'CgIQBg==' + + pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0] + if pp_arg: + yt_query['params'] = pp_arg yt_query.update(self._generate_player_context(sts)) return self._extract_response( @@ -3426,7 +3605,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _get_requested_clients(self, url, smuggled_data): requested_clients = [] - default = ['android', 'web'] + default = ['ios', 'android', 'web'] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) @@ -3513,6 +3692,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_warning( f'Skipping player response from {client} client (got player response for video "{pr_video_id}" instead of "{video_id}")' + bug_reports_message()) else: + # Save client name for introspection later + name = short_client_name(client) + sd = traverse_obj(pr, ('streamingData', {dict})) or {} + sd[STREAMING_DATA_CLIENT_NAME] = name + for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): + f[STREAMING_DATA_CLIENT_NAME] = name prs.append(pr) # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in @@ -3532,10 +3717,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _needs_live_processing(self, live_status, duration): if (live_status == 'is_live' and self.get_param('live_from_start') - or live_status == 'post_live' and (duration or 0) > 4 * 3600): + or live_status == 'post_live' and (duration or 0) > 2 * 3600): return live_status def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): + CHUNK_SIZE = 10 << 20 itags, stream_ids = collections.defaultdict(set), [] itag_qualities, res_qualities = {}, {0: None} q = qualities([ @@ -3545,7 +3731,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'audio_quality_ultralow', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres' ]) - streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[]) + streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...)) + format_types = self._configuration_arg('formats') + all_formats = 'duplicate' in format_types + if self._configuration_arg('include_duplicate_formats'): + all_formats = True + self._downloader.deprecated_feature('[youtube] include_duplicate_formats extractor argument is deprecated. ' + 'Use formats=duplicate extractor argument instead') + + def build_fragments(f): + return LazyList({ + 'url': update_url_query(f['url'], { + 'range': f'{range_start}-{min(range_start + CHUNK_SIZE - 1, f["filesize"])}' + }) + } for range_start in range(0, f['filesize'], CHUNK_SIZE)) for fmt in streaming_formats: if fmt.get('targetDurationSec'): @@ -3553,9 +3752,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): itag = str_or_none(fmt.get('itag')) audio_track = fmt.get('audioTrack') or {} - stream_id = '%s.%s' % (itag or '', audio_track.get('id', '')) - if stream_id in stream_ids: - continue + stream_id = (itag, audio_track.get('id'), fmt.get('isDrc')) + if not all_formats: + if stream_id in stream_ids: + continue quality = fmt.get('quality') height = int_or_none(fmt.get('height')) @@ -3631,29 +3831,36 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if is_damaged: self.report_warning( f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) + + client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) + name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' + fps = int_or_none(fmt.get('fps')) or 0 dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), - 'format_id': itag, + 'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}', 'format_note': join_nonempty( - '%s%s' % (audio_track.get('displayName') or '', - ' (default)' if language_preference > 0 else ''), - fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), + join_nonempty(audio_track.get('displayName'), + language_preference > 0 and ' (default)', delim=''), + name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), - throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '), + throttled and 'THROTTLED', is_damaged and 'DAMAGED', + (self.get_param('verbose') or all_formats) and client_name, + delim=', '), # Format 22 is likely to be damaged. See https://github.com/hypervideo/hypervideo/issues/3372 - 'source_preference': -10 if throttled else -5 if itag == '22' else -1, - 'fps': int_or_none(fmt.get('fps')) or None, + 'source_preference': ((-10 if throttled else -5 if itag == '22' else -1) + + (100 if 'Premium' in name else 0)), + 'fps': fps if fps > 1 else None, # For some formats, fps is wrongly returned as 1 'audio_channels': fmt.get('audioChannels'), 'height': height, - 'quality': q(quality), + 'quality': q(quality) - bool(fmt.get('isDrc')) / 2, 'has_drm': bool(fmt.get('drmFamilies')), 'tbr': tbr, 'url': fmt_url, 'width': int_or_none(fmt.get('width')), 'language': join_nonempty(audio_track.get('id', '').split('.')[0], - 'desc' if language_preference < -1 else ''), + 'desc' if language_preference < -1 else '') or None, 'language_preference': language_preference, # Strictly de-prioritize damaged and 3gp formats 'preference': -10 if is_damaged else -2 if itag == '17' else None, @@ -3663,27 +3870,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if mime_mobj: dct['ext'] = mimetype2ext(mime_mobj.group(1)) dct.update(parse_codecs(mime_mobj.group(2))) - no_audio = dct.get('acodec') == 'none' - no_video = dct.get('vcodec') == 'none' - if no_audio: - dct['vbr'] = tbr - if no_video: - dct['abr'] = tbr - if no_audio or no_video: - dct['downloader_options'] = { - # Youtube throttles chunks >~10M - 'http_chunk_size': 10485760, - } - if dct.get('ext'): - dct['container'] = dct['ext'] + '_dash' - if itag: itags[itag].add(('https', dct.get('language'))) stream_ids.append(stream_id) - yield dct + single_stream = 'none' in (dct.get('acodec'), dct.get('vcodec')) + if single_stream and dct.get('ext'): + dct['container'] = dct['ext'] + '_dash' + + if (all_formats or 'dashy' in format_types) and dct['filesize']: + yield { + **dct, + 'format_id': f'{dct["format_id"]}-dashy' if all_formats else dct['format_id'], + 'protocol': 'http_dash_segments', + 'fragments': build_fragments(dct), + } + if all_formats or 'dashy' not in format_types: + dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE} + yield dct needs_live_processing = self._needs_live_processing(live_status, duration) - skip_bad_formats = not self._configuration_arg('include_incomplete_formats') + skip_bad_formats = 'incomplete' not in format_types + if self._configuration_arg('include_incomplete_formats'): + skip_bad_formats = False + self._downloader.deprecated_feature('[youtube] include_incomplete_formats extractor argument is deprecated. ' + 'Use formats=incomplete extractor argument instead') skip_manifests = set(self._configuration_arg('skip')) if (not self.get_param('youtube_include_hls_manifest', True) @@ -3695,35 +3905,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor): skip_manifests.add('dash') if self._configuration_arg('include_live_dash'): self._downloader.deprecated_feature('[youtube] include_live_dash extractor argument is deprecated. ' - 'Use include_incomplete_formats extractor argument instead') + 'Use formats=incomplete extractor argument instead') elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': skip_manifests.add('dash') - def process_manifest_format(f, proto, itag): + def process_manifest_format(f, proto, client_name, itag): key = (proto, f.get('language')) - if key in itags[itag]: + if not all_formats and key in itags[itag]: return False itags[itag].add(key) - if any(p != proto for p, _ in itags[itag]): + if itag and all_formats: + f['format_id'] = f'{itag}-{proto}' + elif any(p != proto for p, _ in itags[itag]): f['format_id'] = f'{itag}-{proto}' elif itag: f['format_id'] = itag + if f.get('source_preference') is None: + f['source_preference'] = -1 + + if itag in ('616', '235'): + f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') + f['source_preference'] += 100 + f['quality'] = q(itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1)) if f['quality'] == -1 and f.get('height'): f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) + if self.get_param('verbose') or all_formats: + f['format_note'] = join_nonempty(f.get('format_note'), client_name, delim=', ') + if f.get('fps') and f['fps'] <= 1: + del f['fps'] + + if proto == 'hls' and f.get('has_drm'): + f['has_drm'] = 'maybe' + f['source_preference'] -= 5 return True subtitles = {} for sd in streaming_data: + client_name = sd.get(STREAMING_DATA_CLIENT_NAME) + hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') if hls_manifest_url: fmts, subs = self._extract_m3u8_formats_and_subtitles( hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') subtitles = self._merge_subtitles(subs, subtitles) for f in fmts: - if process_manifest_format(f, 'hls', self._search_regex( + if process_manifest_format(f, 'hls', client_name, self._search_regex( r'/itag/(\d+)', f['url'], 'itag', default=None)): yield f @@ -3732,7 +3961,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH for f in formats: - if process_manifest_format(f, 'dash', f['format_id']): + if process_manifest_format(f, 'dash', client_name, f['format_id']): f['filesize'] = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if needs_live_processing: @@ -3783,8 +4012,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): webpage = None if 'webpage' not in self._configuration_arg('player_skip'): query = {'bpctr': '9999999999', 'has_verified': '1'} - if smuggled_data.get('is_story'): - query['pp'] = self._STORY_PLAYER_PARAMS + pp = self._configuration_arg('player_params', [None], casesense=True)[0] + if pp: + query['pp'] = pp webpage = self._download_webpage( webpage_url, video_id, fatal=False, query=query) @@ -3810,8 +4040,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else 'was_live' if live_content else 'not_live' if False in (is_live, live_content) else None) - streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) + streaming_data = traverse_obj(player_responses, (..., 'streamingData')) *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, live_status, duration) + if all(f.get('has_drm') for f in formats): + # If there are no formats that definitely don't have DRM, all have DRM + for f in formats: + f['has_drm'] = True return live_broadcast_details, live_status, streaming_data, formats, subtitles @@ -3825,7 +4059,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): webpage, master_ytcfg, player_responses, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url) playability_statuses = traverse_obj( - player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[]) + player_responses, (..., 'playabilityStatus'), expected_type=dict) trailer_video_id = get_first( playability_statuses, @@ -3838,11 +4072,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None)) - video_details = traverse_obj( - player_responses, (..., 'videoDetails'), expected_type=dict, default=[]) + video_details = traverse_obj(player_responses, (..., 'videoDetails'), expected_type=dict) microformats = traverse_obj( player_responses, (..., 'microformat', 'playerMicroformatRenderer'), - expected_type=dict, default=[]) + expected_type=dict) translated_title = self._get_text(microformats, (..., 'title')) video_title = (self._preferred_lang and translated_title @@ -3975,10 +4208,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader._sort_thumbnails(original_thumbnails) category = get_first(microformats, 'category') or search_meta('genre') - channel_id = str_or_none( + channel_id = self.ucid_or_none(str_or_none( get_first(video_details, 'channelId') or get_first(microformats, 'externalChannelId') - or search_meta('channelId')) + or search_meta('channelId'))) owner_profile_url = get_first(microformats, 'ownerProfileUrl') live_start_time = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp')) @@ -3997,7 +4230,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for fmt in filter(is_bad_format, formats): fmt['preference'] = (fmt.get('preference') or -1) - 10 - fmt['format_note'] = join_nonempty(fmt.get('format_note'), '(Last 4 hours)', delim=' ') + fmt['format_note'] = join_nonempty(fmt.get('format_note'), '(Last 2 hours)', delim=' ') if needs_live_processing: self._prepare_live_from_start_formats( @@ -4005,6 +4238,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): formats.extend(self._extract_storyboard(player_responses, duration)) + channel_handle = self.handle_from_url(owner_profile_url) + info = { 'id': video_id, 'title': video_title, @@ -4014,11 +4249,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # URL checking if user don't care about getting the best possible thumbnail 'thumbnail': traverse_obj(original_thumbnails, (-1, 'url')), 'description': video_description, - 'uploader': get_first(video_details, 'author'), - 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, - 'uploader_url': owner_profile_url, 'channel_id': channel_id, - 'channel_url': format_field(channel_id, None, 'https://www.youtube.com/channel/%s'), + 'channel_url': format_field(channel_id, None, 'https://www.youtube.com/channel/%s', default=None), 'duration': duration, 'view_count': int_or_none( get_first((video_details, microformats), (..., 'viewCount')) @@ -4048,10 +4280,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Converted into dicts to remove duplicates captions = { get_lang_code(sub): sub - for sub in traverse_obj(pctr, (..., 'captionTracks', ...), default=[])} + for sub in traverse_obj(pctr, (..., 'captionTracks', ...))} translation_languages = { lang.get('languageCode'): self._get_text(lang.get('languageName'), max_runs=1) - for lang in traverse_obj(pctr, (..., 'translationLanguages', ...), default=[])} + for lang in traverse_obj(pctr, (..., 'translationLanguages', ...))} def process_language(container, base_url, lang_code, sub_name, query): lang_subs = container.setdefault(lang_code, []) @@ -4090,9 +4322,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue trans_code += f'-{lang_code}' trans_name += format_field(lang_name, None, ' from %s') - # Add an "-orig" label to the original language so that it can be distinguished. - # The subs are returned without "-orig" as well for compatibility if lang_code == f'a-{orig_trans_code}': + # Set audio language based on original subtitles + for f in formats: + if f.get('acodec') != 'none' and not f.get('language'): + f['language'] = orig_trans_code + # Add an "-orig" label to the original language so that it can be distinguished. + # The subs are returned without "-orig" as well for compatibility process_language( automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {}) # Setting tlang=lang returns damaged subtitles. @@ -4112,15 +4348,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info[d_k] = parse_duration(query[k][0]) # Youtube Music Auto-generated description - if video_description: + if (video_description or '').strip().endswith('\nAuto-generated by YouTube.'): + # XXX: Causes catastrophic backtracking if description has "·" + # E.g. https://www.youtube.com/watch?v=DoPaAxMQoiI + # Simulating atomic groups: (?P<a>[^xy]+)x => (?=(?P<a>[^xy]+))(?P=a)x + # reduces it, but does not fully fix it. https://regex101.com/r/8Ssf2h/2 mobj = re.search( r'''(?xs) - (?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+ - (?P<album>[^\n]+) + (?=(?P<track>[^\n·]+))(?P=track)· + (?=(?P<artist>[^\n]+))(?P=artist)\n+ + (?=(?P<album>[^\n]+))(?P=album)\n (?:.+?℗\s*(?P<release_year>\d{4})(?!\d))? (?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))? - (.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))? - .+\nAuto-generated\ by\ YouTube\.\s*$ + (.+?\nArtist\s*:\s* + (?=(?P<clean_artist>[^\n]+))(?P=clean_artist)\n + )?.+\nAuto-generated\ by\ YouTube\.\s*$ ''', video_description) if mobj: release_year = mobj.group('release_year') @@ -4140,22 +4382,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): initial_data = None if webpage: initial_data = self.extract_yt_initial_data(video_id, webpage, fatal=False) + if not traverse_obj(initial_data, 'contents'): + self.report_warning('Incomplete data received in embedded initial data; re-fetching using API.') + initial_data = None if not initial_data: query = {'videoId': video_id} query.update(self._get_checkok_params()) initial_data = self._extract_response( item_id=video_id, ep='next', fatal=False, - ytcfg=master_ytcfg, query=query, + ytcfg=master_ytcfg, query=query, check_get_keys='contents', headers=self.generate_api_headers(ytcfg=master_ytcfg), note='Downloading initial data API JSON') info['comment_count'] = traverse_obj(initial_data, ( 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'itemSectionRenderer', - 'contents', ..., 'commentsEntryPointHeaderRenderer', 'commentCount', 'simpleText' + 'contents', ..., 'commentsEntryPointHeaderRenderer', 'commentCount' ), ( 'engagementPanels', lambda _, v: v['engagementPanelSectionListRenderer']['panelIdentifier'] == 'comment-item-section', - 'engagementPanelSectionListRenderer', 'header', 'engagementPanelTitleHeaderRenderer', 'contextualInfo', 'runs', ..., 'text' - ), expected_type=int_or_none, get_all=False) + 'engagementPanelSectionListRenderer', 'header', 'engagementPanelTitleHeaderRenderer', 'contextualInfo' + ), expected_type=self._get_count, get_all=False) try: # This will error if there is no livechat initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation'] @@ -4178,6 +4423,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or self._extract_chapters_from_description(video_description, duration) or None) + info['heatmap'] = self._extract_heatmap_from_player_overlay(initial_data) + contents = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'), expected_type=list, default=[]) @@ -4205,9 +4452,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): list) or []): tbrs = variadic( traverse_obj( - tlb, 'toggleButtonRenderer', - ('segmentedLikeDislikeButtonRenderer', ..., 'toggleButtonRenderer'), - default=[])) + tlb, ('toggleButtonRenderer', ...), + ('segmentedLikeDislikeButtonRenderer', ..., 'toggleButtonRenderer'))) for tbr in tbrs: for getter, regex in [( lambda x: x['defaultText']['accessibility']['accessibilityData'], @@ -4245,6 +4491,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel': self._get_text(vor, 'title'), 'channel_follower_count': self._get_count(vor, 'subscriberCountText')}) + if not channel_handle: + channel_handle = self.handle_from_url( + traverse_obj(vor, ( + ('navigationEndpoint', ('title', 'runs', ..., 'navigationEndpoint')), + (('commandMetadata', 'webCommandMetadata', 'url'), ('browseEndpoint', 'canonicalBaseUrl')), + {str}), get_all=False)) + rows = try_get( vsir, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], @@ -4270,13 +4523,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info['artist'] = mrr_contents_text elif mrr_title == 'Song': info['track'] = mrr_contents_text + owner_badges = self._extract_badges(traverse_obj(vsir, ('owner', 'videoOwnerRenderer', 'badges'))) + if self._has_badge(owner_badges, BadgeType.VERIFIED): + info['channel_is_verified'] = True - fallbacks = { - 'channel': 'uploader', - 'channel_id': 'uploader_id', - 'channel_url': 'uploader_url', - } - + info.update({ + 'uploader': info.get('channel'), + 'uploader_id': channel_handle, + 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), + }) # The upload date for scheduled, live and past live streams / premieres in microformats # may be different from the stream date. Although not in UTC, we will prefer it in this case. # See: https://github.com/hypervideo/hypervideo/pull/2223#issuecomment-1008485139 @@ -4288,19 +4543,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', []) ): upload_date = strftime_or_none( - self._parse_time_text(self._get_text(vpir, 'dateText')), '%Y%m%d') or upload_date + self._parse_time_text(self._get_text(vpir, 'dateText'))) or upload_date info['upload_date'] = upload_date - for to, frm in fallbacks.items(): - if not info.get(to): - info[to] = info.get(frm) - for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: v = info.get(s_k) if v: info[d_k] = v - badges = self._extract_badges(traverse_obj(contents, (..., 'videoPrimaryInfoRenderer'), get_all=False)) + badges = self._extract_badges(traverse_obj(vpir, 'badges')) is_private = (self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) or get_first(video_details, 'isPrivate', expected_type=bool)) @@ -4355,19 +4606,6 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): return info_dict return wrapper - def _extract_channel_id(self, webpage): - channel_id = self._html_search_meta( - 'channelId', webpage, 'channel id', default=None) - if channel_id: - return channel_id - channel_url = self._html_search_meta( - ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url', - 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad', - 'twitter:app:url:googleplay'), webpage, 'channel url') - return self._search_regex( - r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+', - channel_url, 'channel id') - @staticmethod def _extract_basic_item_renderer(item): # Modified from _extract_grid_item_renderer @@ -4382,6 +4620,44 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): elif key.startswith('grid') and key.endswith('Renderer'): return renderer + def _extract_channel_renderer(self, renderer): + channel_id = self.ucid_or_none(renderer['channelId']) + title = self._get_text(renderer, 'title') + channel_url = format_field(channel_id, None, 'https://www.youtube.com/channel/%s', default=None) + channel_handle = self.handle_from_url( + traverse_obj(renderer, ( + 'navigationEndpoint', (('commandMetadata', 'webCommandMetadata', 'url'), + ('browseEndpoint', 'canonicalBaseUrl')), + {str}), get_all=False)) + if not channel_handle: + # As of 2023-06-01, YouTube sets subscriberCountText to the handle in search + channel_handle = self.handle_or_none(self._get_text(renderer, 'subscriberCountText')) + return { + '_type': 'url', + 'url': channel_url, + 'id': channel_id, + 'ie_key': YoutubeTabIE.ie_key(), + 'channel': title, + 'uploader': title, + 'channel_id': channel_id, + 'channel_url': channel_url, + 'title': title, + 'uploader_id': channel_handle, + 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), + # See above. YouTube sets videoCountText to the subscriber text in search channel renderers. + # However, in feed/channels this is set correctly to the subscriber count + 'channel_follower_count': traverse_obj( + renderer, 'subscriberCountText', 'videoCountText', expected_type=self._get_count), + 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), + 'playlist_count': ( + # videoCountText may be the subscriber count + self._get_count(renderer, 'videoCountText') + if self._get_count(renderer, 'subscriberCountText') is not None else None), + 'description': self._get_text(renderer, 'descriptionSnippet'), + 'channel_is_verified': True if self._has_badge( + self._extract_badges(traverse_obj(renderer, 'ownerBadges')), BadgeType.VERIFIED) else None, + } + def _grid_entries(self, grid_renderer): for item in grid_renderer['items']: if not isinstance(item, dict): @@ -4407,9 +4683,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): # channel channel_id = renderer.get('channelId') if channel_id: - yield self.url_result( - 'https://www.youtube.com/channel/%s' % channel_id, - ie=YoutubeTabIE.ie_key(), video_title=title) + yield self._extract_channel_renderer(renderer) continue # generic endpoint URL support ep_url = urljoin('https://www.youtube.com/', try_get( @@ -4425,8 +4699,11 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _music_reponsive_list_entry(self, renderer): video_id = traverse_obj(renderer, ('playlistItemData', 'videoId')) if video_id: + title = traverse_obj(renderer, ( + 'flexColumns', 0, 'musicResponsiveListItemFlexColumnRenderer', + 'text', 'runs', 0, 'text')) return self.url_result(f'https://music.youtube.com/watch?v={video_id}', - ie=YoutubeIE.ie_key(), video_id=video_id) + ie=YoutubeIE.ie_key(), video_id=video_id, title=title) playlist_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'playlistId')) if playlist_id: video_id = traverse_obj(renderer, ('navigationEndpoint', 'watchEndpoint', 'videoId')) @@ -4485,11 +4762,19 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _rich_entries(self, rich_grid_renderer): renderer = traverse_obj( - rich_grid_renderer, ('content', ('videoRenderer', 'reelItemRenderer')), get_all=False) or {} + rich_grid_renderer, + ('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer')), get_all=False) or {} video_id = renderer.get('videoId') - if not video_id: + if video_id: + yield self._extract_video(renderer) + return + playlist_id = renderer.get('playlistId') + if playlist_id: + yield self.url_result( + f'https://www.youtube.com/playlist?list={playlist_id}', + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, + video_title=self._get_text(renderer, 'title')) return - yield self._extract_video(renderer) def _video_entry(self, video_renderer): video_id = video_renderer.get('videoId') @@ -4605,7 +4890,8 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): 'videoRenderer': lambda x: [self._video_entry(x)], 'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}), 'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}), - 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)] + 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)], + 'richGridRenderer': lambda x: self._extract_entries(x, continuation_list), } for key, renderer in isr_content.items(): if key not in known_renderers: @@ -4633,10 +4919,15 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {}) yield from extract_entries(parent_renderer) continuation = continuation_list[0] - + seen_continuations = set() for page_num in itertools.count(1): if not continuation: break + continuation_token = continuation.get('continuation') + if continuation_token is not None and continuation_token in seen_continuations: + self.write_debug('Detected YouTube feed looping - assuming end of feed.') + break + seen_continuations.add(continuation_token) headers = self.generate_api_headers( ytcfg=ytcfg, account_syncid=account_syncid, visitor_data=visitor_data) response = self._extract_response( @@ -4717,13 +5008,14 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): metadata_renderer = traverse_obj(data, ('metadata', 'channelMetadataRenderer'), expected_type=dict) if metadata_renderer: + channel_id = traverse_obj(metadata_renderer, ('externalId', {self.ucid_or_none}), + ('channelUrl', {self.ucid_from_url})) info.update({ - 'uploader': metadata_renderer.get('title'), - 'uploader_id': metadata_renderer.get('externalId'), - 'uploader_url': metadata_renderer.get('channelUrl'), + 'channel': metadata_renderer.get('title'), + 'channel_id': channel_id, }) - if info['uploader_id']: - info['id'] = info['uploader_id'] + if info['channel_id']: + info['id'] = info['channel_id'] else: metadata_renderer = traverse_obj(data, ('metadata', 'playlistMetadataRenderer'), expected_type=dict) @@ -4776,6 +5068,19 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): 'thumbnails': (primary_thumbnails or playlist_thumbnails) + avatar_thumbnails + channel_banners, }) + channel_handle = ( + traverse_obj(metadata_renderer, (('vanityChannelUrl', ('ownerUrls', ...)), {self.handle_from_url}), get_all=False) + or traverse_obj(data, ('header', ..., 'channelHandleText', {self.handle_or_none}), get_all=False)) + + if channel_handle: + info.update({ + 'uploader_id': channel_handle, + 'uploader_url': format_field(channel_handle, None, 'https://www.youtube.com/%s', default=None), + }) + + channel_badges = self._extract_badges(traverse_obj(data, ('header', ..., 'badges'), get_all=False)) + if self._has_badge(channel_badges, BadgeType.VERIFIED): + info['channel_is_verified'] = True # Playlist stats is a text runs array containing [video count, view count, last updated]. # last updated or (view count and last updated) may be missing. playlist_stats = get_first( @@ -4784,17 +5089,21 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): last_updated_unix = self._parse_time_text( self._get_text(playlist_stats, 2) # deprecated, remove when old layout discontinued or self._get_text(playlist_header_renderer, ('byline', 1, 'playlistBylineRenderer', 'text'))) - info['modified_date'] = strftime_or_none(last_updated_unix, '%Y%m%d') + info['modified_date'] = strftime_or_none(last_updated_unix) info['view_count'] = self._get_count(playlist_stats, 1) if info['view_count'] is None: # 0 is allowed info['view_count'] = self._get_count(playlist_header_renderer, 'viewCountText') + if info['view_count'] is None: + info['view_count'] = self._get_count(data, ( + 'contents', 'twoColumnBrowseResultsRenderer', 'tabs', ..., 'tabRenderer', 'content', 'sectionListRenderer', + 'contents', ..., 'itemSectionRenderer', 'contents', ..., 'channelAboutFullMetadataRenderer', 'viewCountText')) info['playlist_count'] = self._get_count(playlist_stats, 0) if info['playlist_count'] is None: # 0 is allowed info['playlist_count'] = self._get_count(playlist_header_renderer, ('byline', 0, 'playlistBylineRenderer', 'text')) - if not info.get('uploader_id'): + if not info.get('channel_id'): owner = traverse_obj(playlist_header_renderer, 'ownerText') if not owner: # Deprecated owner = traverse_obj( @@ -4803,16 +5112,17 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): owner_text = self._get_text(owner) browse_ep = traverse_obj(owner, ('runs', 0, 'navigationEndpoint', 'browseEndpoint')) or {} info.update({ - 'uploader': self._search_regex(r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text), - 'uploader_id': browse_ep.get('browseId'), - 'uploader_url': urljoin('https://www.youtube.com', browse_ep.get('canonicalBaseUrl')) + 'channel': self._search_regex(r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text), + 'channel_id': self.ucid_or_none(browse_ep.get('browseId')), + 'uploader_id': self.handle_from_url(urljoin('https://www.youtube.com', browse_ep.get('canonicalBaseUrl'))) }) info.update({ - 'channel': info['uploader'], - 'channel_id': info['uploader_id'], - 'channel_url': info['uploader_url'] + 'uploader': info['channel'], + 'channel_url': format_field(info.get('channel_id'), None, 'https://www.youtube.com/channel/%s', default=None), + 'uploader_url': format_field(info.get('uploader_id'), None, 'https://www.youtube.com/%s', default=None), }) + return info def _extract_inline_playlist(self, playlist, playlist_id, data, ytcfg): @@ -4879,7 +5189,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer')) or {} player_header_privacy = playlist_header_renderer.get('privacy') - badges = self._extract_badges(sidebar_renderer) + badges = self._extract_badges(traverse_obj(sidebar_renderer, 'badges')) # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge privacy_setting_icon = get_first( @@ -4951,7 +5261,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {} except ExtractorError as e: if isinstance(e.cause, network_exceptions): - if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code not in (403, 429): + if not isinstance(e.cause, HTTPError) or e.cause.status not in (403, 429): retry.error = e continue self._error_or_warning(e, fatal=fatal) @@ -5060,7 +5370,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): IE_DESC = 'YouTube Tabs' _VALID_URL = r'''(?x: https?:// - (?:\w+\.)? + (?!consent\.)(?:\w+\.)? (?: youtube(?:kids)?\.com| %(invidious)s @@ -5089,12 +5399,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': 'Igor Kleiner - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', 'uploader': 'Igor Kleiner', - 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'uploader_id': '@IgorDataScience', + 'uploader_url': 'https://www.youtube.com/@IgorDataScience', 'channel': 'Igor Kleiner', 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', 'tags': ['"критическое', 'мышление"', '"наука', 'просто"', 'математика', '"анализ', 'данных"'], 'channel_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', - 'uploader_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', 'channel_follower_count': int }, }, { @@ -5105,9 +5415,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'id': 'UCqj7Cz7revf5maW9g5pgNcg', 'title': 'Igor Kleiner - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', - 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', 'uploader': 'Igor Kleiner', - 'uploader_url': 'https://www.youtube.com/channel/UCqj7Cz7revf5maW9g5pgNcg', + 'uploader_id': '@IgorDataScience', + 'uploader_url': 'https://www.youtube.com/@IgorDataScience', 'tags': ['"критическое', 'мышление"', '"наука', 'просто"', 'математика', '"анализ', 'данных"'], 'channel_id': 'UCqj7Cz7revf5maW9g5pgNcg', 'channel': 'Igor Kleiner', @@ -5122,14 +5432,15 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'id': 'UCYO_jab_esuFRV4b17AJtAw', 'title': '3Blue1Brown - Playlists', 'description': 'md5:e1384e8a133307dd10edee76e875d62f', - 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', - 'uploader': '3Blue1Brown', 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', - 'uploader_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', 'channel': '3Blue1Brown', 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'uploader_id': '@3blue1brown', + 'uploader_url': 'https://www.youtube.com/@3blue1brown', + 'uploader': '3Blue1Brown', 'tags': ['Mathematics'], - 'channel_follower_count': int + 'channel_follower_count': int, + 'channel_is_verified': True, }, }, { 'note': 'playlists, singlepage', @@ -5140,10 +5451,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': 'ThirstForScience - Playlists', 'description': 'md5:609399d937ea957b0f53cbffb747a14c', 'uploader': 'ThirstForScience', - 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', - 'uploader_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ', - 'channel_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ', + 'uploader_url': 'https://www.youtube.com/@ThirstForScience', + 'uploader_id': '@ThirstForScience', 'channel_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + 'channel_url': 'https://www.youtube.com/channel/UCAEtajcuhQ6an9WEzY9LEMQ', 'tags': 'count:13', 'channel': 'ThirstForScience', 'channel_follower_count': int @@ -5155,8 +5466,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'note': 'basic, single video playlist', 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'title': 'youtube-dl public playlist', 'description': '', @@ -5165,17 +5474,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'modified_date': '20201130', 'channel': 'Sergey M.', 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'availability': 'public', + 'uploader': 'Sergey M.', + 'uploader_url': 'https://www.youtube.com/@sergeym.6173', + 'uploader_id': '@sergeym.6173', }, 'playlist_count': 1, }, { 'note': 'empty playlist', 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'title': 'youtube-dl empty playlist', 'tags': [], @@ -5184,8 +5493,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'modified_date': '20160902', 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'availability': 'public', + 'uploader_url': 'https://www.youtube.com/@sergeym.6173', + 'uploader_id': '@sergeym.6173', + 'uploader': 'Sergey M.', }, 'playlist_count': 0, }, { @@ -5196,10 +5507,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': 'lex will - Home', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_id': '@lexwill718', 'channel': 'lex will', 'tags': ['bible', 'history', 'prophesy'], - 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_url': 'https://www.youtube.com/@lexwill718', 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'channel_follower_count': int @@ -5213,11 +5524,11 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': 'lex will - Videos', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_id': '@lexwill718', 'tags': ['bible', 'history', 'prophesy'], 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_url': 'https://www.youtube.com/@lexwill718', 'channel': 'lex will', 'channel_follower_count': int }, @@ -5230,9 +5541,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': 'lex will - Videos', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_id': '@lexwill718', 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_url': 'https://www.youtube.com/@lexwill718', 'channel': 'lex will', 'tags': ['bible', 'history', 'prophesy'], 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', @@ -5247,8 +5558,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': 'lex will - Playlists', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'uploader_id': '@lexwill718', + 'uploader_url': 'https://www.youtube.com/@lexwill718', 'channel': 'lex will', 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', @@ -5263,14 +5574,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Community', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', 'channel': 'lex will', 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'tags': ['bible', 'history', 'prophesy'], - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader_url': 'https://www.youtube.com/@lexwill718', + 'uploader_id': '@lexwill718', + 'uploader': 'lex will', }, 'playlist_mincount': 18, }, { @@ -5280,14 +5591,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'title': 'lex will - Channels', 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'uploader_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', 'channel': 'lex will', 'channel_url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', 'channel_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', 'tags': ['bible', 'history', 'prophesy'], - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader_url': 'https://www.youtube.com/@lexwill718', + 'uploader_id': '@lexwill718', + 'uploader': 'lex will', }, 'playlist_mincount': 12, }, { @@ -5298,14 +5609,15 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'id': 'UCYO_jab_esuFRV4b17AJtAw', 'title': '3Blue1Brown - Search - linear algebra', 'description': 'md5:e1384e8a133307dd10edee76e875d62f', - 'uploader': '3Blue1Brown', - 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', - 'uploader_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', 'tags': ['Mathematics'], 'channel': '3Blue1Brown', 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', - 'channel_follower_count': int + 'channel_follower_count': int, + 'uploader_url': 'https://www.youtube.com/@3blue1brown', + 'uploader_id': '@3blue1brown', + 'uploader': '3Blue1Brown', + 'channel_is_verified': True, }, }, { 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', @@ -5322,17 +5634,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'info_dict': { 'title': '29C3: Not my department', 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'uploader': 'Christiaan008', - 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268', 'tags': [], - 'uploader_url': 'https://www.youtube.com/c/ChRiStIaAn008', 'view_count': int, 'modified_date': '20150605', 'channel_id': 'UCEPzS1rYsrkqzSLNp76nrcg', - 'channel_url': 'https://www.youtube.com/c/ChRiStIaAn008', + 'channel_url': 'https://www.youtube.com/channel/UCEPzS1rYsrkqzSLNp76nrcg', 'channel': 'Christiaan008', 'availability': 'public', + 'uploader_id': '@ChRiStIaAn008', + 'uploader': 'Christiaan008', + 'uploader_url': 'https://www.youtube.com/@ChRiStIaAn008', }, 'playlist_count': 96, }, { @@ -5341,17 +5653,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'info_dict': { 'title': 'Uploads from Cauchemar', 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', - 'uploader': 'Cauchemar', - 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', - 'channel_url': 'https://www.youtube.com/c/Cauchemar89', + 'channel_url': 'https://www.youtube.com/channel/UCBABnxM4Ar9ten8Mdjj1j0Q', 'tags': [], 'modified_date': r're:\d{8}', 'channel': 'Cauchemar', - 'uploader_url': 'https://www.youtube.com/c/Cauchemar89', 'view_count': int, 'description': '', 'channel_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', 'availability': 'public', + 'uploader_id': '@Cauchemar89', + 'uploader': 'Cauchemar', + 'uploader_url': 'https://www.youtube.com/@Cauchemar89', }, 'playlist_mincount': 1123, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -5365,17 +5677,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'info_dict': { 'title': 'Uploads from Interstellar Movie', 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', - 'uploader': 'Interstellar Movie', - 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', - 'uploader_url': 'https://www.youtube.com/c/InterstellarMovie', 'tags': [], 'view_count': int, 'channel_id': 'UCXw-G3eDE9trcvY2sBMM_aA', - 'channel_url': 'https://www.youtube.com/c/InterstellarMovie', + 'channel_url': 'https://www.youtube.com/channel/UCXw-G3eDE9trcvY2sBMM_aA', 'channel': 'Interstellar Movie', 'description': '', 'modified_date': r're:\d{8}', 'availability': 'public', + 'uploader_id': '@InterstellarMovie', + 'uploader': 'Interstellar Movie', + 'uploader_url': 'https://www.youtube.com/@InterstellarMovie', }, 'playlist_mincount': 21, }, { @@ -5384,17 +5696,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'info_dict': { 'title': 'Uploads from Phim Siêu Nhân Nhật Bản', 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q', - 'uploader': 'Phim Siêu Nhân Nhật Bản', - 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', 'view_count': int, 'channel': 'Phim Siêu Nhân Nhật Bản', 'tags': [], - 'uploader_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q', 'description': '', 'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q', 'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', 'modified_date': r're:\d{8}', 'availability': 'public', + 'uploader_url': 'https://www.youtube.com/@phimsieunhannhatban', + 'uploader_id': '@phimsieunhannhatban', + 'uploader': 'Phim Siêu Nhân Nhật Bản', }, 'playlist_mincount': 200, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -5404,17 +5716,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'info_dict': { 'title': 'Uploads from BlankTV', 'id': 'UU8l9frL61Yl5KFOl87nIm2w', - 'uploader': 'BlankTV', - 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w', 'channel': 'BlankTV', - 'channel_url': 'https://www.youtube.com/c/blanktv', + 'channel_url': 'https://www.youtube.com/channel/UC8l9frL61Yl5KFOl87nIm2w', 'channel_id': 'UC8l9frL61Yl5KFOl87nIm2w', 'view_count': int, 'tags': [], - 'uploader_url': 'https://www.youtube.com/c/blanktv', 'modified_date': r're:\d{8}', 'description': '', 'availability': 'public', + 'uploader_id': '@blanktv', + 'uploader': 'BlankTV', + 'uploader_url': 'https://www.youtube.com/@blanktv', }, 'playlist_mincount': 1000, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -5424,17 +5736,17 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'info_dict': { 'title': 'Data Analysis with Dr Mike Pound', 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', - 'uploader': 'Computerphile', 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487', - 'uploader_url': 'https://www.youtube.com/user/Computerphile', 'tags': [], 'view_count': int, 'channel_id': 'UC9-y-6csu5WGm29I7JiwpnA', - 'channel_url': 'https://www.youtube.com/user/Computerphile', + 'channel_url': 'https://www.youtube.com/channel/UC9-y-6csu5WGm29I7JiwpnA', 'channel': 'Computerphile', 'availability': 'public', 'modified_date': '20190712', + 'uploader_id': '@Computerphile', + 'uploader': 'Computerphile', + 'uploader_url': 'https://www.youtube.com/@Computerphile', }, 'playlist_mincount': 11, }, { @@ -5447,9 +5759,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'id': 'FqZTN594JQw', 'ext': 'webm', 'title': "Smiley's People 01 detective, Adventure Series, Action", - 'uploader': 'STREEM', - 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', 'upload_date': '20150526', 'license': 'Standard YouTube License', 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', @@ -5472,12 +5781,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { - 'id': 'Wq15eF5vCbI', # This will keep changing + 'id': 'hGkQjiJLjWQ', # This will keep changing 'ext': 'mp4', 'title': str, - 'uploader': 'Sky News', - 'uploader_id': 'skynews', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', 'upload_date': r're:\d{8}', 'description': str, 'categories': ['News & Politics'], @@ -5496,6 +5802,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ', 'channel_follower_count': int, 'concurrent_view_count': int, + 'uploader_url': 'https://www.youtube.com/@SkyNews', + 'uploader_id': '@SkyNews', + 'uploader': 'Sky News', + 'channel_is_verified': True, }, 'params': { 'skip_download': True, @@ -5507,9 +5817,6 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'id': 'a48o2S1cPoo', 'ext': 'mp4', 'title': 'The Young Turks - Live Main Show', - 'uploader': 'The Young Turks', - 'uploader_id': 'TheYoungTurks', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', 'upload_date': '20150715', 'license': 'Standard YouTube License', 'description': 'md5:438179573adcdff3c97ebb1ee632b891', @@ -5590,41 +5897,40 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', 'info_dict': { 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', - 'uploader': 'NoCopyrightSounds', 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', - 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', 'title': 'NCS : All Releases 💿', - 'uploader_url': 'https://www.youtube.com/c/NoCopyrightSounds', - 'channel_url': 'https://www.youtube.com/c/NoCopyrightSounds', + 'channel_url': 'https://www.youtube.com/channel/UC_aEa8K-EOJ3D6gOs7HcyNg', 'modified_date': r're:\d{8}', 'view_count': int, 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', 'tags': [], 'channel': 'NoCopyrightSounds', 'availability': 'public', + 'uploader_url': 'https://www.youtube.com/@NoCopyrightSounds', + 'uploader': 'NoCopyrightSounds', + 'uploader_id': '@NoCopyrightSounds', }, 'playlist_mincount': 166, - 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden', 'YouTube Music is not directly supported'], }, { + # TODO: fix 'unviewable' issue with this playlist when reloading with unavailable videos 'note': 'Topic, should redirect to playlist?list=UU...', 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', 'info_dict': { 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', - 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', 'title': 'Uploads from Royalty Free Music - Topic', - 'uploader': 'Royalty Free Music - Topic', 'tags': [], 'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', 'channel': 'Royalty Free Music - Topic', 'view_count': int, 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', - 'channel_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', 'modified_date': r're:\d{8}', - 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', 'description': '', 'availability': 'public', + 'uploader': 'Royalty Free Music - Topic', }, 'playlist_mincount': 101, + 'expected_warnings': ['YouTube Music is not directly supported', r'[Uu]navailable videos (are|will be) hidden'], }, { # Destination channel with only a hidden self tab (tab id is UCtFRv9O2AHqOZjjynzrv-xg) # Treat as a general feed @@ -5648,12 +5954,11 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'modified_date': r're:\d{8}', }, 'playlist_count': 50, + 'expected_warnings': ['YouTube Music is not directly supported'], }, { 'note': 'unlisted single video playlist', 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', 'info_dict': { - 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', - 'uploader': 'colethedj', 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', 'title': 'hypervideo unlisted playlist test', 'availability': 'unlisted', @@ -5662,11 +5967,31 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel': 'colethedj', 'view_count': int, 'description': '', - 'uploader_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q', 'channel_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', 'channel_url': 'https://www.youtube.com/channel/UC9zHu_mHU96r19o-wV5Qs1Q', + 'uploader_url': 'https://www.youtube.com/@colethedj1894', + 'uploader_id': '@colethedj1894', + 'uploader': 'colethedj', }, + 'playlist': [{ + 'info_dict': { + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'id': 'BaW_jenozKc', + '_type': 'url', + 'ie_key': 'Youtube', + 'duration': 10, + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': 'https://www.youtube.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', + 'view_count': int, + 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc', + 'channel': 'Philipp Hagemeister', + 'uploader_id': '@PhilippHagemeister', + 'uploader_url': 'https://www.youtube.com/@PhilippHagemeister', + 'uploader': 'Philipp Hagemeister', + } + }], 'playlist_count': 1, + 'params': {'extract_flat': True}, }, { 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', 'url': 'https://www.youtube.com/feed/recommended', @@ -5687,13 +6012,10 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', 'title': 'Cody\'sLab - Videos', 'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa', - 'uploader': 'Cody\'sLab', - 'uploader_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', 'channel': 'Cody\'sLab', 'channel_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', 'tags': [], 'channel_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw', - 'uploader_url': 'https://www.youtube.com/channel/UCu6mSoMNzHQiBIOCkHUa2Aw', 'channel_follower_count': int }, 'playlist_mincount': 650, @@ -5707,9 +6029,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', 'info_dict': { 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', - 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', 'title': 'Uploads from Royalty Free Music - Topic', - 'uploader': 'Royalty Free Music - Topic', 'modified_date': r're:\d{8}', 'channel_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', 'description': '', @@ -5717,14 +6037,15 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'tags': [], 'channel': 'Royalty Free Music - Topic', 'view_count': int, - 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', 'availability': 'public', + 'uploader': 'Royalty Free Music - Topic', }, 'playlist_mincount': 101, 'params': { 'skip_download': True, 'extractor_args': {'youtubetab': {'skip': ['webpage']}} }, + 'expected_warnings': ['YouTube Music is not directly supported', r'[Uu]navailable videos (are|will be) hidden'], }, { 'note': 'non-standard redirect to regional channel', 'url': 'https://www.youtube.com/channel/UCwVVpHQ2Cs9iGJfpdFngePQ', @@ -5737,15 +6058,15 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'modified_date': '20220407', 'channel_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q', 'tags': [], - 'uploader_id': 'UCKcqXmCcyqnhgpA5P0oHH_Q', - 'uploader': 'pukkandan', 'availability': 'unlisted', 'channel_id': 'UCKcqXmCcyqnhgpA5P0oHH_Q', 'channel': 'pukkandan', 'description': 'Test for collaborative playlist', 'title': 'hypervideo test - collaborative playlist', 'view_count': int, - 'uploader_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q', + 'uploader_url': 'https://www.youtube.com/@pukkandan', + 'uploader_id': '@pukkandan', + 'uploader': 'pukkandan', }, 'playlist_mincount': 2 }, { @@ -5754,15 +6075,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'info_dict': { 'id': 'UCiu-3thuViMebBjw_5nWYrA', 'tags': [], - 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', 'description': 'test description', 'title': 'cole-dlp-test-acc - 再生リスト', - 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', - 'uploader': 'cole-dlp-test-acc', 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', 'channel': 'cole-dlp-test-acc', - 'channel_follower_count': int, + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', }, 'playlist_mincount': 1, 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, @@ -5776,14 +6096,14 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'tags': [], 'view_count': int, 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', - 'uploader': 'cole-dlp-test-acc', - 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', 'channel': 'cole-dlp-test-acc', 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', 'description': 'test', - 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', 'title': 'dlp test playlist', 'availability': 'public', + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', }, 'playlist_mincount': 1, 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, @@ -5835,29 +6155,30 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_follower_count': int, 'channel_id': 'UCK9V2B22uJYu3N7eR_BT9QA', 'channel_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', - 'uploader': 'Polka Ch. 尾丸ポルカ', - 'description': 'md5:3b8df1ac5af337aa206e37ee3d181ec9', + 'description': 'md5:e56b74b5bb7e9c701522162e9abfb822', 'channel': 'Polka Ch. 尾丸ポルカ', 'tags': 'count:35', - 'uploader_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', - 'uploader_id': 'UCK9V2B22uJYu3N7eR_BT9QA', + 'uploader_url': 'https://www.youtube.com/@OmaruPolka', + 'uploader': 'Polka Ch. 尾丸ポルカ', + 'uploader_id': '@OmaruPolka', }, 'playlist_count': 3, }, { # Shorts tab with channel with handle + # TODO: fix channel description 'url': 'https://www.youtube.com/@NotJustBikes/shorts', 'info_dict': { 'id': 'UC0intLFzLaudFG-xAvUEO-A', 'title': 'Not Just Bikes - Shorts', 'tags': 'count:12', - 'uploader': 'Not Just Bikes', 'channel_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A', - 'description': 'md5:7513148b1f02b924783157d84c4ea555', + 'description': 'md5:26bc55af26855a608a5cf89dfa595c8d', 'channel_follower_count': int, - 'uploader_id': 'UC0intLFzLaudFG-xAvUEO-A', 'channel_id': 'UC0intLFzLaudFG-xAvUEO-A', - 'uploader_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A', 'channel': 'Not Just Bikes', + 'uploader_url': 'https://www.youtube.com/@NotJustBikes', + 'uploader': 'Not Just Bikes', + 'uploader_id': '@NotJustBikes', }, 'playlist_mincount': 10, }, { @@ -5869,12 +6190,12 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'tags': 'count:7', 'channel_id': 'UC3eYAvjCVwNHgkaGbXX3sig', 'channel_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig', - 'uploader_id': 'UC3eYAvjCVwNHgkaGbXX3sig', 'channel': '中村悠一', - 'uploader_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig', 'channel_follower_count': int, - 'uploader': '中村悠一', 'description': 'md5:e744f6c93dafa7a03c0c6deecb157300', + 'uploader_url': 'https://www.youtube.com/@Yuichi-Nakamura', + 'uploader_id': '@Yuichi-Nakamura', + 'uploader': '中村悠一', }, 'playlist_mincount': 60, }, { @@ -5893,15 +6214,15 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'info_dict': { 'id': 'UCgJ5_1F6yJhYLnyMszUdmUg', 'title': 'Shorts Break - Shorts', - 'tags': 'count:32', + 'tags': 'count:48', 'channel_id': 'UCgJ5_1F6yJhYLnyMszUdmUg', 'channel': 'Shorts Break', - 'description': 'md5:a6c234cf3d50d878ef8721e34457cd11', - 'uploader': 'Shorts Break', + 'description': 'md5:6de33c5e7ba686e5f3efd4e19c7ef499', 'channel_follower_count': int, - 'uploader_id': 'UCgJ5_1F6yJhYLnyMszUdmUg', - 'uploader_url': 'https://www.youtube.com/channel/UCgJ5_1F6yJhYLnyMszUdmUg', 'channel_url': 'https://www.youtube.com/channel/UCgJ5_1F6yJhYLnyMszUdmUg', + 'uploader_url': 'https://www.youtube.com/@ShortsBreak_Official', + 'uploader': 'Shorts Break', + 'uploader_id': '@ShortsBreak_Official', }, 'playlist_mincount': 30, }, { @@ -5924,31 +6245,28 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'playlist_mincount': 30, }, { # Shorts url result in shorts tab + # TODO: Fix channel id extraction 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/shorts', 'info_dict': { 'id': 'UCiu-3thuViMebBjw_5nWYrA', 'title': 'cole-dlp-test-acc - Shorts', - 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', 'channel': 'cole-dlp-test-acc', - 'channel_follower_count': int, 'description': 'test description', 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', 'tags': [], + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', 'uploader': 'cole-dlp-test-acc', - 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', - }, 'playlist': [{ 'info_dict': { + # Channel data is not currently available for short renderers (as of 2023-03-01) '_type': 'url', 'ie_key': 'Youtube', 'url': 'https://www.youtube.com/shorts/sSM9J5YH_60', 'id': 'sSM9J5YH_60', - 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', 'title': 'SHORT short', - 'channel': 'cole-dlp-test-acc', - 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', 'view_count': int, 'thumbnails': list, } @@ -5974,10 +6292,124 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_url': str, 'concurrent_view_count': int, 'channel': str, + 'uploader': str, + 'uploader_url': str, + 'uploader_id': str, + 'channel_is_verified': bool, # this will keep changing } }], - 'params': {'extract_flat': True}, + 'params': {'extract_flat': True, 'playlist_items': '1'}, 'playlist_mincount': 1 + }, { + # Channel renderer metadata. Contains number of videos on the channel + 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/channels', + 'info_dict': { + 'id': 'UCiu-3thuViMebBjw_5nWYrA', + 'title': 'cole-dlp-test-acc - Channels', + 'channel': 'cole-dlp-test-acc', + 'description': 'test description', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'tags': [], + 'uploader_url': 'https://www.youtube.com/@coletdjnz', + 'uploader_id': '@coletdjnz', + 'uploader': 'cole-dlp-test-acc', + }, + 'playlist': [{ + 'info_dict': { + '_type': 'url', + 'ie_key': 'YoutubeTab', + 'url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'title': 'PewDiePie', + 'channel': 'PewDiePie', + 'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'thumbnails': list, + 'channel_follower_count': int, + 'playlist_count': int, + 'uploader': 'PewDiePie', + 'uploader_url': 'https://www.youtube.com/@PewDiePie', + 'uploader_id': '@PewDiePie', + 'channel_is_verified': True, + } + }], + 'params': {'extract_flat': True}, + }, { + 'url': 'https://www.youtube.com/@3blue1brown/about', + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'tags': ['Mathematics'], + 'title': '3Blue1Brown - About', + 'channel_follower_count': int, + 'channel_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'channel': '3Blue1Brown', + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader_url': 'https://www.youtube.com/@3blue1brown', + 'uploader_id': '@3blue1brown', + 'uploader': '3Blue1Brown', + 'channel_is_verified': True, + }, + 'playlist_count': 0, + }, { + # Podcasts tab, with rich entry playlistRenderers + 'url': 'https://www.youtube.com/@99percentinvisiblepodcast/podcasts', + 'info_dict': { + 'id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'channel_id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'uploader_url': 'https://www.youtube.com/@99percentinvisiblepodcast', + 'description': 'md5:3a0ed38f1ad42a68ef0428c04a15695c', + 'title': '99 Percent Invisible - Podcasts', + 'uploader': '99 Percent Invisible', + 'channel_follower_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCVMF2HD4ZgC0QHpU9Yq5Xrw', + 'tags': [], + 'channel': '99 Percent Invisible', + 'uploader_id': '@99percentinvisiblepodcast', + }, + 'playlist_count': 1, + }, { + # Releases tab, with rich entry playlistRenderers (same as Podcasts tab) + 'url': 'https://www.youtube.com/@AHimitsu/releases', + 'info_dict': { + 'id': 'UCgFwu-j5-xNJml2FtTrrB3A', + 'channel': 'A Himitsu', + 'uploader_url': 'https://www.youtube.com/@AHimitsu', + 'title': 'A Himitsu - Releases', + 'uploader_id': '@AHimitsu', + 'uploader': 'A Himitsu', + 'channel_id': 'UCgFwu-j5-xNJml2FtTrrB3A', + 'tags': 'count:16', + 'description': 'I make music', + 'channel_url': 'https://www.youtube.com/channel/UCgFwu-j5-xNJml2FtTrrB3A', + 'channel_follower_count': int, + 'channel_is_verified': True, + }, + 'playlist_mincount': 10, + }, { + # Playlist with only shorts, shown as reel renderers + # FIXME: future: YouTube currently doesn't give continuation for this, + # may do in future. + 'url': 'https://www.youtube.com/playlist?list=UUxqPAgubo4coVn9Lx1FuKcg', + 'info_dict': { + 'id': 'UUxqPAgubo4coVn9Lx1FuKcg', + 'channel_url': 'https://www.youtube.com/channel/UCxqPAgubo4coVn9Lx1FuKcg', + 'view_count': int, + 'uploader_id': '@BangyShorts', + 'description': '', + 'uploader_url': 'https://www.youtube.com/@BangyShorts', + 'channel_id': 'UCxqPAgubo4coVn9Lx1FuKcg', + 'channel': 'Bangy Shorts', + 'uploader': 'Bangy Shorts', + 'tags': [], + 'availability': 'public', + 'modified_date': '20230626', + 'title': 'Uploads from Bangy Shorts', + }, + 'playlist_mincount': 100, + 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }] @classmethod @@ -6044,6 +6476,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): original_tab_id, display_id = tab[1:], f'{item_id}{tab}' if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts: url = f'{pre}/videos{post}' + if smuggled_data.get('is_music_url'): + self.report_warning(f'YouTube Music is not directly supported. Redirecting to {url}') # Handle both video/playlist URLs qs = parse_qs(url) @@ -6192,15 +6626,15 @@ class YoutubePlaylistIE(InfoExtractor): 'title': '[OLD]Team Fortress 2 (Class-based LP)', 'id': 'PLBB231211A4F62143', 'uploader': 'Wickman', - 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', + 'uploader_id': '@WickmanVT', 'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2', 'view_count': int, - 'uploader_url': 'https://www.youtube.com/c/WickmanVT', + 'uploader_url': 'https://www.youtube.com/@WickmanVT', 'modified_date': r're:\d{8}', 'channel_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', 'channel': 'Wickman', 'tags': [], - 'channel_url': 'https://www.youtube.com/c/WickmanVT', + 'channel_url': 'https://www.youtube.com/channel/UCKSpbfbl5kRQpTdL7kMc-1Q', 'availability': 'public', }, 'playlist_mincount': 29, @@ -6220,7 +6654,7 @@ class YoutubePlaylistIE(InfoExtractor): 'title': 'JODA15', 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', 'uploader': 'milan', - 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', + 'uploader_id': '@milan5503', 'description': '', 'channel_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw', 'tags': [], @@ -6228,7 +6662,7 @@ class YoutubePlaylistIE(InfoExtractor): 'view_count': int, 'channel': 'milan', 'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw', - 'uploader_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw', + 'uploader_url': 'https://www.youtube.com/@milan5503', 'availability': 'public', }, 'expected_warnings': [r'[Uu]navailable videos? (is|are|will be) hidden'], @@ -6239,13 +6673,13 @@ class YoutubePlaylistIE(InfoExtractor): 'title': '2018 Chinese New Singles (11/6 updated)', 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', 'uploader': 'LBK', - 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA', + 'uploader_id': '@music_king', 'description': 'md5:da521864744d60a198e3a88af4db0d9d', 'channel': 'LBK', 'view_count': int, - 'channel_url': 'https://www.youtube.com/c/愛低音的國王', + 'channel_url': 'https://www.youtube.com/channel/UC21nz3_MesPLqtDqwdvnoxA', 'tags': [], - 'uploader_url': 'https://www.youtube.com/c/愛低音的國王', + 'uploader_url': 'https://www.youtube.com/@music_king', 'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA', 'modified_date': r're:\d{8}', 'availability': 'public', @@ -6291,8 +6725,8 @@ class YoutubeYtBeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Small Scale Baler and Braiding Rugs', 'uploader': 'Backus-Page House Museum', - 'uploader_id': 'backuspagemuseum', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', + 'uploader_id': '@backuspagemuseum', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/@backuspagemuseum', 'upload_date': '20161008', 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', 'categories': ['Nonprofits & Activism'], @@ -6300,7 +6734,7 @@ class YoutubeYtBeIE(InfoExtractor): 'like_count': int, 'age_limit': 0, 'playable_in_embed': True, - 'thumbnail': 'https://i.ytimg.com/vi_webp/yeWKywCrFtk/maxresdefault.webp', + 'thumbnail': r're:^https?://.*\.webp', 'channel': 'Backus-Page House Museum', 'channel_id': 'UCEfMCQ9bs3tjvjy1s451zaw', 'live_status': 'not_live', @@ -6416,7 +6850,7 @@ class YoutubeNotificationsIE(YoutubeTabBaseInfoExtractor): if not video_id: browse_ep = traverse_obj( notification, ('navigationEndpoint', 'browseEndpoint'), expected_type=dict) - channel_id = traverse_obj(browse_ep, 'browseId', expected_type=str) + channel_id = self.ucid_or_none(traverse_obj(browse_ep, 'browseId', expected_type=str)) post_id = self._search_regex( r'/post/(.+)', traverse_obj(browse_ep, 'canonicalBaseUrl', expected_type=str), 'post id', default=None) @@ -6446,6 +6880,7 @@ class YoutubeNotificationsIE(YoutubeTabBaseInfoExtractor): 'title': title, 'channel_id': channel_id, 'channel': channel, + 'uploader': channel, 'thumbnails': self._extract_thumbnails(notification, 'videoThumbnail'), 'timestamp': timestamp, } @@ -6532,6 +6967,36 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): # }], }, }, { + # Channel results + 'url': 'https://www.youtube.com/results?search_query=kurzgesagt&sp=EgIQAg%253D%253D', + 'info_dict': { + 'id': 'kurzgesagt', + 'title': 'kurzgesagt', + }, + 'playlist': [{ + 'info_dict': { + '_type': 'url', + 'id': 'UCsXVk37bltHxD1rDPwtNM8Q', + 'url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', + 'ie_key': 'YoutubeTab', + 'channel': 'Kurzgesagt – In a Nutshell', + 'description': 'md5:4ae48dfa9505ffc307dad26342d06bfc', + 'title': 'Kurzgesagt – In a Nutshell', + 'channel_id': 'UCsXVk37bltHxD1rDPwtNM8Q', + # No longer available for search as it is set to the handle. + # 'playlist_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCsXVk37bltHxD1rDPwtNM8Q', + 'thumbnails': list, + 'uploader_id': '@kurzgesagt', + 'uploader_url': 'https://www.youtube.com/@kurzgesagt', + 'uploader': 'Kurzgesagt – In a Nutshell', + 'channel_is_verified': True, + 'channel_follower_count': int, + } + }], + 'params': {'extract_flat': True, 'playlist_items': '1'}, + 'playlist_mincount': 1, + }, { 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', 'only_matching': True, }] @@ -6669,22 +7134,6 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): }] -class YoutubeStoriesIE(InfoExtractor): - IE_DESC = 'YouTube channel stories; "ytstories:" prefix' - IE_NAME = 'youtube:stories' - _VALID_URL = r'ytstories:UC(?P<id>[A-Za-z0-9_-]{21}[AQgw])$' - _TESTS = [{ - 'url': 'ytstories:UCwFCb4jeqaKWnciAYM-ZVHg', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = f'RLTD{self._match_id(url)}' - return self.url_result( - smuggle_url(f'https://www.youtube.com/playlist?list={playlist_id}&playnext=1', {'is_story': True}), - ie=YoutubeTabIE, video_id=playlist_id) - - class YoutubeShortsAudioPivotIE(InfoExtractor): IE_DESC = 'YouTube Shorts audio pivot (Shorts using audio of a given video)' IE_NAME = 'youtube:shorts:pivot:audio' @@ -6784,11 +7233,14 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): 'title': 'Mobile Games on Console - Scott The Woz', 'upload_date': '20210920', 'uploader': 'Scott The Woz', - 'uploader_id': 'scottthewoz', - 'uploader_url': 'http://www.youtube.com/user/scottthewoz', + 'uploader_id': '@ScottTheWoz', + 'uploader_url': 'https://www.youtube.com/@ScottTheWoz', 'view_count': int, 'live_status': 'not_live', - 'channel_follower_count': int + 'channel_follower_count': int, + 'chapters': 'count:20', + 'comment_count': int, + 'heatmap': 'count:100', } }] @@ -6816,6 +7268,53 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): } +class YoutubeConsentRedirectIE(YoutubeBaseInfoExtractor): + IE_NAME = 'youtube:consent' + IE_DESC = False # Do not list + _VALID_URL = r'https?://consent\.youtube\.com/m\?' + _TESTS = [{ + 'url': 'https://consent.youtube.com/m?continue=https%3A%2F%2Fwww.youtube.com%2Flive%2FqVv6vCqciTM%3Fcbrd%3D1&gl=NL&m=0&pc=yt&hl=en&src=1', + 'info_dict': { + 'id': 'qVv6vCqciTM', + 'ext': 'mp4', + 'age_limit': 0, + 'uploader_id': '@sana_natori', + 'comment_count': int, + 'chapters': 'count:13', + 'upload_date': '20221223', + 'thumbnail': 'https://i.ytimg.com/vi/qVv6vCqciTM/maxresdefault.jpg', + 'channel_url': 'https://www.youtube.com/channel/UCIdEIHpS0TdkqRkHL5OkLtA', + 'uploader_url': 'https://www.youtube.com/@sana_natori', + 'like_count': int, + 'release_date': '20221223', + 'tags': ['Vtuber', '月ノ美兎', '名取さな', 'にじさんじ', 'クリスマス', '3D配信'], + 'title': '【 #インターネット女クリスマス 】3Dで歌ってはしゃぐインターネットの女たち【月ノ美兎/名取さな】', + 'view_count': int, + 'playable_in_embed': True, + 'duration': 4438, + 'availability': 'public', + 'channel_follower_count': int, + 'channel_id': 'UCIdEIHpS0TdkqRkHL5OkLtA', + 'categories': ['Entertainment'], + 'live_status': 'was_live', + 'release_timestamp': 1671793345, + 'channel': 'さなちゃんねる', + 'description': 'md5:6aebf95cc4a1d731aebc01ad6cc9806d', + 'uploader': 'さなちゃんねる', + 'channel_is_verified': True, + 'heatmap': 'count:100', + }, + 'add_ie': ['Youtube'], + 'params': {'skip_download': 'Youtube'}, + }] + + def _real_extract(self, url): + redirect_url = url_or_none(parse_qs(url).get('continue', [None])[-1]) + if not redirect_url: + raise ExtractorError('Invalid cookie consent redirect URL', expected=True) + return self.url_result(redirect_url) + + class YoutubeTruncatedIDIE(InfoExtractor): IE_NAME = 'youtube:truncated_id' IE_DESC = False # Do not list diff --git a/hypervideo_dl/extractor/zaiko.py b/hypervideo_dl/extractor/zaiko.py new file mode 100644 index 0000000..0ccacbb --- /dev/null +++ b/hypervideo_dl/extractor/zaiko.py @@ -0,0 +1,130 @@ +import base64 + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + extract_attributes, + int_or_none, + str_or_none, + traverse_obj, + try_call, + unescapeHTML, + url_or_none, +) + + +class ZaikoBaseIE(InfoExtractor): + def _download_real_webpage(self, url, video_id): + webpage, urlh = self._download_webpage_handle(url, video_id) + final_url = urlh.url + if 'zaiko.io/login' in final_url: + self.raise_login_required() + elif '/_buy/' in final_url: + raise ExtractorError('Your account does not have tickets to this event', expected=True) + return webpage + + def _parse_vue_element_attr(self, name, string, video_id): + page_elem = self._search_regex(rf'(<{name}[^>]+>)', string, name) + attrs = {} + for key, value in extract_attributes(page_elem).items(): + if key.startswith(':'): + attrs[key[1:]] = self._parse_json( + value, video_id, transform_source=unescapeHTML, fatal=False) + return attrs + + +class ZaikoIE(ZaikoBaseIE): + _VALID_URL = r'https?://(?:[\w-]+\.)?zaiko\.io/event/(?P<id>\d+)/stream(?:/\d+)+' + _TESTS = [{ + 'url': 'https://zaiko.io/event/324868/stream/20571/20571', + 'info_dict': { + 'id': '324868', + 'ext': 'mp4', + 'title': 'ZAIKO STREAMING TEST', + 'alt_title': '[VOD] ZAIKO STREAMING TEST_20210603(Do Not Delete)', + 'uploader_id': '454', + 'uploader': 'ZAIKO ZERO', + 'release_timestamp': 1583809200, + 'thumbnail': r're:https://[a-z0-9]+.cloudfront.net/[a-z0-9_]+/[a-z0-9_]+', + 'release_date': '20200310', + 'categories': ['Tech House'], + 'live_status': 'was_live', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_real_webpage(url, video_id) + stream_meta = self._parse_vue_element_attr('stream-page', webpage, video_id) + + player_page = self._download_webpage( + stream_meta['stream-access']['video_source'], video_id, + 'Downloading player page', headers={'referer': 'https://zaiko.io/'}) + player_meta = self._parse_vue_element_attr('player', player_page, video_id) + status = traverse_obj(player_meta, ('initial_event_info', 'status', {str})) + live_status, msg, expected = { + 'vod': ('was_live', 'No VOD stream URL was found', False), + 'archiving': ('post_live', 'Event VOD is still being processed', True), + 'deleting': ('post_live', 'This event has ended', True), + 'deleted': ('post_live', 'This event has ended', True), + 'error': ('post_live', 'This event has ended', True), + 'disconnected': ('post_live', 'Stream has been disconnected', True), + 'live_to_disconnected': ('post_live', 'Stream has been disconnected', True), + 'live': ('is_live', 'No livestream URL found was found', False), + 'waiting': ('is_upcoming', 'Live event has not yet started', True), + 'cancelled': ('not_live', 'Event has been cancelled', True), + }.get(status) or ('not_live', f'Unknown event status "{status}"', False) + + stream_url = traverse_obj(player_meta, ('initial_event_info', 'endpoint', {url_or_none})) + formats = self._extract_m3u8_formats( + stream_url, video_id, live=True, fatal=False) if stream_url else [] + if not formats: + self.raise_no_formats(msg, expected=expected) + + return { + 'id': video_id, + 'formats': formats, + 'live_status': live_status, + **traverse_obj(stream_meta, { + 'title': ('event', 'name', {str}), + 'uploader': ('profile', 'name', {str}), + 'uploader_id': ('profile', 'id', {str_or_none}), + 'release_timestamp': ('stream', 'start', 'timestamp', {int_or_none}), + 'categories': ('event', 'genres', ..., {lambda x: x or None}), + }), + **traverse_obj(player_meta, ('initial_event_info', { + 'alt_title': ('title', {str}), + 'thumbnail': ('poster_url', {url_or_none}), + })), + } + + +class ZaikoETicketIE(ZaikoBaseIE): + _VALID_URL = r'https?://(?:www.)?zaiko\.io/account/eticket/(?P<id>[\w=-]{49})' + _TESTS = [{ + 'url': 'https://zaiko.io/account/eticket/TZjMwMzQ2Y2EzMXwyMDIzMDYwNzEyMTMyNXw1MDViOWU2Mw==', + 'playlist_count': 1, + 'info_dict': { + 'id': 'f30346ca31-20230607121325-505b9e63', + 'title': 'ZAIKO STREAMING TEST', + 'thumbnail': 'https://media.zkocdn.net/pf_1/1_3wdyjcjyupseatkwid34u', + }, + 'skip': 'Only available with the ticketholding account', + }] + + def _real_extract(self, url): + ticket_id = self._match_id(url) + ticket_id = try_call( + lambda: base64.urlsafe_b64decode(ticket_id[1:]).decode().replace('|', '-')) or ticket_id + + webpage = self._download_real_webpage(url, ticket_id) + eticket = self._parse_vue_element_attr('eticket', webpage, ticket_id) + + return self.playlist_result( + [self.url_result(stream, ZaikoIE) for stream in traverse_obj(eticket, ('streams', ..., 'url'))], + ticket_id, **traverse_obj(eticket, ('ticket-details', { + 'title': 'event_name', + 'thumbnail': 'event_img_url', + }))) diff --git a/hypervideo_dl/extractor/zattoo.py b/hypervideo_dl/extractor/zattoo.py index 22620c0..6bd9ea0 100644 --- a/hypervideo_dl/extractor/zattoo.py +++ b/hypervideo_dl/extractor/zattoo.py @@ -2,7 +2,8 @@ import re from uuid import uuid4 from .common import InfoExtractor -from ..compat import compat_HTTPError, compat_str +from ..compat import compat_str +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -36,7 +37,7 @@ class ZattooPlatformBaseIE(InfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: raise ExtractorError( 'Unable to login: incorrect username and/or password', expected=True) diff --git a/hypervideo_dl/extractor/zdf.py b/hypervideo_dl/extractor/zdf.py index fca426a..c04d51b 100644 --- a/hypervideo_dl/extractor/zdf.py +++ b/hypervideo_dl/extractor/zdf.py @@ -24,7 +24,7 @@ from ..utils import ( class ZDFBaseIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'fhd', 'uhd') def _call_api(self, url, video_id, item, api_token=None, referrer=None): headers = {} @@ -61,6 +61,9 @@ class ZDFBaseIE(InfoExtractor): elif mime_type == 'application/f4m+xml' or ext == 'f4m': new_formats = self._extract_f4m_formats( update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False) + elif ext == 'mpd': + new_formats = self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False) else: f = parse_codecs(meta.get('mimeCodec')) if not f and meta.get('type'): @@ -174,7 +177,8 @@ class ZDFIE(ZDFBaseIE): 'thumbnail': 'md5:e65f459f741be5455c952cd820eb188e', 'title': 'heute journal vom 30.12.2021', 'timestamp': 1640897100, - } + }, + 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', }, { 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', 'info_dict': { @@ -189,7 +193,7 @@ class ZDFIE(ZDFBaseIE): }, }, { 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html', - 'md5': '1b93bdec7d02fc0b703c5e7687461628', + 'md5': '57af4423db0455a3975d2dc4578536bc', 'info_dict': { 'ext': 'mp4', 'id': 'video_funk_1770473', @@ -198,7 +202,7 @@ class ZDFIE(ZDFBaseIE): 'title': 'Alles ist verzaubert', 'timestamp': 1635520560, 'upload_date': '20211029', - 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-100~1920x1080?cb=1636466431799', + 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-102~1920x1080?cb=1663848412907', }, }, { # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche @@ -241,10 +245,23 @@ class ZDFIE(ZDFBaseIE): 'title': 'Das Geld anderer Leute', 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d', 'duration': 2581.0, - 'timestamp': 1654790700, - 'upload_date': '20220609', + 'timestamp': 1675160100, + 'upload_date': '20230131', 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350', }, + }, { + 'url': 'https://www.zdf.de/dokumentation/terra-x/unser-gruener-planet-wuesten-doku-100.html', + 'info_dict': { + 'id': '220605_dk_gruener_planet_wuesten_tex', + 'ext': 'mp4', + 'title': 'Unser grüner Planet - Wüsten', + 'description': 'md5:4fc647b6f9c3796eea66f4a0baea2862', + 'duration': 2613.0, + 'timestamp': 1654450200, + 'upload_date': '20220605', + 'format_note': 'uhd, main', + 'thumbnail': 'https://www.zdf.de/assets/saguaro-kakteen-102~3840x2160?cb=1655910690796', + }, }] def _extract_entry(self, url, player, content, video_id): @@ -259,7 +276,7 @@ class ZDFIE(ZDFBaseIE): raise ExtractorError('Could not extract ptmd_path') info = self._extract_ptmd( - urljoin(url, ptmd_path.replace('{playerId}', 'ngplayer_2_4')), video_id, player['apiToken'], url) + urljoin(url, ptmd_path.replace('{playerId}', 'android_native_5')), video_id, player['apiToken'], url) thumbnails = [] layouts = try_get( diff --git a/hypervideo_dl/extractor/zee5.py b/hypervideo_dl/extractor/zee5.py index a64eb9e..ca79cf0 100644 --- a/hypervideo_dl/extractor/zee5.py +++ b/hypervideo_dl/extractor/zee5.py @@ -1,14 +1,16 @@ import json -import random -import string +import time +import uuid from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, + jwt_decode_hs256, parse_age_limit, str_or_none, + try_call, try_get, unified_strdate, unified_timestamp, @@ -94,12 +96,12 @@ class Zee5IE(InfoExtractor): 'url': 'https://www.zee5.com/music-videos/details/adhento-gaani-vunnapaatuga-jersey-nani-shraddha-srinath/0-0-56973', 'only_matching': True }] - _DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails/secure?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false' - _DEVICE_ID = ''.join(random.choices(string.ascii_letters + string.digits, k=20)).ljust(32, '0') + _DEVICE_ID = str(uuid.uuid4()) _USER_TOKEN = None _LOGIN_HINT = 'Use "--username <mobile_number>" to login using otp or "--username token" and "--password <user_token>" to login using user token.' _NETRC_MACHINE = 'zee5' _GEO_COUNTRIES = ['IN'] + _USER_COUNTRY = None def _perform_login(self, username, password): if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None: @@ -118,16 +120,21 @@ class Zee5IE(InfoExtractor): self._USER_TOKEN = otp_verify_json.get('token') if not self._USER_TOKEN: raise ExtractorError(otp_request_json['message'], expected=True) - elif username.lower() == 'token' and len(password) > 1198: + elif username.lower() == 'token' and try_call(lambda: jwt_decode_hs256(password)): self._USER_TOKEN = password else: raise ExtractorError(self._LOGIN_HINT, expected=True) + token = jwt_decode_hs256(self._USER_TOKEN) + if token.get('exp', 0) <= int(time.time()): + raise ExtractorError('User token has expired', expected=True) + self._USER_COUNTRY = token.get('current_country') + def _real_extract(self, url): video_id, display_id = self._match_valid_url(url).group('id', 'display_id') access_token_request = self._download_json( - 'https://useraction.zee5.com/token/platform_tokens.php?platform_name=web_app', - video_id, note='Downloading access token') + 'https://launchapi.zee5.com/launch?platform_name=web_app', + video_id, note='Downloading access token')['platform_token'] data = { 'x-access-token': access_token_request['token'] } @@ -137,8 +144,13 @@ class Zee5IE(InfoExtractor): data['X-Z5-Guest-Token'] = self._DEVICE_ID json_data = self._download_json( - self._DETAIL_API_URL.format(video_id, self._DEVICE_ID), - video_id, headers={'content-type': 'application/json'}, data=json.dumps(data).encode('utf-8')) + 'https://spapi.zee5.com/singlePlayback/getDetails/secure', video_id, query={ + 'content_id': video_id, + 'device_id': self._DEVICE_ID, + 'platform_name': 'desktop_web', + 'country': self._USER_COUNTRY or self.get_param('geo_bypass_country') or 'IN', + 'check_parental_control': False, + }, headers={'content-type': 'application/json'}, data=json.dumps(data).encode('utf-8')) asset_data = json_data['assetDetails'] show_data = json_data.get('showDetails', {}) if 'premium' in asset_data['business_type']: @@ -228,8 +240,8 @@ class Zee5SeriesIE(InfoExtractor): def _entries(self, show_id): access_token_request = self._download_json( - 'https://useraction.zee5.com/token/platform_tokens.php?platform_name=web_app', - show_id, note='Downloading access token') + 'https://launchapi.zee5.com/launch?platform_name=web_app', + show_id, note='Downloading access token')['platform_token'] headers = { 'X-Access-Token': access_token_request['token'], 'Referer': 'https://www.zee5.com/', diff --git a/hypervideo_dl/extractor/zingmp3.py b/hypervideo_dl/extractor/zingmp3.py index a818c9f..007658c 100644 --- a/hypervideo_dl/extractor/zingmp3.py +++ b/hypervideo_dl/extractor/zingmp3.py @@ -1,16 +1,11 @@ -import functools import hashlib import hmac +import itertools import json import urllib.parse from .common import InfoExtractor -from ..utils import ( - OnDemandPagedList, - int_or_none, - traverse_obj, - urljoin, -) +from ..utils import int_or_none, traverse_obj, try_call, urljoin class ZingMp3BaseIE(InfoExtractor): @@ -37,6 +32,7 @@ class ZingMp3BaseIE(InfoExtractor): 'info-artist': '/api/v2/page/get/artist', 'user-list-song': '/api/v2/song/get/list', 'user-list-video': '/api/v2/video/get/list', + 'hub': '/api/v2/page/get/hub-detail', } def _api_url(self, url_type, params): @@ -46,9 +42,9 @@ class ZingMp3BaseIE(InfoExtractor): ''.join(f'{k}={v}' for k, v in sorted(params.items())).encode()).hexdigest() data = { **params, - 'apiKey': '88265e23d4284f25963e6eedac8fbfa3', - 'sig': hmac.new( - b'2aa2d1c561e809b267f3638c4a307aab', f'{api_slug}{sha256}'.encode(), hashlib.sha512).hexdigest(), + 'apiKey': 'X5BM3w8N7MKozC0B85o4KMlzLZKhV00y', + 'sig': hmac.new(b'acOrvUS15XRW2o9JksiK1KgQ6Vbds8ZW', + f'{api_slug}{sha256}'.encode(), hashlib.sha512).hexdigest(), } return f'{self._DOMAIN}{api_slug}?{urllib.parse.urlencode(data)}' @@ -67,6 +63,19 @@ class ZingMp3BaseIE(InfoExtractor): for url in traverse_obj(items, (..., 'link')) or []: yield self.url_result(urljoin(self._DOMAIN, url)) + def _fetch_page(self, id_, url_type, page): + raise NotImplementedError('This method must be implemented by subclasses') + + def _paged_list(self, _id, url_type): + count = 0 + for page in itertools.count(1): + data = self._fetch_page(_id, url_type, page) + entries = list(self._parse_items(data.get('items'))) + count += len(entries) + yield from entries + if not data.get('hasMore') or try_call(lambda: count > data['total']): + break + class ZingMp3IE(ZingMp3BaseIE): _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip|embed' @@ -166,8 +175,11 @@ class ZingMp3IE(ZingMp3BaseIE): 'height': int_or_none(res), }) - if not formats and item.get('msg') == 'Sorry, this content is not available in your country.': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + if not formats: + if item.get('msg') == 'Sorry, this content is not available in your country.': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + else: + self.raise_no_formats('The song is only for VIP accounts.') lyric = item.get('lyric') or self._call_api('lyric', {'id': item_id}, fatal=False).get('file') @@ -200,7 +212,7 @@ class ZingMp3AlbumIE(ZingMp3BaseIE): 'id': 'ZWZAEZZD', 'title': 'Những Bài Hát Hay Nhất Của Mr. Siro', }, - 'playlist_mincount': 49, + 'playlist_mincount': 20, }, { 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', 'only_matching': True, @@ -305,22 +317,20 @@ class ZingMp3ChartMusicVideoIE(ZingMp3BaseIE): 'id': 'IWZ9Z086', 'title': 'the-loai-video_Khong-Loi', }, - 'playlist_mincount': 10, + 'playlist_mincount': 1, }] def _fetch_page(self, song_id, url_type, page): - return self._parse_items(self._call_api(url_type, { + return self._call_api(url_type, { 'id': song_id, 'type': 'genre', - 'page': page + 1, + 'page': page, 'count': self._PER_PAGE - }).get('items')) + }) def _real_extract(self, url): song_id, regions, url_type = self._match_valid_url(url).group('id', 'regions', 'type') - return self.playlist_result( - OnDemandPagedList(functools.partial(self._fetch_page, song_id, url_type), self._PER_PAGE), - song_id, f'{url_type}_{regions}') + return self.playlist_result(self._paged_list(song_id, url_type), song_id, f'{url_type}_{regions}') class ZingMp3UserIE(ZingMp3BaseIE): @@ -331,7 +341,7 @@ class ZingMp3UserIE(ZingMp3BaseIE): 'info_dict': { 'id': 'IWZ98609', 'title': 'Mr. Siro - bai-hat', - 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36', }, 'playlist_mincount': 91, }, { @@ -339,7 +349,7 @@ class ZingMp3UserIE(ZingMp3BaseIE): 'info_dict': { 'id': 'IWZ98609', 'title': 'Mr. Siro - album', - 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36', }, 'playlist_mincount': 3, }, { @@ -347,7 +357,7 @@ class ZingMp3UserIE(ZingMp3BaseIE): 'info_dict': { 'id': 'IWZ98609', 'title': 'Mr. Siro - single', - 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36', }, 'playlist_mincount': 20, }, { @@ -355,19 +365,19 @@ class ZingMp3UserIE(ZingMp3BaseIE): 'info_dict': { 'id': 'IWZ98609', 'title': 'Mr. Siro - video', - 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + 'description': 'md5:5bdcf45e955dc1b8d7f518f322ffef36', }, 'playlist_mincount': 15, }] def _fetch_page(self, user_id, url_type, page): url_type = 'user-list-song' if url_type == 'bai-hat' else 'user-list-video' - return self._parse_items(self._call_api(url_type, { + return self._call_api(url_type, { 'id': user_id, 'type': 'artist', - 'page': page + 1, + 'page': page, 'count': self._PER_PAGE - }, query={'sort': 'new', 'sectionId': 'aSong'}).get('items')) + }) def _real_extract(self, url): user_alias, url_type = self._match_valid_url(url).group('user', 'type') @@ -376,10 +386,41 @@ class ZingMp3UserIE(ZingMp3BaseIE): user_info = self._call_api('info-artist', {}, user_alias, query={'alias': user_alias}) if url_type in ('bai-hat', 'video'): - entries = OnDemandPagedList( - functools.partial(self._fetch_page, user_info['id'], url_type), self._PER_PAGE) + entries = self._paged_list(user_info['id'], url_type) else: entries = self._parse_items(traverse_obj(user_info, ( - 'sections', lambda _, v: v['link'] == f'/{user_alias}/{url_type}', 'items', ...))) + 'sections', + lambda _, v: v['sectionId'] == 'aAlbum' if url_type == 'album' else v['sectionId'] == 'aSingle', + 'items', ...))) return self.playlist_result( entries, user_info['id'], f'{user_info.get("name")} - {url_type}', user_info.get('biography')) + + +class ZingMp3HubIE(ZingMp3BaseIE): + IE_NAME = 'zingmp3:hub' + _VALID_URL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<type>hub)/(?P<regions>[^/]+)/(?P<id>[^\.]+)' + _TESTS = [{ + 'url': 'https://zingmp3.vn/hub/Nhac-Moi/IWZ9Z0CA.html', + 'info_dict': { + 'id': 'IWZ9Z0CA', + 'title': 'Nhạc Mới', + 'description': 'md5:1cc31b68a6f746427b07b2756c22a558', + }, + 'playlist_mincount': 20, + }, { + 'url': 'https://zingmp3.vn/hub/Nhac-Viet/IWZ9Z087.html', + 'info_dict': { + 'id': 'IWZ9Z087', + 'title': 'Nhạc Việt', + 'description': 'md5:acc976c8bdde64d5c6ee4a92c39f7a77', + }, + 'playlist_mincount': 30, + }] + + def _real_extract(self, url): + song_id, regions, url_type = self._match_valid_url(url).group('id', 'regions', 'type') + hub_detail = self._call_api(url_type, {'id': song_id}) + entries = self._parse_items(traverse_obj(hub_detail, ( + 'sections', lambda _, v: v['sectionId'] == 'hub', 'items', ...))) + return self.playlist_result( + entries, song_id, hub_detail.get('title'), hub_detail.get('description')) diff --git a/hypervideo_dl/extractor/zoom.py b/hypervideo_dl/extractor/zoom.py index ef8b715..3d7ccca 100644 --- a/hypervideo_dl/extractor/zoom.py +++ b/hypervideo_dl/extractor/zoom.py @@ -5,6 +5,7 @@ from ..utils import ( str_or_none, js_to_json, parse_filesize, + traverse_obj, urlencode_postdata, urljoin, ) @@ -12,8 +13,8 @@ from ..utils import ( class ZoomIE(InfoExtractor): IE_NAME = 'zoom' - _VALID_URL = r'(?P<base_url>https?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?:play|share)/(?P<id>[A-Za-z0-9_.-]+)' - _TEST = { + _VALID_URL = r'(?P<base_url>https?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?P<type>play|share)/(?P<id>[A-Za-z0-9_.-]+)' + _TESTS = [{ 'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', 'md5': 'ab445e8c911fddc4f9adc842c2c5d434', 'info_dict': { @@ -22,36 +23,73 @@ class ZoomIE(InfoExtractor): 'title': 'China\'s "two sessions" and the new five-year plan', }, 'skip': 'Recording requires email authentication to access', - } + }, { + # play URL + 'url': 'https://ffgolf.zoom.us/rec/play/qhEhXbrxq1Zoucx8CMtHzq1Z_2YZRPVCqWK_K-2FkEGRsSLDeOX8Tu4P6jtjZcRry8QhIbvKZdtr4UNo.QcPn2debFskI9whJ', + 'md5': '2c4b1c4e5213ebf9db293e88d9385bee', + 'info_dict': { + 'id': 'qhEhXbrxq1Zoucx8CMtHzq1Z_2YZRPVCqWK_K-2FkEGRsSLDeOX8Tu4P6jtjZcRry8QhIbvKZdtr4UNo.QcPn2debFskI9whJ', + 'ext': 'mp4', + 'title': 'Prépa AF2023 - Séance 5 du 11 avril - R20/VM/GO', + }, + }, { + # share URL + 'url': 'https://us02web.zoom.us/rec/share/hkUk5Zxcga0nkyNGhVCRfzkA2gX_mzgS3LpTxEEWJz9Y_QpIQ4mZFOUx7KZRZDQA.9LGQBdqmDAYgiZ_8', + 'md5': '90fdc7cfcaee5d52d1c817fc03c43c9b', + 'info_dict': { + 'id': 'hkUk5Zxcga0nkyNGhVCRfzkA2gX_mzgS3LpTxEEWJz9Y_QpIQ4mZFOUx7KZRZDQA.9LGQBdqmDAYgiZ_8', + 'ext': 'mp4', + 'title': 'Timea Andrea Lelik\'s Personal Meeting Room', + }, + }] - def _real_extract(self, url): - base_url, play_id = self._match_valid_url(url).groups() - webpage = self._download_webpage(url, play_id) + def _get_page_data(self, webpage, video_id): + return self._search_json( + r'window\.__data__\s*=', webpage, 'data', video_id, transform_source=js_to_json) + def _get_real_webpage(self, url, base_url, video_id, url_type): + webpage = self._download_webpage(url, video_id, note=f'Downloading {url_type} webpage') try: form = self._form_hidden_inputs('password_form', webpage) except ExtractorError: - form = None - if form: - password = self.get_param('videopassword') - if not password: - raise ExtractorError( - 'This video is protected by a passcode, use the --video-password option', expected=True) - is_meeting = form.get('useWhichPasswd') == 'meeting' - validation = self._download_json( - base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''), - play_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({ - 'id': form[('meet' if is_meeting else 'file') + 'Id'], - 'passwd': password, - 'action': form.get('action'), - })) - if not validation.get('status'): - raise ExtractorError(validation['errorMessage'], expected=True) - webpage = self._download_webpage(url, play_id) + return webpage + + password = self.get_param('videopassword') + if not password: + raise ExtractorError( + 'This video is protected by a passcode, use the --video-password option', expected=True) + is_meeting = form.get('useWhichPasswd') == 'meeting' + validation = self._download_json( + base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''), + video_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({ + 'id': form[('meet' if is_meeting else 'file') + 'Id'], + 'passwd': password, + 'action': form.get('action'), + })) + if not validation.get('status'): + raise ExtractorError(validation['errorMessage'], expected=True) + return self._download_webpage(url, video_id, note=f'Re-downloading {url_type} webpage') + + def _real_extract(self, url): + base_url, url_type, video_id = self._match_valid_url(url).group('base_url', 'type', 'id') + + if url_type == 'share': + webpage = self._get_real_webpage(url, base_url, video_id, 'share') + meeting_id = self._get_page_data(webpage, video_id)['meetingId'] + redirect_path = self._download_json( + f'{base_url}nws/recording/1.0/play/share-info/{meeting_id}', + video_id, note='Downloading share info JSON')['result']['redirectUrl'] + url = urljoin(base_url, redirect_path) + + webpage = self._get_real_webpage(url, base_url, video_id, 'play') + file_id = self._get_page_data(webpage, video_id)['fileId'] + if not file_id: + # When things go wrong, file_id can be empty string + raise ExtractorError('Unable to extract file ID') - data = self._parse_json(self._search_regex( - r'(?s)window\.__data__\s*=\s*({.+?});', - webpage, 'data'), play_id, js_to_json) + data = self._download_json( + f'{base_url}nws/recording/1.0/play/info/{file_id}', video_id, + note='Downloading play info JSON')['result'] subtitles = {} for _type in ('transcript', 'cc', 'chapter'): @@ -67,11 +105,11 @@ class ZoomIE(InfoExtractor): formats.append({ 'format_note': 'Camera stream', 'url': str_or_none(data.get('viewMp4Url')), - 'width': int_or_none(data.get('viewResolvtionsWidth')), - 'height': int_or_none(data.get('viewResolvtionsHeight')), - 'format_id': str_or_none(data.get('recordingId')), + 'width': int_or_none(traverse_obj(data, ('viewResolvtions', 0))), + 'height': int_or_none(traverse_obj(data, ('viewResolvtions', 1))), + 'format_id': str_or_none(traverse_obj(data, ('recording', 'id'))), 'ext': 'mp4', - 'filesize_approx': parse_filesize(data.get('fileSize')), + 'filesize_approx': parse_filesize(str_or_none(traverse_obj(data, ('recording', 'fileSizeInMB')))), 'preference': 0 }) @@ -79,16 +117,16 @@ class ZoomIE(InfoExtractor): formats.append({ 'format_note': 'Screen share stream', 'url': str_or_none(data.get('shareMp4Url')), - 'width': int_or_none(data.get('shareResolvtionsWidth')), - 'height': int_or_none(data.get('shareResolvtionsHeight')), - 'format_id': str_or_none(data.get('shareVideoId')), + 'width': int_or_none(traverse_obj(data, ('shareResolvtions', 0))), + 'height': int_or_none(traverse_obj(data, ('shareResolvtions', 1))), + 'format_id': str_or_none(traverse_obj(data, ('shareVideo', 'id'))), 'ext': 'mp4', 'preference': -1 }) return { - 'id': play_id, - 'title': data.get('topic'), + 'id': video_id, + 'title': str_or_none(traverse_obj(data, ('meet', 'topic'))), 'subtitles': subtitles, 'formats': formats, 'http_headers': { diff --git a/hypervideo_dl/extractor/zype.py b/hypervideo_dl/extractor/zype.py index 8cf9945..2f3b4c4 100644 --- a/hypervideo_dl/extractor/zype.py +++ b/hypervideo_dl/extractor/zype.py @@ -1,7 +1,7 @@ import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..networking.exceptions import HTTPError from ..utils import ( dict_get, ExtractorError, @@ -37,9 +37,9 @@ class ZypeIE(InfoExtractor): response = self._download_json(re.sub( r'\.(?:js|html)\?', '.json?', url), video_id)['response'] except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 403): + if isinstance(e.cause, HTTPError) and e.cause.status in (400, 401, 403): raise ExtractorError(self._parse_json( - e.cause.read().decode(), video_id)['message'], expected=True) + e.cause.response.read().decode(), video_id)['message'], expected=True) raise body = response['body'] |