diff options
author | pukkandan <pukkandan.ytdlp@gmail.com> | 2022-08-01 06:53:25 +0530 |
---|---|---|
committer | pukkandan <pukkandan.ytdlp@gmail.com> | 2022-08-02 01:08:16 +0530 |
commit | bfd973ece3369c593b5e82a88cc16de80088a73e (patch) | |
tree | 6a61140e44f412d16ece6794b5b3e4ead4905b3c | |
parent | 1e8fe57e5cd0f33f940df87430d75e1230ec5b7a (diff) | |
download | hypervideo-pre-bfd973ece3369c593b5e82a88cc16de80088a73e.tar.lz hypervideo-pre-bfd973ece3369c593b5e82a88cc16de80088a73e.tar.xz hypervideo-pre-bfd973ece3369c593b5e82a88cc16de80088a73e.zip |
[extractors] Use new framework for existing embeds (#4307)
`Brightcove` is difficult to migrate because it's subclasses may depend
on the signature of the current functions. So it is left as-is for now
Note: Tests have not been migrated
138 files changed, 500 insertions, 1910 deletions
diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 221c1598d..5ca92f18b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -446,7 +446,7 @@ from .dw import ( DWIE, DWArticleIE, ) -from .eagleplatform import EaglePlatformIE +from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE from .ebaumsworld import EbaumsWorldIE from .echomsk import EchoMskIE from .egghead import ( @@ -1555,6 +1555,7 @@ from .shared import ( SharedIE, VivoIE, ) +from .sharevideos import ShareVideosEmbedIE from .shemaroome import ShemarooMeIE from .showroomlive import ShowRoomLiveIE from .simplecast import ( diff --git a/yt_dlp/extractor/adobetv.py b/yt_dlp/extractor/adobetv.py index 941254243..d8e07b3a1 100644 --- a/yt_dlp/extractor/adobetv.py +++ b/yt_dlp/extractor/adobetv.py @@ -232,6 +232,7 @@ class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): class AdobeTVVideoIE(AdobeTVBaseIE): IE_NAME = 'adobetv:video' _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]'] _TEST = { # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners diff --git a/yt_dlp/extractor/ant1newsgr.py b/yt_dlp/extractor/ant1newsgr.py index cd0f36856..fac476e21 100644 --- a/yt_dlp/extractor/ant1newsgr.py +++ b/yt_dlp/extractor/ant1newsgr.py @@ -1,4 +1,3 @@ -import re import urllib.parse from .common import InfoExtractor @@ -7,7 +6,6 @@ from ..utils import ( ExtractorError, determine_ext, scale_thumbnails_to_max_format_width, - unescapeHTML, ) @@ -91,7 +89,7 @@ class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle') - embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) + embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage)) if not embed_urls: raise ExtractorError('no videos found for %s' % video_id, expected=True) return self.playlist_from_matches( @@ -104,6 +102,7 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): IE_DESC = 'ant1news.gr embedded videos' _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player' _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)' + _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)'] _API_PATH = '/news/templates/data/jsonPlayer' _TESTS = [{ @@ -117,16 +116,6 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): }, }] - @classmethod - def _extract_urls(cls, webpage): - _EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' - _EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)' - for mobj in re.finditer(_EMBED_RE, webpage): - url = unescapeHTML(mobj.group('url')) - if not cls.suitable(url): - continue - yield url - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py index 09dfffdb0..cb9483569 100644 --- a/yt_dlp/extractor/anvato.py +++ b/yt_dlp/extractor/anvato.py @@ -340,30 +340,16 @@ class AnvatoIE(InfoExtractor): 'subtitles': subtitles, } - @staticmethod - def _extract_urls(ie, webpage, video_id): - entries = [] - for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): - anvplayer_data = ie._parse_json( - mobj.group('anvp'), video_id, transform_source=unescapeHTML, - fatal=False) - if not anvplayer_data: - continue - video = anvplayer_data.get('video') - if not isinstance(video, compat_str) or not video.isdigit(): - continue - access_key = anvplayer_data.get('accessKey') - if not access_key: - mcp = anvplayer_data.get('mcp') - if mcp: - access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( - mcp.lower()) + @classmethod + def _extract_from_webpage(cls, url, webpage): + for mobj in re.finditer(cls._ANVP_RE, webpage): + anvplayer_data = unescapeHTML(json.loads(mobj.group('anvp'))) or {} + video_id, access_key = anvplayer_data.get('video'), anvplayer_data.get('accessKey') if not access_key: + access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower()) + if not (video_id or '').isdigit() or not access_key: continue - entries.append(ie.url_result( - 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), - video_id=video)) - return entries + yield cls.url_result(f'anvato:{access_key}:{video_id}', AnvatoIE, video_id) def _extract_anvato_videos(self, webpage, video_id): anvplayer_data = self._parse_json( diff --git a/yt_dlp/extractor/apa.py b/yt_dlp/extractor/apa.py index 847be6edf..c9147e855 100644 --- a/yt_dlp/extractor/apa.py +++ b/yt_dlp/extractor/apa.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -10,6 +8,7 @@ from ..utils import ( class APAIE(InfoExtractor): _VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1'] _TESTS = [{ 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', @@ -30,14 +29,6 @@ class APAIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', - webpage)] - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id, base_url = mobj.group('id', 'base_url') diff --git a/yt_dlp/extractor/aparat.py b/yt_dlp/extractor/aparat.py index cd6cd1c79..90464556d 100644 --- a/yt_dlp/extractor/aparat.py +++ b/yt_dlp/extractor/aparat.py @@ -10,6 +10,7 @@ from ..utils import ( class AparatIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' + _EMBED_REGEX = [r'<iframe .*?src="(?P<url>http://www\.aparat\.com/video/[^"]+)"'] _TESTS = [{ 'url': 'http://www.aparat.com/v/wP8On', diff --git a/yt_dlp/extractor/arcpublishing.py b/yt_dlp/extractor/arcpublishing.py index 2e3f3cc5f..de9ccc538 100644 --- a/yt_dlp/extractor/arcpublishing.py +++ b/yt_dlp/extractor/arcpublishing.py @@ -70,8 +70,8 @@ class ArcPublishingIE(InfoExtractor): ], 'video-api-cdn.%s.arcpublishing.com/api'), ] - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): entries = [] # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): diff --git a/yt_dlp/extractor/arkena.py b/yt_dlp/extractor/arkena.py index 9da2bfd5e..9a0273e2c 100644 --- a/yt_dlp/extractor/arkena.py +++ b/yt_dlp/extractor/arkena.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -19,6 +17,8 @@ class ArkenaIE(InfoExtractor): play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+) ) ''' + # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1'] _TESTS = [{ 'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310', 'md5': '97f117754e5f3c020f5f26da4a44ebaf', @@ -50,15 +50,6 @@ class ArkenaIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1', - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index 9ec5203f1..980d37849 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -204,6 +204,7 @@ class ArteTVIE(ArteTVBaseIE): class ArteTVEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' + _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1'] _TESTS = [{ 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', 'info_dict': { @@ -219,12 +220,6 @@ class ArteTVEmbedIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', - webpage)] - def _real_extract(self, url): qs = parse_qs(url) json_url = qs['json_url'][0] diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 6f806d84e..b34fcb108 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -22,6 +22,7 @@ from ..utils import ( class BandcampIE(InfoExtractor): _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' + _EMBED_REGEX = [r'<meta property="og:url"[^>]*?content="(?P<url>.*?bandcamp\.com.*?)"'] _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 4413a299a..9a0a4414e 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -46,6 +46,7 @@ class BBCCoUkIE(InfoExtractor): ) (?P<id>%s)(?!/(?:episodes|broadcasts|clips)) ''' % _ID_REGEX + _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)'] _LOGIN_URL = 'https://account.bbc.com/signin' _NETRC_MACHINE = 'bbc' diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py index c831092d4..24d321566 100644 --- a/yt_dlp/extractor/bitchute.py +++ b/yt_dlp/extractor/bitchute.py @@ -13,6 +13,7 @@ from ..utils import ( class BitChuteIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)' + _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})'] _TESTS = [{ 'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/', 'md5': '7e427d7ed7af5a75b5855705ec750e2b', @@ -33,14 +34,6 @@ class BitChuteIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL, - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/blogger.py b/yt_dlp/extractor/blogger.py index d7aa7f94e..3d6e03304 100644 --- a/yt_dlp/extractor/blogger.py +++ b/yt_dlp/extractor/blogger.py @@ -1,5 +1,3 @@ -import re - from ..utils import ( mimetype2ext, parse_duration, @@ -13,7 +11,7 @@ from .common import InfoExtractor class BloggerIE(InfoExtractor): IE_NAME = 'blogger.com' _VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P<id>.+)' - _VALID_EMBED = r'''<iframe[^>]+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''' + _EMBED_REGEX = [r'''<iframe[^>]+src=["'](?P<url>(?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']'''] _TESTS = [{ 'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw', 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac', @@ -26,10 +24,6 @@ class BloggerIE(InfoExtractor): } }] - @staticmethod - def _extract_urls(webpage): - return re.findall(BloggerIE._VALID_EMBED, webpage) - def _real_extract(self, url): token_id = self._match_id(url) webpage = self._download_webpage(url, token_id) diff --git a/yt_dlp/extractor/buzzfeed.py b/yt_dlp/extractor/buzzfeed.py index 1b4cba63e..b30a3b7ae 100644 --- a/yt_dlp/extractor/buzzfeed.py +++ b/yt_dlp/extractor/buzzfeed.py @@ -81,7 +81,7 @@ class BuzzFeedIE(InfoExtractor): continue entries.append(self.url_result(video['url'])) - facebook_urls = FacebookIE._extract_urls(webpage) + facebook_urls = FacebookIE._extract_embed_urls(url, webpage) entries.extend([ self.url_result(facebook_url) for facebook_url in facebook_urls]) diff --git a/yt_dlp/extractor/channel9.py b/yt_dlp/extractor/channel9.py index 90a1ab2be..d0390d937 100644 --- a/yt_dlp/extractor/channel9.py +++ b/yt_dlp/extractor/channel9.py @@ -14,6 +14,7 @@ class Channel9IE(InfoExtractor): IE_DESC = 'Channel 9' IE_NAME = 'channel9' _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' + _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b'] _TESTS = [{ 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', @@ -78,12 +79,6 @@ class Channel9IE(InfoExtractor): _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b', - webpage) - def _extract_list(self, video_id, rss_url=None): if not rss_url: rss_url = self._RSS_URL % video_id diff --git a/yt_dlp/extractor/cinchcast.py b/yt_dlp/extractor/cinchcast.py index 393df3698..ff962aad1 100644 --- a/yt_dlp/extractor/cinchcast.py +++ b/yt_dlp/extractor/cinchcast.py @@ -7,6 +7,8 @@ from ..utils import ( class CinchcastIE(InfoExtractor): _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1'] + _TESTS = [{ 'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single', 'info_dict': { diff --git a/yt_dlp/extractor/cloudflarestream.py b/yt_dlp/extractor/cloudflarestream.py index 0a6073403..8bc0ad883 100644 --- a/yt_dlp/extractor/cloudflarestream.py +++ b/yt_dlp/extractor/cloudflarestream.py @@ -1,5 +1,4 @@ import base64 -import re from .common import InfoExtractor @@ -16,6 +15,7 @@ class CloudflareStreamIE(InfoExtractor): ) (?P<id>%s) ''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE) + _EMBED_REGEX = [fr'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1'] _TESTS = [{ 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', 'info_dict': { @@ -37,14 +37,6 @@ class CloudflareStreamIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s(?:%s).*?)\1' % (CloudflareStreamIE._EMBED_RE, CloudflareStreamIE._ID_RE), - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net' diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index c91260cb0..a6933e738 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3882,6 +3882,11 @@ class InfoExtractor: class StopExtraction(Exception): pass + @classmethod + def _extract_url(cls, webpage): # TODO: Remove + """Only for compatibility with some older extractors""" + return next(iter(cls._extract_embed_urls(None, webpage) or []), None) + class SearchInfoExtractor(InfoExtractor): """ diff --git a/yt_dlp/extractor/condenast.py b/yt_dlp/extractor/condenast.py index cf6e40cb8..ffdd820e2 100644 --- a/yt_dlp/extractor/condenast.py +++ b/yt_dlp/extractor/condenast.py @@ -58,7 +58,10 @@ class CondeNastIE(InfoExtractor): )''' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) - EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys()) + _EMBED_REGEX = [r'''(?x) + <(?:iframe|script)[^>]+?src=(["\'])(?P<url> + (?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+? + )\1''' % '|'.join(_SITES.keys())] _TESTS = [{ 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', diff --git a/yt_dlp/extractor/crooksandliars.py b/yt_dlp/extractor/crooksandliars.py index c831a3ae0..85c145e12 100644 --- a/yt_dlp/extractor/crooksandliars.py +++ b/yt_dlp/extractor/crooksandliars.py @@ -7,6 +7,8 @@ from ..utils import ( class CrooksAndLiarsIE(InfoExtractor): _VALID_URL = r'https?://embed\.crooksandliars\.com/(?:embed|v)/(?P<id>[A-Za-z0-9]+)' + _EMBED_REGEX = [r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1'] + _TESTS = [{ 'url': 'https://embed.crooksandliars.com/embed/8RUoRhRi', 'info_dict': { diff --git a/yt_dlp/extractor/cspan.py b/yt_dlp/extractor/cspan.py index cb1523617..84393627a 100644 --- a/yt_dlp/extractor/cspan.py +++ b/yt_dlp/extractor/cspan.py @@ -163,7 +163,7 @@ class CSpanIE(InfoExtractor): video_id = m.group('id') video_type = 'program' if m.group('type') == 'prog' else 'clip' else: - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + senate_isvp_url = SenateISVPIE._extract_url(webpage) if senate_isvp_url: title = self._og_search_title(webpage) surl = smuggle_url(senate_isvp_url, {'force_title': title}) diff --git a/yt_dlp/extractor/dailymail.py b/yt_dlp/extractor/dailymail.py index 5451dbf00..f25d7a8c6 100644 --- a/yt_dlp/extractor/dailymail.py +++ b/yt_dlp/extractor/dailymail.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -12,6 +10,7 @@ from ..utils import ( class DailyMailIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)' + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)'] _TESTS = [{ 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', 'md5': 'f6129624562251f628296c3a9ffde124', @@ -26,12 +25,6 @@ class DailyMailIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py index 46438891f..65a9feec5 100644 --- a/yt_dlp/extractor/dailymotion.py +++ b/yt_dlp/extractor/dailymotion.py @@ -99,6 +99,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): [/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? ''' IE_NAME = 'dailymotion' + _EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1'] _TESTS = [{ 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', 'md5': '074b95bdee76b9e3654137aee9c79dfe', @@ -208,18 +209,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor): } xid''' - @staticmethod - def _extract_urls(webpage): - urls = [] - # Look for embedded Dailymotion player + @classmethod + def _extract_embed_urls(cls, url, webpage): # https://developer.dailymotion.com/player#player-parameters - for mobj in re.finditer( - r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage): - urls.append(unescapeHTML(mobj.group('url'))) + yield from super()._extract_embed_urls(url, webpage) for mobj in re.finditer( r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage): - urls.append('https://www.dailymotion.com/embed/video/' + mobj.group('id')) - return urls + yield from 'https://www.dailymotion.com/embed/video/' + mobj.group('id') def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url) @@ -378,6 +374,15 @@ class DailymotionPlaylistIE(DailymotionPlaylistBaseIE): }] _OBJECT_TYPE = 'collection' + @classmethod + def _extract_embed_urls(cls, url, webpage): + # Look for embedded Dailymotion playlist player (#3822) + for mobj in re.finditer( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', + webpage): + for p in re.findall(r'list\[\]=/playlist/([^/]+)/', unescapeHTML(mobj.group('url'))): + yield '//dailymotion.com/playlist/%s' % p + class DailymotionUserIE(DailymotionPlaylistBaseIE): IE_NAME = 'dailymotion:user' diff --git a/yt_dlp/extractor/dbtv.py b/yt_dlp/extractor/dbtv.py index 2beccd8b5..18be46f7e 100644 --- a/yt_dlp/extractor/dbtv.py +++ b/yt_dlp/extractor/dbtv.py @@ -1,10 +1,9 @@ -import re - from .common import InfoExtractor class DBTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dagbladet\.no/video/(?:(?:embed|(?P<display_id>[^/]+))/)?(?P<id>[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8})' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1'] _TESTS = [{ 'url': 'https://www.dagbladet.no/video/PynxJnNWChE/', 'md5': 'b8f850ba1860adbda668d367f9b77699', @@ -28,12 +27,6 @@ class DBTVIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1', - webpage)] - def _real_extract(self, url): display_id, video_id = self._match_valid_url(url).groups() info = { diff --git a/yt_dlp/extractor/digiteka.py b/yt_dlp/extractor/digiteka.py index 5d244cb08..5fbc42ffe 100644 --- a/yt_dlp/extractor/digiteka.py +++ b/yt_dlp/extractor/digiteka.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import int_or_none @@ -25,6 +23,7 @@ class DigitekaIE(InfoExtractor): ) /id )/(?P<id>[\d+a-z]+)''' + _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)'] _TESTS = [{ # news 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', @@ -58,14 +57,6 @@ class DigitekaIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)', - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') diff --git a/yt_dlp/extractor/drtuber.py b/yt_dlp/extractor/drtuber.py index 3149e319f..824c2be12 100644 --- a/yt_dlp/extractor/drtuber.py +++ b/yt_dlp/extractor/drtuber.py @@ -11,6 +11,7 @@ from ..utils import ( class DrTuberIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|m)\.)?drtuber\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[\w-]+))?' + _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)'] _TESTS = [{ 'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf', 'md5': '93e680cf2536ad0dfb7e74d94a89facd', @@ -33,12 +34,6 @@ class DrTuberIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)', - webpage) - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') diff --git a/yt_dlp/extractor/eagleplatform.py b/yt_dlp/extractor/eagleplatform.py index e2ecd4b7c..7e5047b56 100644 --- a/yt_dlp/extractor/eagleplatform.py +++ b/yt_dlp/extractor/eagleplatform.py @@ -1,3 +1,4 @@ +import functools import re from .common import InfoExtractor @@ -5,6 +6,7 @@ from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, + smuggle_url, unsmuggle_url, url_or_none, ) @@ -18,6 +20,7 @@ class EaglePlatformIE(InfoExtractor): ) (?P<id>\d+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1'] _TESTS = [{ # http://lenta.ru/news/2015/03/06/navalny/ 'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201', @@ -52,14 +55,14 @@ class EaglePlatformIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - # Regular iframe embedding - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', - webpage) - if mobj is not None: - return mobj.group('url') + @classmethod + def _extract_embed_urls(cls, url, webpage): + add_referer = functools.partial(smuggle_url, data={'referrer': url}) + + res = tuple(super()._extract_embed_urls(url, webpage)) + if res: + return map(add_referer, res) + PLAYER_JS_RE = r''' <script[^>]+ src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs) @@ -74,7 +77,7 @@ class EaglePlatformIE(InfoExtractor): data-id=["\'](?P<id>\d+) ''' % PLAYER_JS_RE, webpage) if mobj is not None: - return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() + return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())] # Generalization of "Javascript code usage", "Combined usage" and # "Usage without attaching to DOM" embeddings (see # http://dultonmedia.github.io/eplayer/) @@ -95,7 +98,7 @@ class EaglePlatformIE(InfoExtractor): </script> ''' % PLAYER_JS_RE, webpage) if mobj is not None: - return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() + return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())] @staticmethod def _handle_error(response): @@ -201,3 +204,14 @@ class EaglePlatformIE(InfoExtractor): 'age_limit': age_limit, 'formats': formats, } + + +class ClipYouEmbedIE(InfoExtractor): + _VALID_URL = False + + @classmethod + def _extract_embed_urls(cls, url, webpage): + mobj = re.search( + r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) + if mobj is not None: + yield smuggle_url('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), {'referrer': url}) diff --git a/yt_dlp/extractor/embedly.py b/yt_dlp/extractor/embedly.py index a8d1f3c55..483d018bb 100644 --- a/yt_dlp/extractor/embedly.py +++ b/yt_dlp/extractor/embedly.py @@ -1,3 +1,5 @@ +import re +import urllib.parse from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote @@ -9,5 +11,14 @@ class EmbedlyIE(InfoExtractor): 'only_matching': True, }] + @classmethod + def _extract_embed_urls(cls, url, webpage): + # Bypass suitable check + for mobj in re.finditer(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage): + yield mobj.group('url') + + for mobj in re.finditer(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage): + yield urllib.parse.unquote(mobj.group('url')) + def _real_extract(self, url): return self.url_result(compat_urllib_parse_unquote(self._match_id(url))) diff --git a/yt_dlp/extractor/ertgr.py b/yt_dlp/extractor/ertgr.py index 276543653..eb52ad031 100644 --- a/yt_dlp/extractor/ertgr.py +++ b/yt_dlp/extractor/ertgr.py @@ -15,7 +15,6 @@ from ..utils import ( parse_iso8601, str_or_none, try_get, - unescapeHTML, url_or_none, variadic, ) @@ -275,6 +274,7 @@ class ERTWebtvEmbedIE(InfoExtractor): IE_DESC = 'ert.gr webtv embedded videos' _BASE_PLAYER_URL_RE = re.escape('//www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php') _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?f=(?P<id>[^#&]+)' + _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>(?:https?:)?{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)'] _TESTS = [{ 'url': 'https://www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php?f=trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4&bgimg=/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg', @@ -287,17 +287,6 @@ class ERTWebtvEmbedIE(InfoExtractor): }, }] - @classmethod - def _extract_urls(cls, webpage): - EMBED_URL_RE = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' - EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{EMBED_URL_RE})(?P=_q1)' - - for mobj in re.finditer(EMBED_RE, webpage): - url = unescapeHTML(mobj.group('url')) - if not cls.suitable(url): - continue - yield url - def _real_extract(self, url): video_id = self._match_id(url) formats, subs = self._extract_m3u8_formats_and_subtitles( diff --git a/yt_dlp/extractor/expressen.py b/yt_dlp/extractor/expressen.py index 5aba21ba7..5381e9880 100644 --- a/yt_dlp/extractor/expressen.py +++ b/yt_dlp/extractor/expressen.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -17,6 +15,7 @@ class ExpressenIE(InfoExtractor): tv/(?:[^/]+/)* (?P<id>[^/?#&]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1'] _TESTS = [{ 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/', 'md5': 'deb2ca62e7b1dcd19fa18ba37523f66e', @@ -45,13 +44,6 @@ class ExpressenIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1', - webpage)] - def _real_extract(self, url): display_id = self._match_id(url) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 5b34f3bff..d434b359a 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -57,6 +57,13 @@ class FacebookIE(InfoExtractor): ) (?P<id>[0-9]+) ''' + _EMBED_REGEX = [ + r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', + # Facebook API embed https://developers.facebook.com/docs/plugins/embedded-video-player + r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ + data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', + ] _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _NETRC_MACHINE = 'facebook' @@ -311,21 +318,6 @@ class FacebookIE(InfoExtractor): 'graphURI': '/api/graphql/' } - @staticmethod - def _extract_urls(webpage): - urls = [] - for mobj in re.finditer( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', - webpage): - urls.append(mobj.group('url')) - # Facebook API embed - # see https://developers.facebook.com/docs/plugins/embedded-video-player - for mobj in re.finditer(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ - data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage): - urls.append(mobj.group('url')) - return urls - def _perform_login(self, username, password): login_page_req = sanitized_Request(self._LOGIN_URL) self._set_cookie('facebook.com', 'locale', 'en_US') diff --git a/yt_dlp/extractor/foxnews.py b/yt_dlp/extractor/foxnews.py index e8513f2c2..2343dd20d 100644 --- a/yt_dlp/extractor/foxnews.py +++ b/yt_dlp/extractor/foxnews.py @@ -56,8 +56,8 @@ class FoxNewsIE(AMPIE): }, ] - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): return [ f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}' for mobj in re.finditer( @@ -125,4 +125,4 @@ class FoxNewsArticleIE(InfoExtractor): 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key()) return self.url_result( - FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) + FoxNewsIE._extract_embed_urls(url, webpage)[0], FoxNewsIE.ie_key()) diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index 5902eaca0..ba9e69161 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -32,6 +32,7 @@ class FranceTVIE(InfoExtractor): (?P<id>[^@]+)(?:@(?P<catalog>.+))? ) ''' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1'] _TESTS = [{ # without catalog @@ -370,7 +371,7 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): webpage = self._download_webpage(url, display_id) - dailymotion_urls = DailymotionIE._extract_urls(webpage) + dailymotion_urls = DailymotionIE._extract_embed_urls(url, webpage) if dailymotion_urls: return self.playlist_result([ self.url_result(dailymotion_url, DailymotionIE.ie_key()) diff --git a/yt_dlp/extractor/gedidigital.py b/yt_dlp/extractor/gedidigital.py index 4ae5362b4..4cc678021 100644 --- a/yt_dlp/extractor/gedidigital.py +++ b/yt_dlp/extractor/gedidigital.py @@ -11,7 +11,7 @@ from ..utils import ( class GediDigitalIE(InfoExtractor): - _VALID_URL = r'''(?x:(?P<url>(?:https?:)//video\. + _VALID_URL = r'''(?x:(?P<base_url>(?:https?:)//video\. (?: (?: (?:espresso\.)?repubblica @@ -34,6 +34,12 @@ class GediDigitalIE(InfoExtractor): |lasentinella )\.gelocal )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*))''' + _EMBED_REGEX = [rf'''(?x) + (?: + data-frame-src=| + <iframe[^\n]+src= + ) + (["'])(?P<url>{_VALID_URL})\1'''] _TESTS = [{ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', 'md5': '84658d7fb9e55a6e57ecc77b73137494', @@ -109,22 +115,9 @@ class GediDigitalIE(InfoExtractor): urls[i] = urljoin(base_url(e), url_basename(e)) return urls - @staticmethod - def _extract_urls(webpage): - entries = [ - mobj.group('eurl') - for mobj in re.finditer(r'''(?x) - (?: - data-frame-src=| - <iframe[^\n]+src= - ) - (["'])(?P<eurl>%s)\1''' % GediDigitalIE._VALID_URL, webpage)] - return GediDigitalIE._sanitize_urls(entries) - - @staticmethod - def _extract_url(webpage): - urls = GediDigitalIE._extract_urls(webpage) - return urls[0] if urls else None + @classmethod + def _extract_embed_urls(cls, url, webpage): + return cls._sanitize_urls(tuple(super()._extract_embed_urls(url, webpage))) @staticmethod def _clean_formats(formats): @@ -139,8 +132,7 @@ class GediDigitalIE(InfoExtractor): formats[:] = clean_formats def _real_extract(self, url): - video_id = self._match_id(url) - url = self._match_valid_url(url).group('url') + video_id, url = self._match_valid_url(url).group('id', 'base_url') webpage = self._download_webpage(url, video_id) title = self._html_search_meta( ['twitter:title', 'og:title'], webpage, fatal=True) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index ec1cbf005..d3ed7ce46 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -5,109 +5,9 @@ import xml.etree.ElementTree from . import gen_extractor_classes from .common import InfoExtractor # isort: split -from .ant1newsgr import Ant1NewsGrEmbedIE -from .anvato import AnvatoIE -from .apa import APAIE -from .arcpublishing import ArcPublishingIE -from .arkena import ArkenaIE -from .arte import ArteTVEmbedIE -from .bitchute import BitChuteIE -from .blogger import BloggerIE from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE -from .channel9 import Channel9IE -from .cloudflarestream import CloudflareStreamIE from .commonprotocols import RtmpIE -from .condenast import CondeNastIE -from .dailymail import DailyMailIE -from .dailymotion import DailymotionIE -from .dbtv import DBTVIE -from .digiteka import DigitekaIE -from .drtuber import DrTuberIE -from .eagleplatform import EaglePlatformIE -from .ertgr import ERTWebtvEmbedIE -from .expressen import ExpressenIE -from .facebook import FacebookIE -from .foxnews import FoxNewsIE -from .gedidigital import GediDigitalIE -from .gfycat import GfycatIE -from .glomex import GlomexEmbedIE -from .googledrive import GoogleDriveIE -from .indavideo import IndavideoEmbedIE -from .instagram import InstagramIE -from .joj import JojIE -from .jwplatform import JWPlatformIE -from .kaltura import KalturaIE -from .kinja import KinjaEmbedIE -from .limelight import LimelightBaseIE -from .mainstreaming import MainStreamingIE -from .medialaan import MedialaanIE -from .mediaset import MediasetIE -from .mediasite import MediasiteIE -from .megaphone import MegaphoneIE -from .megatvcom import MegaTVComEmbedIE -from .mofosex import MofosexEmbedIE -from .mtv import MTVServicesEmbeddedIE -from .myvi import MyviIE -from .nbc import NBCSportsVPlayerIE -from .nexx import NexxEmbedIE, NexxIE -from .odnoklassniki import OdnoklassnikiIE -from .onionstudios import OnionStudiosIE -from .ooyala import OoyalaIE -from .panopto import PanoptoBaseIE -from .peertube import PeerTubeIE -from .piksel import PikselIE -from .pladform import PladformIE -from .pornhub import PornHubIE -from .rcs import RCSEmbedsIE -from .redtube import RedTubeIE -from .rumble import RumbleEmbedIE -from .rutube import RutubeIE -from .rutv import RUTVIE -from .ruutu import RuutuIE -from .senategov import SenateISVPIE -from .simplecast import SimplecastIE -from .soundcloud import SoundcloudEmbedIE -from .spankwire import SpankwireIE -from .sportbox import SportBoxIE -from .spotify import SpotifyBaseIE -from .springboardplatform import SpringboardPlatformIE -from .substack import SubstackIE -from .svt import SVTIE -from .teachable import TeachableIE -from .ted import TedEmbedIE -from .theplatform import ThePlatformIE -from .threeqsdn import ThreeQSDNIE -from .tiktok import TikTokIE -from .tnaflix import TNAFlixNetworkEmbedIE -from .tube8 import Tube8IE -from .tunein import TuneInBaseIE -from .tvc import TVCIE -from .tvopengr import TVOpenGrEmbedIE -from .tvp import TVPEmbedIE -from .twentymin import TwentyMinutenIE -from .udn import UDNEmbedIE -from .ustream import UstreamIE -from .vbox7 import Vbox7IE -from .vice import ViceIE -from .videa import VideaIE -from .videomore import VideomoreIE -from .videopress import VideoPressIE -from .viewlift import ViewLiftEmbedIE -from .vimeo import VHXEmbedIE, VimeoIE -from .viqeo import ViqeoIE -from .vk import VKIE -from .vshare import VShareIE -from .vzaar import VzaarIE -from .washingtonpost import WashingtonPostIE -from .webcaster import WebcasterFeedIE -from .wimtv import WimTVIE -from .wistia import WistiaIE -from .xfileshare import XFileShareIE -from .xhamster import XHamsterEmbedIE -from .yapfiles import YapFilesIE -from .youporn import YouPornIE from .youtube import YoutubeIE -from .zype import ZypeIE from ..compat import compat_etree_fromstring from ..utils import ( KNOWN_EXTENSIONS, @@ -115,7 +15,6 @@ from ..utils import ( UnsupportedError, determine_ext, dict_get, - float_or_none, format_field, int_or_none, is_html, @@ -1197,7 +1096,7 @@ class GenericIE(InfoExtractor): 'timestamp': 468923808, 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4', }, - 'add_ie': [JWPlatformIE.ie_key()], + 'add_ie': ['JWPlatform'], }, { # Video.js embed, multiple formats @@ -1733,7 +1632,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [ArkenaIE.ie_key()], + 'add_ie': ['Arkena'], }, { 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/', @@ -1745,7 +1644,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [Vbox7IE.ie_key()], + 'add_ie': ['Vbox7'], }, { # DBTV embeds @@ -1777,7 +1676,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [TwentyMinutenIE.ie_key()], + 'add_ie': ['TwentyMinuten'], }, { # VideoPress embed @@ -1792,7 +1691,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [VideoPressIE.ie_key()], + 'add_ie': ['VideoPress'], }, { # Rutube embed @@ -1809,7 +1708,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [RutubeIE.ie_key()], + 'add_ie': ['Rutube'], }, { # glomex:embed @@ -1881,7 +1780,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Integrated Senate Video Player', }, - 'add_ie': [SenateISVPIE.ie_key()], + 'add_ie': ['SenateISVP'], }, { # Limelight embeds (1 channel embed + 4 media embeds) @@ -1928,7 +1827,7 @@ class GenericIE(InfoExtractor): 'uploader': 'The Washington Post', 'upload_date': '20160211', }, - 'add_ie': [WashingtonPostIE.ie_key()], + 'add_ie': ['WashingtonPost'], }, { # Mediaset embed @@ -1941,7 +1840,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [MediasetIE.ie_key()], + 'add_ie': ['Mediaset'], }, { # JOJ.sk embeds @@ -1951,7 +1850,7 @@ class GenericIE(InfoExtractor): 'title': 'Slovenskom sa prehnala vlna silných búrok', }, 'playlist_mincount': 5, - 'add_ie': [JojIE.ie_key()], + 'add_ie': ['Joj'], }, { # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) @@ -2017,7 +1916,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [SpringboardPlatformIE.ie_key()], + 'add_ie': ['SpringboardPlatform'], }, { 'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html', @@ -2026,7 +1925,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Котята', }, - 'add_ie': [YapFilesIE.ie_key()], + 'add_ie': ['YapFiles'], 'params': { 'skip_download': True, }, @@ -2039,7 +1938,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': '31c9291ab41fac05471db4e73aa11717', }, - 'add_ie': [CloudflareStreamIE.ie_key()], + 'add_ie': ['CloudflareStream'], 'params': { 'skip_download': True, }, @@ -2066,7 +1965,7 @@ class GenericIE(InfoExtractor): 'uploader': 'StreetKitchen', 'uploader_id': '546363', }, - 'add_ie': [IndavideoEmbedIE.ie_key()], + 'add_ie': ['IndavideoEmbed'], 'params': { 'skip_download': True, }, @@ -2441,10 +2340,10 @@ class GenericIE(InfoExtractor): # Panopto embeds 'url': 'https://www.monash.edu/learning-teaching/teachhq/learning-technologies/panopto/how-to/insert-a-quiz-into-a-panopto-video', 'info_dict': { - 'title': 'Insert a quiz into a Panopto video', - 'id': 'insert-a-quiz-into-a-panopto-video' + 'ext': 'mp4', + 'id': '0bd3f16c-824a-436a-8486-ac5900693aef', + 'title': 'Quizzes in Panopto', }, - 'playlist_count': 1 }, { # Ruutu embed @@ -2529,24 +2428,17 @@ class GenericIE(InfoExtractor): }, { 'url': 'https://www.skimag.com/video/ski-people-1980/', + 'md5': '022a7e31c70620ebec18deeab376ee03', 'info_dict': { - 'id': 'ski-people-1980', - 'title': 'Ski People (1980)', - }, - 'playlist_count': 1, - 'playlist': [{ - 'md5': '022a7e31c70620ebec18deeab376ee03', - 'info_dict': { - 'id': 'YTmgRiNU', - 'ext': 'mp4', - 'title': '1980 Ski People', - 'timestamp': 1610407738, - 'description': 'md5:cf9c3d101452c91e141f292b19fe4843', - 'thumbnail': 'https://cdn.jwplayer.com/v2/media/YTmgRiNU/poster.jpg?width=720', - 'duration': 5688.0, - 'upload_date': '20210111', - } - }] + 'id': 'YTmgRiNU', + 'ext': 'mp4', + 'title': '1980 Ski People', + 'timestamp': 1610407738, + 'description': 'md5:cf9c3d101452c91e141f292b19fe4843', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/YTmgRiNU/poster.jpg?width=720', + 'duration': 5688.0, + 'upload_date': '20210111', + } }, { 'note': 'Rumble embed', @@ -2888,14 +2780,8 @@ class GenericIE(InfoExtractor): r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>', lambda x: unescapeHTML(x.group(0)), webpage) - # TODO: Remove - video_title, video_description, video_thumbnail, age_limit, video_uploader = \ - info_dict['title'], info_dict['description'], info_dict['thumbnail'], info_dict['age_limit'], domain_name - - # TODO: Move Embeds - self._downloader.write_debug('Looking for single embeds') - - # Look for Brightcove Legacy Studio embeds + # TODO: Move to respective extractors + self._downloader.write_debug('Looking for Brightcove embeds') bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: entries = [{ @@ -2906,853 +2792,17 @@ class GenericIE(InfoExtractor): return { '_type': 'playlist', - 'title': video_title, + 'title': info_dict['title'], 'id': video_id, 'entries': entries, } - - # Look for Brightcove New Studio embeds bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage) if bc_urls: return self.playlist_from_matches( - bc_urls, video_id, video_title, + bc_urls, video_id, info_dict['title'], getter=lambda x: smuggle_url(x, {'referrer': url}), ie='BrightcoveNew') - # Look for Nexx embeds - nexx_urls = NexxIE._extract_urls(webpage) - if nexx_urls: - return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key()) - - # Look for Nexx iFrame embeds - nexx_embed_urls = NexxEmbedIE._extract_urls(webpage) - if nexx_embed_urls: - return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key()) - - # Look for ThePlatform embeds - tp_urls = ThePlatformIE._extract_urls(webpage) - if tp_urls: - return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') - - arc_urls = ArcPublishingIE._extract_urls(webpage) - if arc_urls: - return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key()) - - mychannels_urls = MedialaanIE._extract_urls(webpage) - if mychannels_urls: - return self.playlist_from_matches( - mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key()) - - # Look for embedded rtl.nl player - matches = re.findall( - r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', - webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') - - vimeo_urls = VimeoIE._extract_urls(url, webpage) - if vimeo_urls: - return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) - - vhx_url = VHXEmbedIE._extract_url(url, webpage) - if vhx_url: - return self.url_result(vhx_url, VHXEmbedIE.ie_key()) - - # Invidious Instances - # https://github.com/yt-dlp/yt-dlp/issues/195 - # https://github.com/iv-org/invidious/pull/1730 - youtube_url = self._search_regex( - r'<link rel="alternate" href="(https://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})"', - webpage, 'youtube link', default=None) - if youtube_url: - return self.url_result(youtube_url, YoutubeIE.ie_key()) - - # Look for YouTube embeds - youtube_urls = YoutubeIE._extract_urls(webpage) - if youtube_urls: - return self.playlist_from_matches( - youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) - - matches = DailymotionIE._extract_urls(webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title) - - # Look for embedded Dailymotion playlist player (#3822) - m = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) - if m: - playlists = re.findall( - r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) - if playlists: - return self.playlist_from_matches( - playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) - - # Look for DailyMail embeds - dailymail_urls = DailyMailIE._extract_urls(webpage) - if dailymail_urls: - return self.playlist_from_matches( - dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) - - # Look for Teachable embeds, must be before Wistia - teachable_url = TeachableIE._extract_url(webpage, url) - if teachable_url: - return self.url_result(teachable_url) - - # Look for embedded Wistia player - wistia_urls = WistiaIE._extract_urls(webpage) - if wistia_urls: - playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key()) - playlist['entries'] = list(playlist['entries']) - for entry in playlist['entries']: - entry.update({ - '_type': 'url_transparent', - 'uploader': video_uploader, - }) - return playlist - - # Look for SVT player - svt_url = SVTIE._extract_url(webpage) - if svt_url: - return self.url_result(svt_url, 'SVT') - - # Look for Bandcamp pages with custom domain - mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) - if mobj is not None: - burl = unescapeHTML(mobj.group(1)) - # Don't set the extractor because it can be a track url or an album - return self.url_result(burl) - - # Check for Substack custom domains - substack_url = SubstackIE._extract_url(webpage, url) - if substack_url: - return self.url_result(substack_url, SubstackIE) - - # Look for embedded Vevo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Viddler player - mobj = re.search( - r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NYTimes player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Libsyn player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Ooyala videos - mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) - or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) - or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) - if mobj is not None: - embed_token = self._search_regex( - r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', - webpage, 'ooyala embed token', default=None) - return OoyalaIE._build_url_result(smuggle_url( - mobj.group('ec'), { - 'domain': url, - 'embed_token': embed_token, - })) - - # Look for multiple Ooyala embeds on SBN network websites - mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) - if mobj is not None: - embeds = self._parse_json(mobj.group(1), video_id, fatal=False) - if embeds: - return self.playlist_from_matches( - embeds, video_id, video_title, - getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') - - # Look for Aparat videos - mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Aparat') - - # Look for MPORA videos - mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Mpora') - - # Look for embedded Facebook player - facebook_urls = FacebookIE._extract_urls(webpage) - if facebook_urls: - return self.playlist_from_matches(facebook_urls, video_id, video_title) - - # Look for embedded VK player - mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'VK') - - # Look for embedded Odnoklassniki player - odnoklassniki_url = OdnoklassnikiIE._extract_url(webpage) - if odnoklassniki_url: - return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) - - # Look for sibnet embedded player - sibnet_urls = VKIE._extract_sibnet_urls(webpage) - if sibnet_urls: - return self.playlist_from_matches(sibnet_urls, video_id, video_title) - - # Look for embedded ivi player - mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Ivi') - - # Look for embedded Huffington Post player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'HuffPost') - - # Look for embed.ly - mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage) - if mobj is not None: - return self.url_result(urllib.parse.unquote(mobj.group('url'))) - - # Look for funnyordie embed - matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) - if matches: - return self.playlist_from_matches( - matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') - - # Look for Simplecast embeds - simplecast_urls = SimplecastIE._extract_urls(webpage) - if simplecast_urls: - return self.playlist_from_matches( - simplecast_urls, video_id, video_title) - - # Look for BBC iPlayer embed - matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk') - - # Look for embedded RUTV player - rutv_url = RUTVIE._extract_url(webpage) - if rutv_url: - return self.url_result(rutv_url, 'RUTV') - - # Look for embedded TVC player - tvc_url = TVCIE._extract_url(webpage) - if tvc_url: - return self.url_result(tvc_url, 'TVC') - - # Look for embedded SportBox player - sportbox_urls = SportBoxIE._extract_urls(webpage) - if sportbox_urls: - return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key()) - - # Look for embedded Spotify player - spotify_urls = SpotifyBaseIE._extract_urls(webpage) - if spotify_urls: - return self.playlist_from_matches(spotify_urls, video_id, video_title) - - # Look for embedded XHamster player - xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) - if xhamster_urls: - return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed') - - # Look for embedded TNAFlixNetwork player - tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) - if tnaflix_urls: - return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key()) - - # Look for embedded PornHub player - pornhub_urls = PornHubIE._extract_urls(webpage) - if pornhub_urls: - return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key()) - - # Look for embedded DrTuber player - drtuber_urls = DrTuberIE._extract_urls(webpage) - if drtuber_urls: - return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key()) - - # Look for embedded RedTube player - redtube_urls = RedTubeIE._extract_urls(webpage) - if redtube_urls: - return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) - - # Look for embedded Tube8 player - tube8_urls = Tube8IE._extract_urls(webpage) - if tube8_urls: - return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) - - # Look for embedded Mofosex player - mofosex_urls = MofosexEmbedIE._extract_urls(webpage) - if mofosex_urls: - return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key()) - - # Look for embedded Spankwire player - spankwire_urls = SpankwireIE._extract_urls(webpage) - if spankwire_urls: - return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key()) - - # Look for embedded YouPorn player - youporn_urls = YouPornIE._extract_urls(webpage) - if youporn_urls: - return self.playlist_from_matches(youporn_urls, video_id, video_title, ie=YouPornIE.ie_key()) - - # Look for embedded Tvigle player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Tvigle') - - # Look for embedded TED player - ted_urls = TedEmbedIE._extract_urls(webpage) - if ted_urls: - return self.playlist_from_matches(ted_urls, video_id, video_title, ie=TedEmbedIE.ie_key()) - - # Look for embedded Ustream videos - ustream_url = UstreamIE._extract_url(webpage) - if ustream_url: - return self.url_result(ustream_url, UstreamIE.ie_key()) - - # Look for embedded arte.tv player - arte_urls = ArteTVEmbedIE._extract_urls(webpage) - if arte_urls: - return self.playlist_from_matches(arte_urls, video_id, video_title) - - # Look for embedded francetv player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Myvi.ru player - myvi_url = MyviIE._extract_url(webpage) - if myvi_url: - return self.url_result(myvi_url) - - # Look for embedded soundcloud player - soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage) - if soundcloud_urls: - return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML) - - # Look for tunein player - tunein_urls = TuneInBaseIE._extract_urls(webpage) - if tunein_urls: - return self.playlist_from_matches(tunein_urls, video_id, video_title) - - # Look for embedded mtvservices player - mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) - if mtvservices_url: - return self.url_result(mtvservices_url, ie='MTVServicesEmbedded') - - # Look for embedded yahoo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Yahoo') - - # Look for embedded sbs.com.au player - mobj = re.search( - r'''(?x) - (?: - <meta\s+property="og:video"\s+content=| - <iframe[^>]+?src= - ) - (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'SBS') - - # Look for embedded Cinchcast player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Cinchcast') - - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', - webpage) - if not mobj: - mobj = re.search( - r'data-video-link=["\'](?P<url>http://m\.mlb\.com/video/[^"\']+)', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'MLB') - - mobj = re.search( - r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, - webpage) - if mobj is not None: - return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') - - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Livestream') - - # Look for Zapiks embed - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Zapiks') - - # Look for Kaltura embeds - kaltura_urls = KalturaIE._extract_urls(webpage) - if kaltura_urls: - return self.playlist_from_matches( - kaltura_urls, video_id, video_title, - getter=lambda x: smuggle_url(x, {'source_url': url}), - ie=KalturaIE.ie_key()) - - # Look for EaglePlatform embeds - eagleplatform_url = EaglePlatformIE._extract_url(webpage) - if eagleplatform_url: - return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key()) - - # Look for ClipYou (uses EaglePlatform) embeds - mobj = re.search( - r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) - if mobj is not None: - return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') - - # Look for Pladform embeds - pladform_url = PladformIE._extract_url(webpage) - if pladform_url: - return self.url_result(pladform_url) - - # Look for Videomore embeds - videomore_url = VideomoreIE._extract_url(webpage) - if videomore_url: - return self.url_result(videomore_url) - - # Look for Webcaster embeds - webcaster_url = WebcasterFeedIE._extract_url(self, webpage) - if webcaster_url: - return self.url_result(webcaster_url, ie=WebcasterFeedIE.ie_key()) - - # Look for Playwire embeds - mobj = re.search( - r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Crooks and Liars embeds - mobj = re.search( - r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NBC Sports VPlayer embeds - nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) - if nbc_sports_url: - return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') - - # Look for NBC News embeds - nbc_news_embed_url = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1', webpage) - if nbc_news_embed_url: - return self.url_result(nbc_news_embed_url.group('url'), 'NBCNews') - - # Look for Google Drive embeds - google_drive_url = GoogleDriveIE._extract_url(webpage) - if google_drive_url: - return self.url_result(google_drive_url, 'GoogleDrive') - - # Look for UDN embeds - mobj = re.search( - r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) - if mobj is not None: - return self.url_result( - urllib.parse.urljoin(url, mobj.group('url')), 'UDNEmbed') - - # Look for Senate ISVP iframe - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - return self.url_result(senate_isvp_url, 'SenateISVP') - - # Look for Kinja embeds - kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url) - if kinja_embed_urls: - return self.playlist_from_matches( - kinja_embed_urls, video_id, video_title) - - # Look for OnionStudios embeds - onionstudios_url = OnionStudiosIE._extract_url(webpage) - if onionstudios_url: - return self.url_result(onionstudios_url) - - # Look for Blogger embeds - blogger_urls = BloggerIE._extract_urls(webpage) - if blogger_urls: - return self.playlist_from_matches(blogger_urls, video_id, video_title, ie=BloggerIE.ie_key()) - - # Look for ViewLift embeds - viewlift_url = ViewLiftEmbedIE._extract_url(webpage) - if viewlift_url: - return self.url_result(viewlift_url) - - # Look for JWPlatform embeds - jwplatform_urls = JWPlatformIE._extract_urls(webpage) - if jwplatform_urls: - return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) - - # Look for Digiteka embeds - digiteka_url = DigitekaIE._extract_url(webpage) - if digiteka_url: - return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) - - # Look for Arkena embeds - arkena_url = ArkenaIE._extract_url(webpage) - if arkena_url: - return self.url_result(arkena_url, ArkenaIE.ie_key()) - - # Look for Piksel embeds - piksel_url = PikselIE._extract_url(webpage) - if piksel_url: - return self.url_result(piksel_url, PikselIE.ie_key()) - - # Look for Limelight embeds - limelight_urls = LimelightBaseIE._extract_urls(webpage, url) - if limelight_urls: - return self.playlist_result( - limelight_urls, video_id, video_title, video_description) - - # Look for Anvato embeds - anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) - if anvato_urls: - return self.playlist_result( - anvato_urls, video_id, video_title, video_description) - - # Look for AdobeTVVideo embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), - 'AdobeTVVideo') - - # Look for Vine embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') - - # Look for VODPlatform embeds - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform') - - # Look for Mangomolo embeds - mobj = re.search( - r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?// - (?: - admin\.mangomolo\.com/analytics/index\.php/customers/embed| - player\.mangomolo\.com/v1 - )/ - (?: - video\?.*?\bid=(?P<video_id>\d+)| - (?:index|live)\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) - ).+?)\1''', webpage) - if mobj is not None: - info = { - '_type': 'url_transparent', - 'url': self._proto_relative_url(unescapeHTML(mobj.group('url'))), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - video_id = mobj.group('video_id') - if video_id: - info.update({ - 'ie_key': 'MangomoloVideo', - 'id': video_id, - }) - else: - info.update({ - 'ie_key': 'MangomoloLive', - 'id': mobj.group('channel_id'), - }) - return info - - # Look for Instagram embeds - instagram_embed_url = InstagramIE._extract_embed_url(webpage) - if instagram_embed_url is not None: - return self.url_result( - self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) - - # Look for 3Q SDN embeds - threeqsdn_url = ThreeQSDNIE._extract_url(webpage) - if threeqsdn_url: - return { - '_type': 'url_transparent', - 'ie_key': ThreeQSDNIE.ie_key(), - 'url': self._proto_relative_url(threeqsdn_url), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - - # Look for VBOX7 embeds - vbox7_url = Vbox7IE._extract_url(webpage) - if vbox7_url: - return self.url_result(vbox7_url, Vbox7IE.ie_key()) - - # Look for DBTV embeds - dbtv_urls = DBTVIE._extract_urls(webpage) - if dbtv_urls: - return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key()) - - # Look for Videa embeds - videa_urls = VideaIE._extract_urls(webpage) - if videa_urls: - return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key()) - - # Look for 20 minuten embeds - twentymin_urls = TwentyMinutenIE._extract_urls(webpage) - if twentymin_urls: - return self.playlist_from_matches( - twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) - - # Look for VideoPress embeds - videopress_urls = VideoPressIE._extract_urls(webpage) - if videopress_urls: - return self.playlist_from_matches( - videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) - - # Look for Rutube embeds - rutube_urls = RutubeIE._extract_urls(webpage) - if rutube_urls: - return self.playlist_from_matches( - rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) - - # Look for Glomex embeds - glomex_urls = list(GlomexEmbedIE._extract_urls(webpage, url)) - if glomex_urls: - return self.playlist_from_matches( - glomex_urls, video_id, video_title, ie=GlomexEmbedIE.ie_key()) - - # Look for megatv.com embeds - megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage)) - if megatvcom_urls: - return self.playlist_from_matches( - megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key()) - - # Look for ant1news.gr embeds - ant1newsgr_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) - if ant1newsgr_urls: - return self.playlist_from_matches( - ant1newsgr_urls, video_id, video_title, ie=Ant1NewsGrEmbedIE.ie_key()) - - # Look for WashingtonPost embeds - wapo_urls = WashingtonPostIE._extract_urls(webpage) - if wapo_urls: - return self.playlist_from_matches( - wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) - - # Look for Mediaset embeds - mediaset_urls = MediasetIE._extract_urls(self, webpage) - if mediaset_urls: - return self.playlist_from_matches( - mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) - - # Look for JOJ.sk embeds - joj_urls = JojIE._extract_urls(webpage) - if joj_urls: - return self.playlist_from_matches( - joj_urls, video_id, video_title, ie=JojIE.ie_key()) - - # Look for megaphone.fm embeds - mpfn_urls = MegaphoneIE._extract_urls(webpage) - if mpfn_urls: - return self.playlist_from_matches( - mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) - - # Look for vzaar embeds - vzaar_urls = VzaarIE._extract_urls(webpage) - if vzaar_urls: - return self.playlist_from_matches( - vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) - - channel9_urls = Channel9IE._extract_urls(webpage) - if channel9_urls: - return self.playlist_from_matches( - channel9_urls, video_id, video_title, ie=Channel9IE.ie_key()) - - vshare_urls = VShareIE._extract_urls(webpage) - if vshare_urls: - return self.playlist_from_matches( - vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) - - # Look for Mediasite embeds - mediasite_urls = MediasiteIE._extract_urls(webpage) - if mediasite_urls: - entries = [ - self.url_result(smuggle_url( - urllib.parse.urljoin(url, mediasite_url), - {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) - for mediasite_url in mediasite_urls] - return self.playlist_result(entries, video_id, video_title) - - springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage) - if springboardplatform_urls: - return self.playlist_from_matches( - springboardplatform_urls, video_id, video_title, - ie=SpringboardPlatformIE.ie_key()) - - yapfiles_urls = YapFilesIE._extract_urls(webpage) - if yapfiles_urls: - return self.playlist_from_matches( - yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key()) - - vice_urls = ViceIE._extract_urls(webpage) - if vice_urls: - return self.playlist_from_matches( - vice_urls, video_id, video_title, ie=ViceIE.ie_key()) - - xfileshare_urls = XFileShareIE._extract_urls(webpage) - if xfileshare_urls: - return self.playlist_from_matches( - xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) - - cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage) - if cloudflarestream_urls: - return self.playlist_from_matches( - cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) - - peertube_urls = PeerTubeIE._extract_urls(webpage, url) - if peertube_urls: - return self.playlist_from_matches( - peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) - - indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) - if indavideo_urls: - return self.playlist_from_matches( - indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) - - apa_urls = APAIE._extract_urls(webpage) - if apa_urls: - return self.playlist_from_matches( - apa_urls, video_id, video_title, ie=APAIE.ie_key()) - - foxnews_urls = FoxNewsIE._extract_urls(webpage) - if foxnews_urls: - return self.playlist_from_matches( - foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) - - sharevideos_urls = [sharevideos_mobj.group('url') for sharevideos_mobj in re.finditer( - r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', - webpage)] - if sharevideos_urls: - return self.playlist_from_matches( - sharevideos_urls, video_id, video_title) - - viqeo_urls = ViqeoIE._extract_urls(webpage) - if viqeo_urls: - return self.playlist_from_matches( - viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key()) - - expressen_urls = ExpressenIE._extract_urls(webpage) - if expressen_urls: - return self.playlist_from_matches( - expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key()) - - zype_urls = ZypeIE._extract_urls(webpage) - if zype_urls: - return self.playlist_from_matches( - zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) - - gedi_urls = GediDigitalIE._extract_urls(webpage) - if gedi_urls: - return self.playlist_from_matches( - gedi_urls, video_id, video_title, ie=GediDigitalIE.ie_key()) - - # Look for RCS media group embeds - rcs_urls = RCSEmbedsIE._extract_urls(webpage) - if rcs_urls: - return self.playlist_from_matches( - rcs_urls, video_id, video_title, ie=RCSEmbedsIE.ie_key()) - - wimtv_urls = WimTVIE._extract_urls(webpage) - if wimtv_urls: - return self.playlist_from_matches( - wimtv_urls, video_id, video_title, ie=WimTVIE.ie_key()) - - bitchute_urls = BitChuteIE._extract_urls(webpage) - if bitchute_urls: - return self.playlist_from_matches( - bitchute_urls, video_id, video_title, ie=BitChuteIE.ie_key()) - - rumble_urls = RumbleEmbedIE._extract_urls(webpage) - if len(rumble_urls) == 1: - return self.url_result(rumble_urls[0], RumbleEmbedIE.ie_key()) - if rumble_urls: - return self.playlist_from_matches( - rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key()) - - # Look for (tvopen|ethnos).gr embeds - tvopengr_urls = list(TVOpenGrEmbedIE._extract_urls(webpage)) - if tvopengr_urls: - return self.playlist_from_matches(tvopengr_urls, video_id, video_title, ie=TVOpenGrEmbedIE.ie_key()) - - # Look for ert.gr webtv embeds - ertwebtv_urls = list(ERTWebtvEmbedIE._extract_urls(webpage)) - if len(ertwebtv_urls) == 1: - return self.url_result(self._proto_relative_url(ertwebtv_urls[0]), video_title=video_title, url_transparent=True) - elif ertwebtv_urls: - return self.playlist_from_matches(ertwebtv_urls, video_id, video_title, ie=ERTWebtvEmbedIE.ie_key()) - - tvp_urls = TVPEmbedIE._extract_urls(webpage) - if tvp_urls: - return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key()) - - # Look for MainStreaming embeds - mainstreaming_urls = MainStreamingIE._extract_urls(webpage) - if mainstreaming_urls: - return self.playlist_from_matches(mainstreaming_urls, video_id, video_title, ie=MainStreamingIE.ie_key()) - - # Look for Gfycat Embeds - gfycat_urls = GfycatIE._extract_urls(webpage) - if gfycat_urls: - return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key()) - - panopto_urls = PanoptoBaseIE._extract_urls(webpage) - if panopto_urls: - return self.playlist_from_matches(panopto_urls, video_id, video_title) - - # Look for Ruutu embeds - ruutu_urls = RuutuIE._extract_urls(webpage) - if ruutu_urls: - return self.playlist_from_matches(ruutu_urls, video_id, video_title) - - # Look for Tiktok embeds - tiktok_urls = TikTokIE._extract_urls(webpage) - if tiktok_urls: - return self.playlist_from_matches(tiktok_urls, video_id, video_title) - # TODO: END: Move Embeds - self._downloader.write_debug('Looking for embeds') embeds = [] for ie in gen_extractor_classes(): @@ -3784,7 +2834,7 @@ class GenericIE(InfoExtractor): return { **info_dict, '_type': 'url', - 'ie_key': JWPlatformIE.ie_key(), + 'ie_key': 'JWPlatform', 'url': jwplayer_data['playlist'], } try: @@ -4045,9 +3095,9 @@ class GenericIE(InfoExtractor): entry_info_dict = { 'id': video_id, - 'uploader': video_uploader, - 'title': video_title, - 'age_limit': age_limit, + 'uploader': domain_name, + 'title': info_dict['title'], + 'age_limit': info_dict['age_limit'], 'http_headers': headers, } diff --git a/yt_dlp/extractor/gfycat.py b/yt_dlp/extractor/gfycat.py index 60f06ccd7..9d091c113 100644 --- a/yt_dlp/extractor/gfycat.py +++ b/yt_dlp/extractor/gfycat.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -11,6 +9,7 @@ from ..utils import ( class GfycatIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?i:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)' + _EMBED_REGEX = [rf'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>{_VALID_URL})'] _TESTS = [{ 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher', 'info_dict': { @@ -82,14 +81,6 @@ class GfycatIE(InfoExtractor): 'only_matching': True }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>%s)' % GfycatIE._VALID_URL, - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/glomex.py b/yt_dlp/extractor/glomex.py index 85ffa4c05..86fe1b024 100644 --- a/yt_dlp/extractor/glomex.py +++ b/yt_dlp/extractor/glomex.py @@ -174,7 +174,7 @@ class GlomexEmbedIE(GlomexBaseIE): return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url) @classmethod - def _extract_urls(cls, webpage, origin_url): + def _extract_embed_urls(cls, url, webpage): # https://docs.glomex.com/publisher/video-player-integration/javascript-api/ quot_re = r'["\']' @@ -183,9 +183,9 @@ class GlomexEmbedIE(GlomexBaseIE): (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+ )(?P=q)''' for mobj in re.finditer(regex, webpage): - url = unescapeHTML(mobj.group('url')) - if cls.suitable(url): - yield cls._smuggle_origin_url(url, origin_url) + embed_url = unescapeHTML(mobj.group('url')) + if cls.suitable(embed_url): + yield cls._smuggle_origin_url(embed_url, url) regex = fr'''(?x) <glomex-player [^>]+?>| @@ -193,7 +193,7 @@ class GlomexEmbedIE(GlomexBaseIE): for mobj in re.finditer(regex, webpage): attrs = extract_attributes(mobj.group(0)) if attrs.get('data-integration-id') and attrs.get('data-playlist-id'): - yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], origin_url) + yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], url) # naive parsing of inline scripts for hard-coded integration parameters regex = fr'''(?x) @@ -206,7 +206,7 @@ class GlomexEmbedIE(GlomexBaseIE): continue playlist_id = re.search(regex % 'playlistId', script) if playlist_id: - yield cls.build_player_url(playlist_id, integration_id, origin_url) + yield cls.build_player_url(playlist_id, integration_id, url) def _real_extract(self, url): url, origin_url = self._unsmuggle_origin_url(url) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index d7475b6da..cb123b874 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -77,13 +77,13 @@ class GoogleDriveIE(InfoExtractor): _caption_formats_ext = [] _captions_xml = None - @staticmethod - def _extract_url(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): mobj = re.search( r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})', webpage) if mobj: - return 'https://drive.google.com/file/d/%s' % mobj.group('id') + yield 'https://drive.google.com/file/d/%s' % mobj.group('id') def _download_subtitles_xml(self, video_id, subtitles_id, hl): if self._captions_xml: diff --git a/yt_dlp/extractor/heise.py b/yt_dlp/extractor/heise.py index 84e5d3023..a80eaaf81 100644 --- a/yt_dlp/extractor/heise.py +++ b/yt_dlp/extractor/heise.py @@ -121,7 +121,7 @@ class HeiseIE(InfoExtractor): if kaltura_id: return _make_kaltura_result('kaltura:2238431:%s' % kaltura_id) - yt_urls = YoutubeIE._extract_urls(webpage) + yt_urls = YoutubeIE._extract_embed_urls(url, webpage) if yt_urls: return self.playlist_from_matches( yt_urls, video_id, title, ie=YoutubeIE.ie_key()) diff --git a/yt_dlp/extractor/huffpost.py b/yt_dlp/extractor/huffpost.py index 7286dbcd7..27ebc8b6c 100644 --- a/yt_dlp/extractor/huffpost.py +++ b/yt_dlp/extractor/huffpost.py @@ -17,6 +17,7 @@ class HuffPostIE(InfoExtractor): HPLEmbedPlayer/\?segmentId= ) (?P<id>[0-9a-f]+)''' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1'] _TEST = { 'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677', diff --git a/yt_dlp/extractor/indavideo.py b/yt_dlp/extractor/indavideo.py index fb041a182..b397c168c 100644 --- a/yt_dlp/extractor/indavideo.py +++ b/yt_dlp/extractor/indavideo.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -12,6 +10,14 @@ from ..utils import ( class IndavideoEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)' + # Some example URLs covered by generic extractor: + # http://indavideo.hu/video/Vicces_cica_1 + # http://index.indavideo.hu/video/2015_0728_beregszasz + # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko + # http://erotika.indavideo.hu/video/Amator_tini_punci + # http://film.indavideo.hu/video/f_hrom_nagymamm_volt + # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)'] _TESTS = [{ 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', @@ -37,20 +43,6 @@ class IndavideoEmbedIE(InfoExtractor): 'only_matching': True, }] - # Some example URLs covered by generic extractor: - # http://indavideo.hu/video/Vicces_cica_1 - # http://index.indavideo.hu/video/2015_0728_beregszasz - # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko - # http://erotika.indavideo.hu/video/Amator_tini_punci - # http://film.indavideo.hu/video/f_hrom_nagymamm_volt - # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 04afacb90..94db75640 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -243,6 +243,7 @@ class InstagramIOSIE(InfoExtractor): class InstagramIE(InstagramBaseIE): _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+))' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1'] _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -346,23 +347,16 @@ class InstagramIE(InstagramBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_embed_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', - webpage) - if mobj: - return mobj.group('url') - - blockquote_el = get_element_by_attribute( - 'class', 'instagram-media', webpage) - if blockquote_el is None: - return + @classmethod + def _extract_embed_urls(cls, url, webpage): + res = tuple(super()._extract_embed_urls(url, webpage)) + if res: + return res - mobj = re.search( - r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) + mobj = re.search(r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', + get_element_by_attribute('class', 'instagram-media', webpage) or '') if mobj: - return mobj.group('link') + return [mobj.group('link')] def _real_extract(self, url): video_id, url = self._match_valid_url(url).group('id', 'url') diff --git a/yt_dlp/extractor/ivi.py b/yt_dlp/extractor/ivi.py index 699746943..6772fcbb9 100644 --- a/yt_dlp/extractor/ivi.py +++ b/yt_dlp/extractor/ivi.py @@ -13,6 +13,7 @@ class IviIE(InfoExtractor): IE_DESC = 'ivi.ru' IE_NAME = 'ivi' _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<id>\d+)' + _EMBED_REGEX = [r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1'] _GEO_BYPASS = False _GEO_COUNTRIES = ['RU'] _LIGHT_KEY = b'\xf1\x02\x32\xb7\xbc\x5c\x7a\xe8\xf7\x96\xc1\x33\x2b\x27\xa1\x8c' diff --git a/yt_dlp/extractor/joj.py b/yt_dlp/extractor/joj.py index 1c4676e95..298b37823 100644 --- a/yt_dlp/extractor/joj.py +++ b/yt_dlp/extractor/joj.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -18,6 +16,7 @@ class JojIE(InfoExtractor): ) (?P<id>[^/?#^]+) ''' + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1'] _TESTS = [{ 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', 'info_dict': { @@ -38,14 +37,6 @@ class JojIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1', - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/jwplatform.py b/yt_dlp/extractor/jwplatform.py index 2cb7ca3d7..d6b8420a8 100644 --- a/yt_dlp/extractor/jwplatform.py +++ b/yt_dlp/extractor/jwplatform.py @@ -22,13 +22,8 @@ class JWPlatformIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - urls = JWPlatformIE._extract_urls(webpage) - return urls[0] if urls else None - - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): for tag, key in ((r'(?:script|iframe)', 'src'), ('input', 'value')): # <input value=URL> is used by hyland.com # if we find <iframe>, dont look for <input> diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py index f4092aa71..f62c9791c 100644 --- a/yt_dlp/extractor/kaltura.py +++ b/yt_dlp/extractor/kaltura.py @@ -111,13 +111,8 @@ class KalturaIE(InfoExtractor): } ] - @staticmethod - def _extract_url(webpage): - urls = KalturaIE._extract_urls(webpage) - return urls[0] if urls else None - - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site finditer = ( list(re.finditer( @@ -159,14 +154,14 @@ class KalturaIE(InfoExtractor): for k, v in embed_info.items(): if v: embed_info[k] = v.strip() - url = 'kaltura:%(partner_id)s:%(id)s' % embed_info + embed_url = 'kaltura:%(partner_id)s:%(id)s' % embed_info escaped_pid = re.escape(embed_info['partner_id']) service_mobj = re.search( r'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), webpage) if service_mobj: - url = smuggle_url(url, {'service_url': service_mobj.group('id')}) - urls.append(url) + embed_url = smuggle_url(embed_url, {'service_url': service_mobj.group('id')}) + urls.append(embed_url) return urls def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): diff --git a/yt_dlp/extractor/kinja.py b/yt_dlp/extractor/kinja.py index c00abfbc1..3747d8eea 100644 --- a/yt_dlp/extractor/kinja.py +++ b/yt_dlp/extractor/kinja.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..compat import ( compat_str, @@ -10,8 +8,6 @@ from ..utils import ( parse_iso8601, strip_or_none, try_get, - unescapeHTML, - urljoin, ) @@ -55,6 +51,7 @@ class KinjaEmbedIE(InfoExtractor): vine| youtube-(?:list|video) )-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX) + _EMBED_REGEX = [rf'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//{_DOMAIN_REGEX})?{_COMMON_REGEX}(?:(?!\1).)+)\1'] _TESTS = [{ 'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621', 'only_matching': True, @@ -119,12 +116,6 @@ class KinjaEmbedIE(InfoExtractor): 'youtube-video': ('youtube.com/embed/', 'Youtube'), } - @staticmethod - def _extract_urls(webpage, url): - return [urljoin(url, unescapeHTML(mobj.group('url'))) for mobj in re.finditer( - r'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//%s)?%s(?:(?!\1).)+)\1' % (KinjaEmbedIE._DOMAIN_REGEX, KinjaEmbedIE._COMMON_REGEX), - webpage)] - def _real_extract(self, url): video_type, video_id = self._match_valid_url(url).groups() diff --git a/yt_dlp/extractor/libsyn.py b/yt_dlp/extractor/libsyn.py index 8245a3481..29bbb03de 100644 --- a/yt_dlp/extractor/libsyn.py +++ b/yt_dlp/extractor/libsyn.py @@ -10,6 +10,7 @@ from ..utils import ( class LibsynIE(InfoExtractor): _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1'] _TESTS = [{ 'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/', diff --git a/yt_dlp/extractor/limelight.py b/yt_dlp/extractor/limelight.py index 25667fc07..90065094b 100644 --- a/yt_dlp/extractor/limelight.py +++ b/yt_dlp/extractor/limelight.py @@ -17,7 +17,7 @@ class LimelightBaseIE(InfoExtractor): _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' @classmethod - def _extract_urls(cls, webpage, source_url): + def _extract_embed_urls(cls, url, webpage): lm = { 'Media': 'media', 'Channel': 'channel', @@ -25,7 +25,7 @@ class LimelightBaseIE(InfoExtractor): } def smuggle(url): - return smuggle_url(url, {'source_url': source_url}) + return smuggle_url(url, {'source_url': url}) entries = [] for kind, video_id in re.findall( diff --git a/yt_dlp/extractor/livestream.py b/yt_dlp/extractor/livestream.py index 4b90c22c5..70449dce5 100644 --- a/yt_dlp/extractor/livestream.py +++ b/yt_dlp/extractor/livestream.py @@ -23,6 +23,8 @@ from ..utils import ( class LivestreamIE(InfoExtractor): IE_NAME = 'livestream' _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?' + _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"'] + _TESTS = [{ 'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', 'md5': '53274c76ba7754fb0e8d072716f2292b', diff --git a/yt_dlp/extractor/mainstreaming.py b/yt_dlp/extractor/mainstreaming.py index c144c7592..213a1df57 100644 --- a/yt_dlp/extractor/mainstreaming.py +++ b/yt_dlp/extractor/mainstreaming.py @@ -14,6 +14,7 @@ from ..utils import ( class MainStreamingIE(InfoExtractor): _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)' + _EMBED_REGEX = [rf'<iframe[^>]+?src=["\']?(?P<url>{_VALID_URL})["\']?'] IE_DESC = 'MainStreaming Player' _TESTS = [ @@ -102,13 +103,6 @@ class MainStreamingIE(InfoExtractor): } ] - @staticmethod - def _extract_urls(webpage): - mobj = re.findall( - r'<iframe[^>]+?src=["\']?(?P<url>%s)["\']?' % MainStreamingIE._VALID_URL, webpage) - if mobj: - return [group[0] for group in mobj] - def _playlist_entries(self, host, playlist_content): for entry in playlist_content: content_id = entry.get('contentID') diff --git a/yt_dlp/extractor/mangomolo.py b/yt_dlp/extractor/mangomolo.py index a392e9b54..568831aa8 100644 --- a/yt_dlp/extractor/mangomolo.py +++ b/yt_dlp/extractor/mangomolo.py @@ -3,11 +3,29 @@ from ..compat import ( compat_b64decode, compat_urllib_parse_unquote, ) -from ..utils import int_or_none +from ..utils import classproperty, int_or_none class MangomoloBaseIE(InfoExtractor): - _BASE_REGEX = r'https?://(?:admin\.mangomolo\.com/analytics/index\.php/customers/embed/|player\.mangomolo\.com/v1/)' + _BASE_REGEX = r'(?:https?:)?//(?:admin\.mangomolo\.com/analytics/index\.php/customers/embed/|player\.mangomolo\.com/v1/)' + _SLUG = None + + @classproperty + def _VALID_URL(cls): + return f'{cls._BASE_REGEX}{cls._SLUG}' + + @classproperty + def _EMBED_REGEX(cls): + return [rf'<iframe[^>]+src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1'] + + def _extract_from_webpage(self, url, webpage): + for res in super()._extract_from_webpage(url, webpage): + yield { + **res, + '_type': 'url_transparent', + 'id': self._search_regex(self._SLUG, res['url'], 'id', group='id'), + 'uploader': self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader'), + } def _get_real_id(self, page_id): return page_id @@ -41,14 +59,15 @@ class MangomoloBaseIE(InfoExtractor): class MangomoloVideoIE(MangomoloBaseIE): _TYPE = 'video' IE_NAME = 'mangomolo:' + _TYPE - _VALID_URL = MangomoloBaseIE._BASE_REGEX + r'video\?.*?\bid=(?P<id>\d+)' + _SLUG = r'video\?.*?\bid=(?P<id>\d+)' + _IS_LIVE = False class MangomoloLiveIE(MangomoloBaseIE): _TYPE = 'live' IE_NAME = 'mangomolo:' + _TYPE - _VALID_URL = MangomoloBaseIE._BASE_REGEX + r'(live|index)\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)' + _SLUG = r'(?:live|index)\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)' _IS_LIVE = True def _get_real_id(self, page_id): diff --git a/yt_dlp/extractor/medialaan.py b/yt_dlp/extractor/medialaan.py index 297f8c4b2..6daa50846 100644 --- a/yt_dlp/extractor/medialaan.py +++ b/yt_dlp/extractor/medialaan.py @@ -69,8 +69,8 @@ class MedialaanIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): entries = [] for element in re.findall(r'(<div[^>]+data-mychannels-type="video"[^>]*>)', webpage): mychannels_id = extract_attributes(element).get('data-mychannels-id') diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index f396c1bd3..4e549fe5e 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -167,8 +167,7 @@ class MediasetIE(ThePlatformBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_urls(ie, webpage): + def _extract_from_webpage(self, url, webpage): def _qs(url): return parse_qs(url) @@ -188,8 +187,7 @@ class MediasetIE(ThePlatformBaseIE): video_id = embed_qs.get('id', [None])[0] if not video_id: continue - urlh = ie._request_webpage( - embed_url, video_id, note='Following embed URL redirect') + urlh = self._request_webpage(embed_url, video_id, note='Following embed URL redirect') embed_url = urlh.geturl() program_guid = _program_guid(_qs(embed_url)) if program_guid: diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index 30464bad0..0ffd01cd2 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -13,7 +13,7 @@ from ..utils import ( str_or_none, try_call, try_get, - unescapeHTML, + smuggle_url, unsmuggle_url, url_or_none, urljoin, @@ -25,6 +25,7 @@ _ID_RE = r'(?:[0-9a-f]{32,34}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0 class MediasiteIE(InfoExtractor): _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/[^/#?]+/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE + _EMBED_REGEX = [r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE] _TESTS = [ { 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', @@ -112,13 +113,10 @@ class MediasiteIE(InfoExtractor): 5: 'video3', } - @staticmethod - def _extract_urls(webpage): - return [ - unescapeHTML(mobj.group('url')) - for mobj in re.finditer( - r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE, - webpage)] + @classmethod + def _extract_embed_urls(cls, url, webpage): + for embed_url in super()._extract_embed_urls(url, webpage): + yield smuggle_url(embed_url, {'UrlReferrer': url}) def __extract_slides(self, *, stream_id, snum, Stream, duration, images): slide_base_url = Stream['SlideBaseUrl'] diff --git a/yt_dlp/extractor/megaphone.py b/yt_dlp/extractor/megaphone.py index 0c150ef45..af80523e3 100644 --- a/yt_dlp/extractor/megaphone.py +++ b/yt_dlp/extractor/megaphone.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import js_to_json @@ -8,6 +6,7 @@ class MegaphoneIE(InfoExtractor): IE_NAME = 'megaphone.fm' IE_DESC = 'megaphone.fm embedded players' _VALID_URL = r'https://player\.megaphone\.fm/(?P<id>[A-Z0-9]+)' + _EMBED_REGEX = [rf'<iframe[^>]*?\ssrc=["\'](?P<url>{_VALID_URL})'] _TEST = { 'url': 'https://player.megaphone.fm/GLT9749789991?"', 'md5': '4816a0de523eb3e972dc0dda2c191f96', @@ -45,8 +44,3 @@ class MegaphoneIE(InfoExtractor): 'duration': episode_data['duration'], 'formats': formats, } - - @classmethod - def _extract_urls(cls, webpage): - return [m[0] for m in re.findall( - r'<iframe[^>]*?\ssrc=["\'](%s)' % cls._VALID_URL, webpage)] diff --git a/yt_dlp/extractor/megatvcom.py b/yt_dlp/extractor/megatvcom.py index ec481d016..54c7b7f9f 100644 --- a/yt_dlp/extractor/megatvcom.py +++ b/yt_dlp/extractor/megatvcom.py @@ -104,7 +104,7 @@ class MegaTVComEmbedIE(MegaTVComBaseIE): IE_NAME = 'megatvcom:embed' IE_DESC = 'megatv.com embedded videos' _VALID_URL = r'(?:https?:)?//(?:www\.)?megatv\.com/embed/?\?p=(?P<id>\d+)' - _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''') + _EMBED_REGEX = [rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)'''] _TESTS = [{ 'url': 'https://www.megatv.com/embed/?p=2020520979', @@ -134,11 +134,6 @@ class MegaTVComEmbedIE(MegaTVComBaseIE): }, }] - @classmethod - def _extract_urls(cls, webpage): - for mobj in cls._EMBED_RE.finditer(webpage): - yield unescapeHTML(mobj.group('url')) - def _match_canonical_url(self, webpage): LINK_RE = r'''(?x) <link(?: diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index 5fb97083a..dd1f54f87 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -92,6 +92,10 @@ class MLBIE(MLBBaseIE): (?P<id>\d+) ) ''' + _EMBED_REGEX = [ + r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', + r'data-video-link=["\'](?P<url>http://m\.mlb\.com/video/[^"\']+)', + ] _TESTS = [ { 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', diff --git a/yt_dlp/extractor/mofosex.py b/yt_dlp/extractor/mofosex.py index 66a098c97..4221ef3e3 100644 --- a/yt_dlp/extractor/mofosex.py +++ b/yt_dlp/extractor/mofosex.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -59,17 +57,12 @@ class MofosexIE(KeezMoviesIE): class MofosexEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)'] _TESTS = [{ 'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM', 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) return self.url_result( diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py index d161c33c1..10cd304eb 100644 --- a/yt_dlp/extractor/mtv.py +++ b/yt_dlp/extractor/mtv.py @@ -331,6 +331,7 @@ class MTVServicesInfoExtractor(InfoExtractor): class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): IE_NAME = 'mtvservices:embedded' _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?|/|$)' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1'] _TEST = { # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/ @@ -346,13 +347,6 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): }, } - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage) - if mobj: - return mobj.group('url') - def _get_feed_url(self, uri, url=None): video_id = self._id_from_uri(uri) config = self._download_json( diff --git a/yt_dlp/extractor/myvi.py b/yt_dlp/extractor/myvi.py index b31cf4493..df7200be2 100644 --- a/yt_dlp/extractor/myvi.py +++ b/yt_dlp/extractor/myvi.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from .vimple import SprutoBaseIE @@ -26,6 +24,7 @@ class MyviIE(SprutoBaseIE): ) (?P<id>[\da-zA-Z_-]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1'] _TESTS = [{ 'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', 'md5': '571bbdfba9f9ed229dc6d34cc0f335bf', @@ -56,13 +55,6 @@ class MyviIE(SprutoBaseIE): 'only_matching': True, }] - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1', webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index 365c2e60d..910cbedf6 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -184,6 +184,7 @@ class NBCIE(ThePlatformIE): class NBCSportsVPlayerIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)' + _EMBED_REGEX = [r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>%s[^\"]+)' % _VALID_URL_BASE] _TESTS = [{ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', @@ -207,13 +208,6 @@ class NBCSportsVPlayerIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - video_urls = re.search( - r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>%s[^\"]+)' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) - if video_urls: - return video_urls.group('url') - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -317,6 +311,7 @@ class NBCSportsStreamIE(AdobePassIE): class NBCNewsIE(ThePlatformIE): _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1'] _TESTS = [ { diff --git a/yt_dlp/extractor/nexx.py b/yt_dlp/extractor/nexx.py index 01376be3d..69c48652c 100644 --- a/yt_dlp/extractor/nexx.py +++ b/yt_dlp/extractor/nexx.py @@ -114,8 +114,8 @@ class NexxIE(InfoExtractor): webpage) return mobj.group('id') if mobj else None - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): # Reference: # 1. https://nx-s.akamaized.net/files/201510/44.pdf @@ -135,10 +135,6 @@ class NexxIE(InfoExtractor): return entries - @staticmethod - def _extract_url(webpage): - return NexxIE._extract_urls(webpage)[0] - def _handle_error(self, response): if traverse_obj(response, ('metadata', 'notice'), expected_type=str): self.report_warning('%s said: %s' % (self.IE_NAME, response['metadata']['notice'])) @@ -498,6 +494,8 @@ class NexxIE(InfoExtractor): class NexxEmbedIE(InfoExtractor): _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P<id>[^/?#&]+)' + # Reference. https://nx-s.akamaized.net/files/201510/44.pdf + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1'] _TESTS = [{ 'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1', 'md5': '16746bfc28c42049492385c989b26c4a', @@ -521,16 +519,6 @@ class NexxEmbedIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - # Reference: - # 1. https://nx-s.akamaized.net/files/201510/44.pdf - - # iFrame Embed Integration - return [mobj.group('url') for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1', - webpage)] - def _real_extract(self, url): embed_id = self._match_id(url) diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py index f388688c4..fe6986a82 100644 --- a/yt_dlp/extractor/nytimes.py +++ b/yt_dlp/extractor/nytimes.py @@ -103,6 +103,7 @@ class NYTimesBaseIE(InfoExtractor): class NYTimesIE(NYTimesBaseIE): _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>'] _TESTS = [{ 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py index 36a7f5f4e..4faec914e 100644 --- a/yt_dlp/extractor/odnoklassniki.py +++ b/yt_dlp/extractor/odnoklassniki.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, @@ -31,6 +29,7 @@ class OdnoklassnikiIE(InfoExtractor): ) (?P<id>[\d-]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1'] _TESTS = [{ 'note': 'Coub embedded', 'url': 'http://ok.ru/video/1484130554189', @@ -161,13 +160,6 @@ class OdnoklassnikiIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): try: return self._extract_desktop(url) diff --git a/yt_dlp/extractor/onionstudios.py b/yt_dlp/extractor/onionstudios.py index 9776b4d97..5fa49e142 100644 --- a/yt_dlp/extractor/onionstudios.py +++ b/yt_dlp/extractor/onionstudios.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import js_to_json @@ -7,6 +5,7 @@ from ..utils import js_to_json class OnionStudiosIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:video(?:s/[^/]+-|/)|embed\?.*\bid=)(?P<id>\d+)(?!-)' + _EMBED_REGEX = [r'(?s)<(?:iframe|bulbs-video)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/(?:embed.+?|video/\d+\.json))\1'] _TESTS = [{ 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', @@ -29,13 +28,6 @@ class OnionStudiosIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'(?s)<(?:iframe|bulbs-video)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/(?:embed.+?|video/\d+\.json))\1', webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/ooyala.py b/yt_dlp/extractor/ooyala.py index 77017f08b..146c1f981 100644 --- a/yt_dlp/extractor/ooyala.py +++ b/yt_dlp/extractor/ooyala.py @@ -10,6 +10,7 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + smuggle_url, try_get, unsmuggle_url, ) @@ -151,6 +152,29 @@ class OoyalaIE(OoyalaBaseIE): } ] + def _extract_from_webpage(self, url, webpage): + mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) + or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) + or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) + or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) + or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) + if mobj is not None: + embed_token = self._search_regex( + r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', + webpage, 'ooyala embed token', default=None) + yield self._build_url_result(smuggle_url( + mobj.group('ec'), { + 'domain': url, + 'embed_token': embed_token, + })) + return + + # Look for multiple Ooyala embeds on SBN network websites + mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) + if mobj is not None: + for v in self._parse_json(mobj.group(1), self._generic_id(url), fatal=False) or []: + yield self._build_url_result(smuggle_url(v['provider_video_id'], {'domain': url})) + @staticmethod def _url_for_embed_code(embed_code): return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py index 3388f7f39..5f5edb26b 100644 --- a/yt_dlp/extractor/panopto.py +++ b/yt_dlp/extractor/panopto.py @@ -1,4 +1,3 @@ -import re import calendar import json import functools @@ -73,15 +72,10 @@ class PanoptoBaseIE(InfoExtractor): def _parse_fragment(url): return {k: json.loads(v[0]) for k, v in compat_urlparse.parse_qs(compat_urllib_parse_urlparse(url).fragment).items()} - @staticmethod - def _extract_urls(webpage): - return [m.group('url') for m in re.finditer( - r'<iframe[^>]+src=["\'](?P<url>%s/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"\']+)' % PanoptoIE.BASE_URL_RE, - webpage)] - class PanoptoIE(PanoptoBaseIE): _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)id=(?P<id>[a-f0-9-]+)' + _EMBED_REGEX = [rf'<iframe[^>]+src=["\'](?P<url>{PanoptoBaseIE.BASE_URL_RE}/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"\']+)'] _TESTS = [ { 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb', diff --git a/yt_dlp/extractor/peertube.py b/yt_dlp/extractor/peertube.py index 0d3bc18a8..6d280e41c 100644 --- a/yt_dlp/extractor/peertube.py +++ b/yt_dlp/extractor/peertube.py @@ -1057,6 +1057,7 @@ class PeerTubeIE(InfoExtractor): ) (?P<id>%s) ''' % (_INSTANCES_RE, _UUID_RE) + _EMBED_REGEX = [r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//{_INSTANCES_RE}/videos/embed/{cls._UUID_RE})'''] _TESTS = [{ 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', 'md5': '8563064d245a4be5705bddb22bb00a28', @@ -1158,16 +1159,15 @@ class PeerTubeIE(InfoExtractor): '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): return 'peertube:%s:%s' % mobj.group('host', 'id') - @staticmethod - def _extract_urls(webpage, source_url): - entries = re.findall( - r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' - % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) - if not entries: - peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) - if peertube_url: - entries = [peertube_url] - return entries + @classmethod + def _extract_embed_urls(cls, url, webpage): + embeds = tuple(super()._extract_embed_urls(url, webpage)) + if embeds: + return embeds + + peertube_url = cls._extract_peertube_url(webpage, url) + if peertube_url: + return [peertube_url] def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): return self._download_json( diff --git a/yt_dlp/extractor/periscope.py b/yt_dlp/extractor/periscope.py index fc8591a2c..2ff6589d5 100644 --- a/yt_dlp/extractor/periscope.py +++ b/yt_dlp/extractor/periscope.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -67,6 +65,7 @@ class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' + _EMBED_REGEX = [r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1'] # Alive example URLs can be found here https://www.periscope.tv/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', @@ -92,13 +91,6 @@ class PeriscopeIE(PeriscopeBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): token = self._match_id(url) diff --git a/yt_dlp/extractor/piksel.py b/yt_dlp/extractor/piksel.py index 14a540859..fba7242f5 100644 --- a/yt_dlp/extractor/piksel.py +++ b/yt_dlp/extractor/piksel.py @@ -30,6 +30,7 @@ class PikselIE(InfoExtractor): )\.jp| vidego\.baltimorecity\.gov )/v/(?:refid/(?P<refid>[^/]+)/prefid/)?(?P<id>[\w-]+)''' + _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)'] _TESTS = [ { 'url': 'http://player.piksel.com/v/ums2867l', @@ -62,14 +63,6 @@ class PikselIE(InfoExtractor): } ] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)', - webpage) - if mobj: - return mobj.group('url') - def _call_api(self, app_token, resource, display_id, query, fatal=True): response = (self._download_json( 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), diff --git a/yt_dlp/extractor/pladform.py b/yt_dlp/extractor/pladform.py index 301f5c838..8be08a5bc 100644 --- a/yt_dlp/extractor/pladform.py +++ b/yt_dlp/extractor/pladform.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -24,6 +22,7 @@ class PladformIE(InfoExtractor): ) (?P<id>\d+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1'] _TESTS = [{ 'url': 'http://out.pladform.ru/player?pl=18079&type=html5&videoid=100231282', 'info_dict': { @@ -61,13 +60,6 @@ class PladformIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/playwire.py b/yt_dlp/extractor/playwire.py index ab7f71493..683dbf4a5 100644 --- a/yt_dlp/extractor/playwire.py +++ b/yt_dlp/extractor/playwire.py @@ -7,6 +7,8 @@ from ..utils import ( class PlaywireIE(InfoExtractor): _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)' + _EMBED_REGEX = [r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1'] + _TESTS = [{ 'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json', 'md5': 'e6398701e3595888125729eaa2329ed9', diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 35468b4fc..6afaf5e6e 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -128,6 +128,7 @@ class PornHubIE(PornHubBaseIE): ) (?P<id>[\da-z]+) ''' % PornHubBaseIE._PORNHUB_HOST_RE + _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)'] _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': 'a6391306d050e4547f62b3f485dd9ba9', @@ -257,12 +258,6 @@ class PornHubIE(PornHubBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', - webpage) - def _extract_count(self, pattern, webpage, name): return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None)) diff --git a/yt_dlp/extractor/rcs.py b/yt_dlp/extractor/rcs.py index abbc167c0..28ba42eed 100644 --- a/yt_dlp/extractor/rcs.py +++ b/yt_dlp/extractor/rcs.py @@ -281,6 +281,20 @@ class RCSEmbedsIE(RCSBaseIE): (?:gazzanet\.)?gazzetta )\.it) /video-embed/(?P<id>[^/=&\?]+?)(?:$|\?)''' + _EMBED_REGEX = [r'''(?x) + (?: + data-frame-src=| + <iframe[^\n]+src= + ) + (["']) + (?P<url>(?:https?:)?//video\. + (?: + rcs| + (?:corriere\w+\.)?corriere| + (?:gazzanet\.)?gazzetta + ) + \.it/video-embed/.+?) + \1'''] _TESTS = [{ 'url': 'https://video.rcs.it/video-embed/iodonna-0001585037', 'md5': '623ecc8ffe7299b2d0c1046d8331a9df', @@ -321,30 +335,9 @@ class RCSEmbedsIE(RCSBaseIE): urls[i] = urljoin(base_url(e), url_basename(e)) return urls - @staticmethod - def _extract_urls(webpage): - entries = [ - mobj.group('url') - for mobj in re.finditer(r'''(?x) - (?: - data-frame-src=| - <iframe[^\n]+src= - ) - (["']) - (?P<url>(?:https?:)?//video\. - (?: - rcs| - (?:corriere\w+\.)?corriere| - (?:gazzanet\.)?gazzetta - ) - \.it/video-embed/.+?) - \1''', webpage)] - return RCSEmbedsIE._sanitize_urls(entries) - - @staticmethod - def _extract_url(webpage): - urls = RCSEmbedsIE._extract_urls(webpage) - return urls[0] if urls else None + @classmethod + def _extract_embed_urls(cls, url, webpage): + return cls._sanitize_urls(tuple(super()._extract_embed_urls(url, webpage))) class RCSIE(RCSBaseIE): diff --git a/yt_dlp/extractor/redtube.py b/yt_dlp/extractor/redtube.py index ab7c505da..8e767b6e4 100644 --- a/yt_dlp/extractor/redtube.py +++ b/yt_dlp/extractor/redtube.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -14,6 +12,7 @@ from ..utils import ( class RedTubeIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' + _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)'] _TESTS = [{ 'url': 'https://www.redtube.com/38864951', 'md5': '4fba70cbca3aefd25767ab4b523c9878', @@ -37,12 +36,6 @@ class RedTubeIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( diff --git a/yt_dlp/extractor/rtlnl.py b/yt_dlp/extractor/rtlnl.py index e6b450a23..3852a3a13 100644 --- a/yt_dlp/extractor/rtlnl.py +++ b/yt_dlp/extractor/rtlnl.py @@ -8,6 +8,7 @@ from ..utils import ( class RtlNlIE(InfoExtractor): IE_NAME = 'rtl.nl' IE_DESC = 'rtl.nl and rtlxl.nl' + _EMBED_REGEX = [r'<iframe[^>]+?\bsrc=(?P<q1>[\'"])(?P<url>(?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)(?P=q1)'] _VALID_URL = r'''(?x) https?://(?:(?:www|static)\.)? (?: diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 924f9829f..c94ba68ee 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -15,6 +15,7 @@ from ..utils import ( class RumbleEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)' + _EMBED_REGEX = [fr'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>{_VALID_URL})'] _TESTS = [{ 'url': 'https://rumble.com/embed/v5pv5f', 'md5': '36a18a049856720189f30977ccbb2c34', @@ -51,11 +52,10 @@ class RumbleEmbedIE(InfoExtractor): }] @classmethod - def _extract_urls(cls, webpage): - embeds = tuple(re.finditer( - fr'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>{cls._VALID_URL})', webpage)) + def _extract_embed_urls(cls, url, webpage): + embeds = tuple(super()._extract_embed_urls(url, webpage)) if embeds: - return [mobj.group('url') for mobj in embeds] + return embeds return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer( r'<script>\s*Rumble\(\s*"play"\s*,\s*{\s*[\'"]video[\'"]\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py index ecfcea939..380c5e14e 100644 --- a/yt_dlp/extractor/rutube.py +++ b/yt_dlp/extractor/rutube.py @@ -1,4 +1,3 @@ -import re import itertools from .common import InfoExtractor @@ -94,6 +93,7 @@ class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1'] _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', @@ -128,12 +128,6 @@ class RutubeIE(RutubeBaseIE): def suitable(cls, url): return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) - @staticmethod - def _extract_urls(webpage): - return [mobj.group('url') for mobj in re.finditer( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1', - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) info = self._download_and_extract_info(video_id) diff --git a/yt_dlp/extractor/rutv.py b/yt_dlp/extractor/rutv.py index adf78ddb0..0b07dc5ad 100644 --- a/yt_dlp/extractor/rutv.py +++ b/yt_dlp/extractor/rutv.py @@ -20,6 +20,10 @@ class RUTVIE(InfoExtractor): ) (?P<id>\d+) ''' + _EMBED_URLS = [ + r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', + r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', + ] _TESTS = [ { @@ -107,19 +111,6 @@ class RUTVIE(InfoExtractor): }, ] - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) - if mobj: - return mobj.group('url') - - mobj = re.search( - r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') diff --git a/yt_dlp/extractor/ruutu.py b/yt_dlp/extractor/ruutu.py index c6d94c100..3f6d30d3c 100644 --- a/yt_dlp/extractor/ruutu.py +++ b/yt_dlp/extractor/ruutu.py @@ -135,7 +135,7 @@ class RuutuIE(InfoExtractor): _API_BASE = 'https://gatling.nelonenmedia.fi' @classmethod - def _extract_urls(cls, webpage): + def _extract_embed_urls(cls, url, webpage): # nelonen.fi settings = try_call( lambda: json.loads(re.search( diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py index 711524406..6bb499930 100644 --- a/yt_dlp/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py @@ -15,6 +15,12 @@ class SBSIE(InfoExtractor): .*?\bplay=|/watch/ )|news/(?:embeds/)?video/ )(?P<id>[0-9]+)''' + _EMBED_REGEX = [r'''(?x)] + (?: + <meta\s+property="og:video"\s+content=| + <iframe[^>]+?src= + ) + (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1'''] _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py index bced14328..6fec7c0bb 100644 --- a/yt_dlp/extractor/senategov.py +++ b/yt_dlp/extractor/senategov.py @@ -49,6 +49,7 @@ _COMMITTEES = { class SenateISVPIE(InfoExtractor): _IE_NAME = 'senate.gov:isvp' _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' + _EMBED_REGEX = [r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]"] _TESTS = [{ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', @@ -87,14 +88,6 @@ class SenateISVPIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _search_iframe_url(webpage): - mobj = re.search( - r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) diff --git a/yt_dlp/extractor/sendtonews.py b/yt_dlp/extractor/sendtonews.py index cf4b93d45..5ff06f19d 100644 --- a/yt_dlp/extractor/sendtonews.py +++ b/yt_dlp/extractor/sendtonews.py @@ -43,14 +43,14 @@ class SendtoNewsIE(InfoExtractor): _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' @classmethod - def _extract_url(cls, webpage): + def _extract_embed_urls(cls, url, webpage): mobj = re.search(r'''(?x)<script[^>]+src=([\'"]) (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\? .*\bSC=(?P<SC>[0-9a-zA-Z-]+).* \1>''', webpage) if mobj: sc = mobj.group('SC') - return cls._URL_TEMPLATE % sc + yield cls._URL_TEMPLATE % sc def _real_extract(self, url): playlist_id = self._match_id(url) diff --git a/yt_dlp/extractor/seznamzpravy.py b/yt_dlp/extractor/seznamzpravy.py index 891bfcfee..05642a116 100644 --- a/yt_dlp/extractor/seznamzpravy.py +++ b/yt_dlp/extractor/seznamzpravy.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..compat import ( compat_str, @@ -20,6 +18,7 @@ def _raw_id(src_url): class SeznamZpravyIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc=' + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1'] _TESTS = [{ 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', 'info_dict': { @@ -48,13 +47,6 @@ class SeznamZpravyIE(InfoExtractor): }, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1', - webpage)] - def _extract_sdn_formats(self, sdn_url, video_id): sdn_data = self._download_json(sdn_url, video_id) @@ -162,5 +154,5 @@ class SeznamZpravyArticleIE(InfoExtractor): return self.playlist_result([ self.url_result(entry_url, ie=SeznamZpravyIE.ie_key()) - for entry_url in SeznamZpravyIE._extract_urls(webpage)], + for entry_url in SeznamZpravyIE._extract_embed_urls(url, webpage)], article_id, title, description) diff --git a/yt_dlp/extractor/sharevideos.py b/yt_dlp/extractor/sharevideos.py new file mode 100644 index 000000000..3132c7a82 --- /dev/null +++ b/yt_dlp/extractor/sharevideos.py @@ -0,0 +1,6 @@ +from .common import InfoExtractor + + +class ShareVideosEmbedIE(InfoExtractor): + _VALID_URL = False + _EMBED_REGEX = [r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1'] diff --git a/yt_dlp/extractor/simplecast.py b/yt_dlp/extractor/simplecast.py index ecbb6123b..ec349ddf9 100644 --- a/yt_dlp/extractor/simplecast.py +++ b/yt_dlp/extractor/simplecast.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( clean_podcast_url, @@ -68,6 +66,11 @@ class SimplecastBaseIE(InfoExtractor): class SimplecastIE(SimplecastBaseIE): IE_NAME = 'simplecast' _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX + _EMBED_REGEX = [rf'''(?x)<iframe[^>]+src=["\'] + (?P<url>https?://(?: + embed\.simplecast\.com/[0-9a-f]{8}| + player\.simplecast\.com/{SimplecastBaseIE._UUID_REGEX} + ))'''] _COMMON_TEST_INFO = { 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays', 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', @@ -94,15 +97,6 @@ class SimplecastIE(SimplecastBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'''(?x)<iframe[^>]+src=["\'] - ( - https?://(?:embed\.simplecast\.com/[0-9a-f]{8}| - player\.simplecast\.com/%s - ))''' % SimplecastBaseIE._UUID_REGEX, webpage) - def _real_extract(self, url): episode_id = self._match_id(url) episode = self._call_api('episodes/%s', episode_id) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 9e4c8cf25..f7e125d37 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -33,18 +33,13 @@ from ..utils import ( class SoundcloudEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1'] _TEST = { # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/ 'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey', 'only_matching': True, } - @staticmethod - def _extract_urls(webpage): - return [m.group('url') for m in re.finditer( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', - webpage)] - def _real_extract(self, url): query = parse_qs(url) api_url = query['url'][0] diff --git a/yt_dlp/extractor/spankwire.py b/yt_dlp/extractor/spankwire.py index 603f17e9d..d1990e4de 100644 --- a/yt_dlp/extractor/spankwire.py +++ b/yt_dlp/extractor/spankwire.py @@ -21,6 +21,7 @@ class SpankwireIE(InfoExtractor): ) (?P<id>\d+) ''' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)'] _TESTS = [{ # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', @@ -65,12 +66,6 @@ class SpankwireIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/sportbox.py b/yt_dlp/extractor/sportbox.py index 1041cc7d1..622a81b47 100644 --- a/yt_dlp/extractor/sportbox.py +++ b/yt_dlp/extractor/sportbox.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -11,6 +9,7 @@ from ..utils import ( class SportBoxIE(InfoExtractor): _VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"'] _TESTS = [{ 'url': 'http://news.sportbox.ru/vdl/player/ci/211355', 'info_dict': { @@ -42,12 +41,6 @@ class SportBoxIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/spotify.py b/yt_dlp/extractor/spotify.py index f476b7022..4da24db9e 100644 --- a/yt_dlp/extractor/spotify.py +++ b/yt_dlp/extractor/spotify.py @@ -23,6 +23,7 @@ class SpotifyBaseIE(InfoExtractor): 'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d', } _VALID_URL_TEMPL = r'https?://open\.spotify\.com/(?:embed-podcast/|embed/|)%s/(?P<id>[^/?&#]+)' + _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://open\.spotify.com/embed/[^"]+)"'] def _real_initialize(self): self._ACCESS_TOKEN = self._download_json( @@ -97,12 +98,6 @@ class SpotifyBaseIE(InfoExtractor): 'series': series, } - @classmethod - def _extract_urls(cls, webpage): - return re.findall( - r'<iframe[^>]+src="(https?://open\.spotify.com/embed/[^"]+)"', - webpage) - class SpotifyIE(SpotifyBaseIE): IE_NAME = 'spotify' diff --git a/yt_dlp/extractor/springboardplatform.py b/yt_dlp/extractor/springboardplatform.py index 8e156bf1a..539a64209 100644 --- a/yt_dlp/extractor/springboardplatform.py +++ b/yt_dlp/extractor/springboardplatform.py @@ -21,6 +21,7 @@ class SpringboardPlatformIE(InfoExtractor): xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+) ) ''' + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1'] _TESTS = [{ 'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1', 'md5': '5c3cb7b5c55740d482561099e920f192', @@ -45,14 +46,6 @@ class SpringboardPlatformIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1', - webpage)] - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') or mobj.group('id_2') diff --git a/yt_dlp/extractor/streamable.py b/yt_dlp/extractor/streamable.py index a2935b04b..3e60479ad 100644 --- a/yt_dlp/extractor/streamable.py +++ b/yt_dlp/extractor/streamable.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -12,6 +10,7 @@ from ..utils import ( class StreamableIE(InfoExtractor): _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(?P<q1>[\'"])(?P<url>(?:https?:)?//streamable\.com/.+?)(?P=q1)'] _TESTS = [ { 'url': 'https://streamable.com/dnd1', @@ -53,14 +52,6 @@ class StreamableIE(InfoExtractor): } ] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(?P<q1>[\'"])(?P<src>(?:https?:)?//streamable\.com/(?:(?!\1).+))(?P=q1)', - webpage) - if mobj: - return mobj.group('src') - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py index 70cf10515..787b9f70d 100644 --- a/yt_dlp/extractor/substack.py +++ b/yt_dlp/extractor/substack.py @@ -46,14 +46,15 @@ class SubstackIE(InfoExtractor): }] @classmethod - def _extract_url(cls, webpage, url): + def _extract_embed_urls(cls, url, webpage): if not re.search(r'<script[^>]+src=["\']https://substackcdn.com/[^"\']+\.js', webpage): return mobj = re.search(r'{[^}]*["\']subdomain["\']\s*:\s*["\'](?P<subdomain>[^"]+)', webpage) if mobj: parsed = urllib.parse.urlparse(url) - return parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl() + yield parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl() + raise cls.StopExtraction() def _extract_video_formats(self, video_id, username): formats, subtitles = [], {} diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index e0c436b67..b422b6d93 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -101,6 +101,7 @@ class SVTBaseIE(InfoExtractor): class SVTIE(SVTBaseIE): _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)' + _EMBED_REGEX = [r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % _VALID_URL] _TEST = { 'url': 'http://www.svt.se/wd?widgetId=23991§ionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false', 'md5': '33e9a5d8f646523ce0868ecfb0eed77d', @@ -113,13 +114,6 @@ class SVTIE(SVTBaseIE): }, } - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) widget_id = mobj.group('widget_id') diff --git a/yt_dlp/extractor/teachable.py b/yt_dlp/extractor/teachable.py index e480d7610..c212a4926 100644 --- a/yt_dlp/extractor/teachable.py +++ b/yt_dlp/extractor/teachable.py @@ -140,12 +140,12 @@ class TeachableIE(TeachableBaseIE): r'<link[^>]+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com', webpage) - @staticmethod - def _extract_url(webpage, source_url): - if not TeachableIE._is_teachable(webpage): - return - if re.match(r'https?://[^/]+/(?:courses|p)', source_url): - return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url) + @classmethod + def _extract_embed_urls(cls, url, webpage): + if cls._is_teachable(webpage): + if re.match(r'https?://[^/]+/(?:courses|p)', url): + yield f'{cls._URL_PREFIX}{url}' + raise cls.StopExtraction() def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -160,7 +160,7 @@ class TeachableIE(TeachableBaseIE): webpage = self._download_webpage(url, video_id) - wistia_urls = WistiaIE._extract_urls(webpage) + wistia_urls = WistiaIE._extract_embed_urls(url, webpage) if not wistia_urls: if any(re.search(p, webpage) for p in ( r'class=["\']lecture-contents-locked', diff --git a/yt_dlp/extractor/ted.py b/yt_dlp/extractor/ted.py index b5c7e35ac..0e09ec757 100644 --- a/yt_dlp/extractor/ted.py +++ b/yt_dlp/extractor/ted.py @@ -215,6 +215,7 @@ class TedPlaylistIE(TedBaseIE): class TedEmbedIE(InfoExtractor): _VALID_URL = r'https?://embed(?:-ssl)?\.ted\.com/' + _EMBED_REGEX = [rf'<iframe[^>]+?src=(["\'])(?P<url>{_VALID_URL}.+?)\1'] _TESTS = [{ 'url': 'https://embed.ted.com/talks/janet_stovall_how_to_get_serious_about_diversity_and_inclusion_in_the_workplace', @@ -233,10 +234,5 @@ class TedEmbedIE(InfoExtractor): }, }] - @classmethod - def _extract_urls(cls, webpage): - return [mobj.group('url') for mobj in re.finditer( - fr'<iframe[^>]+?src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1', webpage)] - def _real_extract(self, url): return self.url_result(re.sub(r'://embed(-ssl)?', '://www', url), TedTalkIE.ie_key()) diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index bf7efc013..c8026d294 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -123,6 +123,13 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? |theplatform:)(?P<id>[^/\?&]+)''' + _EMBED_REGEX = [ + r'''(?x) + <meta\s+ + property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+ + content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2''', + r'(?s)<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//player\.theplatform\.com/p/.+?)\1' + ] _TESTS = [{ # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/ @@ -192,22 +199,11 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): }] @classmethod - def _extract_urls(cls, webpage): - m = re.search( - r'''(?x) - <meta\s+ - property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+ - content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2 - ''', webpage) - if m: - return [m.group('url')] - + def _extract_embed_urls(cls, url, webpage): # Are whitespaces ignored in URLs? # https://github.com/ytdl-org/youtube-dl/issues/12044 - matches = re.findall( - r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) - if matches: - return [re.sub(r'\s', '', list(zip(*matches))[1][0])] + for embed_url in super()._extract_embed_urls(url, webpage): + yield re.sub(r'\s', '', embed_url) @staticmethod def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False): diff --git a/yt_dlp/extractor/threeqsdn.py b/yt_dlp/extractor/threeqsdn.py index 1c0baf5ed..a313a8dfb 100644 --- a/yt_dlp/extractor/threeqsdn.py +++ b/yt_dlp/extractor/threeqsdn.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -16,6 +14,7 @@ class ThreeQSDNIE(InfoExtractor): IE_NAME = '3qsdn' IE_DESC = '3Q SDN' _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _EMBED_REGEX = [r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % _VALID_URL] _TESTS = [{ # https://player.3qsdn.com/demo.html 'url': 'https://playout.3qsdn.com/7201c779-6b3c-11e7-a40e-002590c750be', @@ -76,12 +75,13 @@ class ThreeQSDNIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % ThreeQSDNIE._VALID_URL, webpage) - if mobj: - return mobj.group('url') + def _extract_from_webpage(self, url, webpage): + for res in super()._extract_from_webpage(url, webpage): + yield { + **res, + '_type': 'url_transparent', + 'uploader': self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader'), + } def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 680358d5e..3ac765270 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -1,7 +1,6 @@ import itertools import json import random -import re import string import time @@ -379,6 +378,7 @@ class TikTokBaseIE(InfoExtractor): class TikTokIE(TikTokBaseIE): _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)/video)/(?P<id>\d+)' + _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})'] _TESTS = [{ 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610', @@ -529,11 +529,6 @@ class TikTokIE(TikTokBaseIE): 'only_matching': True }] - @classmethod - def _extract_urls(cls, webpage): - return [mobj.group('url') for mobj in re.finditer( - rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{cls._VALID_URL})', webpage)] - def _extract_aweme_app(self, aweme_id): try: aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id, diff --git a/yt_dlp/extractor/tnaflix.py b/yt_dlp/extractor/tnaflix.py index 6b766f3cc..34361e515 100644 --- a/yt_dlp/extractor/tnaflix.py +++ b/yt_dlp/extractor/tnaflix.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -173,6 +171,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1'] _TITLE_REGEX = r'<title>([^<]+)</title>' @@ -194,12 +193,6 @@ class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1', - webpage)] - class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE): _DESCRIPTION_REGEX = r'(?s)>Description:</[^>]+>(.+?)<' diff --git a/yt_dlp/extractor/tube8.py b/yt_dlp/extractor/tube8.py index 32e80d9d2..b092ecad5 100644 --- a/yt_dlp/extractor/tube8.py +++ b/yt_dlp/extractor/tube8.py @@ -9,6 +9,7 @@ from .keezmovies import KeezMoviesIE class Tube8IE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)'] _TESTS = [{ 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', 'md5': '65e20c48e6abff62ed0c3965fff13a39', @@ -29,12 +30,6 @@ class Tube8IE(KeezMoviesIE): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)', - webpage) - def _real_extract(self, url): webpage, info = self._extract_info(url) diff --git a/yt_dlp/extractor/tunein.py b/yt_dlp/extractor/tunein.py index e3d3f2a96..f163eaf09 100644 --- a/yt_dlp/extractor/tunein.py +++ b/yt_dlp/extractor/tunein.py @@ -8,12 +8,6 @@ from ..compat import compat_urlparse class TuneInBaseIE(InfoExtractor): _API_BASE_URL = 'http://tunein.com/tuner/tune/' - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/[pst]\d+)', - webpage) - def _real_extract(self, url): content_id = self._match_id(url) @@ -86,6 +80,7 @@ class TuneInClipIE(TuneInBaseIE): class TuneInStationIE(TuneInBaseIE): IE_NAME = 'tunein:station' _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId=|embed/player/s)(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/[pst]\d+)'] _API_URL_QUERY = '?tuneType=Station&stationId=%s' @classmethod diff --git a/yt_dlp/extractor/tvc.py b/yt_dlp/extractor/tvc.py index 4ccc8f522..1ef64caf9 100644 --- a/yt_dlp/extractor/tvc.py +++ b/yt_dlp/extractor/tvc.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( clean_html, @@ -9,6 +7,7 @@ from ..utils import ( class TVCIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1'] _TEST = { 'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702', 'md5': 'bbc5ff531d1e90e856f60fc4b3afd708', @@ -21,13 +20,6 @@ class TVCIE(InfoExtractor): }, } - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/tvigle.py b/yt_dlp/extractor/tvigle.py index cc1d35dc2..9a7cb7214 100644 --- a/yt_dlp/extractor/tvigle.py +++ b/yt_dlp/extractor/tvigle.py @@ -13,6 +13,7 @@ class TvigleIE(InfoExtractor): IE_NAME = 'tvigle' IE_DESC = 'Интернет-телевидение Tvigle.ru' _VALID_URL = r'https?://(?:www\.)?(?:tvigle\.ru/(?:[^/]+/)+(?P<display_id>[^/]+)/$|cloud\.tvigle\.ru/video/(?P<id>\d+))' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1'] _GEO_BYPASS = False _GEO_COUNTRIES = ['RU'] diff --git a/yt_dlp/extractor/tvopengr.py b/yt_dlp/extractor/tvopengr.py index aded261f3..d8be12c96 100644 --- a/yt_dlp/extractor/tvopengr.py +++ b/yt_dlp/extractor/tvopengr.py @@ -1,11 +1,8 @@ -import re - from .common import InfoExtractor from ..utils import ( determine_ext, get_elements_text_and_html_by_attribute, scale_thumbnails_to_max_format_width, - unescapeHTML, ) @@ -98,7 +95,7 @@ class TVOpenGrEmbedIE(TVOpenGrBaseIE): IE_NAME = 'tvopengr:embed' IE_DESC = 'tvopen.gr embedded videos' _VALID_URL = r'(?:https?:)?//(?:www\.|cdn\.|)(?:tvopen|ethnos).gr/embed/(?P<id>\d+)' - _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''') + _EMBED_REGEX = [rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)'''] _TESTS = [{ 'url': 'https://cdn.ethnos.gr/embed/100963', @@ -115,11 +112,6 @@ class TVOpenGrEmbedIE(TVOpenGrBaseIE): }, }] - @classmethod - def _extract_urls(cls, webpage): - for mobj in cls._EMBED_RE.finditer(webpage): - yield unescapeHTML(mobj.group('url')) - def _real_extract(self, url): video_id = self._match_id(url) return self._return_canonical_url(url, video_id) diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py index 69168f655..f1bc0fbba 100644 --- a/yt_dlp/extractor/tvp.py +++ b/yt_dlp/extractor/tvp.py @@ -310,6 +310,7 @@ class TVPEmbedIE(InfoExtractor): =) (?P<id>\d+) ''' + _EMBED_REGEX = [rf'(?x)<iframe[^>]+?src=(["\'])(?P<url>{_VALID_URL[4:]})'] _TESTS = [{ 'url': 'tvp:194536', @@ -340,12 +341,6 @@ class TVPEmbedIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage, **kw): - return [m.group('embed') for m in re.finditer( - r'(?x)<iframe[^>]+?src=(["\'])(?P<embed>%s)' % TVPEmbedIE._VALID_URL[4:], - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/twentymin.py b/yt_dlp/extractor/twentymin.py index 616c3c36e..f33f15914 100644 --- a/yt_dlp/extractor/twentymin.py +++ b/yt_dlp/extractor/twentymin.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -18,6 +16,7 @@ class TwentyMinutenIE(InfoExtractor): ) (?P<id>\d+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1'] _TESTS = [{ 'url': 'http://www.20min.ch/videotv/?vid=469148&cid=2', 'md5': 'e7264320db31eed8c38364150c12496e', @@ -44,12 +43,6 @@ class TwentyMinutenIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [m.group('url') for m in re.finditer( - r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1', - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/udn.py b/yt_dlp/extractor/udn.py index 4fa74b9e8..9fdb46faf 100644 --- a/yt_dlp/extractor/udn.py +++ b/yt_dlp/extractor/udn.py @@ -13,6 +13,7 @@ class UDNEmbedIE(InfoExtractor): IE_DESC = '聯合影音' _PROTOCOL_RELATIVE_VALID_URL = r'//video\.udn\.com/(?:embed|play)/news/(?P<id>\d+)' _VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL + _EMBED_REGEX = [r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % _PROTOCOL_RELATIVE_VALID_URL] _TESTS = [{ 'url': 'http://video.udn.com/embed/news/300040', 'info_dict': { diff --git a/yt_dlp/extractor/ustream.py b/yt_dlp/extractor/ustream.py index fff21667a..cb920bf13 100644 --- a/yt_dlp/extractor/ustream.py +++ b/yt_dlp/extractor/ustream.py @@ -20,6 +20,7 @@ from ..utils import ( class UstreamIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)' IE_NAME = 'ustream' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1'] _TESTS = [{ 'url': 'http://www.ustream.tv/recorded/20274954', 'md5': '088f151799e8f572f84eb62f17d73e5c', @@ -71,13 +72,6 @@ class UstreamIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage) - if mobj is not None: - return mobj.group('url') - def _get_stream_info(self, url, video_id, app_id_ver, extra_note=None): def num_to_hex(n): return hex(n)[2:] diff --git a/yt_dlp/extractor/vbox7.py b/yt_dlp/extractor/vbox7.py index 76c844cb8..be35dad1c 100644 --- a/yt_dlp/extractor/vbox7.py +++ b/yt_dlp/extractor/vbox7.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ExtractorError @@ -17,6 +15,7 @@ class Vbox7IE(InfoExtractor): ) (?P<id>[\da-fA-F]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)'] _GEO_COUNTRIES = ['BG'] _TESTS = [{ 'url': 'http://vbox7.com/play:0946fff23c', @@ -51,14 +50,6 @@ class Vbox7IE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)', - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/vevo.py b/yt_dlp/extractor/vevo.py index 825089f47..a146be048 100644 --- a/yt_dlp/extractor/vevo.py +++ b/yt_dlp/extractor/vevo.py @@ -36,6 +36,7 @@ class VevoIE(VevoBaseIE): https?://tv\.vevo\.com/watch/artist/(?:[^/]+)/| vevo:) (?P<id>[^&?#]+)''' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1'] _TESTS = [{ 'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', diff --git a/yt_dlp/extractor/vice.py b/yt_dlp/extractor/vice.py index abb4a6fa0..f3ad56bf1 100644 --- a/yt_dlp/extractor/vice.py +++ b/yt_dlp/extractor/vice.py @@ -2,7 +2,6 @@ import functools import hashlib import json import random -import re import time from .adobepass import AdobePassIE @@ -38,6 +37,7 @@ class ViceBaseIE(InfoExtractor): class ViceIE(ViceBaseIE, AdobePassIE): IE_NAME = 'vice' _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]{24})' + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})'] _TESTS = [{ 'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7', 'info_dict': { @@ -103,17 +103,6 @@ class ViceIE(ViceBaseIE, AdobePassIE): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})', - webpage) - - @staticmethod - def _extract_url(webpage): - urls = ViceIE._extract_urls(webpage) - return urls[0] if urls else None - def _real_extract(self, url): locale, video_id = self._match_valid_url(url).groups() diff --git a/yt_dlp/extractor/viddler.py b/yt_dlp/extractor/viddler.py index f491b67ef..d81a31375 100644 --- a/yt_dlp/extractor/viddler.py +++ b/yt_dlp/extractor/viddler.py @@ -7,6 +7,8 @@ from ..utils import ( class ViddlerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)(?:.+?\bsecret=(\d+))?' + _EMBED_REGEX = [r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1'] + _TESTS = [{ 'url': 'http://www.viddler.com/v/43903784', 'md5': '9eee21161d2c7f5b39690c3e325fab2f', diff --git a/yt_dlp/extractor/videa.py b/yt_dlp/extractor/videa.py index 9b05c86a5..fa16da28b 100644 --- a/yt_dlp/extractor/videa.py +++ b/yt_dlp/extractor/videa.py @@ -1,5 +1,4 @@ import random -import re import string import struct @@ -29,6 +28,7 @@ class VideaIE(InfoExtractor): ) (?P<id>[^?#&]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1'] _TESTS = [{ 'url': 'http://videa.hu/videok/allatok/az-orult-kigyasz-285-kigyot-kigyo-8YfIAjxwWGwT8HVQ', 'md5': '97a7af41faeaffd9f1fc864a7c7e7603', @@ -75,12 +75,6 @@ class VideaIE(InfoExtractor): _STATIC_SECRET = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', - webpage)] - - @staticmethod def rc4(cipher_text, key): res = b'' diff --git a/yt_dlp/extractor/videomore.py b/yt_dlp/extractor/videomore.py index 09d12d192..2f81860bb 100644 --- a/yt_dlp/extractor/videomore.py +++ b/yt_dlp/extractor/videomore.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..compat import ( compat_str, @@ -47,6 +45,12 @@ class VideomoreIE(InfoExtractor): (?P<id>\d+) (?:[/?#&]|\.(?:xml|json)|$) ''' + _EMBED_REGEX = [r'''(?x) + (?: + <iframe[^>]+src=([\'"])| + <object[^>]+data=(["\'])https?://videomore\.ru/player\.swf\?.*config= + )(?P<url>https?://videomore\.ru/[^?#"']+/\d+(?:\.xml)?) + '''] _TESTS = [{ 'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617', 'md5': '44455a346edc0d509ac5b5a5b531dc35', @@ -126,19 +130,6 @@ class VideomoreIE(InfoExtractor): }] _GEO_BYPASS = False - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<object[^>]+data=(["\'])https?://videomore\.ru/player\.swf\?.*config=(?P<url>https?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1', - webpage) - if not mobj: - mobj = re.search( - r'<iframe[^>]+src=([\'"])(?P<url>https?://videomore\.ru/embed/\d+)', - webpage) - - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('sid') or mobj.group('id') diff --git a/yt_dlp/extractor/videopress.py b/yt_dlp/extractor/videopress.py index 3c5e27a9d..16965dfb0 100644 --- a/yt_dlp/extractor/videopress.py +++ b/yt_dlp/extractor/videopress.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -17,6 +15,7 @@ class VideoPressIE(InfoExtractor): _ID_REGEX = r'[\da-zA-Z]{8}' _PATH_REGEX = r'video(?:\.word)?press\.com/embed/' _VALID_URL = r'https?://%s(?P<id>%s)' % (_PATH_REGEX, _ID_REGEX) + _EMBED_REGEX = [rf'<iframe[^>]+src=["\'](?P<url>(?:https?://)?{_PATH_REGEX}{_ID_REGEX})'] _TESTS = [{ 'url': 'https://videopress.com/embed/kUJmAcSf', 'md5': '706956a6c875873d51010921310e4bc6', @@ -39,12 +38,6 @@ class VideoPressIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX), - webpage) - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/viewlift.py b/yt_dlp/extractor/viewlift.py index d081a2f12..b630f9a6d 100644 --- a/yt_dlp/extractor/viewlift.py +++ b/yt_dlp/extractor/viewlift.py @@ -1,5 +1,4 @@ import json -import re from .common import InfoExtractor from ..compat import compat_HTTPError @@ -63,6 +62,7 @@ class ViewLiftBaseIE(InfoExtractor): class ViewLiftEmbedIE(ViewLiftBaseIE): IE_NAME = 'viewlift:embed' _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?P<domain>%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX] _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -89,14 +89,6 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX, - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): domain, film_id = self._match_valid_url(url).groups() site = domain.split('.')[-2] diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 961734345..1c9e2453a 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -30,7 +30,6 @@ from ..utils import ( unsmuggle_url, urlencode_postdata, urljoin, - unescapeHTML, urlhandle_detect_ext, ) @@ -328,6 +327,14 @@ class VimeoIE(VimeoBaseInfoExtractor): /?(?:[?&].*)?(?:[#].*)?$ ''' IE_NAME = 'vimeo' + _EMBED_REGEX = [ + # iframe + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/\d+.*?)\1', + # Embedded (swf embed) Vimeo player + r'<embed[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)\1', + # Non-standard embedded Vimeo player + r'<video[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)\1', + ] _TESTS = [ { 'url': 'http://vimeo.com/56015672#at=0', @@ -729,29 +736,10 @@ class VimeoIE(VimeoBaseInfoExtractor): # vimeo embed with check-password page protected by Referer header ] - @staticmethod - def _extract_urls(url, webpage): - urls = [] - # Look for embedded (iframe) Vimeo player - for mobj in re.finditer( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/\d+.*?)\1', - webpage): - urls.append(VimeoIE._smuggle_referrer(unescapeHTML(mobj.group('url')), url)) - PLAIN_EMBED_RE = ( - # Look for embedded (swf embed) Vimeo player - r'<embed[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)\1', - # Look more for non-standard embedded Vimeo player - r'<video[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)\1', - ) - for embed_re in PLAIN_EMBED_RE: - for mobj in re.finditer(embed_re, webpage): - urls.append(mobj.group('url')) - return urls - - @staticmethod - def _extract_url(url, webpage): - urls = VimeoIE._extract_urls(url, webpage) - return urls[0] if urls else None + @classmethod + def _extract_embed_urls(cls, url, webpage): + for embed_url in super()._extract_embed_urls(url, webpage): + yield cls._smuggle_referrer(embed_url, url) def _verify_player_video_password(self, url, video_id, headers): password = self._get_video_password() @@ -1386,12 +1374,12 @@ class VimeoLikesIE(VimeoChannelIE): class VHXEmbedIE(VimeoBaseInfoExtractor): IE_NAME = 'vhx:embed' _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://embed\.vhx\.tv/videos/\d+[^"]*)"'] - @staticmethod - def _extract_url(url, webpage): - mobj = re.search( - r'<iframe[^>]+src="(https?://embed\.vhx\.tv/videos/\d+[^"]*)"', webpage) - return VimeoIE._smuggle_referrer(unescapeHTML(mobj.group(1)), url) if mobj else None + @classmethod + def _extract_embed_urls(cls, url, webpage): + for embed_url in super()._extract_embed_urls(url, webpage): + yield cls._smuggle_referrer(embed_url, url) def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/vine.py b/yt_dlp/extractor/vine.py index 947f5cdb6..8e57201f6 100644 --- a/yt_dlp/extractor/vine.py +++ b/yt_dlp/extractor/vine.py @@ -10,6 +10,7 @@ from ..utils import ( class VineIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?vine\.co/(?:v|oembed)/(?P<id>\w+)' + _EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))'] _TESTS = [{ 'url': 'https://vine.co/v/b9KOOWX7HUx', 'md5': '2f36fed6235b16da96ce9b4dc890940d', diff --git a/yt_dlp/extractor/viqeo.py b/yt_dlp/extractor/viqeo.py index d214223e9..574622fa9 100644 --- a/yt_dlp/extractor/viqeo.py +++ b/yt_dlp/extractor/viqeo.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -17,6 +15,7 @@ class ViqeoIE(InfoExtractor): ) (?P<id>[\da-f]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cdn\.viqeo\.tv/embed/*\?.*?\bvid=[\da-f]+.*?)\1'] _TESTS = [{ 'url': 'https://cdn.viqeo.tv/embed/?vid=cde96f09d25f39bee837', 'md5': 'a169dd1a6426b350dca4296226f21e76', @@ -35,14 +34,6 @@ class ViqeoIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cdn\.viqeo\.tv/embed/*\?.*?\bvid=[\da-f]+.*?)\1', - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index bad0b4ff4..95ea63ffa 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -85,6 +85,7 @@ class VKBaseIE(InfoExtractor): class VKIE(VKBaseIE): IE_NAME = 'vk' IE_DESC = 'VK' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1'] _VALID_URL = r'''(?x) https?:// (?: @@ -100,6 +101,8 @@ class VKIE(VKBaseIE): (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>([\da-f]+)|(ln-[\da-zA-Z]+)))? ) ''' + # https://help.sibnet.ru/?sibnet_video_embed + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1'] _TESTS = [ { 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', @@ -344,13 +347,6 @@ class VKIE(VKBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_sibnet_urls(webpage): - # https://help.sibnet.ru/?sibnet_video_embed - return [unescapeHTML(mobj.group('url')) for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1', - webpage)] - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('videoid') @@ -451,7 +447,7 @@ class VKIE(VKBaseIE): m_rutube.group(1).replace('\\', '')) return self.url_result(rutube_url) - dailymotion_urls = DailymotionIE._extract_urls(info_page) + dailymotion_urls = DailymotionIE._extract_embed_urls(url, info_page) if dailymotion_urls: return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key()) @@ -459,7 +455,7 @@ class VKIE(VKBaseIE): if odnoklassniki_url: return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) - sibnet_urls = self._extract_sibnet_urls(info_page) + sibnet_urls = self._extract_embed_urls(url, info_page) if sibnet_urls: return self.url_result(sibnet_urls[0]) diff --git a/yt_dlp/extractor/vodplatform.py b/yt_dlp/extractor/vodplatform.py index 2b45dcd86..0d3e7eec2 100644 --- a/yt_dlp/extractor/vodplatform.py +++ b/yt_dlp/extractor/vodplatform.py @@ -4,6 +4,7 @@ from ..utils import unescapeHTML class VODPlatformIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/(?P<id>[^/?#]+)' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1'] _TESTS = [{ # from http://www.lbcgroup.tv/watch/chapter/29143/52844/%D8%A7%D9%84%D9%86%D8%B5%D8%B1%D8%A9-%D9%81%D9%8A-%D8%B6%D9%8A%D8%A7%D9%81%D8%A9-%D8%A7%D9%84%D9%80-cnn/ar 'url': 'http://vod-platform.net/embed/RufMcytHDolTH1MuKHY9Fw', diff --git a/yt_dlp/extractor/voxmedia.py b/yt_dlp/extractor/voxmedia.py index a7bf298aa..96c782d8b 100644 --- a/yt_dlp/extractor/voxmedia.py +++ b/yt_dlp/extractor/voxmedia.py @@ -71,6 +71,7 @@ class VoxMediaVolumeIE(OnceIE): class VoxMediaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:(?:theverge|vox|sbnation|eater|polygon|curbed|racked|funnyordie)\.com|recode\.net)/(?:[^/]+/)*(?P<id>[^/?]+)' + _EMBED_REGEX = [r'<iframe[^>]+?src="(?P<url>https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"'] _TESTS = [{ # Volume embed, Youtube 'url': 'http://www.theverge.com/2014/6/27/5849272/material-world-how-google-discovered-what-software-is-made-of', diff --git a/yt_dlp/extractor/vshare.py b/yt_dlp/extractor/vshare.py index fd5226bbc..93842db79 100644 --- a/yt_dlp/extractor/vshare.py +++ b/yt_dlp/extractor/vshare.py @@ -1,11 +1,10 @@ -import re - from .common import InfoExtractor from ..utils import ExtractorError, decode_packed_codes class VShareIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P<id>[^/?#&]+)' + _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?vshare\.io/v/[^/?#&]+)'] _TESTS = [{ 'url': 'https://vshare.io/d/0f64ce6', 'md5': '17b39f55b5497ae8b59f5fbce8e35886', @@ -19,12 +18,6 @@ class VShareIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?vshare\.io/v/[^/?#&]+)', - webpage) - def _extract_packed(self, webpage): packed = self._search_regex( r'(eval\(function.+)', webpage, 'packed code') diff --git a/yt_dlp/extractor/vzaar.py b/yt_dlp/extractor/vzaar.py index 7ce0ba9f5..df43caf38 100644 --- a/yt_dlp/extractor/vzaar.py +++ b/yt_dlp/extractor/vzaar.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -12,6 +10,7 @@ from ..utils import ( class VzaarIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|view)\.)?vzaar\.com/(?:videos/)?(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)'] _TESTS = [{ # HTTP and HLS 'url': 'https://vzaar.com/videos/1152805', @@ -47,12 +46,6 @@ class VzaarIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src=["\']((?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( diff --git a/yt_dlp/extractor/washingtonpost.py b/yt_dlp/extractor/washingtonpost.py index 7274eaa39..74501b1d2 100644 --- a/yt_dlp/extractor/washingtonpost.py +++ b/yt_dlp/extractor/washingtonpost.py @@ -8,7 +8,7 @@ from ..utils import traverse_obj class WashingtonPostIE(InfoExtractor): IE_NAME = 'washingtonpost' _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/(?:video|posttv)/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'] _TESTS = [{ 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', @@ -28,11 +28,6 @@ class WashingtonPostIE(InfoExtractor): 'only_matching': True, }] - @classmethod - def _extract_urls(cls, webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage) - def _real_extract(self, url): video_id = self._match_id(url) return self.url_result( diff --git a/yt_dlp/extractor/webcaster.py b/yt_dlp/extractor/webcaster.py index 374fe35cd..a66a5f8c5 100644 --- a/yt_dlp/extractor/webcaster.py +++ b/yt_dlp/extractor/webcaster.py @@ -64,27 +64,23 @@ class WebcasterIE(InfoExtractor): class WebcasterFeedIE(InfoExtractor): _VALID_URL = r'https?://bl\.webcaster\.pro/feed/start/free_(?P<id>[^/]+)' + _EMBED_REGEX = [r'<(?:object|a[^>]+class=["\']webcaster-player["\'])[^>]+data(?:-config)?=(["\']).*?config=(?P<url>https?://bl\.webcaster\.pro/feed/start/free_.*?)(?:[?&]|\1)'] _TEST = { 'url': 'http://bl.webcaster.pro/feed/start/free_c8cefd240aa593681c8d068cff59f407_hd/q393859/eb173f99dd5f558674dae55f4ba6806d/1480289104', 'only_matching': True, } - @staticmethod - def _extract_url(ie, webpage): - mobj = re.search( - r'<(?:object|a[^>]+class=["\']webcaster-player["\'])[^>]+data(?:-config)?=(["\']).*?config=(?P<url>https?://bl\.webcaster\.pro/feed/start/free_.*?)(?:[?&]|\1)', - webpage) - if mobj: - return mobj.group('url') + def _extract_from_webpage(self, url, webpage): + yield from super()._extract_from_webpage(url, webpage) + for secure in (True, False): - video_url = ie._og_search_video_url( - webpage, secure=secure, default=None) + video_url = self._og_search_video_url(webpage, secure=secure, default=None) if video_url: mobj = re.search( r'config=(?P<url>https?://bl\.webcaster\.pro/feed/start/free_[^?&=]+)', video_url) if mobj: - return mobj.group('url') + yield self.url_result(mobj.group('url'), self) def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/wimtv.py b/yt_dlp/extractor/wimtv.py index 263844d72..d27a348d9 100644 --- a/yt_dlp/extractor/wimtv.py +++ b/yt_dlp/extractor/wimtv.py @@ -1,5 +1,3 @@ -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -20,6 +18,7 @@ class WimTVIE(InfoExtractor): ) (?P<type>vod|live|cast)[=/] (?P<id>%s).*?)''' % _UUID_RE + _EMBED_REGEX = [rf'<iframe[^>]+src=["\'](?P<url>{_VALID_URL})'] _TESTS = [{ # vod stream 'url': 'https://platform.wim.tv/embed/?vod=db29fb32-bade-47b6-a3a6-cb69fe80267a', @@ -54,14 +53,6 @@ class WimTVIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe[^>]+src=["\'](?P<url>%s)' % WimTVIE._VALID_URL, - webpage)] - def _real_initialize(self): if not self._player: self._get_player_data() diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py index 3cbcb4aa0..438828624 100644 --- a/yt_dlp/extractor/wistia.py +++ b/yt_dlp/extractor/wistia.py @@ -5,8 +5,8 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + try_call, try_get, - unescapeHTML, ) @@ -117,7 +117,7 @@ class WistiaBaseIE(InfoExtractor): class WistiaIE(WistiaBaseIE): _VALID_URL = r'(?:wistia:|%s(?:iframe|medias)/)%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX) - + _EMBED_REGEX = [r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})'] _TESTS = [{ # with hls video 'url': 'wistia:807fafadvk', @@ -146,17 +146,10 @@ class WistiaIE(WistiaBaseIE): }] # https://wistia.com/support/embed-and-share/video-on-your-website - @staticmethod - def _extract_url(webpage): - urls = WistiaIE._extract_urls(webpage) - return urls[0] if urls else None - - @staticmethod - def _extract_urls(webpage): - urls = [] - for match in re.finditer( - r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage): - urls.append(unescapeHTML(match.group('url'))) + @classmethod + def _extract_embed_urls(cls, url, webpage): + urls = list(super()._extract_embed_urls(url, webpage)) + for match in re.finditer( r'''(?sx) <div[^>]+class=(["'])(?:(?!\1).)*?\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1 @@ -166,6 +159,20 @@ class WistiaIE(WistiaBaseIE): urls.append('wistia:%s' % match.group('id')) return urls + @classmethod + def _extract_from_webpage(cls, url, webpage): + from .teachable import TeachableIE + + if list(TeachableIE._extract_embed_urls(url, webpage)): + return + + for entry in super()._extract_from_webpage(url, webpage): + yield { + **entry, + '_type': 'url_transparent', + 'uploader': try_call(lambda: re.match(r'(?:https?://)?([^/]+)/', url).group(1)), + } + def _real_extract(self, url): video_id = self._match_id(url) embed_config = self._download_embed_config('media', video_id, url) diff --git a/yt_dlp/extractor/xfileshare.py b/yt_dlp/extractor/xfileshare.py index 63abe4a1f..5ecd7f00f 100644 --- a/yt_dlp/extractor/xfileshare.py +++ b/yt_dlp/extractor/xfileshare.py @@ -61,6 +61,7 @@ class XFileShareIE(InfoExtractor): IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) _VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' % '|'.join(site for site in list(zip(*_SITES))[0])) + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' % '|'.join(site for site in list(zip(*_SITES))[0])] _FILE_NOT_FOUND_REGEXES = ( r'>(?:404 - )?File Not Found<', @@ -84,15 +85,6 @@ class XFileShareIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' - % '|'.join(site for site in list(zip(*XFileShareIE._SITES))[0]), - webpage)] - def _real_extract(self, url): host, video_id = self._match_valid_url(url).groups() diff --git a/yt_dlp/extractor/xhamster.py b/yt_dlp/extractor/xhamster.py index e42eed7d8..688c6b952 100644 --- a/yt_dlp/extractor/xhamster.py +++ b/yt_dlp/extractor/xhamster.py @@ -373,6 +373,7 @@ class XHamsterIE(InfoExtractor): class XHamsterEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P<id>\d+)' % XHamsterIE._DOMAINS + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1'] _TEST = { 'url': 'http://xhamster.com/xembed.php?video=3328539', 'info_dict': { @@ -387,12 +388,6 @@ class XHamsterEmbedIE(InfoExtractor): } } - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1', - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/yahoo.py b/yt_dlp/extractor/yahoo.py index f85990e0a..01a859556 100644 --- a/yt_dlp/extractor/yahoo.py +++ b/yt_dlp/extractor/yahoo.py @@ -21,6 +21,8 @@ from ..utils import ( class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' _VALID_URL = r'(?P<url>https?://(?:(?P<country>[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P<id>[^?&#]*-[0-9]+(?:-[a-z]+)?)\.html)' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1'] + _TESTS = [{ 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', 'info_dict': { @@ -310,7 +312,7 @@ class YahooIE(InfoExtractor): if items.get('markup'): entries.extend( - self.url_result(yt_url) for yt_url in YoutubeIE._extract_urls(items['markup'])) + self.url_result(yt_url) for yt_url in YoutubeIE._extract_embed_urls(url, items['markup'])) return self.playlist_result( entries, item.get('uuid'), diff --git a/yt_dlp/extractor/yapfiles.py b/yt_dlp/extractor/yapfiles.py index 8fabdf81c..221df842c 100644 --- a/yt_dlp/extractor/yapfiles.py +++ b/yt_dlp/extractor/yapfiles.py @@ -1,11 +1,8 @@ -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, qualities, - unescapeHTML, url_or_none, ) @@ -13,6 +10,7 @@ from ..utils import ( class YapFilesIE(InfoExtractor): _YAPFILES_URL = r'//(?:(?:www|api)\.)?yapfiles\.ru/get_player/*\?.*?\bv=(?P<id>\w+)' _VALID_URL = r'https?:%s' % _YAPFILES_URL + _EMBED_REGEX = [rf'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?{_YAPFILES_URL}.*?)\1'] _TESTS = [{ # with hd 'url': 'http://www.yapfiles.ru/get_player/?v=vMDE1NjcyNDUt0413', @@ -30,12 +28,6 @@ class YapFilesIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [unescapeHTML(mobj.group('url')) for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.*?)\1' - % YapFilesIE._YAPFILES_URL, webpage)] - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/youporn.py b/yt_dlp/extractor/youporn.py index b484e08ec..7fdb865f7 100644 --- a/yt_dlp/extractor/youporn.py +++ b/yt_dlp/extractor/youporn.py @@ -12,6 +12,7 @@ from ..utils import ( class YouPornIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)'] _TESTS = [{ 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'md5': '3744d24c50438cf5b6f6d59feb5055c2', @@ -65,12 +66,6 @@ class YouPornIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)', - webpage) - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4dc8e79ac..f20b7321a 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -929,6 +929,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:\#|$)""" % { 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), } + _EMBED_REGEX = [r'''(?x) + (?: + <iframe[^>]+?src=| + data-video-url=| + <embed[^>]+?src=| + embedSWF\(?:\s*| + <object[^>]+data=| + new\s+SWFObject\( + ) + (["\']) + (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) + \1'''] _PLAYER_INFO_RE = ( r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player', r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', @@ -2721,42 +2734,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor): url, video_id, f'Marking {label}watched', 'Unable to mark watched', fatal=False) - @staticmethod - def _extract_urls(webpage): - # Embedded YouTube player - entries = [ - unescapeHTML(mobj.group('url')) - for mobj in re.finditer(r'''(?x) - (?: - <iframe[^>]+?src=| - data-video-url=| - <embed[^>]+?src=| - embedSWF\(?:\s*| - <object[^>]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) - \1''', webpage)] + @classmethod + def _extract_from_webpage(cls, url, webpage): + # Invidious Instances + # https://github.com/yt-dlp/yt-dlp/issues/195 + # https://github.com/iv-org/invidious/pull/1730 + mobj = re.search( + r'<link rel="alternate" href="(?P<url>https://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})"', + webpage) + if mobj: + yield cls.url_result(mobj.group('url'), cls) + raise cls.StopExtraction() + + yield from super()._extract_from_webpage(url, webpage) # lazyYT YouTube embed - entries.extend(list(map( - unescapeHTML, - re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) + for id_ in re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage): + yield cls.url_result(unescapeHTML(id_), cls, id_) # Wordpress "YouTube Video Importer" plugin - matches = re.findall(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) - entries.extend(m[-1] for m in matches) - - return entries - - @staticmethod - def _extract_url(webpage): - urls = YoutubeIE._extract_urls(webpage) - return urls[0] if urls else None + for m in re.findall(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage): + yield cls.url_result(m[-1], cls, m[-1]) @classmethod def extract_id(cls, url): diff --git a/yt_dlp/extractor/zapiks.py b/yt_dlp/extractor/zapiks.py index a1546fd88..4b18cb86c 100644 --- a/yt_dlp/extractor/zapiks.py +++ b/yt_dlp/extractor/zapiks.py @@ -12,6 +12,7 @@ from ..utils import ( class ZapiksIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?zapiks\.(?:fr|com)/(?:(?:[a-z]{2}/)?(?P<display_id>.+?)\.html|index\.php\?.*\bmedia_id=(?P<id>\d+))' + _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"'] _TESTS = [ { 'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html', diff --git a/yt_dlp/extractor/zype.py b/yt_dlp/extractor/zype.py index 6f2fbb9e9..a705149e6 100644 --- a/yt_dlp/extractor/zype.py +++ b/yt_dlp/extractor/zype.py @@ -15,6 +15,7 @@ class ZypeIE(InfoExtractor): _ID_RE = r'[\da-fA-F]+' _COMMON_RE = r'//player\.zype\.com/embed/%s\.(?:js|json|html)\?.*?(?:access_token|(?:ap[ip]|player)_key)=' _VALID_URL = r'https?:%s[^&]+' % (_COMMON_RE % ('(?P<id>%s)' % _ID_RE)) + _EMBED_REGEX = [fr'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?{_COMMON_RE % _ID_RE}.+?)\1'] _TEST = { 'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false', 'md5': 'eaee31d474c76a955bdaba02a505c595', @@ -29,14 +30,6 @@ class ZypeIE(InfoExtractor): }, } - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.+?)\1' % (ZypeIE._COMMON_RE % ZypeIE._ID_RE), - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) |