diff options
Diffstat (limited to 'youtube_dlc/extractor/generic.py')
-rw-r--r-- | youtube_dlc/extractor/generic.py | 179 |
1 files changed, 130 insertions, 49 deletions
diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py index e5d29f316..6246b8a83 100644 --- a/youtube_dlc/extractor/generic.py +++ b/youtube_dlc/extractor/generic.py @@ -29,9 +29,11 @@ from ..utils import ( sanitized_Request, smuggle_url, unescapeHTML, - unified_strdate, + unified_timestamp, unsmuggle_url, UnsupportedError, + url_or_none, + xpath_attr, xpath_text, ) from .commonprotocols import RtmpIE @@ -48,7 +50,6 @@ from .ooyala import OoyalaIE from .rutv import RUTVIE from .tvc import TVCIE from .sportbox import SportBoxIE -from .smotri import SmotriIE from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE @@ -63,7 +64,10 @@ from .tube8 import Tube8IE from .mofosex import MofosexEmbedIE from .spankwire import SpankwireIE from .youporn import YouPornIE -from .vimeo import VimeoIE +from .vimeo import ( + VimeoIE, + VHXEmbedIE, +) from .dailymotion import DailymotionIE from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE @@ -123,6 +127,7 @@ from .kinja import KinjaEmbedIE from .gedi import GediEmbedsIE from .rcs import RCSEmbedsIE from .bitchute import BitChuteIE +from .arcpublishing import ArcPublishingIE class GenericIE(InfoExtractor): @@ -201,11 +206,46 @@ class GenericIE(InfoExtractor): { 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'info_dict': { - 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - 'ext': 'm4v', - 'upload_date': '20150228', - 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - } + 'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'title': 'MSNBC Rachel Maddow (video)', + 'description': 're:.*her unique approach to storytelling.*', + }, + 'playlist': [{ + 'info_dict': { + 'ext': 'mov', + 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726', + 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726', + 'description': 're:.*her unique approach to storytelling.*', + 'upload_date': '20201204', + }, + }], + }, + # RSS feed with item with description and thumbnails + { + 'url': 'https://anchor.fm/s/dd00e14/podcast/rss', + 'info_dict': { + 'id': 'https://anchor.fm/s/dd00e14/podcast/rss', + 'title': 're:.*100% Hydrogen.*', + 'description': 're:.*In this episode.*', + }, + 'playlist': [{ + 'info_dict': { + 'ext': 'm4a', + 'id': 'c1c879525ce2cb640b344507e682c36d', + 'title': 're:Hydrogen!', + 'description': 're:.*In this episode we are going.*', + 'timestamp': 1567977776, + 'upload_date': '20190908', + 'duration': 459, + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 1, + 'season_number': 1, + 'age_limit': 0, + }, + }], + 'params': { + 'skip_download': True, + }, }, # RSS feed with enclosures and unsupported link URLs { @@ -1987,22 +2027,6 @@ class GenericIE(InfoExtractor): 'add_ie': [SpringboardPlatformIE.ie_key()], }, { - 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', - 'info_dict': { - 'id': 'uPDB5I9wfp8', - 'ext': 'webm', - 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', - 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', - 'upload_date': '20160219', - 'uploader': 'Pocoyo - Português (BR)', - 'uploader_id': 'PocoyoBrazil', - }, - 'add_ie': [YoutubeIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, - { 'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html', 'info_dict': { 'id': 'vMDE4NzI1Mjgt690b', @@ -2106,23 +2130,23 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - { - # Zype embed - 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', - 'info_dict': { - 'id': '5b400b834b32992a310622b9', - 'ext': 'mp4', - 'title': 'Smoky Barbecue Favorites', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', - 'upload_date': '20170909', - 'timestamp': 1504915200, - }, - 'add_ie': [ZypeIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, + # { + # # Zype embed + # 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', + # 'info_dict': { + # 'id': '5b400b834b32992a310622b9', + # 'ext': 'mp4', + # 'title': 'Smoky Barbecue Favorites', + # 'thumbnail': r're:^https?://.*\.jpe?g', + # 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', + # 'upload_date': '20170909', + # 'timestamp': 1504915200, + # }, + # 'add_ie': [ZypeIE.ie_key()], + # 'params': { + # 'skip_download': True, + # }, + # }, { # videojs embed 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904', @@ -2171,7 +2195,32 @@ class GenericIE(InfoExtractor): # 'params': { # 'force_generic_extractor': True, # }, - # } + # }, + { + # VHX Embed + 'url': 'https://demo.vhx.tv/category-c/videos/file-example-mp4-480-1-5mg-copy', + 'info_dict': { + 'id': '858208', + 'ext': 'mp4', + 'title': 'Untitled', + 'uploader_id': 'user80538407', + 'uploader': 'OTT Videos', + }, + }, + { + # ArcPublishing PoWa video player + 'url': 'https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/', + 'md5': 'b03b2fac8680e1e5a7cc81a5c27e71b3', + 'info_dict': { + 'id': '8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'ext': 'mp4', + 'title': 'Senate candidates wave to voters on Anchorage streets', + 'description': 'md5:91f51a6511f090617353dc720318b20e', + 'timestamp': 1604378735, + 'upload_date': '20201103', + 'duration': 1581, + }, + }, ] def report_following_redirect(self, new_url): @@ -2183,6 +2232,10 @@ class GenericIE(InfoExtractor): playlist_desc_el = doc.find('./channel/description') playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text + NS_MAP = { + 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', + } + entries = [] for it in doc.findall('./channel/item'): next_url = None @@ -2198,10 +2251,33 @@ class GenericIE(InfoExtractor): if not next_url: continue + def itunes(key): + return xpath_text( + it, xpath_with_ns('./itunes:%s' % key, NS_MAP), + default=None) + + duration = itunes('duration') + explicit = (itunes('explicit') or '').lower() + if explicit in ('true', 'yes'): + age_limit = 18 + elif explicit in ('false', 'no'): + age_limit = 0 + else: + age_limit = None + entries.append({ '_type': 'url_transparent', 'url': next_url, 'title': it.find('title').text, + 'description': xpath_text(it, 'description', default=None), + 'timestamp': unified_timestamp( + xpath_text(it, 'pubDate', default=None)), + 'duration': int_or_none(duration) or parse_duration(duration), + 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), + 'episode': itunes('title'), + 'episode_number': int_or_none(itunes('episode')), + 'season_number': int_or_none(itunes('season')), + 'age_limit': age_limit, }) return { @@ -2321,7 +2397,7 @@ class GenericIE(InfoExtractor): info_dict = { 'id': video_id, 'title': self._generic_title(url), - 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) + 'timestamp': unified_timestamp(head_response.headers.get('Last-Modified')) } # Check for direct link to a video @@ -2427,7 +2503,9 @@ class GenericIE(InfoExtractor): # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way - webpage = compat_urllib_parse_unquote(webpage) + # FIXME: unescaping the whole page may break URLs, commenting out for now. + # There probably should be a second run of generic extractor on unescaped webpage. + # webpage = compat_urllib_parse_unquote(webpage) # Unescape squarespace embeds to be detected by generic extractor, # see https://github.com/ytdl-org/youtube-dl/issues/21294 @@ -2509,6 +2587,10 @@ class GenericIE(InfoExtractor): if tp_urls: return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') + arc_urls = ArcPublishingIE._extract_urls(webpage) + if arc_urls: + return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key()) + # Look for embedded rtl.nl player matches = re.findall( r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', @@ -2520,6 +2602,10 @@ class GenericIE(InfoExtractor): if vimeo_urls: return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) + vhx_url = VHXEmbedIE._extract_url(webpage) + if vhx_url: + return self.url_result(vhx_url, VHXEmbedIE.ie_key()) + vid_me_embed_url = self._search_regex( r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', webpage, 'vid.me embed', default=None) @@ -2775,11 +2861,6 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) - # Look for embedded smotri.com player - smotri_url = SmotriIE._extract_url(webpage) - if smotri_url: - return self.url_result(smotri_url, 'Smotri') - # Look for embedded Myvi.ru player myvi_url = MyviIE._extract_url(webpage) if myvi_url: |