diff options
author | Jesús <heckyel@hyperbola.info> | 2022-06-27 01:25:17 +0800 |
---|---|---|
committer | Jesús <heckyel@hyperbola.info> | 2022-06-27 01:25:17 +0800 |
commit | 16e8548f6a720a78679e417a20a300db2036bf6c (patch) | |
tree | b1247bca3417ce882e4a4d80213f41c20113c1a4 /yt_dlp/extractor/generic.py | |
parent | 4bbf329feb5a820ac21269fa426c95ca14d7af25 (diff) | |
parent | e08f72e6759fb6b1102521f0bdb9457038ef7c06 (diff) | |
download | hypervideo-pre-16e8548f6a720a78679e417a20a300db2036bf6c.tar.lz hypervideo-pre-16e8548f6a720a78679e417a20a300db2036bf6c.tar.xz hypervideo-pre-16e8548f6a720a78679e417a20a300db2036bf6c.zip |
updated from upstream | 27/06/2022 at 01:25
Diffstat (limited to 'yt_dlp/extractor/generic.py')
-rw-r--r-- | yt_dlp/extractor/generic.py | 229 |
1 files changed, 162 insertions, 67 deletions
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index f594d02c2..c2f754453 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -1,5 +1,6 @@ import os import re +import urllib.parse import xml.etree.ElementTree from .ant1newsgr import Ant1NewsGrEmbedIE @@ -69,11 +70,13 @@ from .spankwire import SpankwireIE from .sportbox import SportBoxIE from .spotify import SpotifyBaseIE from .springboardplatform import SpringboardPlatformIE +from .substack import SubstackIE from .svt import SVTIE from .teachable import TeachableIE from .ted import TedEmbedIE from .theplatform import ThePlatformIE from .threeqsdn import ThreeQSDNIE +from .tiktok import TikTokIE from .tnaflix import TNAFlixNetworkEmbedIE from .tube8 import Tube8IE from .tunein import TuneInBaseIE @@ -104,12 +107,7 @@ from .yapfiles import YapFilesIE from .youporn import YouPornIE from .youtube import YoutubeIE from .zype import ZypeIE -from ..compat import ( - compat_etree_fromstring, - compat_str, - compat_urllib_parse_unquote, - compat_urlparse, -) +from ..compat import compat_etree_fromstring from ..utils import ( KNOWN_EXTENSIONS, ExtractorError, @@ -129,6 +127,7 @@ from ..utils import ( sanitized_Request, smuggle_url, str_or_none, + try_call, unescapeHTML, unified_timestamp, unsmuggle_url, @@ -2526,6 +2525,118 @@ class GenericIE(InfoExtractor): 'upload_date': '20220504', }, }, + { + # Webpage contains double BOM + 'url': 'https://www.filmarkivet.se/movies/paris-d-moll/', + 'md5': 'df02cadc719dcc63d43288366f037754', + 'info_dict': { + 'id': 'paris-d-moll', + 'ext': 'mp4', + 'upload_date': '20220518', + 'title': 'Paris d-moll', + 'description': 'md5:319e37ea5542293db37e1e13072fe330', + 'thumbnail': 'https://www.filmarkivet.se/wp-content/uploads/parisdmoll2.jpg', + 'timestamp': 1652833414, + 'age_limit': 0, + } + }, + { + 'url': 'https://www.mollymovieclub.com/p/interstellar?s=r#details', + 'md5': '198bde8bed23d0b23c70725c83c9b6d9', + 'info_dict': { + 'id': '53602801', + 'ext': 'mpga', + 'title': 'Interstellar', + 'description': 'Listen now | Episode One', + 'thumbnail': 'md5:c30d9c83f738e16d8551d7219d321538', + 'uploader': 'Molly Movie Club', + 'uploader_id': '839621', + }, + }, + { + 'url': 'https://www.blockedandreported.org/p/episode-117-lets-talk-about-depp?s=r', + 'md5': 'c0cc44ee7415daeed13c26e5b56d6aa0', + 'info_dict': { + 'id': '57962052', + 'ext': 'mpga', + 'title': 'md5:855b2756f0ee10f6723fa00b16266f8d', + 'description': 'md5:fe512a5e94136ad260c80bde00ea4eef', + 'thumbnail': 'md5:2218f27dfe517bb5ac16c47d0aebac59', + 'uploader': 'Blocked and Reported', + 'uploader_id': '500230', + }, + }, + { + 'url': 'https://www.skimag.com/video/ski-people-1980/', + 'info_dict': { + 'id': 'ski-people-1980', + 'title': 'Ski People (1980)', + }, + 'playlist_count': 1, + 'playlist': [{ + 'md5': '022a7e31c70620ebec18deeab376ee03', + 'info_dict': { + 'id': 'YTmgRiNU', + 'ext': 'mp4', + 'title': '1980 Ski People', + 'timestamp': 1610407738, + 'description': 'md5:cf9c3d101452c91e141f292b19fe4843', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/YTmgRiNU/poster.jpg?width=720', + 'duration': 5688.0, + 'upload_date': '20210111', + } + }] + }, + { + 'note': 'Rumble embed', + 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html', + 'md5': '53af34098a7f92c4e51cf0bd1c33f009', + 'info_dict': { + 'id': 'vb0ofn', + 'ext': 'mp4', + 'timestamp': 1612662578, + 'uploader': 'LovingMontana', + 'channel': 'LovingMontana', + 'upload_date': '20210207', + 'title': 'Winter-loving dog helps girls dig a snow fort ', + 'channel_url': 'https://rumble.com/c/c-546523', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/f/x/x/5fxxb.OvCc.1-small-Moose-The-Dog-Helps-Girls-D.jpg', + 'duration': 103, + } + }, + { + 'note': 'Rumble JS embed', + 'url': 'https://therightscoop.com/what-does-9-plus-1-plus-1-equal-listen-to-this-audio-of-attempted-kavanaugh-assassins-call-and-youll-get-it', + 'md5': '4701209ac99095592e73dbba21889690', + 'info_dict': { + 'id': 'v15eqxl', + 'ext': 'mp4', + 'channel': 'Mr Producer Media', + 'duration': 92, + 'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh', + 'channel_url': 'https://rumble.com/c/RichSementa', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.OvCc-small-911-Audio-From-The-Man-Who-.jpg', + 'timestamp': 1654892716, + 'uploader': 'Mr Producer Media', + 'upload_date': '20220610', + } + }, + { + 'note': 'JSON LD with multiple @type', + 'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html', + 'md5': 'c7949f34f57273013fb7ccb1156393db', + 'info_dict': { + 'id': 'ipy2AcGL', + 'ext': 'mp4', + 'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d', + 'thumbnail': r're:https://media\.nu\.nl/m/.+\.jpg', + 'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen', + 'timestamp': 1586577474, + 'upload_date': '20200411', + 'age_limit': 0, + 'duration': 111.0, + } + }, ] def report_following_redirect(self, new_url): @@ -2536,66 +2647,44 @@ class GenericIE(InfoExtractor): self._downloader.write_debug(f'Identified a {name}') def _extract_rss(self, url, video_id, doc): - playlist_title = doc.find('./channel/title').text - playlist_desc_el = doc.find('./channel/description') - playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text - NS_MAP = { 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', } entries = [] for it in doc.findall('./channel/item'): - next_url = None - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break - - if not next_url: - next_url = xpath_text(it, 'link', fatal=False) - + next_url = next( + (e.attrib.get('url') for e in it.findall('./enclosure')), + xpath_text(it, 'link', fatal=False)) if not next_url: continue - if it.find('guid').text is not None: - next_url = smuggle_url(next_url, {'force_videoid': it.find('guid').text}) + guid = try_call(lambda: it.find('guid').text) + if guid: + next_url = smuggle_url(next_url, {'force_videoid': guid}) def itunes(key): - return xpath_text( - it, xpath_with_ns('./itunes:%s' % key, NS_MAP), - default=None) - - duration = itunes('duration') - explicit = (itunes('explicit') or '').lower() - if explicit in ('true', 'yes'): - age_limit = 18 - elif explicit in ('false', 'no'): - age_limit = 0 - else: - age_limit = None + return xpath_text(it, xpath_with_ns(f'./itunes:{key}', NS_MAP), default=None) entries.append({ '_type': 'url_transparent', 'url': next_url, - 'title': it.find('title').text, + 'title': try_call(lambda: it.find('title').text), 'description': xpath_text(it, 'description', default=None), - 'timestamp': unified_timestamp( - xpath_text(it, 'pubDate', default=None)), - 'duration': int_or_none(duration) or parse_duration(duration), + 'timestamp': unified_timestamp(xpath_text(it, 'pubDate', default=None)), + 'duration': parse_duration(itunes('duration')), 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), 'episode': itunes('title'), 'episode_number': int_or_none(itunes('episode')), 'season_number': int_or_none(itunes('season')), - 'age_limit': age_limit, + 'age_limit': {'true': 18, 'yes': 18, 'false': 0, 'no': 0}.get((itunes('explicit') or '').lower()), }) return { '_type': 'playlist', 'id': url, - 'title': playlist_title, - 'description': playlist_desc, + 'title': try_call(lambda: doc.find('./channel/title').text), + 'description': try_call(lambda: doc.find('./channel/description').text), 'entries': entries, } @@ -2610,7 +2699,7 @@ class GenericIE(InfoExtractor): title = self._html_search_meta('DC.title', webpage, fatal=True) - camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg) + camtasia_url = urllib.parse.urljoin(url, camtasia_cfg) camtasia_cfg = self._download_xml( camtasia_url, video_id, note='Downloading camtasia configuration', @@ -2626,7 +2715,7 @@ class GenericIE(InfoExtractor): entries.append({ 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0], 'title': f'{title} - {n.tag}', - 'url': compat_urlparse.urljoin(url, url_n.text), + 'url': urllib.parse.urljoin(url, url_n.text), 'duration': float_or_none(n.find('./duration').text), }) @@ -2678,7 +2767,7 @@ class GenericIE(InfoExtractor): if url.startswith('//'): return self.url_result(self.http_scheme() + url) - parsed_url = compat_urlparse.urlparse(url) + parsed_url = urllib.parse.urlparse(url) if not parsed_url.scheme: default_search = self.get_param('default_search') if default_search is None: @@ -2754,7 +2843,7 @@ class GenericIE(InfoExtractor): m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: self.report_detected('direct video link') - format_id = compat_str(m.group('format_id')) + format_id = str(m.group('format_id')) subtitles = {} if format_id.endswith('mpegurl'): formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') @@ -2873,7 +2962,7 @@ class GenericIE(InfoExtractor): # Unescaping the whole page allows to handle those cases in a generic way # FIXME: unescaping the whole page may break URLs, commenting out for now. # There probably should be a second run of generic extractor on unescaped webpage. - # webpage = compat_urllib_parse_unquote(webpage) + # webpage = urllib.parse.unquote(webpage) # Unescape squarespace embeds to be detected by generic extractor, # see https://github.com/ytdl-org/youtube-dl/issues/21294 @@ -2975,7 +3064,7 @@ class GenericIE(InfoExtractor): if vimeo_urls: return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) - vhx_url = VHXEmbedIE._extract_url(webpage) + vhx_url = VHXEmbedIE._extract_url(url, webpage) if vhx_url: return self.url_result(vhx_url, VHXEmbedIE.ie_key()) @@ -3023,6 +3112,7 @@ class GenericIE(InfoExtractor): wistia_urls = WistiaIE._extract_urls(webpage) if wistia_urls: playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key()) + playlist['entries'] = list(playlist['entries']) for entry in playlist['entries']: entry.update({ '_type': 'url_transparent', @@ -3042,6 +3132,11 @@ class GenericIE(InfoExtractor): # Don't set the extractor because it can be a track url or an album return self.url_result(burl) + # Check for Substack custom domains + substack_url = SubstackIE._extract_url(webpage, url) + if substack_url: + return self.url_result(substack_url, SubstackIE) + # Look for embedded Vevo player mobj = re.search( r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) @@ -3140,7 +3235,7 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage) if mobj is not None: - return self.url_result(compat_urllib_parse_unquote(mobj.group('url'))) + return self.url_result(urllib.parse.unquote(mobj.group('url'))) # Look for funnyordie embed matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) @@ -3393,7 +3488,7 @@ class GenericIE(InfoExtractor): r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) if mobj is not None: return self.url_result( - compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') + urllib.parse.urljoin(url, mobj.group('url')), 'UDNEmbed') # Look for Senate ISVP iframe senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) @@ -3626,7 +3721,7 @@ class GenericIE(InfoExtractor): if mediasite_urls: entries = [ self.url_result(smuggle_url( - compat_urlparse.urljoin(url, mediasite_url), + urllib.parse.urljoin(url, mediasite_url), {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) for mediasite_url in mediasite_urls] return self.playlist_result(entries, video_id, video_title) @@ -3762,6 +3857,11 @@ class GenericIE(InfoExtractor): if ruutu_urls: return self.playlist_from_matches(ruutu_urls, video_id, video_title) + # Look for Tiktok embeds + tiktok_urls = TikTokIE._extract_urls(webpage) + if tiktok_urls: + return self.playlist_from_matches(tiktok_urls, video_id, video_title) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: @@ -3816,11 +3916,11 @@ class GenericIE(InfoExtractor): subtitles = {} for source in sources: src = source.get('src') - if not src or not isinstance(src, compat_str): + if not src or not isinstance(src, str): continue - src = compat_urlparse.urljoin(url, src) + src = urllib.parse.urljoin(url, src) src_type = source.get('type') - if isinstance(src_type, compat_str): + if isinstance(src_type, str): src_type = src_type.lower() ext = determine_ext(src).lower() if src_type == 'video/youtube': @@ -3854,7 +3954,7 @@ class GenericIE(InfoExtractor): if not src: continue subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({ - 'url': compat_urlparse.urljoin(url, src), + 'url': urllib.parse.urljoin(url, src), 'name': sub.get('label'), 'http_headers': { 'Referer': full_response.geturl(), @@ -3871,22 +3971,17 @@ class GenericIE(InfoExtractor): json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url') not in (url, None): self.report_detected('JSON LD') - if determine_ext(json_ld['url']) == 'm3u8': - json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles( - json_ld['url'], video_id, 'mp4') - json_ld.pop('url') - self._sort_formats(json_ld['formats']) - else: - json_ld['_type'] = 'url_transparent' - json_ld['url'] = smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}) - return merge_dicts(json_ld, info_dict) + return merge_dicts({ + '_type': 'url_transparent', + 'url': smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}), + }, json_ld, info_dict) def check_video(vurl): if YoutubeIE.suitable(vurl): return True if RtmpIE.suitable(vurl): return True - vpath = compat_urlparse.urlparse(vurl).path + vpath = urllib.parse.urlparse(vurl).path vext = determine_ext(vpath, None) return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') @@ -4014,7 +4109,7 @@ class GenericIE(InfoExtractor): if refresh_header: found = re.search(REDIRECT_REGEX, refresh_header) if found: - new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) + new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1))) if new_url != url: self.report_following_redirect(new_url) return { @@ -4040,8 +4135,8 @@ class GenericIE(InfoExtractor): for video_url in orderedSet(found): video_url = unescapeHTML(video_url) video_url = video_url.replace('\\/', '/') - video_url = compat_urlparse.urljoin(url, video_url) - video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) + video_url = urllib.parse.urljoin(url, video_url) + video_id = urllib.parse.unquote(os.path.basename(video_url)) # Sometimes, jwplayer extraction will result in a YouTube URL if YoutubeIE.suitable(video_url): |