diff options
Diffstat (limited to 'hypervideo_dl/extractor/generic.py')
-rw-r--r-- | hypervideo_dl/extractor/generic.py | 320 |
1 files changed, 253 insertions, 67 deletions
diff --git a/hypervideo_dl/extractor/generic.py b/hypervideo_dl/extractor/generic.py index 7da038a..8387646 100644 --- a/hypervideo_dl/extractor/generic.py +++ b/hypervideo_dl/extractor/generic.py @@ -84,7 +84,6 @@ from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE from .arkena import ArkenaIE from .instagram import InstagramIE -from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE from .theplatform import ThePlatformIE from .kaltura import KalturaIE @@ -128,9 +127,14 @@ from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE from .vk import VKIE from .kinja import KinjaEmbedIE +from .gedidigital import GediDigitalIE +from .rcs import RCSEmbedsIE +from .bitchute import BitChuteIE +from .rumble import RumbleEmbedIE from .arcpublishing import ArcPublishingIE from .medialaan import MedialaanIE from .simplecast import SimplecastIE +from .wimtv import WimTVIE class GenericIE(InfoExtractor): @@ -216,12 +220,10 @@ class GenericIE(InfoExtractor): 'playlist': [{ 'info_dict': { 'ext': 'mov', - 'id': 'pdv_maddow_netcast_mov-12-04-2020-224335', - 'title': 're:MSNBC Rachel Maddow', + 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726', + 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726', 'description': 're:.*her unique approach to storytelling.*', - 'timestamp': int, - 'upload_date': compat_str, - 'duration': float, + 'upload_date': '20201204', }, }], }, @@ -1213,14 +1215,13 @@ class GenericIE(InfoExtractor): }, { # JWPlatform iframe - 'url': 'https://www.mediaite.com/tv/dem-senator-claims-gary-cohn-faked-a-bad-connection-during-trump-call-to-get-him-off-the-phone/', - 'md5': 'ca00a040364b5b439230e7ebfd02c4e9', + 'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved', 'info_dict': { - 'id': 'O0c5JcKT', + 'id': 'AG26UQXM', 'ext': 'mp4', - 'upload_date': '20171122', - 'timestamp': 1511366290, - 'title': 'Dem Senator Claims Gary Cohn Faked a Bad Connection During Trump Call to Get Him Off the Phone', + 'upload_date': '20160719', + 'timestamp': 468923808, + 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4', }, 'add_ie': [JWPlatformIE.ie_key()], }, @@ -1629,31 +1630,6 @@ class GenericIE(InfoExtractor): 'upload_date': '20160409', }, }, - # LiveLeak embed - { - 'url': 'http://www.wykop.pl/link/3088787/', - 'md5': '7619da8c820e835bef21a1efa2a0fc71', - 'info_dict': { - 'id': '874_1459135191', - 'ext': 'mp4', - 'title': 'Man shows poor quality of new apartment building', - 'description': 'The wall is like a sand pile.', - 'uploader': 'Lake8737', - }, - 'add_ie': [LiveLeakIE.ie_key()], - }, - # Another LiveLeak embed pattern (#13336) - { - 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/', - 'info_dict': { - 'id': '2eb_1496309988', - 'ext': 'mp4', - 'title': 'Thief robs place where everyone was armed', - 'description': 'md5:694d73ee79e535953cf2488562288eee', - 'uploader': 'brazilwtf', - }, - 'add_ie': [LiveLeakIE.ie_key()], - }, # Duplicated embedded video URLs { 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443', @@ -2253,6 +2229,95 @@ class GenericIE(InfoExtractor): # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed) 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html', 'only_matching': True, + }, { + # WimTv embed player + 'url': 'http://www.msmotor.tv/wearefmi-pt-2-2021/', + 'info_dict': { + 'id': 'wearefmi-pt-2-2021', + 'title': '#WEAREFMI – PT.2 – 2021 – MsMotorTV', + }, + 'playlist_count': 1, + }, { + # KVS Player + 'url': 'https://www.kvs-demo.com/videos/105/kelis-4th-of-july/', + 'info_dict': { + 'id': '105', + 'display_id': 'kelis-4th-of-july', + 'ext': 'mp4', + 'title': 'Kelis - 4th Of July', + 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # KVS Player + 'url': 'https://www.kvs-demo.com/embed/105/', + 'info_dict': { + 'id': '105', + 'display_id': 'kelis-4th-of-july', + 'ext': 'mp4', + 'title': 'Kelis - 4th Of July / Embed Player', + 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg', + }, + 'params': { + 'skip_download': True, + }, + }, { + # KVS Player + 'url': 'https://thisvid.com/videos/french-boy-pantsed/', + 'md5': '3397979512c682f6b85b3b04989df224', + 'info_dict': { + 'id': '2400174', + 'display_id': 'french-boy-pantsed', + 'ext': 'mp4', + 'title': 'French Boy Pantsed - ThisVid.com', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', + } + }, { + # KVS Player + 'url': 'https://thisvid.com/embed/2400174/', + 'md5': '3397979512c682f6b85b3b04989df224', + 'info_dict': { + 'id': '2400174', + 'display_id': 'french-boy-pantsed', + 'ext': 'mp4', + 'title': 'French Boy Pantsed - ThisVid.com', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', + } + }, { + # KVS Player + 'url': 'https://youix.com/video/leningrad-zoj/', + 'md5': '94f96ba95706dc3880812b27b7d8a2b8', + 'info_dict': { + 'id': '18485', + 'display_id': 'leningrad-zoj', + 'ext': 'mp4', + 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com', + 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', + } + }, { + # KVS Player + 'url': 'https://youix.com/embed/18485', + 'md5': '94f96ba95706dc3880812b27b7d8a2b8', + 'info_dict': { + 'id': '18485', + 'display_id': 'leningrad-zoj', + 'ext': 'mp4', + 'title': 'Ленинград - ЗОЖ', + 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg', + } + }, { + # KVS Player + 'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/', + 'md5': '94166bdb26b4cb1fb9214319a629fc51', + 'info_dict': { + 'id': '21217', + 'display_id': '40-nochey-40-nights-2016', + 'ext': 'mp4', + 'title': '40 ночей (2016) - BogMedia.org', + 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', + } }, ] @@ -2358,19 +2423,57 @@ class GenericIE(InfoExtractor): 'title': title, } + def _kvs_getrealurl(self, video_url, license_code): + if not video_url.startswith('function/0/'): + return video_url # not obfuscated + + url_path, _, url_query = video_url.partition('?') + urlparts = url_path.split('/')[2:] + license = self._kvs_getlicensetoken(license_code) + newmagic = urlparts[5][:32] + + for o in range(len(newmagic) - 1, -1, -1): + new = '' + l = (o + sum([int(n) for n in license[o:]])) % 32 + + for i in range(0, len(newmagic)): + if i == o: + new += newmagic[l] + elif i == l: + new += newmagic[o] + else: + new += newmagic[i] + newmagic = new + + urlparts[5] = newmagic + urlparts[5][32:] + return '/'.join(urlparts) + '?' + url_query + + def _kvs_getlicensetoken(self, license): + modlicense = license.replace('$', '').replace('0', '1') + center = int(len(modlicense) / 2) + fronthalf = int(modlicense[:center + 1]) + backhalf = int(modlicense[center:]) + + modlicense = str(4 * abs(fronthalf - backhalf)) + retval = '' + for o in range(0, center + 1): + for i in range(1, 5): + retval += str((int(license[o + i]) + int(modlicense[o])) % 10) + return retval + def _real_extract(self, url): if url.startswith('//'): return self.url_result(self.http_scheme() + url) parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: - default_search = self._downloader.params.get('default_search') + default_search = self.get_param('default_search') if default_search is None: default_search = 'fixup_error' if default_search in ('auto', 'auto_warning', 'fixup_error'): if re.match(r'^[^\s/]+\.[^\s/]+/', url): - self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') + self.report_warning('The url doesn\'t specify the protocol, trying with http') return self.url_result('http://' + url) elif default_search != 'fixup_error': if default_search == 'auto_warning': @@ -2379,7 +2482,7 @@ class GenericIE(InfoExtractor): 'Invalid URL: %r . Call hypervideo like this: hypervideo -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url, expected=True) else: - self._downloader.report_warning( + self.report_warning( 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url) return self.url_result('ytsearch:' + url) @@ -2438,8 +2541,9 @@ class GenericIE(InfoExtractor): m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: format_id = compat_str(m.group('format_id')) + subtitles = {} if format_id.endswith('mpegurl'): - formats = self._extract_m3u8_formats(url, video_id, 'mp4') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') elif format_id == 'f4m': formats = self._extract_f4m_formats(url, video_id) else: @@ -2451,11 +2555,12 @@ class GenericIE(InfoExtractor): info_dict['direct'] = True self._sort_formats(formats) info_dict['formats'] = formats + info_dict['subtitles'] = subtitles return info_dict - if not self._downloader.params.get('test', False) and not is_intentional: - force = self._downloader.params.get('force_generic_extractor', False) - self._downloader.report_warning( + if not self.get_param('test', False) and not is_intentional: + force = self.get_param('force_generic_extractor', False) + self.report_warning( '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) if not full_response: @@ -2475,14 +2580,14 @@ class GenericIE(InfoExtractor): # Is it an M3U playlist? if first_bytes.startswith(b'#EXTM3U'): - info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') + info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') self._sort_formats(info_dict['formats']) return info_dict # Maybe it's a direct link to a video? # Be careful not to download the whole thing! if not is_html(first_bytes): - self._downloader.report_warning( + self.report_warning( 'URL could be a direct video link, returning it as such.') info_dict.update({ 'direct': True, @@ -2500,11 +2605,14 @@ class GenericIE(InfoExtractor): # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? try: - doc = compat_etree_fromstring(webpage.encode('utf-8')) + try: + doc = compat_etree_fromstring(webpage) + except compat_xml_parse_error: + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif doc.tag == 'SmoothStreamingMedia': - info_dict['formats'] = self._parse_ism_formats(doc, url) + info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url) self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): @@ -2518,7 +2626,7 @@ class GenericIE(InfoExtractor): xspf_base_url=full_response.geturl()), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): - info_dict['formats'] = self._parse_mpd_formats( + info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles( doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) @@ -2647,11 +2755,14 @@ class GenericIE(InfoExtractor): if vhx_url: return self.url_result(vhx_url, VHXEmbedIE.ie_key()) - vid_me_embed_url = self._search_regex( - r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', - webpage, 'vid.me embed', default=None) - if vid_me_embed_url is not None: - return self.url_result(vid_me_embed_url, 'Vidme') + # Invidious Instances + # https://github.com/hypervideo/hypervideo/issues/195 + # https://github.com/iv-org/invidious/pull/1730 + youtube_url = self._search_regex( + r'<link rel="alternate" href="(https://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})"', + webpage, 'youtube link', default=None) + if youtube_url: + return self.url_result(youtube_url, YoutubeIE.ie_key()) # Look for YouTube embeds youtube_urls = YoutubeIE._extract_urls(webpage) @@ -3179,11 +3290,6 @@ class GenericIE(InfoExtractor): return self.url_result( self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) - # Look for LiveLeak embeds - liveleak_urls = LiveLeakIE._extract_urls(webpage) - if liveleak_urls: - return self.playlist_from_matches(liveleak_urls, video_id, video_title) - # Look for 3Q SDN embeds threeqsdn_url = ThreeQSDNIE._extract_url(webpage) if threeqsdn_url: @@ -3348,6 +3454,34 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) + gedi_urls = GediDigitalIE._extract_urls(webpage) + if gedi_urls: + return self.playlist_from_matches( + gedi_urls, video_id, video_title, ie=GediDigitalIE.ie_key()) + + # Look for RCS media group embeds + rcs_urls = RCSEmbedsIE._extract_urls(webpage) + if rcs_urls: + return self.playlist_from_matches( + rcs_urls, video_id, video_title, ie=RCSEmbedsIE.ie_key()) + + wimtv_urls = WimTVIE._extract_urls(webpage) + if wimtv_urls: + return self.playlist_from_matches( + wimtv_urls, video_id, video_title, ie=WimTVIE.ie_key()) + + bitchute_urls = BitChuteIE._extract_urls(webpage) + if bitchute_urls: + return self.playlist_from_matches( + bitchute_urls, video_id, video_title, ie=BitChuteIE.ie_key()) + + rumble_urls = RumbleEmbedIE._extract_urls(webpage) + if len(rumble_urls) == 1: + return self.url_result(rumble_urls[0], RumbleEmbedIE.ie_key()) + if rumble_urls: + return self.playlist_from_matches( + rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: @@ -3388,6 +3522,7 @@ class GenericIE(InfoExtractor): if not isinstance(sources, list): sources = [sources] formats = [] + subtitles = {} for source in sources: src = source.get('src') if not src or not isinstance(src, compat_str): @@ -3400,12 +3535,16 @@ class GenericIE(InfoExtractor): if src_type == 'video/youtube': return self.url_result(src, YoutubeIE.ie_key()) if src_type == 'application/dash+xml' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles( + src, video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif src_type == 'application/x-mpegurl' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: formats.append({ 'url': src, @@ -3415,9 +3554,10 @@ class GenericIE(InfoExtractor): 'Referer': full_response.geturl(), }, }) - if formats: + if formats or subtitles: self._sort_formats(formats) info_dict['formats'] = formats + info_dict['subtitles'] = subtitles return info_dict # Looking for http://schema.org/VideoObject @@ -3451,6 +3591,52 @@ class GenericIE(InfoExtractor): .*? ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) if not found: + # Look for generic KVS player + found = re.search(r'<script [^>]*?src="https://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage) + if found: + if found.group('maj_ver') not in ['4', '5']: + self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver')) + flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage) + flashvars = self._parse_json(flashvars.group(1), video_id, transform_source=js_to_json) + + # extract the part after the last / as the display_id from the + # canonical URL. + display_id = self._search_regex( + r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>' + r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)', + webpage, 'display_id', fatal=False + ) + title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title') + + thumbnail = flashvars['preview_url'] + if thumbnail.startswith('//'): + protocol, _, _ = url.partition('/') + thumbnail = protocol + thumbnail + + formats = [] + for key in ('video_url', 'video_alt_url', 'video_alt_url2'): + if key in flashvars and '/get_file/' in flashvars[key]: + next_format = { + 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']), + 'format_id': flashvars.get(key + '_text', key), + 'ext': 'mp4', + } + height = re.search(r'%s_(\d+)p\.mp4(?:/[?].*)?$' % flashvars['video_id'], flashvars[key]) + if height: + next_format['height'] = int(height.group(1)) + else: + next_format['quality'] = 1 + formats.append(next_format) + self._sort_formats(formats) + + return { + 'id': flashvars['video_id'], + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } + if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) if not found: @@ -3552,13 +3738,13 @@ class GenericIE(InfoExtractor): ext = determine_ext(video_url) if ext == 'smil': - entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id) + entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict} elif ext == 'xspf': return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) elif ext == 'm3u8': - entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4') + entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4') elif ext == 'mpd': - entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id) + entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id) elif ext == 'f4m': entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id) elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: |