diff options
author | Jesús <heckyel@hyperbola.info> | 2021-12-07 12:26:51 -0500 |
---|---|---|
committer | Jesús <heckyel@hyperbola.info> | 2021-12-07 12:26:51 -0500 |
commit | 495746b9a6d4d32ddfa39ed908092d90a7cd5f3f (patch) | |
tree | 4845e40905136556b7513b9f36e3a70e505ee4c9 /yt_dlp/extractor | |
parent | 25831c5572c6e1d45bc05a122312516e0d264f8d (diff) | |
parent | ddd24c99493483bde822944e8063064f53464ac1 (diff) | |
download | hypervideo-pre-495746b9a6d4d32ddfa39ed908092d90a7cd5f3f.tar.lz hypervideo-pre-495746b9a6d4d32ddfa39ed908092d90a7cd5f3f.tar.xz hypervideo-pre-495746b9a6d4d32ddfa39ed908092d90a7cd5f3f.zip |
updated from upstream | 07/12/2021 at 12:26
Diffstat (limited to 'yt_dlp/extractor')
-rw-r--r-- | yt_dlp/extractor/ceskatelevize.py | 15 | ||||
-rw-r--r-- | yt_dlp/extractor/common.py | 18 | ||||
-rw-r--r-- | yt_dlp/extractor/niconico.py | 8 | ||||
-rw-r--r-- | yt_dlp/extractor/ntvcojp.py | 27 | ||||
-rw-r--r-- | yt_dlp/extractor/redtube.py | 35 | ||||
-rw-r--r-- | yt_dlp/extractor/sovietscloset.py | 13 |
6 files changed, 73 insertions, 43 deletions
diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py index f766dfbb7..6ca2f38b5 100644 --- a/yt_dlp/extractor/ceskatelevize.py +++ b/yt_dlp/extractor/ceskatelevize.py @@ -12,8 +12,7 @@ from ..utils import ( ExtractorError, float_or_none, sanitized_Request, - unescapeHTML, - update_url_query, + traverse_obj, urlencode_postdata, USER_AGENTS, ) @@ -99,11 +98,13 @@ class CeskaTelevizeIE(InfoExtractor): playlist_description = playlist_description.replace('\xa0', ' ') if parsed_url.path.startswith('/porady/'): - refer_url = update_url_query(unescapeHTML(self._search_regex( - (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), - webpage, 'iframe player url', group='url')), query={'autoStart': 'true'}) - webpage = self._download_webpage(refer_url, playlist_id) + next_data = self._search_nextjs_data(webpage, playlist_id) + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) + if not idec: + raise ExtractorError('Failed to find IDEC id') + iframe_hash = self._download_webpage('https://www.ceskatelevize.cz/v-api/iframe-hash/', playlist_id) + webpage = self._download_webpage('https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', playlist_id, + query={'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', 'IDEC': idec}) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s</p>' % NOT_AVAILABLE_STRING in webpage: diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 2180f879c..d8fc5272c 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1513,6 +1513,24 @@ class InfoExtractor(object): webpage, 'next.js data', **kw), video_id, **kw) + def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'): + ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. ''' + # not all website do this, but it can be changed + # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source + rectx = re.escape(context_name) + js, arg_keys, arg_vals = self._search_regex( + (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx, + r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx), + webpage, context_name, group=['js', 'arg_keys', 'arg_vals']) + + args = dict(zip(arg_keys.split(','), arg_vals.split(','))) + + for key, val in args.items(): + if val in ('undefined', 'void 0'): + args[key] = 'null' + + return self._parse_json(js_to_json(js, args), video_id)['data'][0] + @staticmethod def _hidden_inputs(html): html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 4fcf1d8ed..ee888e9d3 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -675,16 +675,16 @@ class NicovideoSearchBaseIE(InfoExtractor): if not results: break + def _search_results(self, query): + return self._entries( + self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query) + class NicovideoSearchIE(NicovideoSearchBaseIE, SearchInfoExtractor): IE_DESC = 'Nico video search' IE_NAME = 'nicovideo:search' _SEARCH_KEY = 'nicosearch' - def _search_results(self, query): - return self._entries( - self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query) - class NicovideoSearchURLIE(NicovideoSearchBaseIE): IE_NAME = f'{NicovideoSearchIE.IE_NAME}_url' diff --git a/yt_dlp/extractor/ntvcojp.py b/yt_dlp/extractor/ntvcojp.py index 0c8221b22..c9af91188 100644 --- a/yt_dlp/extractor/ntvcojp.py +++ b/yt_dlp/extractor/ntvcojp.py @@ -3,8 +3,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - js_to_json, + ExtractorError, smuggle_url, + traverse_obj, ) @@ -19,7 +20,7 @@ class NTVCoJpCUIE(InfoExtractor): 'ext': 'mp4', 'title': '桜エビと炒り卵がポイント! 「中華風 エビチリおにぎり」──『美虎』五十嵐美幸', 'upload_date': '20181213', - 'description': 'md5:211b52f4fd60f3e0e72b68b0c6ba52a9', + 'description': 'md5:1985b51a9abc285df0104d982a325f2a', 'uploader_id': '3855502814001', 'timestamp': 1544669941, }, @@ -28,22 +29,30 @@ class NTVCoJpCUIE(InfoExtractor): 'skip_download': True, }, } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - player_config = self._parse_json(self._search_regex( - r'(?s)PLAYER_CONFIG\s*=\s*({.+?})', - webpage, 'player config'), display_id, js_to_json) - video_id = player_config['videoId'] - account_id = player_config.get('account') or '3855502814001' + player_config = self._search_nuxt_data(webpage, display_id) + video_id = traverse_obj(player_config, ('movie', 'video_id')) + if not video_id: + raise ExtractorError('Failed to extract video ID for Brightcove') + account_id = traverse_obj(player_config, ('player', 'account')) or '3855502814001' + title = traverse_obj(player_config, ('movie', 'name')) + if not title: + og_title = self._og_search_title(webpage, fatal=False) or traverse_obj(player_config, ('player', 'title')) + if og_title: + title = og_title.split('(', 1)[0].strip() + description = (traverse_obj(player_config, ('movie', 'description')) + or self._html_search_meta(['description', 'og:description'], webpage)) return { '_type': 'url_transparent', 'id': video_id, 'display_id': display_id, - 'title': self._search_regex(r'<h1[^>]+class="title"[^>]*>([^<]+)', webpage, 'title').strip(), - 'description': self._html_search_meta(['description', 'og:description'], webpage), + 'title': title, + 'description': description, 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, video_id), {'geo_countries': ['JP']}), 'ie_key': 'BrightcoveNew', } diff --git a/yt_dlp/extractor/redtube.py b/yt_dlp/extractor/redtube.py index 747ce5199..7fee54fee 100644 --- a/yt_dlp/extractor/redtube.py +++ b/yt_dlp/extractor/redtube.py @@ -17,17 +17,20 @@ from ..utils import ( class RedTubeIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://www.redtube.com/66418', - 'md5': 'fc08071233725f26b8f014dba9590005', + 'url': 'https://www.redtube.com/38864951', + 'md5': '4fba70cbca3aefd25767ab4b523c9878', 'info_dict': { - 'id': '66418', + 'id': '38864951', 'ext': 'mp4', - 'title': 'Sucked on a toilet', - 'upload_date': '20110811', - 'duration': 596, + 'title': 'Public Sex on the Balcony in Freezing Paris! Amateur Couple LeoLulu', + 'description': 'Watch video Public Sex on the Balcony in Freezing Paris! Amateur Couple LeoLulu on Redtube, home of free Blowjob porn videos and Blonde sex movies online. Video length: (10:46) - Uploaded by leolulu - Verified User - Starring Pornstar: Leolulu', + 'upload_date': '20210111', + 'timestamp': 1610343109, + 'duration': 646, 'view_count': int, 'age_limit': 18, - } + 'thumbnail': r're:https://\wi-ph\.rdtcdn\.com/videos/.+/.+\.jpg', + }, }, { 'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286', 'only_matching': True, @@ -84,15 +87,25 @@ class RedTubeIE(InfoExtractor): r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage, 'media definitions', default='{}'), video_id, fatal=False) - if medias and isinstance(medias, list): - for media in medias: + for media in medias if isinstance(medias, list) else []: + format_url = url_or_none(media.get('videoUrl')) + if not format_url: + continue + format_id = media.get('format') + quality = media.get('quality') + if format_id == 'hls' or (format_id == 'mp4' and not quality): + more_media = self._download_json(format_url, video_id, fatal=False) + else: + more_media = [media] + for media in more_media if isinstance(more_media, list) else []: format_url = url_or_none(media.get('videoUrl')) if not format_url: continue - if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8': + format_id = media.get('format') + if format_id == 'hls' or determine_ext(format_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', + entry_protocol='m3u8_native', m3u8_id=format_id or 'hls', fatal=False)) continue format_id = media.get('quality') diff --git a/yt_dlp/extractor/sovietscloset.py b/yt_dlp/extractor/sovietscloset.py index 7df23759a..daf1c7450 100644 --- a/yt_dlp/extractor/sovietscloset.py +++ b/yt_dlp/extractor/sovietscloset.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - js_to_json, try_get, unified_timestamp ) @@ -14,17 +13,7 @@ class SovietsClosetBaseIE(InfoExtractor): def parse_nuxt_jsonp(self, nuxt_jsonp_url, video_id, name): nuxt_jsonp = self._download_webpage(nuxt_jsonp_url, video_id, note=f'Downloading {name} __NUXT_JSONP__') - js, arg_keys, arg_vals = self._search_regex( - r'__NUXT_JSONP__\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)', - nuxt_jsonp, '__NUXT_JSONP__', group=['js', 'arg_keys', 'arg_vals']) - - args = dict(zip(arg_keys.split(','), arg_vals.split(','))) - - for key, val in args.items(): - if val in ('undefined', 'void 0'): - args[key] = 'null' - - return self._parse_json(js_to_json(js, args), video_id)['data'][0] + return self._search_nuxt_data(nuxt_jsonp, video_id, '__NUXT_JSONP__') def video_meta(self, video_id, game_name, category_name, episode_number, stream_date): title = game_name |