diff options
author | Jesús <heckyel@hyperbola.info> | 2021-10-18 15:24:21 -0500 |
---|---|---|
committer | Jesús <heckyel@hyperbola.info> | 2021-10-18 15:24:21 -0500 |
commit | 5122028a4bcac4ae577ef7fbd55ccad5cb34ef5e (patch) | |
tree | 65209bc739db35e31f1c9b5b868eb5df4fe12ae3 /hypervideo_dl/extractor/crunchyroll.py | |
parent | 27fe903c511691c078942bef5ee9a05a43b15c8f (diff) | |
download | hypervideo-5122028a4bcac4ae577ef7fbd55ccad5cb34ef5e.tar.lz hypervideo-5122028a4bcac4ae577ef7fbd55ccad5cb34ef5e.tar.xz hypervideo-5122028a4bcac4ae577ef7fbd55ccad5cb34ef5e.zip |
update from upstream
Diffstat (limited to 'hypervideo_dl/extractor/crunchyroll.py')
-rw-r--r-- | hypervideo_dl/extractor/crunchyroll.py | 133 |
1 files changed, 102 insertions, 31 deletions
diff --git a/hypervideo_dl/extractor/crunchyroll.py b/hypervideo_dl/extractor/crunchyroll.py index bc2d1fa..511ac1b 100644 --- a/hypervideo_dl/extractor/crunchyroll.py +++ b/hypervideo_dl/extractor/crunchyroll.py @@ -29,6 +29,7 @@ from ..utils import ( merge_dicts, remove_end, sanitized_Request, + try_get, urlencode_postdata, xpath_text, ) @@ -120,7 +121,7 @@ class CrunchyrollBaseIE(InfoExtractor): class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): IE_NAME = 'crunchyroll' - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)' + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<id>[0-9]+))(?:[/?&]|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', 'info_dict': { @@ -412,8 +413,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text return subtitles def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') + mobj = self._match_valid_url(url) + video_id = mobj.group('id') if mobj.group('prefix') == 'm': mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage') @@ -428,7 +429,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='') if note_m: - raise ExtractorError(note_m) + raise ExtractorError(note_m, expected=True) mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage) if mobj: @@ -458,6 +459,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text video_description = (self._parse_json(self._html_search_regex( r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, webpage, 'description', default='{}'), video_id) or media_metadata).get('description') + + thumbnails = [] + thumbnail_url = (self._parse_json(self._html_search_regex( + r'<script type="application\/ld\+json">\n\s*(.+?)<\/script>', + webpage, 'thumbnail_url', default='{}'), video_id)).get('image') + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'width': 1920, + 'height': 1080 + }) + if video_description: video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) video_uploader = self._html_search_regex( @@ -473,15 +486,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text stream.get('url'), video_id, stream.get('format'), audio_lang, hardsub_lang) for f in vrv_formats: - if not hardsub_lang: - f['preference'] = 1 - language_preference = 0 - if audio_lang == language: - language_preference += 1 - if hardsub_lang == language: - language_preference += 1 - if language_preference: - f['language_preference'] = language_preference + f['language_preference'] = 1 if audio_lang == language else 0 + f['quality'] = ( + 1 if not hardsub_lang + else 0 if hardsub_lang == language + else -1) formats.extend(vrv_formats) if not formats: available_fmts = [] @@ -571,7 +580,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'ext': 'flv', }) formats.append(format_info) - self._sort_formats(formats, ('preference', 'language_preference', 'height', 'width', 'tbr', 'fps')) + self._sort_formats(formats) metadata = self._call_rpc_api( 'VideoPlayer_GetMediaMetadata', video_id, @@ -596,21 +605,25 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d', webpage, 'series', fatal=False) - season = episode = episode_number = duration = thumbnail = None + season = episode = episode_number = duration = None if isinstance(metadata, compat_etree_Element): season = xpath_text(metadata, 'series_title') episode = xpath_text(metadata, 'episode_title') episode_number = int_or_none(xpath_text(metadata, 'episode_number')) duration = float_or_none(media_metadata.get('duration'), 1000) - thumbnail = xpath_text(metadata, 'episode_image_url') if not episode: episode = media_metadata.get('title') if not episode_number: episode_number = int_or_none(media_metadata.get('episode_number')) - if not thumbnail: - thumbnail = media_metadata.get('thumbnail', {}).get('url') + thumbnail_url = try_get(media, lambda x: x['thumbnail']['url']) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'width': 640, + 'height': 360 + }) season_number = int_or_none(self._search_regex( r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', @@ -623,7 +636,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'title': video_title, 'description': video_description, 'duration': duration, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'uploader': video_uploader, 'series': series, 'season': season, @@ -637,10 +650,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = 'crunchyroll:playlist' - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)' + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:\w{1,2}/)?(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)' _TESTS = [{ - 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', + 'url': 'https://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', 'info_dict': { 'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi' @@ -659,28 +672,86 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1', 'only_matching': True, + }, { + 'url': 'http://www.crunchyroll.com/fr/ladies-versus-butlers', + 'only_matching': True, }] def _real_extract(self, url): show_id = self._match_id(url) webpage = self._download_webpage( - self._add_skip_wall(url), show_id, + # https:// gives a 403, but http:// does not + self._add_skip_wall(url).replace('https://', 'http://'), show_id, headers=self.geo_verification_headers()) title = self._html_search_meta('name', webpage, default=None) - episode_paths = re.findall( - r'(?s)<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"', - webpage) - entries = [ - self.url_result('http://www.crunchyroll.com' + ep, 'Crunchyroll', ep_id) - for ep_id, ep in episode_paths - ] - entries.reverse() + episode_re = r'<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"' + season_re = r'<a [^>]+season-dropdown[^>]+>([^<]+)' + paths = re.findall(f'(?s){episode_re}|{season_re}', webpage) + + entries, current_season = [], None + for ep_id, ep, season in paths: + if season: + current_season = season + continue + entries.append(self.url_result( + f'http://www.crunchyroll.com{ep}', CrunchyrollIE.ie_key(), ep_id, season=current_season)) return { '_type': 'playlist', 'id': show_id, 'title': title, - 'entries': entries, + 'entries': reversed(entries), } + + +class CrunchyrollBetaIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:beta' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<internal_id>\w+)/(?P<id>[\w\-]+)/?(?:\?|$)' + _TESTS = [{ + 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', + 'info_dict': { + 'id': '696363', + 'ext': 'mp4', + 'timestamp': 1459610100, + 'description': 'md5:a022fbec4fbb023d43631032c91ed64b', + 'uploader': 'Toei Animation', + 'title': 'World Trigger Episode 73 – To the Future', + 'upload_date': '20160402', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Unable to download XML'] + }] + + def _real_extract(self, url): + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id') + webpage = self._download_webpage(url, display_id) + episode_data = self._parse_json( + self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'episode data'), + display_id)['content']['byId'][internal_id] + video_id = episode_data['external_id'].split('.')[1] + series_id = episode_data['episode_metadata']['series_slug_title'] + return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}', + CrunchyrollIE.ie_key(), video_id) + + +class CrunchyrollBetaShowIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:playlist:beta' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/\w+/(?P<id>[\w\-]+)/?(?:\?|$)' + _TESTS = [{ + 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', + 'info_dict': { + 'id': 'girl-friend-beta', + 'title': 'Girl Friend BETA', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA', + 'only_matching': True, + }] + + def _real_extract(self, url): + lang, series_id = self._match_valid_url(url).group('lang', 'id') + return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id.lower()}', + CrunchyrollShowPlaylistIE.ie_key(), series_id) |