diff options
Diffstat (limited to 'hypervideo_dl/extractor/crunchyroll.py')
-rw-r--r-- | hypervideo_dl/extractor/crunchyroll.py | 359 |
1 files changed, 280 insertions, 79 deletions
diff --git a/hypervideo_dl/extractor/crunchyroll.py b/hypervideo_dl/extractor/crunchyroll.py index 511ac1b..7edb645 100644 --- a/hypervideo_dl/extractor/crunchyroll.py +++ b/hypervideo_dl/extractor/crunchyroll.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import re import json import zlib @@ -8,7 +9,7 @@ import zlib from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor -from .vrv import VRVIE +from .vrv import VRVBaseIE from ..compat import ( compat_b64decode, compat_etree_Element, @@ -23,14 +24,17 @@ from ..utils import ( bytes_to_intlist, extract_attributes, float_or_none, + format_field, intlist_to_bytes, int_or_none, + join_nonempty, lowercase_escape, merge_dicts, + qualities, remove_end, sanitized_Request, + traverse_obj, try_get, - urlencode_postdata, xpath_text, ) from ..aes import ( @@ -39,8 +43,8 @@ from ..aes import ( class CrunchyrollBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.crunchyroll.com/login' - _LOGIN_FORM = 'login_form' + _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login' + _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' def _call_rpc_api(self, method, video_id, note=None, data=None): @@ -53,57 +57,50 @@ class CrunchyrollBaseIE(InfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded', }) - def _login(self): - username, password = self._get_login_info() - if username is None: + def _perform_login(self, username, password): + if self._get_cookies(self._LOGIN_URL).get('etp_rt'): return - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - def is_logged(webpage): - return 'href="/logout"' in webpage - - # Already logged in - if is_logged(login_page): - return - - login_form_str = self._search_regex( - r'(?P<form><form[^>]+?id=(["\'])%s\2[^>]*>)' % self._LOGIN_FORM, - login_page, 'login form', group='form') - - post_url = extract_attributes(login_form_str).get('action') - if not post_url: - post_url = self._LOGIN_URL - elif not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - login_form = self._form_hidden_inputs(self._LOGIN_FORM, login_page) - - login_form.update({ - 'login_form[name]': username, - 'login_form[password]': password, - }) - - response = self._download_webpage( - post_url, None, 'Logging in', 'Wrong login info', - data=urlencode_postdata(login_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - # Successful login - if is_logged(response): - return - - error = self._html_search_regex( - '(?s)<ul[^>]+class=["\']messages["\'][^>]*>(.+?)</ul>', - response, 'error message', default=None) - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - - raise ExtractorError('Unable to log in') - - def _real_initialize(self): - self._login() + upsell_response = self._download_json( + f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id', + query={ + 'sess_id': 1, + 'device_id': 'whatvalueshouldbeforweb', + 'device_type': 'com.crunchyroll.static', + 'access_token': 'giKq5eY27ny3cqz', + 'referer': self._LOGIN_URL + }) + if upsell_response['code'] != 'ok': + raise ExtractorError('Could not get session id') + session_id = upsell_response['data']['session_id'] + + login_response = self._download_json( + f'{self._API_BASE}/login.1.json', None, 'Logging in', + data=compat_urllib_parse_urlencode({ + 'account': username, + 'password': password, + 'session_id': session_id + }).encode('ascii')) + if login_response['code'] != 'ok': + raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True) + if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): + raise ExtractorError('Login succeeded but did not set etp_rt cookie') + + # Beta-specific, but needed for redirects + def _get_beta_embedded_json(self, webpage, display_id): + initial_state = self._parse_json(self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) + app_config = self._parse_json(self._search_regex( + r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) + return initial_state, app_config + + def _redirect_to_beta(self, webpage, iekey, video_id): + if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): + raise ExtractorError('Received a beta page from non-beta url when not logged in.') + initial_state, app_config = self._get_beta_embedded_json(webpage, video_id) + url = app_config['baseSiteUrl'] + initial_state['router']['locations']['current']['pathname'] + self.to_screen(f'{video_id}: Redirected to beta site - {url}') + return self.url_result(f'{url}', iekey, video_id) @staticmethod def _add_skip_wall(url): @@ -119,7 +116,7 @@ class CrunchyrollBaseIE(InfoExtractor): parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) -class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): +class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE): IE_NAME = 'crunchyroll' _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<id>[0-9]+))(?:[/?&]|$)' _TESTS = [{ @@ -425,6 +422,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage = self._download_webpage( self._add_skip_wall(webpage_url), video_id, headers=self.geo_verification_headers()) + if re.search(r'<div id="preload-data">', webpage): + return self._redirect_to_beta(webpage, CrunchyrollBetaIE.ie_key(), video_id) note_m = self._html_search_regex( r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='') @@ -478,19 +477,24 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], webpage, 'video_uploader', default=False) + requested_languages = self._configuration_arg('language') + requested_hardsubs = [('' if val == 'none' else val) for val in self._configuration_arg('hardsub')] + language_preference = qualities((requested_languages or [language or ''])[::-1]) + hardsub_preference = qualities((requested_hardsubs or ['', language or ''])[::-1]) + formats = [] for stream in media.get('streams', []): - audio_lang = stream.get('audio_lang') - hardsub_lang = stream.get('hardsub_lang') + audio_lang = stream.get('audio_lang') or '' + hardsub_lang = stream.get('hardsub_lang') or '' + if (requested_languages and audio_lang.lower() not in requested_languages + or requested_hardsubs and hardsub_lang.lower() not in requested_hardsubs): + continue vrv_formats = self._extract_vrv_formats( stream.get('url'), video_id, stream.get('format'), audio_lang, hardsub_lang) for f in vrv_formats: - f['language_preference'] = 1 if audio_lang == language else 0 - f['quality'] = ( - 1 if not hardsub_lang - else 0 if hardsub_lang == language - else -1) + f['language_preference'] = language_preference(audio_lang) + f['quality'] = hardsub_preference(hardsub_lang) formats.extend(vrv_formats) if not formats: available_fmts = [] @@ -684,6 +688,8 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): # https:// gives a 403, but http:// does not self._add_skip_wall(url).replace('https://', 'http://'), show_id, headers=self.geo_verification_headers()) + if re.search(r'<div id="preload-data">', webpage): + return self._redirect_to_beta(webpage, CrunchyrollBetaShowIE.ie_key(), show_id) title = self._html_search_meta('name', webpage, default=None) episode_re = r'<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"' @@ -706,9 +712,56 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): } -class CrunchyrollBetaIE(CrunchyrollBaseIE): +class CrunchyrollBetaBaseIE(CrunchyrollBaseIE): + params = None + + def _get_params(self, lang): + if not CrunchyrollBetaBaseIE.params: + initial_state, app_config = self._get_beta_embedded_json(self._download_webpage( + f'https://beta.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) + api_domain = app_config['cxApiParams']['apiDomain'] + basic_token = str(base64.b64encode(('%s:' % app_config['cxApiParams']['accountAuthClientId']).encode('ascii')), 'ascii') + auth_response = self._download_json( + f'{api_domain}/auth/v1/token', None, note='Authenticating with cookie', + headers={ + 'Authorization': 'Basic ' + basic_token + }, data='grant_type=etp_rt_cookie'.encode('ascii')) + policy_response = self._download_json( + f'{api_domain}/index/v2', None, note='Retrieving signed policy', + headers={ + 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] + }) + bucket = policy_response['cms']['bucket'] + params = { + 'Policy': policy_response['cms']['policy'], + 'Signature': policy_response['cms']['signature'], + 'Key-Pair-Id': policy_response['cms']['key_pair_id'] + } + locale = traverse_obj(initial_state, ('localization', 'locale')) + if locale: + params['locale'] = locale + CrunchyrollBetaBaseIE.params = (api_domain, bucket, params) + return CrunchyrollBetaBaseIE.params + + def _redirect_from_beta(self, url, lang, internal_id, display_id, is_episode, iekey): + initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(url, display_id), display_id) + content_data = initial_state['content']['byId'][internal_id] + if is_episode: + video_id = content_data['external_id'].split('.')[1] + series_id = content_data['episode_metadata']['series_slug_title'] + else: + series_id = content_data['slug_title'] + series_id = re.sub(r'-{2,}', '-', series_id) + url = f'https://www.crunchyroll.com/{lang}{series_id}' + if is_episode: + url = url + f'/{display_id}-{video_id}' + self.to_screen(f'{display_id}: Not logged in. Redirecting to non-beta site - {url}') + return self.url_result(url, iekey, display_id) + + +class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): IE_NAME = 'crunchyroll:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<internal_id>\w+)/(?P<id>[\w\-]+)/?(?:\?|$)' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', 'info_dict': { @@ -719,26 +772,129 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'uploader': 'Toei Animation', 'title': 'World Trigger Episode 73 – To the Future', 'upload_date': '20160402', + 'episode_number': 73, + 'series': 'World Trigger', + 'average_rating': 4.9, + 'episode': 'To the Future', + 'season': 'World Trigger', + 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/c870dedca1a83137c2d3d144984155ed1459527119_main.jpg', + 'season_number': 1, + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Unable to download XML'] + }, { + 'url': 'https://beta.crunchyroll.com/watch/GYK53DMPR/wicked-lord-shingan-reborn', + 'info_dict': { + 'id': '648781', + 'ext': 'mp4', + 'episode_number': 1, + 'timestamp': 1389173400, + 'series': 'Love, Chunibyo & Other Delusions - Heart Throb -', + 'description': 'md5:5579d1a0355cc618558ba23d27067a62', + 'uploader': 'TBS', + 'episode': 'Wicked Lord Shingan... Reborn', + 'average_rating': 4.9, + 'season': 'Love, Chunibyo & Other Delusions - Heart Throb -', + 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/2ba0384e225a5370d5f0ee9496d91ea51389046521_main.jpg', + 'title': 'Love, Chunibyo & Other Delusions - Heart Throb - Episode 1 – Wicked Lord Shingan... Reborn', + 'season_number': 2, + 'upload_date': '20140108', }, 'params': {'skip_download': 'm3u8'}, 'expected_warnings': ['Unable to download XML'] + }, { + 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/', + 'only_matching': True, }] def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id') - webpage = self._download_webpage(url, display_id) - episode_data = self._parse_json( - self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'episode data'), - display_id)['content']['byId'][internal_id] - video_id = episode_data['external_id'].split('.')[1] - series_id = episode_data['episode_metadata']['series_slug_title'] - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}', - CrunchyrollIE.ie_key(), video_id) - - -class CrunchyrollBetaShowIE(CrunchyrollBaseIE): + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') + + if not self._get_cookies(url).get('etp_rt'): + return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key()) + + api_domain, bucket, params = self._get_params(lang) + + episode_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, + note='Retrieving episode metadata', + query=params) + if episode_response.get('is_premium_only') and not episode_response.get('playback'): + raise ExtractorError('This video is for premium members only.', expected=True) + stream_response = self._download_json( + episode_response['playback'], display_id, + note='Retrieving stream info') + + thumbnails = [] + for thumbnails_data in traverse_obj(episode_response, ('images', 'thumbnail')): + for thumbnail_data in thumbnails_data: + thumbnails.append({ + 'url': thumbnail_data.get('source'), + 'width': thumbnail_data.get('width'), + 'height': thumbnail_data.get('height'), + }) + subtitles = {} + for lang, subtitle_data in stream_response.get('subtitles').items(): + subtitles[lang] = [{ + 'url': subtitle_data.get('url'), + 'ext': subtitle_data.get('format') + }] + + requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] + hardsub_preference = qualities(requested_hardsubs[::-1]) + requested_formats = self._configuration_arg('format') or ['adaptive_hls'] + + formats = [] + for stream_type, streams in stream_response.get('streams', {}).items(): + if stream_type not in requested_formats: + continue + for stream in streams.values(): + hardsub_lang = stream.get('hardsub_locale') or '' + if hardsub_lang.lower() not in requested_hardsubs: + continue + format_id = join_nonempty( + stream_type, + format_field(stream, 'hardsub_locale', 'hardsub-%s')) + if not stream.get('url'): + continue + if stream_type.split('_')[-1] == 'hls': + adaptive_formats = self._extract_m3u8_formats( + stream['url'], display_id, 'mp4', m3u8_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + elif stream_type.split('_')[-1] == 'dash': + adaptive_formats = self._extract_mpd_formats( + stream['url'], display_id, mpd_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = stream_response.get('audio_locale') + f['quality'] = hardsub_preference(hardsub_lang.lower()) + formats.extend(adaptive_formats) + self._sort_formats(formats) + + return { + 'id': internal_id, + 'title': '%s Episode %s – %s' % (episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), + 'description': episode_response.get('description').replace(r'\r\n', '\n'), + 'duration': float_or_none(episode_response.get('duration_ms'), 1000), + 'thumbnails': thumbnails, + 'series': episode_response.get('series_title'), + 'series_id': episode_response.get('series_id'), + 'season': episode_response.get('season_title'), + 'season_id': episode_response.get('season_id'), + 'season_number': episode_response.get('season_number'), + 'episode': episode_response.get('title'), + 'episode_number': episode_response.get('sequence_number'), + 'subtitles': subtitles, + 'formats': formats + } + + +class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): IE_NAME = 'crunchyroll:playlist:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/\w+/(?P<id>[\w\-]+)/?(?:\?|$)' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { @@ -747,11 +903,56 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE): }, 'playlist_mincount': 10, }, { + 'url': 'https://beta.crunchyroll.com/series/GYJQV73V6/love-chunibyo--other-delusions---heart-throb--', + 'info_dict': { + 'id': 'love-chunibyo-other-delusions-heart-throb-', + 'title': 'Love, Chunibyo & Other Delusions - Heart Throb -', + }, + 'playlist_mincount': 10, + }, { 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA', 'only_matching': True, }] def _real_extract(self, url): - lang, series_id = self._match_valid_url(url).group('lang', 'id') - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id.lower()}', - CrunchyrollShowPlaylistIE.ie_key(), series_id) + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') + + if not self._get_cookies(url).get('etp_rt'): + return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key()) + + api_domain, bucket, params = self._get_params(lang) + + series_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id, + note='Retrieving series metadata', query=params) + + seasons_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id, + note='Retrieving season list', query=params) + + def entries(): + for season in seasons_response['items']: + episodes_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id, + note=f'Retrieving episode list for {season.get("slug_title")}', query=params) + for episode in episodes_response['items']: + episode_id = episode['id'] + episode_display_id = episode['slug_title'] + yield { + '_type': 'url', + 'url': f'https://beta.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', + 'ie_key': CrunchyrollBetaIE.ie_key(), + 'id': episode_id, + 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), + 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')), + 'duration': float_or_none(episode.get('duration_ms'), 1000), + 'series': episode.get('series_title'), + 'series_id': episode.get('series_id'), + 'season': episode.get('season_title'), + 'season_id': episode.get('season_id'), + 'season_number': episode.get('season_number'), + 'episode': episode.get('title'), + 'episode_number': episode.get('sequence_number') + } + + return self.playlist_result(entries(), internal_id, series_response.get('title')) |