diff options
author | Jeff Huffman <tejing@tejing.com> | 2022-01-28 16:33:51 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-01-29 06:03:51 +0530 |
commit | 706dfe441b3cf01c0e2b294afc7d293211a74e94 (patch) | |
tree | 6cf0c6da0d573d76ac1cb1a448e05f0930e7f404 /yt_dlp/extractor/crunchyroll.py | |
parent | c4da5ff971f22bf3c93ac521d6805f7fb561284b (diff) | |
download | hypervideo-pre-706dfe441b3cf01c0e2b294afc7d293211a74e94.tar.lz hypervideo-pre-706dfe441b3cf01c0e2b294afc7d293211a74e94.tar.xz hypervideo-pre-706dfe441b3cf01c0e2b294afc7d293211a74e94.zip |
[crunchyroll:beta] Add cookies support (#2506)
* Extract directly from the beta API when cookies are passed. If login cookie is absent, the extraction is delegated to `CrunchyrollIE`. This causes different metadata to be extracted (including formats and video id) and therefore results in a different archive entry. For now, this issue is unavoidable since the browser also redirects to the old site when not logged in.
* Adds extractor-args `format` and `hardsub` to control the source and subtitles of the extracted formats
Closes #1911
Authored by: tejing1
Diffstat (limited to 'yt_dlp/extractor/crunchyroll.py')
-rw-r--r-- | yt_dlp/extractor/crunchyroll.py | 123 |
1 files changed, 116 insertions, 7 deletions
diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index cd35728e5..5253e7e4b 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import re import json import zlib @@ -23,13 +24,16 @@ from ..utils import ( bytes_to_intlist, extract_attributes, float_or_none, + format_field, intlist_to_bytes, int_or_none, + join_nonempty, lowercase_escape, merge_dicts, qualities, remove_end, sanitized_Request, + traverse_obj, try_get, urlencode_postdata, xpath_text, @@ -733,13 +737,118 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): def _real_extract(self, url): lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id') webpage = self._download_webpage(url, display_id) - episode_data = self._parse_json( - self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'episode data'), - display_id)['content']['byId'][internal_id] - video_id = episode_data['external_id'].split('.')[1] - series_id = episode_data['episode_metadata']['series_slug_title'] - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}', - CrunchyrollIE.ie_key(), video_id) + initial_state = self._parse_json( + self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), + display_id) + episode_data = initial_state['content']['byId'][internal_id] + if not self._get_cookies(url).get('etp_rt'): + video_id = episode_data['external_id'].split('.')[1] + series_id = episode_data['episode_metadata']['series_slug_title'] + return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}', + CrunchyrollIE.ie_key(), video_id) + + app_config = self._parse_json( + self._search_regex(r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), + display_id) + client_id = app_config['cxApiParams']['accountAuthClientId'] + api_domain = app_config['cxApiParams']['apiDomain'] + basic_token = str(base64.b64encode(('%s:' % client_id).encode('ascii')), 'ascii') + auth_response = self._download_json( + f'{api_domain}/auth/v1/token', display_id, + note='Authenticating with cookie', + headers={ + 'Authorization': 'Basic ' + basic_token + }, data='grant_type=etp_rt_cookie'.encode('ascii')) + policy_response = self._download_json( + f'{api_domain}/index/v2', display_id, + note='Retrieving signed policy', + headers={ + 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] + }) + bucket = policy_response['cms']['bucket'] + params = { + 'Policy': policy_response['cms']['policy'], + 'Signature': policy_response['cms']['signature'], + 'Key-Pair-Id': policy_response['cms']['key_pair_id'] + } + locale = traverse_obj(initial_state, ('localization', 'locale')) + if locale: + params['locale'] = locale + episode_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, + note='Retrieving episode metadata', + query=params) + if episode_response.get('is_premium_only') and not episode_response.get('playback'): + raise ExtractorError('This video is for premium members only.', expected=True) + stream_response = self._download_json( + episode_response['playback'], display_id, + note='Retrieving stream info') + + thumbnails = [] + for thumbnails_data in traverse_obj(episode_response, ('images', 'thumbnail')): + for thumbnail_data in thumbnails_data: + thumbnails.append({ + 'url': thumbnail_data.get('source'), + 'width': thumbnail_data.get('width'), + 'height': thumbnail_data.get('height'), + }) + subtitles = {} + for lang, subtitle_data in stream_response.get('subtitles').items(): + subtitles[lang] = [{ + 'url': subtitle_data.get('url'), + 'ext': subtitle_data.get('format') + }] + + requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] + hardsub_preference = qualities(requested_hardsubs[::-1]) + requested_formats = self._configuration_arg('format') or ['adaptive_hls'] + + formats = [] + for stream_type, streams in stream_response.get('streams', {}).items(): + if stream_type not in requested_formats: + continue + for stream in streams.values(): + hardsub_lang = stream.get('hardsub_locale') or '' + if hardsub_lang.lower() not in requested_hardsubs: + continue + format_id = join_nonempty( + stream_type, + format_field(stream, 'hardsub_locale', 'hardsub-%s')) + if not stream.get('url'): + continue + if stream_type.split('_')[-1] == 'hls': + adaptive_formats = self._extract_m3u8_formats( + stream['url'], display_id, 'mp4', m3u8_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + elif stream_type.split('_')[-1] == 'dash': + adaptive_formats = self._extract_mpd_formats( + stream['url'], display_id, mpd_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = stream_response.get('audio_locale') + f['quality'] = hardsub_preference(hardsub_lang.lower()) + formats.extend(adaptive_formats) + self._sort_formats(formats) + + return { + 'id': internal_id, + 'title': '%s Episode %s – %s' % (episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), + 'description': episode_response.get('description').replace(r'\r\n', '\n'), + 'duration': float_or_none(episode_response.get('duration_ms'), 1000), + 'thumbnails': thumbnails, + 'series': episode_response.get('series_title'), + 'series_id': episode_response.get('series_id'), + 'season': episode_response.get('season_title'), + 'season_id': episode_response.get('season_id'), + 'season_number': episode_response.get('season_number'), + 'episode': episode_response.get('title'), + 'episode_number': episode_response.get('sequence_number'), + 'subtitles': subtitles, + 'formats': formats + } class CrunchyrollBetaShowIE(CrunchyrollBaseIE): |