diff options
author | James Taylor <user234683@users.noreply.github.com> | 2018-11-09 00:58:46 -0800 |
---|---|---|
committer | James Taylor <user234683@users.noreply.github.com> | 2018-11-09 00:58:46 -0800 |
commit | 630b476e42af47f7a0c8930e00c0e6c9cdc384d2 (patch) | |
tree | d5e1a67c3a6b62799737c7e678f88c22ce2657c1 /youtube_dl/extractor/youtube.py | |
parent | f6778f8c3d0f9fe5c1d496313c8c6119e4dba3bb (diff) | |
download | yt-local-630b476e42af47f7a0c8930e00c0e6c9cdc384d2.tar.lz yt-local-630b476e42af47f7a0c8930e00c0e6c9cdc384d2.tar.xz yt-local-630b476e42af47f7a0c8930e00c0e6c9cdc384d2.zip |
yt-dl: update youtube extractor file to fix js-sig extraction issues
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
-rw-r--r-- | youtube_dl/extractor/youtube.py | 66 |
1 files changed, 50 insertions, 16 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 544cfb6..3822e18 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -67,7 +67,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}' + _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}' def _set_language(self): self._set_cookie( @@ -262,7 +262,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True def _download_webpage_handle(self, *args, **kwargs): - kwargs.setdefault('query', {})['disable_polymer'] = 'true' + query = kwargs.get('query', {}).copy() + query['disable_polymer'] = 'true' + kwargs['query'] = query return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) @@ -350,6 +352,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| + (?:www\.)?invidio\.us/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: @@ -493,12 +496,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'upload_date': '20121002', 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -581,6 +587,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, }, @@ -1067,6 +1074,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', 'only_matching': True, }, + { + 'url': 'https://invidio.us/watch?v=BaW_jenozKc', + 'only_matching': True, + }, ] def __init__(self, *args, **kwargs): @@ -1183,7 +1194,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\('), + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) @@ -1536,6 +1548,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_response = {} + + # Is it unlisted? unlisted = (self._search_regex('''<meta itemprop="unlisted" content="(\w*)">''', video_webpage, 'is_unlisted', default='False') == "True") @@ -1590,6 +1604,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): current_song[title.lower()] = value last_index = match.end() + + + # Get video info embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: @@ -1605,7 +1622,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage( video_info_url, video_id, note='Refetching age-gated info webpage', @@ -1697,7 +1713,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'token' not in video_info: if 'reason' in video_info: - print(video_info['reason']) if 'The uploader has not made this video available in your country.' in video_info['reason']: regions_allowed = self._html_search_meta( 'regionsAllowed', video_webpage, default=None) @@ -1709,7 +1724,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): unavailable_message = extract_unavailable_message() if unavailable_message: reason = unavailable_message - raise YoutubeError( + raise ExtractorError( 'YouTube said: %s' % reason, expected=True, video_id=video_id) else: @@ -1961,13 +1976,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' formats.append(a_format) else: - error_message = extract_unavailable_message() - alt_error_message = clean_html(video_info.get('reason', [None])[0]) - print(alt_error_message) + error_message = clean_html(video_info.get('reason', [None])[0]) if not error_message: - error_message = alt_error_message + error_message = extract_unavailable_message() if error_message: - raise YoutubeError(error_message) + raise ExtractorError(error_message, expected=True) raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # uploader @@ -1991,6 +2004,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: self._downloader.report_warning('unable to extract uploader nickname') + channel_id = self._html_search_meta( + 'channelId', video_webpage, 'channel id') + channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None + # thumbnail image # We try first to get a high quality image: m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', @@ -2088,8 +2105,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'view count', default=None)) # subtitles - video_subtitles = self._get_subtitles(video_id, video_webpage) - automatic_captions = self._get_automatic_captions(video_id, video_webpage) + video_subtitles = self.extract_subtitles(video_id, video_webpage) + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) video_duration = try_get( video_info, lambda x: int_or_none(x['length_seconds'][0])) @@ -2169,6 +2186,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': video_uploader, 'uploader_id': video_uploader_id, 'uploader_url': video_uploader_url, + 'channel_id': channel_id, + 'channel_url': channel_url, 'upload_date': upload_date, 'license': video_license, 'creator': video_creator or artist, @@ -2210,7 +2229,11 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): (?:https?://)? (?:\w+\.)? (?: - youtube\.com/ + (?: + youtube\.com| + invidio\.us + ) + / (?: (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) \? (?:.*?[&;])*? (?:p|a|list)= @@ -2219,7 +2242,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( - (?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) @@ -2323,6 +2346,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', 'categories': ['People & Blogs'], 'tags': list, + 'view_count': int, 'like_count': int, 'dislike_count': int, }, @@ -2357,6 +2381,13 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): }, { 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', 'only_matching': True, + }, { + # music album playlist + 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', + 'only_matching': True, + }, { + 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', + 'only_matching': True, }] def _real_initialize(self): @@ -2499,7 +2530,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' + _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' @@ -2520,6 +2551,9 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): 'id': 'UUs0ifCMCm1icqRbqhUINa0w', 'title': 'Uploads from Deus Ex', }, + }, { + 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', + 'only_matching': True, }] @classmethod |