diff options
author | James Taylor <user234683@users.noreply.github.com> | 2019-12-19 21:33:54 -0800 |
---|---|---|
committer | James Taylor <user234683@users.noreply.github.com> | 2019-12-19 21:33:54 -0800 |
commit | b4406df9cf33c53b6e942e6a5c72d955f57c4b5f (patch) | |
tree | 4de0082ac9eb26a05188dd424835ea50b1483113 | |
parent | b614fcdb8579ba29fccfa47eab1e2965cfb0beaa (diff) | |
parent | 6b7a1212e30b713453aa7d2b3a7122e97689dad0 (diff) | |
download | yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.tar.lz yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.tar.xz yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.zip |
Merge branch 'modular-data-extract'
Commits in this branch are prefixed with "Extraction:"
This branch refactors data extraction. All such functionality has been moved to the yt_data_extract module.
Responses from requests are given to the module and it parses them into a consistent, more useful format.
The dependency on youtube-dl has also been dropped and this functionality has been built from scratch for these reasons:
(1) I've noticed youtube-dl breaks more often than invidious (which uses watch page extraction built from scratch) in response to changes from Youtube, so I'm hoping what I wrote will also be less brittle.
(2) Such breakage is inconvenient because I have to manually merge the fixes since I had to make changes to youtube-dl to make it do things such as extracting related videos.
(3) I have no control over error handling and request pooling with youtube-dl, since it does all the requests (these would require intrusive changes I don't want to maintain).
(4) I will now be able to finally display the number of comments and whether comments are disabled without making additional requests.
61 files changed, 1760 insertions, 32300 deletions
diff --git a/youtube/__init__.py b/youtube/__init__.py index 0137e86..534b9f8 100644 --- a/youtube/__init__.py +++ b/youtube/__init__.py @@ -23,3 +23,10 @@ def inject_theme_preference(): 'theme_path': '/youtube.com/static/' + theme_names[settings.theme] + '.css', } +@yt_app.template_filter('commatize') +def commatize(num): + if num is None: + return '' + if isinstance(num, str): + num = int(num) + return '{:,}'.format(num) diff --git a/youtube/channel.py b/youtube/channel.py index de75eaa..ad06e3f 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -137,132 +137,13 @@ def get_channel_search_json(channel_id, query, page): return polymer_json -def extract_info(polymer_json, tab): - response = polymer_json[1]['response'] - try: - microformat = response['microformat']['microformatDataRenderer'] - - # channel doesn't exist or was terminated - # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org - except KeyError: - if 'alerts' in response and len(response['alerts']) > 0: - result = '' - for alert in response['alerts']: - result += alert['alertRenderer']['text']['simpleText'] + '\n' - flask.abort(200, result) - elif 'errors' in response['responseContext']: - for error in response['responseContext']['errors']['error']: - if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id': - flask.abort(404, 'This channel does not exist') - raise - - - info = {} - info['current_tab'] = tab - - - # stuff from microformat (info given by youtube for every page on channel) - info['short_description'] = microformat['description'] - info['channel_name'] = microformat['title'] - info['avatar'] = microformat['thumbnail']['thumbnails'][0]['url'] - channel_url = microformat['urlCanonical'].rstrip('/') - channel_id = channel_url[channel_url.rfind('/')+1:] - info['channel_id'] = channel_id - info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id - - info['items'] = [] - - # empty channel - if 'contents' not in response and 'continuationContents' not in response: - return info - - - # find the tab with content - # example channel where tabs do not have definite index: https://www.youtube.com/channel/UC4gQ8i3FD7YbhOgqUkeQEJg - # TODO: maybe use the 'selected' attribute for this? - if 'continuationContents' not in response: - tab_renderer = None - tab_content = None - for tab_json in response['contents']['twoColumnBrowseResultsRenderer']['tabs']: - try: - tab_renderer = tab_json['tabRenderer'] - except KeyError: - tab_renderer = tab_json['expandableTabRenderer'] - try: - tab_content = tab_renderer['content'] - break - except KeyError: - pass - else: # didn't break - raise Exception("No tabs found with content") - assert tab == tab_renderer['title'].lower() - - - # extract tab-specific info - if tab in ('videos', 'playlists', 'search'): # find the list of items - if 'continuationContents' in response: - try: - items = response['continuationContents']['gridContinuation']['items'] - except KeyError: - items = response['continuationContents']['sectionListContinuation']['contents'] # for search - else: - contents = tab_content['sectionListRenderer']['contents'] - if 'itemSectionRenderer' in contents[0]: - item_section = contents[0]['itemSectionRenderer']['contents'][0] - try: - items = item_section['gridRenderer']['items'] - except KeyError: - if "messageRenderer" in item_section: - items = [] - else: - raise Exception('gridRenderer missing but messageRenderer not found') - else: - items = contents # for search - - # TODO: Fix this URL prefixing shit - additional_info = {'author': info['channel_name'], 'author_url': '/channel/' + channel_id} - info['items'] = [yt_data_extract.renderer_info(renderer, additional_info) for renderer in items] - - elif tab == 'about': - channel_metadata = tab_content['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['channelAboutFullMetadataRenderer'] - - - info['links'] = [] - for link_json in channel_metadata.get('primaryLinks', ()): - url = link_json['navigationEndpoint']['urlEndpoint']['url'] - if url.startswith('/redirect'): # youtube puts these on external links to do tracking - query_string = url[url.find('?')+1: ] - url = urllib.parse.parse_qs(query_string)['q'][0] - - text = yt_data_extract.get_plain_text(link_json['title']) - - info['links'].append( (text, url) ) - - - info['stats'] = [] - for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'): - try: - stat = channel_metadata[stat_name] - except KeyError: - continue - info['stats'].append(yt_data_extract.get_plain_text(stat)) - - if 'description' in channel_metadata: - info['description'] = yt_data_extract.get_text(channel_metadata['description']) - else: - info['description'] = '' - - else: - raise NotImplementedError('Unknown or unsupported channel tab: ' + tab) - - return info def post_process_channel_info(info): - info['avatar'] = '/' + info['avatar'] - info['channel_url'] = '/' + info['channel_url'] + info['avatar'] = util.prefix_url(info['avatar']) + info['channel_url'] = util.prefix_url(info['channel_url']) for item in info['items']: - yt_data_extract.prefix_urls(item) - yt_data_extract.add_extra_html_info(item) + util.prefix_urls(item) + util.add_extra_html_info(item) @@ -304,7 +185,9 @@ def get_channel_page(channel_id, tab='videos'): flask.abort(404, 'Unknown channel tab: ' + tab) - info = extract_info(json.loads(polymer_json), tab) + info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) + if info['error']: + return flask.render_template('error.html', error_message = info['error']) post_process_channel_info(info) if tab in ('videos', 'search'): info['number_of_videos'] = number_of_videos @@ -344,7 +227,10 @@ def get_channel_page_general_url(base_url, tab, request): flask.abort(404, 'Unknown channel tab: ' + tab) - info = extract_info(json.loads(polymer_json), tab) + info = yt_data_extract.extract_channel_info(json.loads(polymer_json), tab) + if info['error']: + return flask.render_template('error.html', error_message = info['error']) + post_process_channel_info(info) if tab in ('videos', 'search'): info['number_of_videos'] = 1000 diff --git a/youtube/comments.py b/youtube/comments.py index 3b1ef86..4e79d8b 100644 --- a/youtube/comments.py +++ b/youtube/comments.py @@ -48,24 +48,6 @@ def comment_replies_ctoken(video_id, comment_id, max_results=500): result = proto.nested(2, proto.string(2, video_id)) + proto.uint(3,6) + proto.nested(6, params) return base64.urlsafe_b64encode(result).decode('ascii') -def ctoken_metadata(ctoken): - result = dict() - params = proto.parse(proto.b64_to_bytes(ctoken)) - result['video_id'] = proto.parse(params[2])[2].decode('ascii') - - offset_information = proto.parse(params[6]) - result['offset'] = offset_information.get(5, 0) - - result['is_replies'] = False - if (3 in offset_information) and (2 in proto.parse(offset_information[3])): - result['is_replies'] = True - result['sort'] = None - else: - try: - result['sort'] = proto.parse(offset_information[4])[6] - except KeyError: - result['sort'] = 0 - return result mobile_headers = { @@ -91,7 +73,9 @@ def request_comments(ctoken, replies=False): print("got <!DOCTYPE>, retrying") continue break - return content + + polymer_json = json.loads(util.uppercase_escape(content.decode('utf-8'))) + return polymer_json def single_comment_ctoken(video_id, comment_id): @@ -102,112 +86,40 @@ def single_comment_ctoken(video_id, comment_id): -def parse_comments_polymer(content): - try: - video_title = '' - content = json.loads(util.uppercase_escape(content.decode('utf-8'))) - url = content[1]['url'] - ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] - metadata = ctoken_metadata(ctoken) - - try: - comments_raw = content[1]['response']['continuationContents']['commentSectionContinuation']['items'] - except KeyError: - comments_raw = content[1]['response']['continuationContents']['commentRepliesContinuation']['contents'] - - ctoken = util.default_multi_get(content, 1, 'response', 'continuationContents', 'commentSectionContinuation', 'continuations', 0, 'nextContinuationData', 'continuation', default='') - - comments = [] - for comment_json in comments_raw: - number_of_replies = 0 - try: - comment_thread = comment_json['commentThreadRenderer'] - except KeyError: - comment_renderer = comment_json['commentRenderer'] - else: - if 'commentTargetTitle' in comment_thread: - video_title = comment_thread['commentTargetTitle']['runs'][0]['text'] - - if 'replies' in comment_thread: - view_replies_text = yt_data_extract.get_plain_text(comment_thread['replies']['commentRepliesRenderer']['moreText']) - view_replies_text = view_replies_text.replace(',', '') - match = re.search(r'(\d+)', view_replies_text) - if match is None: - number_of_replies = 1 - else: - number_of_replies = int(match.group(1)) - comment_renderer = comment_thread['comment']['commentRenderer'] - - comment = { - 'author_id': comment_renderer.get('authorId', ''), - 'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'], - 'likes': comment_renderer['likeCount'], - 'published': yt_data_extract.get_plain_text(comment_renderer['publishedTimeText']), - 'text': comment_renderer['contentText'].get('runs', ''), - 'number_of_replies': number_of_replies, - 'comment_id': comment_renderer['commentId'], - } - - if 'authorText' in comment_renderer: # deleted channels have no name or channel link - comment['author'] = yt_data_extract.get_plain_text(comment_renderer['authorText']) - comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url'] - comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId'] - else: - comment['author'] = '' - comment['author_url'] = '' - comment['author_channel_id'] = '' - - comments.append(comment) - except Exception as e: - print('Error parsing comments: ' + str(e)) - comments = () - ctoken = '' - - return { - 'ctoken': ctoken, - 'comments': comments, - 'video_title': video_title, - 'video_id': metadata['video_id'], - 'offset': metadata['offset'], - 'is_replies': metadata['is_replies'], - 'sort': metadata['sort'], - } - def post_process_comments_info(comments_info): for comment in comments_info['comments']: comment['author_url'] = util.URL_ORIGIN + comment['author_url'] comment['author_avatar'] = '/' + comment['author_avatar'] - comment['permalink'] = util.URL_ORIGIN + '/watch?v=' + comments_info['video_id'] + '&lc=' + comment['comment_id'] + comment['permalink'] = util.URL_ORIGIN + '/watch?v=' + comments_info['video_id'] + '&lc=' + comment['id'] - if comment['author_channel_id'] in accounts.accounts: + if comment['author_id'] in accounts.accounts: comment['delete_url'] = (util.URL_ORIGIN + '/delete_comment?video_id=' + comments_info['video_id'] - + '&channel_id='+ comment['author_channel_id'] - + '&author_id=' + comment['author_id'] - + '&comment_id=' + comment['comment_id']) + + '&channel_id='+ comment['author_id'] + + '&comment_id=' + comment['id']) - num_replies = comment['number_of_replies'] - if num_replies == 0: - comment['replies_url'] = util.URL_ORIGIN + '/post_comment?parent_id=' + comment['comment_id'] + "&video_id=" + comments_info['video_id'] + reply_count = comment['reply_count'] + if reply_count == 0: + comment['replies_url'] = util.URL_ORIGIN + '/post_comment?parent_id=' + comment['id'] + "&video_id=" + comments_info['video_id'] else: - comment['replies_url'] = util.URL_ORIGIN + '/comments?parent_id=' + comment['comment_id'] + "&video_id=" + comments_info['video_id'] + comment['replies_url'] = util.URL_ORIGIN + '/comments?parent_id=' + comment['id'] + "&video_id=" + comments_info['video_id'] - if num_replies == 0: + if reply_count == 0: comment['view_replies_text'] = 'Reply' - elif num_replies == 1: + elif reply_count == 1: comment['view_replies_text'] = '1 reply' else: - comment['view_replies_text'] = str(num_replies) + ' replies' + comment['view_replies_text'] = str(reply_count) + ' replies' - if comment['likes'] == 1: + if comment['like_count'] == 1: comment['likes_text'] = '1 like' else: - comment['likes_text'] = str(comment['likes']) + ' likes' + comment['likes_text'] = str(comment['like_count']) + ' likes' comments_info['include_avatars'] = settings.enable_comment_avatars - if comments_info['ctoken'] != '': + if comments_info['ctoken']: comments_info['more_comments_url'] = util.URL_ORIGIN + '/comments?ctoken=' + comments_info['ctoken'] comments_info['page_number'] = page_number = str(int(comments_info['offset']/20) + 1) @@ -222,7 +134,7 @@ def post_process_comments_info(comments_info): def video_comments(video_id, sort=0, offset=0, lc='', secret_key=''): if settings.comments_mode: - comments_info = parse_comments_polymer(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key))) + comments_info = yt_data_extract.extract_comments_info(request_comments(make_comment_ctoken(video_id, sort, offset, lc, secret_key))) post_process_comments_info(comments_info) post_comment_url = util.URL_ORIGIN + "/post_comment?video_id=" + video_id @@ -247,7 +159,7 @@ def get_comments_page(): ctoken = comment_replies_ctoken(video_id, parent_id) replies = True - comments_info = parse_comments_polymer(request_comments(ctoken, replies)) + comments_info = yt_data_extract.extract_comments_info(request_comments(ctoken, replies)) post_process_comments_info(comments_info) if not replies: diff --git a/youtube/local_playlist.py b/youtube/local_playlist.py index cc6132a..3a058b3 100644 --- a/youtube/local_playlist.py +++ b/youtube/local_playlist.py @@ -57,7 +57,7 @@ def get_local_playlist_videos(name, offset=0, amount=50): info['thumbnail'] = util.get_thumbnail_url(info['id']) missing_thumbnails.append(info['id']) info['type'] = 'video' - yt_data_extract.add_extra_html_info(info) + util.add_extra_html_info(info) videos.append(info) except json.decoder.JSONDecodeError: if not video_json.strip() == '': diff --git a/youtube/playlist.py b/youtube/playlist.py index 3e5b0d2..3ca235a 100644 --- a/youtube/playlist.py +++ b/youtube/playlist.py @@ -89,28 +89,29 @@ def get_playlist_page(): ) gevent.joinall(tasks) first_page_json, this_page_json = tasks[0].value, tasks[1].value - - try: # first page - video_list = this_page_json['response']['contents']['singleColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents'] - except KeyError: # other pages - video_list = this_page_json['response']['continuationContents']['playlistVideoListContinuation']['contents'] - - parsed_video_list = [yt_data_extract.parse_info_prepare_for_html(video_json) for video_json in video_list] + info = yt_data_extract.extract_playlist_info(this_page_json) + if info['error']: + return flask.render_template('error.html', error_message = info['error']) - metadata = yt_data_extract.renderer_info(first_page_json['response']['header']) - yt_data_extract.prefix_urls(metadata) + if page != '1': + info['metadata'] = yt_data_extract.extract_playlist_metadata(first_page_json) - if 'description' not in metadata: - metadata['description'] = '' + util.prefix_urls(info['metadata']) + for item in info.get('items', ()): + util.prefix_urls(item) + util.add_extra_html_info(item) + if 'id' in item: + item['thumbnail'] = '/https://i.ytimg.com/vi/' + item['id'] + '/default.jpg' - video_count = int(metadata['size'].replace(',', '')) - metadata['size'] += ' videos' + video_count = yt_data_extract.deep_get(info, 'metadata', 'video_count') + if video_count is None: + video_count = 40 return flask.render_template('playlist.html', - video_list = parsed_video_list, + video_list = info.get('items', []), num_pages = math.ceil(video_count/20), parameters_dictionary = request.args, - **metadata + **info['metadata'] ).encode('utf-8') diff --git a/youtube/post_comment.py b/youtube/post_comment.py index 25d0e3a..78f080f 100644 --- a/youtube/post_comment.py +++ b/youtube/post_comment.py @@ -70,7 +70,7 @@ def _post_comment_reply(text, video_id, parent_comment_id, session_token, cookie print("Comment posting code: " + code) return code -def _delete_comment(video_id, comment_id, author_id, session_token, cookiejar): +def _delete_comment(video_id, comment_id, session_token, cookiejar): headers = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1', 'Accept': '*/*', @@ -79,7 +79,7 @@ def _delete_comment(video_id, comment_id, author_id, session_token, cookiejar): 'X-YouTube-Client-Version': '2.20180823', 'Content-Type': 'application/x-www-form-urlencoded', } - action = proto.uint(1,6) + proto.string(3, comment_id) + proto.string(5, video_id) + proto.string(9, author_id) + action = proto.uint(1,6) + proto.string(3, comment_id) + proto.string(5, video_id) action = proto.percent_b64encode(action).decode('ascii') sej = json.dumps({"clickTrackingParams":"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=","commandMetadata":{"webCommandMetadata":{"url":"/service_ajax","sendPost":True}},"performCommentActionEndpoint":{"action":action}}) @@ -115,7 +115,7 @@ def delete_comment(): cookiejar = accounts.account_cookiejar(request.values['channel_id']) token = get_session_token(video_id, cookiejar) - code = _delete_comment(video_id, request.values['comment_id'], request.values['author_id'], token, cookiejar) + code = _delete_comment(video_id, request.values['comment_id'], token, cookiejar) if code == "SUCCESS": return flask.redirect(util.URL_ORIGIN + '/comment_delete_success', 303) @@ -147,7 +147,7 @@ def post_comment(): @yt_app.route('/delete_comment', methods=['GET']) def get_delete_comment_page(): - parameters = [(parameter_name, request.args[parameter_name]) for parameter_name in ('video_id', 'channel_id', 'author_id', 'comment_id')] + parameters = [(parameter_name, request.args[parameter_name]) for parameter_name in ('video_id', 'channel_id', 'comment_id')] return flask.render_template('delete_comment.html', parameters = parameters) diff --git a/youtube/search.py b/youtube/search.py index e167279..0f6bbc4 100644 --- a/youtube/search.py +++ b/youtube/search.py @@ -5,7 +5,6 @@ import settings import json import urllib import base64 -from math import ceil import mimetypes from flask import request import flask @@ -74,59 +73,34 @@ def get_search_page(): filters['time'] = int(request.args.get("time", "0")) filters['type'] = int(request.args.get("type", "0")) filters['duration'] = int(request.args.get("duration", "0")) - info = get_search_json(query, page, autocorrect, sort, filters) - - estimated_results = int(info[1]['response']['estimatedResults']) - estimated_pages = ceil(estimated_results/20) - - # almost always is the first "section", but if there's an advertisement for a google product like Stadia or Home in the search results, then that becomes the first "section" and the search results are in the second. So just join all of them for resiliency - results = [] - for section in info[1]['response']['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents']: - results += section['itemSectionRenderer']['contents'] - - parsed_results = [] - corrections = {'type': None} - for renderer in results: - type = list(renderer.keys())[0] - if type == 'shelfRenderer': - continue - if type == 'didYouMeanRenderer': - renderer = renderer[type] - corrected_query_string = request.args.to_dict(flat=False) - corrected_query_string['query'] = [renderer['correctedQueryEndpoint']['searchEndpoint']['query']] - corrected_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(corrected_query_string, doseq=True) - - corrections = { - 'type': 'did_you_mean', - 'corrected_query': yt_data_extract.format_text_runs(renderer['correctedQuery']['runs']), - 'corrected_query_url': corrected_query_url, - } - continue - if type == 'showingResultsForRenderer': - renderer = renderer[type] - no_autocorrect_query_string = request.args.to_dict(flat=False) - no_autocorrect_query_string['autocorrect'] = ['0'] - no_autocorrect_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(no_autocorrect_query_string, doseq=True) - - corrections = { - 'type': 'showing_results_for', - 'corrected_query': yt_data_extract.format_text_runs(renderer['correctedQuery']['runs']), - 'original_query_url': no_autocorrect_query_url, - 'original_query': renderer['originalQuery']['simpleText'], - } - continue - - info = yt_data_extract.parse_info_prepare_for_html(renderer) - if info['type'] != 'unsupported': - parsed_results.append(info) + polymer_json = get_search_json(query, page, autocorrect, sort, filters) + + search_info = yt_data_extract.extract_search_info(polymer_json) + if search_info['error']: + return flask.render_template('error.html', error_message = search_info['error']) + + for extract_item_info in search_info['items']: + util.prefix_urls(extract_item_info) + util.add_extra_html_info(extract_item_info) + + corrections = search_info['corrections'] + if corrections['type'] == 'did_you_mean': + corrected_query_string = request.args.to_dict(flat=False) + corrected_query_string['query'] = [corrections['corrected_query']] + corrections['corrected_query_url'] = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(corrected_query_string, doseq=True) + elif corrections['type'] == 'showing_results_for': + no_autocorrect_query_string = request.args.to_dict(flat=False) + no_autocorrect_query_string['autocorrect'] = ['0'] + no_autocorrect_query_url = util.URL_ORIGIN + '/search?' + urllib.parse.urlencode(no_autocorrect_query_string, doseq=True) + corrections['original_query_url'] = no_autocorrect_query_url return flask.render_template('search.html', header_playlist_names = local_playlist.get_playlist_names(), query = query, - estimated_results = estimated_results, - estimated_pages = estimated_pages, - corrections = corrections, - results = parsed_results, + estimated_results = search_info['estimated_results'], + estimated_pages = search_info['estimated_pages'], + corrections = search_info['corrections'], + results = search_info['items'], parameters_dictionary = request.args, ) diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py index c9638cf..18436e2 100644 --- a/youtube/subscriptions.py +++ b/youtube/subscriptions.py @@ -172,7 +172,7 @@ def _get_videos(cursor, number_per_page, offset, tag = None): 'id': db_video[0], 'title': db_video[1], 'duration': db_video[2], - 'published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]), + 'time_published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]), 'author': db_video[5], }) @@ -455,10 +455,17 @@ def _get_upstream_videos(channel_id): print('Failed to read atoma feed for ' + channel_status_name) traceback.print_exc() - videos = channel.extract_info(json.loads(channel_tab), 'videos')['items'] + channel_info = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos') + if channel_info['error']: + print('Error checking channel ' + channel_status_name + ': ' + channel_info['error']) + return + + videos = channel_info['items'] for i, video_item in enumerate(videos): - if 'description' not in video_item: + if not video_item.get('description'): video_item['description'] = '' + else: + video_item['description'] = ''.join(run.get('text', '') for run in video_item['description']) if video_item['id'] in times_published: video_item['time_published'] = times_published[video_item['id']] @@ -466,7 +473,7 @@ def _get_upstream_videos(channel_id): else: video_item['is_time_published_exact'] = False try: - video_item['time_published'] = youtube_timestamp_to_posix(video_item['published']) - i # subtract a few seconds off the videos so they will be in the right order + video_item['time_published'] = youtube_timestamp_to_posix(video_item['time_published']) - i # subtract a few seconds off the videos so they will be in the right order except KeyError: print(video_item) @@ -759,7 +766,7 @@ def get_subscriptions_page(): video['thumbnail'] = util.URL_ORIGIN + '/data/subscription_thumbnails/' + video['id'] + '.jpg' video['type'] = 'video' video['item_size'] = 'small' - yt_data_extract.add_extra_html_info(video) + util.add_extra_html_info(video) tags = _get_all_tags(cursor) diff --git a/youtube/templates/comments.html b/youtube/templates/comments.html index 20cde4e..396852a 100644 --- a/youtube/templates/comments.html +++ b/youtube/templates/comments.html @@ -12,11 +12,11 @@ <a class="author" href="{{ comment['author_url'] }}" title="{{ comment['author'] }}">{{ comment['author'] }}</a> </address> <a class="permalink" href="{{ comment['permalink'] }}" title="permalink"> - <time datetime="">{{ comment['published'] }}</time> + <time datetime="">{{ comment['time_published'] }}</time> </a> <span class="text">{{ common_elements.text_runs(comment['text']) }}</span> - <span class="likes">{{ comment['likes_text'] if comment['likes'] else ''}}</span> + <span class="likes">{{ comment['likes_text'] if comment['like_count'] else ''}}</span> <div class="bottom-row"> <a href="{{ comment['replies_url'] }}" class="replies">{{ comment['view_replies_text'] }}</a> {% if 'delete_url' is in comment %} diff --git a/youtube/templates/common_elements.html b/youtube/templates/common_elements.html index 67655b3..7914c08 100644 --- a/youtube/templates/common_elements.html +++ b/youtube/templates/common_elements.html @@ -9,53 +9,59 @@ {{ text_run["text"] }} {%- endif -%} {%- endfor -%} - {%- else -%} + {%- elif runs -%} {{ runs }} {%- endif -%} {% endmacro %} -{% macro item(info, description=false, horizontal=true, include_author=true) %} +{% macro item(info, description=false, horizontal=true, include_author=true, include_badges=true) %} <div class="item-box {{ info['type'] + '-item-box' }} {{'horizontal-item-box' if horizontal else 'vertical-item-box'}} {{'has-description' if description else 'no-description'}}"> - <div class="item {{ info['type'] + '-item' }}"> - <a class="thumbnail-box" href="{{ info['url'] }}" title="{{ info['title'] }}"> - <img class="thumbnail-img" src="{{ info['thumbnail'] }}"> - {% if info['type'] != 'channel' %} - <div class="thumbnail-info"> - <span>{{ info['size'] if info['type'] == 'playlist' else info['duration'] }}</span> - </div> - {% endif %} - </a> + {% if info['error'] %} + {{ info['error'] }} + {% else %} + <div class="item {{ info['type'] + '-item' }}"> + <a class="thumbnail-box" href="{{ info['url'] }}" title="{{ info['title'] }}"> + <img class="thumbnail-img" src="{{ info['thumbnail'] }}"> + {% if info['type'] != 'channel' %} + <div class="thumbnail-info"> + <span>{{ (info['video_count']|string + ' videos') if info['type'] == 'playlist' else info['duration'] }}</span> + </div> + {% endif %} + </a> - <div class="title"><a class="title" href="{{ info['url'] }}" title="{{ info['title'] }}">{{ info['title'] }}</a></div> + <div class="title"><a class="title" href="{{ info['url'] }}" title="{{ info['title'] }}">{{ info['title'] }}</a></div> - <ul class="stats {{'vertical-stats' if horizontal and not description and include_author else 'horizontal-stats'}}"> - {% if info['type'] == 'channel' %} - <li><span>{{ info['subscriber_count'] }} subscribers</span></li> - <li><span>{{ info['size'] }} videos</span></li> - {% else %} - {% if include_author %} - {% if 'author_url' is in(info) %} - <li><address title="{{ info['author'] }}">By <a href="{{ info['author_url'] }}">{{ info['author'] }}</a></address></li> - {% else %} - <li><address title="{{ info['author'] }}"><b>{{ info['author'] }}</b></address></li> - {% endif %} - {% endif %} - {% if 'views' is in(info) %} - <li><span class="views">{{ info['views'] }}</span></li> - {% endif %} - {% if 'published' is in(info) %} - <li><time>{{ info['published'] }}</time></li> + {% if include_author %} + {% if info.get('author_url') %} + <address title="{{ info['author'] }}">By <a href="{{ info['author_url'] }}">{{ info['author'] }}</a></address> + {% else %} + <address title="{{ info['author'] }}"><b>{{ info['author'] }}</b></address> {% endif %} {% endif %} - </ul> + <ul class="stats {{'horizontal-stats' if horizontal else 'vertical-stats'}}"> + {% if info['type'] == 'channel' %} + <li><span>{{ info['approx_subscriber_count'] }} subscribers</span></li> + <li><span>{{ info['video_count'] }} videos</span></li> + {% else %} + {% if info.get('approx_view_count') %} + <li><span class="views">{{ info['approx_view_count'] }} views</span></li> + {% endif %} + {% if info.get('time_published') %} + <li><time>{{ info['time_published'] }}</time></li> + {% endif %} + {% endif %} + </ul> - {% if description %} - <span class="description">{{ text_runs(info.get('description', '')) }}</span> + {% if description %} + <span class="description">{{ text_runs(info.get('description', '')) }}</span> + {% endif %} + {% if include_badges %} + <span class="badges">{{ info['badges']|join(' | ') }}</span> + {% endif %} + </div> + {% if info['type'] == 'video' %} + <input class="item-checkbox" type="checkbox" name="video_info_list" value="{{ info['video_info'] }}" form="playlist-edit"> {% endif %} - <span class="badges">{{ info['badges']|join(' | ') }}</span> - </div> - {% if info['type'] == 'video' %} - <input class="item-checkbox" type="checkbox" name="video_info_list" value="{{ info['video_info'] }}" form="playlist-edit"> {% endif %} </div> diff --git a/youtube/templates/playlist.html b/youtube/templates/playlist.html index ab2640f..ebd152b 100644 --- a/youtube/templates/playlist.html +++ b/youtube/templates/playlist.html @@ -54,8 +54,9 @@ <h2 class="playlist-title">{{ title }}</h2> <a class="playlist-author" href="{{ author_url }}">{{ author }}</a> <div class="playlist-stats"> - <div>{{ views }}</div> - <div>{{ size }}</div> + <div>{{ video_count|commatize }} videos</div> + <div>{{ view_count|commatize }} views</div> + <div>Last updated {{ time_published }}</div> </div> <div class="playlist-description">{{ common_elements.text_runs(description) }}</div> </div> diff --git a/youtube/templates/search.html b/youtube/templates/search.html index aef914a..8b803e7 100644 --- a/youtube/templates/search.html +++ b/youtube/templates/search.html @@ -29,10 +29,10 @@ <div id="result-info"> <div id="number-of-results">Approximately {{ '{:,}'.format(estimated_results) }} results ({{ '{:,}'.format(estimated_pages) }} pages)</div> {% if corrections['type'] == 'showing_results_for' %} - <div>Showing results for <a>{{ corrections['corrected_query']|safe }}</a></div> - <div>Search instead for <a href="{{ corrections['original_query_url'] }}">{{ corrections['original_query'] }}</a></div> + <div>Showing results for <a>{{ common_elements.text_runs(corrections['corrected_query_text']) }}</a></div> + <div>Search instead for <a href="{{ corrections['original_query_url'] }}">{{ corrections['original_query_text'] }}</a></div> {% elif corrections['type'] == 'did_you_mean' %} - <div>Did you mean <a href="{{ corrections['corrected_query_url'] }}">{{ corrections['corrected_query']|safe }}</a></div> + <div>Did you mean <a href="{{ corrections['corrected_query_url'] }}">{{ common_elements.text_runs(corrections['corrected_query_text']) }}</a></div> {% endif %} </div> <div class="item-list"> diff --git a/youtube/templates/watch.html b/youtube/templates/watch.html index 25bd34c..a06e895 100644 --- a/youtube/templates/watch.html +++ b/youtube/templates/watch.html @@ -14,6 +14,19 @@ text-decoration: underline; } + .playability-error{ + height: 360px; + width: 640px; + grid-column: 2; + background-color: var(--video-background-color); + text-align:center; + } + .playability-error span{ + position: relative; + top: 50%; + transform: translate(-50%, -50%); + } + {% if theater_mode %} video{ grid-column: 1 / span 5; @@ -61,12 +74,21 @@ grid-column: 1 / span 2; min-width: 0; } - .video-info > .is-unlisted{ - background-color: var(--interface-color); + .video-info > .labels{ justify-self:start; - padding-left:2px; - padding-right:2px; + list-style: none; + padding: 0px; + margin: 5px 0px; + } + .video-info > .labels:empty{ + margin: 0px; } + .labels > li{ + display: inline; + margin-right:5px; + background-color: var(--interface-color); + padding: 2px 5px + } .video-info > address{ grid-column: 1; grid-row: 3; @@ -143,9 +165,13 @@ .related-videos-inner{ padding-top: 10px; display: grid; - grid-auto-rows: 94px; + grid-auto-rows: 90px; grid-row-gap: 10px; } + .thumbnail-box{ /* overides rule in shared.css */ + height: 90px !important; + width: 120px !important; + } /* Put related vids below videos when window is too small */ /* 1100px instead of 1080 because W3C is full of idiots who include scrollbar width */ @@ -187,38 +213,59 @@ .format-ext{ width: 60px; } - .format-res{ - width:90px; + .format-video-quality{ + width: 140px; + } + .format-audio-quality{ + width: 120px; + } + .format-file-size{ + width: 80px; + } + .format-codecs{ + width: 120px; } {% endblock style %} {% block main %} - <video controls autofocus> - {% for video_source in video_sources %} - <source src="{{ video_source['src'] }}" type="{{ video_source['type'] }}"> - {% endfor %} + {% if playability_error %} + <div class="playability-error"><span>{{ 'Error: ' + playability_error }}</span></div> + {% else %} + <video controls autofocus class="video"> + {% for video_source in video_sources %} + <source src="{{ video_source['src'] }}" type="{{ video_source['type'] }}"> + {% endfor %} - {% for source in subtitle_sources %} - {% if source['on'] %} - <track label="{{ source['label'] }}" src="{{ source['url'] }}" kind="subtitles" srclang="{{ source['srclang'] }}" default> - {% else %} - <track label="{{ source['label'] }}" src="{{ source['url'] }}" kind="subtitles" srclang="{{ source['srclang'] }}"> - {% endif %} - {% endfor %} + {% for source in subtitle_sources %} + {% if source['on'] %} + <track label="{{ source['label'] }}" src="{{ source['url'] }}" kind="subtitles" srclang="{{ source['srclang'] }}" default> + {% else %} + <track label="{{ source['label'] }}" src="{{ source['url'] }}" kind="subtitles" srclang="{{ source['srclang'] }}"> + {% endif %} + {% endfor %} - </video> + </video> + {% endif %} <div class="video-info"> <h2 class="title">{{ title }}</h2> - {% if unlisted %} - <span class="is-unlisted">Unlisted</span> - {% endif %} + <ul class="labels"> + {%- if unlisted -%} + <li class="is-unlisted">Unlisted</li> + {%- endif -%} + {%- if age_restricted -%} + <li class="age-restricted">Age-restricted</li> + {%- endif -%} + {%- if limited_state -%} + <li>Limited state</li> + {%- endif -%} + </ul> <address>Uploaded by <a href="{{ uploader_channel_url }}">{{ uploader }}</a></address> - <span class="views">{{ views }} views</span> + <span class="views">{{ view_count }} views</span> - <time datetime="$upload_date">Published on {{ upload_date }}</time> - <span class="likes-dislikes">{{ likes }} likes {{ dislikes }} dislikes</span> + <time datetime="$upload_date">Published on {{ time_published }}</time> + <span class="likes-dislikes">{{ like_count }} likes {{ dislike_count }} dislikes</span> <details class="download-dropdown"> <summary class="download-dropdown-label">Download</summary> <ul class="download-dropdown-content"> @@ -227,8 +274,10 @@ <a class="download-link" href="{{ format['url'] }}"> <ol class="format-attributes"> <li class="format-ext">{{ format['ext'] }}</li> - <li class="format-res">{{ format['resolution'] }}</li> - <li class="format-note">{{ format['note'] }}</li> + <li class="format-video-quality">{{ format['video_quality'] }}</li> + <li class="format-audio-quality">{{ format['audio_quality'] }}</li> + <li class="format-file-size">{{ format['file_size'] }}</li> + <li class="format-codecs">{{ format['codecs'] }}</li> </ol> </a> </li> @@ -238,7 +287,7 @@ <input class="checkbox" name="video_info_list" value="{{ video_info }}" form="playlist-edit" type="checkbox"> - <span class="description">{{ description }}</span> + <span class="description">{{ common_elements.text_runs(description) }}</span> <div class="music-list"> {% if music_list.__len__() != 0 %} <hr> @@ -266,7 +315,7 @@ <summary>Related Videos</summary> <nav class="related-videos-inner"> {% for info in related %} - {{ common_elements.item(info) }} + {{ common_elements.item(info, include_badges=false) }} {% endfor %} </nav> </details> diff --git a/youtube/util.py b/youtube/util.py index 2205645..feeec8c 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -1,4 +1,5 @@ import settings +from youtube import yt_data_extract import socks, sockshandler import gzip import brotli @@ -6,6 +7,7 @@ import urllib.parse import re import time import os +import json import gevent import gevent.queue import gevent.lock @@ -176,7 +178,7 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookieja return content, response return content -mobile_user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1' +mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36' mobile_ua = (('User-Agent', mobile_user_agent),) desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0' desktop_ua = (('User-Agent', desktop_user_agent),) @@ -277,15 +279,6 @@ def video_id(url): url_parts = urllib.parse.urlparse(url) return urllib.parse.parse_qs(url_parts.query)['v'][0] -def default_multi_get(object, *keys, default): - ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors ''' - try: - for key in keys: - object = object[key] - return object - except (IndexError, KeyError): - return default - # default, sddefault, mqdefault, hqdefault, hq720 def get_thumbnail_url(video_id): @@ -317,3 +310,52 @@ def uppercase_escape(s): return re.sub( r'\\U([0-9a-fA-F]{8})', lambda m: chr(int(m.group(1), base=16)), s) + +def prefix_url(url): + if url is None: + return None + url = url.lstrip('/') # some urls have // before them, which has a special meaning + return '/' + url + +def left_remove(string, substring): + '''removes substring from the start of string, if present''' + if string.startswith(substring): + return string[len(substring):] + return string + + +def prefix_urls(item): + try: + item['thumbnail'] = prefix_url(item['thumbnail']) + except KeyError: + pass + + try: + item['author_url'] = prefix_url(item['author_url']) + except KeyError: + pass + +def add_extra_html_info(item): + if item['type'] == 'video': + item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None + + video_info = {} + for key in ('id', 'title', 'author', 'duration'): + try: + video_info[key] = item[key] + except KeyError: + video_info[key] = '' + + item['video_info'] = json.dumps(video_info) + + elif item['type'] == 'playlist': + item['url'] = (URL_ORIGIN + '/playlist?list=' + item['id']) if item.get('id') else None + elif item['type'] == 'channel': + item['url'] = (URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None + +def parse_info_prepare_for_html(renderer, additional_info={}): + item = yt_data_extract.extract_item_info(renderer, additional_info) + prefix_urls(item) + add_extra_html_info(item) + + return item diff --git a/youtube/watch.py b/youtube/watch.py index 41c90e4..429f272 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -5,49 +5,20 @@ import settings from flask import request import flask -from youtube_dl.YoutubeDL import YoutubeDL -from youtube_dl.extractor.youtube import YoutubeError import json import html import gevent import os +import math +import traceback +import urllib +try: + with open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'r') as f: + decrypt_cache = json.loads(f.read())['decrypt_cache'] +except FileNotFoundError: + decrypt_cache = {} -def get_related_items(info): - results = [] - for item in info['related_vids']: - if 'list' in item: # playlist: - result = watch_page_related_playlist_info(item) - else: - result = watch_page_related_video_info(item) - yt_data_extract.prefix_urls(result) - yt_data_extract.add_extra_html_info(result) - results.append(result) - return results - - -# json of related items retrieved directly from the watch page has different names for everything -# converts these to standard names -def watch_page_related_video_info(item): - result = {key: item[key] for key in ('id', 'title', 'author')} - result['duration'] = util.seconds_to_timestamp(item['length_seconds']) - try: - result['views'] = item['short_view_count_text'] - except KeyError: - result['views'] = '' - result['thumbnail'] = util.get_thumbnail_url(item['id']) - result['type'] = 'video' - return result - -def watch_page_related_playlist_info(item): - return { - 'size': item['playlist_length'] if item['playlist_length'] != "0" else "50+", - 'title': item['playlist_title'], - 'id': item['list'], - 'first_video_id': item['video_id'], - 'thumbnail': util.get_thumbnail_url(item['video_id']), - 'type': 'playlist', - } def get_video_sources(info): video_sources = [] @@ -55,9 +26,10 @@ def get_video_sources(info): max_resolution = 360 else: max_resolution = settings.default_resolution - for format in info['formats']: - if format['acodec'] != 'none' and format['vcodec'] != 'none' and format['height'] <= max_resolution: + if not all(format[attr] for attr in ('height', 'width', 'ext', 'url')): + continue + if format['acodec'] and format['vcodec'] and format['height'] <= max_resolution: video_sources.append({ 'src': format['url'], 'type': 'video/' + format['ext'], @@ -71,50 +43,108 @@ def get_video_sources(info): return video_sources +def make_caption_src(info, lang, auto=False, trans_lang=None): + label = lang + if auto: + label += ' (Automatic)' + if trans_lang: + label += ' -> ' + trans_lang + return { + 'url': '/' + yt_data_extract.get_caption_url(info, lang, 'vtt', auto, trans_lang), + 'label': label, + 'srclang': trans_lang[0:2] if trans_lang else lang[0:2], + 'on': False, + } + +def lang_in(lang, sequence): + '''Tests if the language is in sequence, with e.g. en and en-US considered the same''' + if lang is None: + return False + lang = lang[0:2] + return lang in (l[0:2] for l in sequence) + +def lang_eq(lang1, lang2): + '''Tests if two iso 639-1 codes are equal, with en and en-US considered the same. + Just because the codes are equal does not mean the dialects are mutually intelligible, but this will have to do for now without a complex language model''' + if lang1 is None or lang2 is None: + return False + return lang1[0:2] == lang2[0:2] + +def equiv_lang_in(lang, sequence): + '''Extracts a language in sequence which is equivalent to lang. + e.g. if lang is en, extracts en-GB from sequence. + Necessary because if only a specific variant like en-GB is available, can't ask Youtube for simply en. Need to get the available variant.''' + lang = lang[0:2] + for l in sequence: + if l[0:2] == lang: + return l + return None + def get_subtitle_sources(info): + '''Returns these sources, ordered from least to most intelligible: + native_video_lang (Automatic) + foreign_langs (Manual) + native_video_lang (Automatic) -> pref_lang + foreign_langs (Manual) -> pref_lang + native_video_lang (Manual) -> pref_lang + pref_lang (Automatic) + pref_lang (Manual)''' sources = [] - default_found = False - default = None - for language, formats in info['subtitles'].items(): - for format in formats: - if format['ext'] == 'vtt': - source = { - 'url': '/' + format['url'], - 'label': language, - 'srclang': language, - - # set as on by default if this is the preferred language and a default-on subtitles mode is in settings - 'on': language == settings.subtitles_language and settings.subtitles_mode > 0, - } - - if language == settings.subtitles_language: - default_found = True - default = source - else: - sources.append(source) - break - - # Put it at the end to avoid browser bug when there are too many languages - # (in firefox, it is impossible to select a language near the top of the list because it is cut off) - if default_found: - sources.append(default) + pref_lang = settings.subtitles_language + native_video_lang = None + if info['automatic_caption_languages']: + native_video_lang = info['automatic_caption_languages'][0] - try: - formats = info['automatic_captions'][settings.subtitles_language] - except KeyError: - pass - else: - for format in formats: - if format['ext'] == 'vtt': - sources.append({ - 'url': '/' + format['url'], - 'label': settings.subtitles_language + ' - Automatic', - 'srclang': settings.subtitles_language, + highest_fidelity_is_manual = False - # set as on by default if this is the preferred language and a default-on subtitles mode is in settings - 'on': settings.subtitles_mode == 2 and not default_found, + # Sources are added in very specific order outlined above + # More intelligible sources are put further down to avoid browser bug when there are too many languages + # (in firefox, it is impossible to select a language near the top of the list because it is cut off) - }) + # native_video_lang (Automatic) + if native_video_lang and not lang_eq(native_video_lang, pref_lang): + sources.append(make_caption_src(info, native_video_lang, auto=True)) + + # foreign_langs (Manual) + for lang in info['manual_caption_languages']: + if not lang_eq(lang, pref_lang): + sources.append(make_caption_src(info, lang)) + + if (lang_in(pref_lang, info['translation_languages']) + and not lang_in(pref_lang, info['automatic_caption_languages']) + and not lang_in(pref_lang, info['manual_caption_languages'])): + # native_video_lang (Automatic) -> pref_lang + if native_video_lang and not lang_eq(pref_lang, native_video_lang): + sources.append(make_caption_src(info, native_video_lang, auto=True, trans_lang=pref_lang)) + + # foreign_langs (Manual) -> pref_lang + for lang in info['manual_caption_languages']: + if not lang_eq(lang, native_video_lang) and not lang_eq(lang, pref_lang): + sources.append(make_caption_src(info, lang, trans_lang=pref_lang)) + + # native_video_lang (Manual) -> pref_lang + if lang_in(native_video_lang, info['manual_caption_languages']): + sources.append(make_caption_src(info, native_video_lang, trans_lang=pref_lang)) + + # pref_lang (Automatic) + if lang_in(pref_lang, info['automatic_caption_languages']): + sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['automatic_caption_languages']), auto=True)) + + # pref_lang (Manual) + if lang_in(pref_lang, info['manual_caption_languages']): + sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['manual_caption_languages']))) + highest_fidelity_is_manual = True + + if sources and sources[-1]['srclang'] == pref_lang: + # set as on by default since it's manual a default-on subtitles mode is in settings + if highest_fidelity_is_manual and settings.subtitles_mode > 0: + sources[-1]['on'] = True + # set as on by default since settings indicate to set it as such even if it's not manual + elif settings.subtitles_mode == 2: + sources[-1]['on'] = True + + if len(sources) == 0: + assert len(info['automatic_caption_languages']) == 0 and len(info['manual_caption_languages']) == 0 return sources @@ -134,14 +164,111 @@ def get_ordered_music_list_attributes(music_list): return ordered_attributes - -def extract_info(downloader, *args, **kwargs): +def save_decrypt_cache(): try: - return downloader.extract_info(*args, **kwargs) - except YoutubeError as e: - return str(e) - - + f = open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'w') + except FileNotFoundError: + os.makedirs(settings.data_dir) + f = open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'w') + + f.write(json.dumps({'version': 1, 'decrypt_cache':decrypt_cache}, indent=4, sort_keys=True)) + f.close() + +def decrypt_signatures(info): + '''return error string, or False if no errors''' + if not yt_data_extract.requires_decryption(info): + return False + if not info['player_name']: + return 'Could not find player name' + if not info['base_js']: + return 'Failed to find base.js' + + player_name = info['player_name'] + if player_name in decrypt_cache: + print('Using cached decryption function for: ' + player_name) + info['decryption_function'] = decrypt_cache[player_name] + else: + base_js = util.fetch_url(info['base_js'], debug_name='base.js', report_text='Fetched player ' + player_name) + base_js = base_js.decode('utf-8') + err = yt_data_extract.extract_decryption_function(info, base_js) + if err: + return err + decrypt_cache[player_name] = info['decryption_function'] + save_decrypt_cache() + err = yt_data_extract.decrypt_signatures(info) + return err + +headers = ( + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('X-YouTube-Client-Name', '2'), + ('X-YouTube-Client-Version', '2.20180830'), +) + util.mobile_ua + +def extract_info(video_id): + polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1&bpctr=9999999999', headers=headers, debug_name='watch').decode('utf-8') + # TODO: Decide whether this should be done in yt_data_extract.extract_watch_info + try: + polymer_json = json.loads(polymer_json) + except json.decoder.JSONDecodeError: + traceback.print_exc() + return {'error': 'Failed to parse json response'} + info = yt_data_extract.extract_watch_info(polymer_json) + + # age restriction bypass + if info['age_restricted']: + print('Fetching age restriction bypass page') + data = { + 'video_id': video_id, + 'eurl': 'https://youtube.googleapis.com/v/' + video_id, + } + url = 'https://www.youtube.com/get_video_info?' + urllib.parse.urlencode(data) + video_info_page = util.fetch_url(url, debug_name='get_video_info', report_text='Fetched age restriction bypass page').decode('utf-8') + yt_data_extract.update_with_age_restricted_info(info, video_info_page) + + # signature decryption + decryption_error = decrypt_signatures(info) + if decryption_error: + decryption_error = 'Error decrypting url signatures: ' + decryption_error + info['playability_error'] = decryption_error + + return info + +def video_quality_string(format): + if format['vcodec']: + result =str(format['width'] or '?') + 'x' + str(format['height'] or '?') + if format['fps']: + result += ' ' + str(format['fps']) + 'fps' + return result + elif format['acodec']: + return 'audio only' + + return '?' + +def audio_quality_string(format): + if format['acodec']: + result = str(format['audio_bitrate'] or '?') + 'k' + if format['audio_sample_rate']: + result += ' ' + str(format['audio_sample_rate']) + ' Hz' + return result + elif format['vcodec']: + return 'video only' + + return '?' + +# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py +def format_bytes(bytes): + if bytes is None: + return 'N/A' + if type(bytes) is str: + bytes = float(bytes) + if bytes == 0.0: + exponent = 0 + else: + exponent = int(math.log(bytes, 1024.0)) + suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent] + converted = float(bytes) / float(1024 ** exponent) + return '%.2f%s' % (converted, suffix) @yt_app.route('/watch') @@ -152,38 +279,26 @@ def get_watch_page(): flask.abort(flask.Response('Incomplete video id (too short): ' + video_id)) lc = request.args.get('lc', '') - if settings.route_tor: - proxy = 'socks5://127.0.0.1:9150/' - else: - proxy = '' - yt_dl_downloader = YoutubeDL(params={'youtube_include_dash_manifest':False, 'proxy':proxy}) tasks = ( gevent.spawn(comments.video_comments, video_id, int(settings.default_comment_sorting), lc=lc ), - gevent.spawn(extract_info, yt_dl_downloader, "https://www.youtube.com/watch?v=" + video_id, download=False) + gevent.spawn(extract_info, video_id) ) gevent.joinall(tasks) comments_info, info = tasks[0].value, tasks[1].value - if isinstance(info, str): # youtube error - return flask.render_template('error.html', error_message = info) + if info['error']: + return flask.render_template('error.html', error_message = info['error']) video_info = { - "duration": util.seconds_to_timestamp(info["duration"]), + "duration": util.seconds_to_timestamp(info["duration"] or 0), "id": info['id'], "title": info['title'], - "author": info['uploader'], + "author": info['author'], } - upload_year = info["upload_date"][0:4] - upload_month = info["upload_date"][4:6] - upload_day = info["upload_date"][6:8] - upload_date = upload_month + "/" + upload_day + "/" + upload_year - - if settings.related_videos_mode: - related_videos = get_related_items(info) - else: - related_videos = [] - + for item in info['related_videos']: + util.prefix_urls(item) + util.add_extra_html_info(item) if settings.gather_googlevideo_domains: with open(os.path.join(settings.data_dir, 'googlevideo-domains.txt'), 'a+', encoding='utf-8') as f: @@ -195,31 +310,37 @@ def get_watch_page(): download_formats = [] for format in info['formats']: + if format['acodec'] and format['vcodec']: + codecs_string = format['acodec'] + ', ' + format['vcodec'] + else: + codecs_string = format['acodec'] or format['vcodec'] or '?' download_formats.append({ 'url': format['url'], - 'ext': format['ext'], - 'resolution': yt_dl_downloader.format_resolution(format), - 'note': yt_dl_downloader._format_note(format), + 'ext': format['ext'] or '?', + 'audio_quality': audio_quality_string(format), + 'video_quality': video_quality_string(format), + 'file_size': format_bytes(format['file_size']), + 'codecs': codecs_string, }) video_sources = get_video_sources(info) - video_height = video_sources[0]['height'] - + video_height = yt_data_extract.deep_get(video_sources, 0, 'height', default=360) + video_width = yt_data_extract.deep_get(video_sources, 0, 'width', default=640) # 1 second per pixel, or the actual video width - theater_video_target_width = max(640, info['duration'], video_sources[0]['width']) + theater_video_target_width = max(640, info['duration'] or 0, video_width) return flask.render_template('watch.html', header_playlist_names = local_playlist.get_playlist_names(), - uploader_channel_url = '/' + info['uploader_url'], - upload_date = upload_date, - views = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)), - likes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)), - dislikes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)), + uploader_channel_url = ('/' + info['author_url']) if info['author_url'] else '', + time_published = info['time_published'], + view_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)), + like_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)), + dislike_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)), download_formats = download_formats, video_info = json.dumps(video_info), video_sources = video_sources, subtitle_sources = get_subtitle_sources(info), - related = related_videos, + related = info['related_videos'], music_list = info['music_list'], music_attributes = get_ordered_music_list_attributes(info['music_list']), comments_info = comments_info, @@ -232,9 +353,12 @@ def get_watch_page(): theater_video_target_width = theater_video_target_width, title = info['title'], - uploader = info['uploader'], + uploader = info['author'], description = info['description'], unlisted = info['unlisted'], + limited_state = info['limited_state'], + age_restricted = info['age_restricted'], + playability_error = info['playability_error'], ) diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py deleted file mode 100644 index 5419084..0000000 --- a/youtube/yt_data_extract.py +++ /dev/null @@ -1,273 +0,0 @@ -from youtube import util - -import html -import json - -# videos (all of type str): - -# id -# title -# url -# author -# author_url -# thumbnail -# description -# published -# duration -# likes -# dislikes -# views -# playlist_index - -# playlists: - -# id -# title -# url -# author -# author_url -# thumbnail -# description -# updated -# size -# first_video_id - - - - - -def get_plain_text(node): - try: - return node['simpleText'] - except KeyError: - return ''.join(text_run['text'] for text_run in node['runs']) - -def format_text_runs(runs): - if isinstance(runs, str): - return runs - result = '' - for text_run in runs: - if text_run.get("bold", False): - result += "<b>" + html.escape(text_run["text"]) + "</b>" - elif text_run.get('italics', False): - result += "<i>" + html.escape(text_run["text"]) + "</i>" - else: - result += html.escape(text_run["text"]) - return result - - - - - - - - -def get_url(node): - try: - return node['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - except KeyError: - return node['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - - -def get_text(node): - if node == {}: - return '' - try: - return node['simpleText'] - except KeyError: - pass - try: - return node['runs'][0]['text'] - except IndexError: # empty text runs - return '' - except KeyError: - print(node) - raise - -def get_formatted_text(node): - try: - return node['runs'] - except KeyError: - return node['simpleText'] - -def get_badges(node): - badges = [] - for badge_node in node: - badge = badge_node['metadataBadgeRenderer']['label'] - badges.append(badge) - return badges - -def get_thumbnail(node): - try: - return node['thumbnails'][0]['url'] # polymer format - except KeyError: - return node['url'] # ajax format - -dispatch = { - -# polymer format - 'title': ('title', get_text), - 'publishedTimeText': ('published', get_text), - 'videoId': ('id', lambda node: node), - 'descriptionSnippet': ('description', get_formatted_text), - 'lengthText': ('duration', get_text), - 'thumbnail': ('thumbnail', get_thumbnail), - 'thumbnails': ('thumbnail', lambda node: node[0]['thumbnails'][0]['url']), - - 'viewCountText': ('views', get_text), - 'numVideosText': ('size', lambda node: get_text(node).split(' ')[0]), # the format is "324 videos" - 'videoCountText': ('size', get_text), - 'playlistId': ('id', lambda node: node), - 'descriptionText': ('description', get_formatted_text), - - 'subscriberCountText': ('subscriber_count', get_text), - 'channelId': ('id', lambda node: node), - 'badges': ('badges', get_badges), - -# ajax format - 'view_count_text': ('views', get_text), - 'num_videos_text': ('size', lambda node: get_text(node).split(' ')[0]), - 'owner_text': ('author', get_text), - 'owner_endpoint': ('author_url', lambda node: node['url']), - 'description': ('description', get_formatted_text), - 'index': ('playlist_index', get_text), - 'short_byline': ('author', get_text), - 'length': ('duration', get_text), - 'video_id': ('id', lambda node: node), - -} - -def ajax_info(item_json): - try: - info = {} - for key, node in item_json.items(): - try: - simple_key, function = dispatch[key] - except KeyError: - continue - info[simple_key] = function(node) - return info - except KeyError: - print(item_json) - raise - - - -def prefix_urls(item): - try: - item['thumbnail'] = '/' + item['thumbnail'].lstrip('/') - except KeyError: - pass - - try: - item['author_url'] = util.URL_ORIGIN + item['author_url'] - except KeyError: - pass - -def add_extra_html_info(item): - if item['type'] == 'video': - item['url'] = util.URL_ORIGIN + '/watch?v=' + item['id'] - - video_info = {} - for key in ('id', 'title', 'author', 'duration'): - try: - video_info[key] = item[key] - except KeyError: - video_info[key] = '' - - item['video_info'] = json.dumps(video_info) - - elif item['type'] == 'playlist': - item['url'] = util.URL_ORIGIN + '/playlist?list=' + item['id'] - elif item['type'] == 'channel': - item['url'] = util.URL_ORIGIN + "/channel/" + item['id'] - - -def renderer_info(renderer, additional_info={}): - type = list(renderer.keys())[0] - renderer = renderer[type] - info = {} - if type == 'itemSectionRenderer': - return renderer_info(renderer['contents'][0], additional_info) - - if type in ('movieRenderer', 'clarificationRenderer'): - info['type'] = 'unsupported' - return info - - info.update(additional_info) - - - if type in ('compactVideoRenderer', 'videoRenderer', 'playlistVideoRenderer', 'gridVideoRenderer'): - info['type'] = 'video' - elif type in ('playlistRenderer', 'compactPlaylistRenderer', 'gridPlaylistRenderer', - 'radioRenderer', 'compactRadioRenderer', 'gridRadioRenderer', - 'showRenderer', 'compactShowRenderer', 'gridShowRenderer'): - info['type'] = 'playlist' - elif type == 'channelRenderer': - info['type'] = 'channel' - elif type == 'playlistHeaderRenderer': - info['type'] = 'playlist_metadata' - else: - info['type'] = 'unsupported' - return info - - try: - if 'viewCountText' in renderer: # prefer this one as it contains all the digits - info['views'] = get_text(renderer['viewCountText']) - elif 'shortViewCountText' in renderer: - info['views'] = get_text(renderer['shortViewCountText']) - - if 'ownerText' in renderer: - info['author'] = renderer['ownerText']['runs'][0]['text'] - info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - try: - overlays = renderer['thumbnailOverlays'] - except KeyError: - pass - else: - for overlay in overlays: - if 'thumbnailOverlayTimeStatusRenderer' in overlay: - info['duration'] = get_text(overlay['thumbnailOverlayTimeStatusRenderer']['text']) - # show renderers don't have videoCountText - elif 'thumbnailOverlayBottomPanelRenderer' in overlay: - info['size'] = get_text(overlay['thumbnailOverlayBottomPanelRenderer']['text']) - - # show renderers don't have playlistId, have to dig into the url to get it - try: - info['id'] = renderer['navigationEndpoint']['watchEndpoint']['playlistId'] - except KeyError: - pass - for key, node in renderer.items(): - if key in ('longBylineText', 'shortBylineText'): - info['author'] = get_text(node) - try: - info['author_url'] = get_url(node) - except KeyError: - pass - - # show renderers don't have thumbnail key at top level, dig into thumbnailRenderer - elif key == 'thumbnailRenderer' and 'showCustomThumbnailRenderer' in node: - info['thumbnail'] = node['showCustomThumbnailRenderer']['thumbnail']['thumbnails'][0]['url'] - else: - try: - simple_key, function = dispatch[key] - except KeyError: - continue - info[simple_key] = function(node) - if info['type'] == 'video' and 'duration' not in info: - info['duration'] = 'Live' - - return info - except KeyError: - print(renderer) - raise - - -def parse_info_prepare_for_html(renderer, additional_info={}): - item = renderer_info(renderer, additional_info) - prefix_urls(item) - add_extra_html_info(item) - - return item - - diff --git a/youtube/yt_data_extract/__init__.py b/youtube/yt_data_extract/__init__.py new file mode 100644 index 0000000..898141e --- /dev/null +++ b/youtube/yt_data_extract/__init__.py @@ -0,0 +1,11 @@ +from .common import (get, multi_get, deep_get, multi_deep_get, + liberal_update, conservative_update, remove_redirect, normalize_url, + extract_str, extract_formatted_text, extract_int, extract_approx_int, + extract_date, extract_item_info, extract_items, extract_response) + +from .everything_else import (extract_channel_info, extract_search_info, + extract_playlist_metadata, extract_playlist_info, extract_comments_info) + +from .watch_extraction import (extract_watch_info, get_caption_url, + update_with_age_restricted_info, requires_decryption, + extract_decryption_function, decrypt_signatures) diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py new file mode 100644 index 0000000..4681a86 --- /dev/null +++ b/youtube/yt_data_extract/common.py @@ -0,0 +1,415 @@ +import re +import urllib.parse +import collections + +def get(object, key, default=None, types=()): + '''Like dict.get(), but returns default if the result doesn't match one of the types. + Also works for indexing lists.''' + try: + result = object[key] + except (TypeError, IndexError, KeyError): + return default + + if not types or isinstance(result, types): + return result + else: + return default + +def multi_get(object, *keys, default=None, types=()): + '''Like get, but try other keys if the first fails''' + for key in keys: + try: + result = object[key] + except (TypeError, IndexError, KeyError): + pass + else: + if not types or isinstance(result, types): + return result + else: + continue + return default + + +def deep_get(object, *keys, default=None, types=()): + '''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. + Last argument is the default value to use in case of any IndexErrors or KeyErrors. + If types is given and the result doesn't match one of those types, default is returned''' + try: + for key in keys: + object = object[key] + except (TypeError, IndexError, KeyError): + return default + else: + if not types or isinstance(object, types): + return object + else: + return default + +def multi_deep_get(object, *key_sequences, default=None, types=()): + '''Like deep_get, but can try different key sequences in case one fails. + Return default if all of them fail. key_sequences is a list of lists''' + for key_sequence in key_sequences: + _object = object + try: + for key in key_sequence: + _object = _object[key] + except (TypeError, IndexError, KeyError): + pass + else: + if not types or isinstance(_object, types): + return _object + else: + continue + return default + +def liberal_update(obj, key, value): + '''Updates obj[key] with value as long as value is not None. + Ensures obj[key] will at least get a value of None, however''' + if (value is not None) or (key not in obj): + obj[key] = value + +def conservative_update(obj, key, value): + '''Only updates obj if it doesn't have key or obj[key] is None''' + if obj.get(key) is None: + obj[key] = value + +def remove_redirect(url): + if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking + query_string = url[url.find('?')+1: ] + return urllib.parse.parse_qs(query_string)['q'][0] + return url + +youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$') +def normalize_url(url): + if url is None: + return None + match = youtube_url_re.fullmatch(url) + if match is None: + raise Exception() + + return 'https://www.youtube.com' + match.group(1) + +def _recover_urls(runs): + for run in runs: + url = deep_get(run, 'navigationEndpoint', 'urlEndpoint', 'url') + text = run.get('text', '') + # second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text + if url is not None and (text.startswith('http://') or text.startswith('https://')): + url = remove_redirect(url) + run['url'] = url + run['text'] = url # youtube truncates the url text, use actual url instead + +def extract_str(node, default=None, recover_urls=False): + '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix Youtube's truncation of url text (most prominently seen in descriptions)''' + if isinstance(node, str): + return node + + try: + return node['simpleText'] + except (KeyError, TypeError): + pass + + if isinstance(node, dict) and 'runs' in node: + if recover_urls: + _recover_urls(node['runs']) + return ''.join(text_run.get('text', '') for text_run in node['runs']) + + return default + +def extract_formatted_text(node): + if not node: + return [] + if 'runs' in node: + _recover_urls(node['runs']) + return node['runs'] + elif 'simpleText' in node: + return [{'text': node['simpleText']}] + return [] + +def extract_int(string, default=None): + if isinstance(string, int): + return string + if not isinstance(string, str): + string = extract_str(string) + if not string: + return default + match = re.search(r'(\d+)', string.replace(',', '')) + if match is None: + return default + try: + return int(match.group(1)) + except ValueError: + return default + +def extract_approx_int(string): + '''e.g. "15M" from "15M subscribers"''' + if not isinstance(string, str): + string = extract_str(string) + if not string: + return None + match = re.search(r'(\d+[KMBTkmbt])', string.replace(',', '')) + if match is None: + return None + return match.group(1) + +def extract_date(date_text): + '''Input: "Mar 9, 2019". Output: "2019-3-9"''' + if date_text is None: + return None + + date_text = date_text.replace(',', '').lower() + parts = date_text.split() + if len(parts) >= 3: + month, day, year = parts[-3:] + month = month_abbreviations.get(month[0:3]) # slicing in case they start writing out the full month name + if month and (re.fullmatch(r'\d\d?', day) is not None) and (re.fullmatch(r'\d{4}', year) is not None): + return year + '-' + month + '-' + day + +def check_missing_keys(object, *key_sequences): + for key_sequence in key_sequences: + _object = object + try: + for key in key_sequence: + _object = _object[key] + except (KeyError, IndexError, TypeError): + return 'Could not find ' + key + + return None + +def extract_item_info(item, additional_info={}): + if not item: + return {'error': 'No item given'} + + type = get(list(item.keys()), 0) + if not type: + return {'error': 'Could not find type'} + item = item[type] + + info = {'error': None} + if type in ('itemSectionRenderer', 'compactAutoplayRenderer'): + return extract_item_info(deep_get(item, 'contents', 0), additional_info) + + if type in ('movieRenderer', 'clarificationRenderer'): + info['type'] = 'unsupported' + return info + + info.update(additional_info) + + # type looks like e.g. 'compactVideoRenderer' or 'gridVideoRenderer' + # camelCase split, https://stackoverflow.com/a/37697078 + type_parts = [s.lower() for s in re.sub(r'([A-Z][a-z]+)', r' \1', type).split()] + if len(type_parts) < 2: + info['type'] = 'unsupported' + return + primary_type = type_parts[-2] + if primary_type == 'video': + info['type'] = 'video' + elif primary_type in ('playlist', 'radio', 'show'): + info['type'] = 'playlist' + elif primary_type == 'channel': + info['type'] = 'channel' + else: + info['type'] = 'unsupported' + + info['title'] = extract_str(item.get('title')) + info['author'] = extract_str(multi_get(item, 'longBylineText', 'shortBylineText', 'ownerText')) + info['author_id'] = extract_str(multi_deep_get(item, + ['longBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], + ['shortBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], + ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'] + )) + info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None + info['description'] = extract_formatted_text(multi_get(item, 'descriptionSnippet', 'descriptionText')) + info['thumbnail'] = multi_deep_get(item, + ['thumbnail', 'thumbnails', 0, 'url'], # videos + ['thumbnails', 0, 'thumbnails', 0, 'url'], # playlists + ['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows + ) + + info['badges'] = [] + for badge_node in multi_get(item, 'badges', 'ownerBadges', default=()): + badge = deep_get(badge_node, 'metadataBadgeRenderer', 'label') + if badge: + info['badges'].append(badge) + + if primary_type in ('video', 'playlist'): + info['time_published'] = extract_str(item.get('publishedTimeText')) + + if primary_type == 'video': + info['id'] = item.get('videoId') + info['view_count'] = extract_int(item.get('viewCountText')) + + # dig into accessibility data to get view_count for videos marked as recommended, and to get time_published + accessibility_label = deep_get(item, 'title', 'accessibility', 'accessibilityData', 'label', default='') + timestamp = re.search(r'(\d+ \w+ ago)', accessibility_label) + if timestamp: + conservative_update(info, 'time_published', timestamp.group(1)) + view_count = re.search(r'(\d+) views', accessibility_label.replace(',', '')) + if view_count: + conservative_update(info, 'view_count', int(view_count.group(1))) + + if info['view_count']: + info['approx_view_count'] = '{:,}'.format(info['view_count']) + else: + info['approx_view_count'] = extract_approx_int(multi_get(item, 'shortViewCountText')) + info['duration'] = extract_str(item.get('lengthText')) + elif primary_type == 'playlist': + info['id'] = item.get('playlistId') + info['video_count'] = extract_int(item.get('videoCount')) + elif primary_type == 'channel': + info['id'] = item.get('channelId') + info['approx_subscriber_count'] = extract_approx_int(item.get('subscriberCountText')) + elif primary_type == 'show': + info['id'] = deep_get(item, 'navigationEndpoint', 'watchEndpoint', 'playlistId') + + if primary_type in ('playlist', 'channel'): + conservative_update(info, 'video_count', extract_int(item.get('videoCountText'))) + + for overlay in item.get('thumbnailOverlays', []): + conservative_update(info, 'duration', extract_str(deep_get( + overlay, 'thumbnailOverlayTimeStatusRenderer', 'text' + ))) + # show renderers don't have videoCountText + conservative_update(info, 'video_count', extract_int(deep_get( + overlay, 'thumbnailOverlayBottomPanelRenderer', 'text' + ))) + return info + +def extract_response(polymer_json): + '''return response, error''' + response = multi_deep_get(polymer_json, [1, 'response'], ['response'], default=None, types=dict) + if response is None: + return None, 'Failed to extract response' + else: + return response, None + + +list_types = { + 'sectionListRenderer', + 'itemSectionRenderer', + 'gridRenderer', + 'playlistVideoListRenderer', +} + +item_types = { + 'movieRenderer', + 'didYouMeanRenderer', + 'showingResultsForRenderer', + + 'videoRenderer', + 'compactVideoRenderer', + 'compactAutoplayRenderer', + 'gridVideoRenderer', + 'playlistVideoRenderer', + + 'playlistRenderer', + 'compactPlaylistRenderer', + 'gridPlaylistRenderer', + + 'radioRenderer', + 'compactRadioRenderer', + 'gridRadioRenderer', + + 'showRenderer', + 'compactShowRenderer', + 'gridShowRenderer', + + + 'channelRenderer', + 'compactChannelRenderer', + 'gridChannelRenderer', + + 'channelAboutFullMetadataRenderer', +} + +def _traverse_browse_renderer(renderer): + for tab in get(renderer, 'tabs', (), types=(list, tuple)): + tab_renderer = multi_deep_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict) + if tab_renderer is None: + continue + if tab_renderer.get('selected', False): + return get(tab_renderer, 'content', {}, types=(dict)) + print('Could not find tab with content') + return {} + +def _traverse_standard_list(renderer): + renderer_list = multi_deep_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple)) + continuation = deep_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation') + return renderer_list, continuation + +# these renderers contain one inside them +nested_renderer_dispatch = { + 'singleColumnBrowseResultsRenderer': _traverse_browse_renderer, + 'twoColumnBrowseResultsRenderer': _traverse_browse_renderer, + 'twoColumnSearchResultsRenderer': lambda renderer: get(renderer, 'primaryContents', {}, types=dict), +} + +# these renderers contain a list of renderers inside them +nested_renderer_list_dispatch = { + 'sectionListRenderer': _traverse_standard_list, + 'itemSectionRenderer': _traverse_standard_list, + 'gridRenderer': _traverse_standard_list, + 'playlistVideoListRenderer': _traverse_standard_list, + 'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None), +} + +def extract_items(response, item_types=item_types): + '''return items, ctoken''' + if 'continuationContents' in response: + # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something + for key, renderer_continuation in get(response, 'continuationContents', {}, types=dict).items(): + if key.endswith('Continuation'): # e.g. commentSectionContinuation, playlistVideoListContinuation + items = multi_deep_get(renderer_continuation, ['contents'], ['items'], default=[], types=(list, tuple)) + ctoken = deep_get(renderer_continuation, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str) + return items, ctoken + return [], None + elif 'contents' in response: + ctoken = None + items = [] + + iter_stack = collections.deque() + current_iter = iter(()) + + renderer = get(response, 'contents', {}, types=dict) + + while True: + # mode 1: dig into the current renderer + # Will stay in mode 1 (via continue) if a new renderer is found inside this one + # Otherwise, after finding that it is an item renderer, + # contains a list, or contains nothing, + # falls through into mode 2 to get a new renderer + if len(renderer) != 0: + key, value = list(renderer.items())[0] + + # has a list in it, add it to the iter stack + if key in nested_renderer_list_dispatch: + renderer_list, continuation = nested_renderer_list_dispatch[key](value) + if renderer_list: + iter_stack.append(current_iter) + current_iter = iter(renderer_list) + if continuation: + ctoken = continuation + + # new renderer nested inside this one + elif key in nested_renderer_dispatch: + renderer = nested_renderer_dispatch[key](value) + continue # back to mode 1 + + # the renderer is an item + elif key in item_types: + items.append(renderer) + + + # mode 2: get a new renderer by iterating. + # goes up the stack for an iterator if one has been exhausted + while current_iter is not None: + try: + renderer = current_iter.__next__() + break + except StopIteration: + try: + current_iter = iter_stack.pop() # go back up the stack + except IndexError: + return items, ctoken + else: + return [], None diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py new file mode 100644 index 0000000..6dc5248 --- /dev/null +++ b/youtube/yt_data_extract/everything_else.py @@ -0,0 +1,273 @@ +from .common import (get, multi_get, deep_get, multi_deep_get, + liberal_update, conservative_update, remove_redirect, normalize_url, + extract_str, extract_formatted_text, extract_int, extract_approx_int, + extract_date, check_missing_keys, extract_item_info, extract_items, + extract_response) +from youtube import proto + +import re +import urllib +from math import ceil + +def extract_channel_info(polymer_json, tab): + response, err = extract_response(polymer_json) + if err: + return {'error': err} + + try: + microformat = response['microformat']['microformatDataRenderer'] + + # channel doesn't exist or was terminated + # example terminated channel: https://www.youtube.com/channel/UCnKJeK_r90jDdIuzHXC0Org + except KeyError: + if 'alerts' in response and len(response['alerts']) > 0: + return {'error': ' '.join(alert['alertRenderer']['text']['simpleText'] for alert in response['alerts']) } + elif 'errors' in response['responseContext']: + for error in response['responseContext']['errors']['error']: + if error['code'] == 'INVALID_VALUE' and error['location'] == 'browse_id': + return {'error': 'This channel does not exist'} + return {'error': 'Failure getting microformat'} + + info = {'error': None} + info['current_tab'] = tab + + + # stuff from microformat (info given by youtube for every page on channel) + info['short_description'] = microformat['description'] + info['channel_name'] = microformat['title'] + info['avatar'] = microformat['thumbnail']['thumbnails'][0]['url'] + channel_url = microformat['urlCanonical'].rstrip('/') + channel_id = channel_url[channel_url.rfind('/')+1:] + info['channel_id'] = channel_id + info['channel_url'] = 'https://www.youtube.com/channel/' + channel_id + + info['items'] = [] + + # empty channel + if 'contents' not in response and 'continuationContents' not in response: + return info + + + items, _ = extract_items(response) + if tab in ('videos', 'playlists', 'search'): + additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id} + info['items'] = [extract_item_info(renderer, additional_info) for renderer in items] + + elif tab == 'about': + for item in items: + try: + channel_metadata = item['channelAboutFullMetadataRenderer'] + break + except KeyError: + pass + else: + info['error'] = 'Could not find channelAboutFullMetadataRenderer' + return info + + info['links'] = [] + for link_json in channel_metadata.get('primaryLinks', ()): + url = remove_redirect(link_json['navigationEndpoint']['urlEndpoint']['url']) + + text = extract_str(link_json['title']) + + info['links'].append( (text, url) ) + + + info['stats'] = [] + for stat_name in ('subscriberCountText', 'joinedDateText', 'viewCountText', 'country'): + try: + stat = channel_metadata[stat_name] + except KeyError: + continue + info['stats'].append(extract_str(stat)) + + if 'description' in channel_metadata: + info['description'] = extract_str(channel_metadata['description']) + else: + info['description'] = '' + + else: + raise NotImplementedError('Unknown or unsupported channel tab: ' + tab) + + return info + +def extract_search_info(polymer_json): + response, err = extract_response(polymer_json) + if err: + return {'error': err} + info = {'error': None} + info['estimated_results'] = int(response['estimatedResults']) + info['estimated_pages'] = ceil(info['estimated_results']/20) + + + results, _ = extract_items(response) + + + info['items'] = [] + info['corrections'] = {'type': None} + for renderer in results: + type = list(renderer.keys())[0] + if type == 'shelfRenderer': + continue + if type == 'didYouMeanRenderer': + renderer = renderer[type] + + info['corrections'] = { + 'type': 'did_you_mean', + 'corrected_query': renderer['correctedQueryEndpoint']['searchEndpoint']['query'], + 'corrected_query_text': renderer['correctedQuery']['runs'], + } + continue + if type == 'showingResultsForRenderer': + renderer = renderer[type] + + info['corrections'] = { + 'type': 'showing_results_for', + 'corrected_query_text': renderer['correctedQuery']['runs'], + 'original_query_text': renderer['originalQuery']['simpleText'], + } + continue + + i_info = extract_item_info(renderer) + if i_info.get('type') != 'unsupported': + info['items'].append(i_info) + + + return info + +def extract_playlist_metadata(polymer_json): + response, err = extract_response(polymer_json) + if err: + return {'error': err} + + metadata = {'error': None} + header = deep_get(response, 'header', 'playlistHeaderRenderer', default={}) + metadata['title'] = extract_str(header.get('title')) + + metadata['first_video_id'] = deep_get(header, 'playEndpoint', 'watchEndpoint', 'videoId') + first_id = re.search(r'([a-z_\-]{11})', deep_get(header, + 'thumbnail', 'thumbnails', 0, 'url', default='')) + if first_id: + conservative_update(metadata, 'first_video_id', first_id.group(1)) + if metadata['first_video_id'] is None: + metadata['thumbnail'] = None + else: + metadata['thumbnail'] = 'https://i.ytimg.com/vi/' + metadata['first_video_id'] + '/mqdefault.jpg' + + metadata['video_count'] = extract_int(header.get('numVideosText')) + metadata['description'] = extract_str(header.get('descriptionText'), default='') + metadata['author'] = extract_str(header.get('ownerText')) + metadata['author_id'] = multi_deep_get(header, + ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'], + ['ownerEndpoint', 'browseEndpoint', 'browseId']) + if metadata['author_id']: + metadata['author_url'] = 'https://www.youtube.com/channel/' + metadata['author_id'] + else: + metadata['author_url'] = None + metadata['view_count'] = extract_int(header.get('viewCountText')) + metadata['like_count'] = extract_int(header.get('likesCountWithoutLikeText')) + for stat in header.get('stats', ()): + text = extract_str(stat) + if 'videos' in text: + conservative_update(metadata, 'video_count', extract_int(text)) + elif 'views' in text: + conservative_update(metadata, 'view_count', extract_int(text)) + elif 'updated' in text: + metadata['time_published'] = extract_date(text) + + return metadata + +def extract_playlist_info(polymer_json): + response, err = extract_response(polymer_json) + if err: + return {'error': err} + info = {'error': None} + first_page = 'continuationContents' not in response + video_list, _ = extract_items(response) + + info['items'] = [extract_item_info(renderer) for renderer in video_list] + + if first_page: + info['metadata'] = extract_playlist_metadata(polymer_json) + + return info + +def _ctoken_metadata(ctoken): + result = dict() + params = proto.parse(proto.b64_to_bytes(ctoken)) + result['video_id'] = proto.parse(params[2])[2].decode('ascii') + + offset_information = proto.parse(params[6]) + result['offset'] = offset_information.get(5, 0) + + result['is_replies'] = False + if (3 in offset_information) and (2 in proto.parse(offset_information[3])): + result['is_replies'] = True + result['sort'] = None + else: + try: + result['sort'] = proto.parse(offset_information[4])[6] + except KeyError: + result['sort'] = 0 + return result + +def extract_comments_info(polymer_json): + response, err = extract_response(polymer_json) + if err: + return {'error': err} + info = {'error': None} + + url = multi_deep_get(polymer_json, [1, 'url'], ['url']) + if url: + ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] + metadata = _ctoken_metadata(ctoken) + else: + metadata = {} + info['video_id'] = metadata.get('video_id') + info['offset'] = metadata.get('offset') + info['is_replies'] = metadata.get('is_replies') + info['sort'] = metadata.get('sort') + info['video_title'] = None + + comments, ctoken = extract_items(response) + info['comments'] = [] + info['ctoken'] = ctoken + for comment in comments: + comment_info = {} + + if 'commentThreadRenderer' in comment: # top level comments + conservative_update(info, 'is_replies', False) + comment_thread = comment['commentThreadRenderer'] + info['video_title'] = extract_str(comment_thread.get('commentTargetTitle')) + if 'replies' not in comment_thread: + comment_info['reply_count'] = 0 + else: + comment_info['reply_count'] = extract_int(deep_get(comment_thread, + 'replies', 'commentRepliesRenderer', 'moreText' + ), default=1) # With 1 reply, the text reads "View reply" + comment_renderer = deep_get(comment_thread, 'comment', 'commentRenderer', default={}) + elif 'commentRenderer' in comment: # replies + comment_info['reply_count'] = 0 # replyCount, below, not present for replies even if the reply has further replies to it + conservative_update(info, 'is_replies', True) + comment_renderer = comment['commentRenderer'] + else: + comment_renderer = {} + + # These 3 are sometimes absent, likely because the channel was deleted + comment_info['author'] = extract_str(comment_renderer.get('authorText')) + comment_info['author_url'] = deep_get(comment_renderer, + 'authorEndpoint', 'commandMetadata', 'webCommandMetadata', 'url') + comment_info['author_id'] = deep_get(comment_renderer, + 'authorEndpoint', 'browseEndpoint', 'browseId') + + comment_info['author_avatar'] = deep_get(comment_renderer, + 'authorThumbnail', 'thumbnails', 0, 'url') + comment_info['id'] = comment_renderer.get('commentId') + comment_info['text'] = extract_formatted_text(comment_renderer.get('contentText')) + comment_info['time_published'] = extract_str(comment_renderer.get('publishedTimeText')) + comment_info['like_count'] = comment_renderer.get('likeCount') + liberal_update(comment_info, 'reply_count', comment_renderer.get('replyCount')) + + info['comments'].append(comment_info) + + return info diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py new file mode 100644 index 0000000..09abbe3 --- /dev/null +++ b/youtube/yt_data_extract/watch_extraction.py @@ -0,0 +1,545 @@ +from .common import (get, multi_get, deep_get, multi_deep_get, + liberal_update, conservative_update, remove_redirect, normalize_url, + extract_str, extract_formatted_text, extract_int, extract_approx_int, + extract_date, check_missing_keys, extract_item_info, extract_items, + extract_response) + +import json +import urllib.parse +import traceback +import re + +# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py +_formats = { + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'audio_bitrate': 64, 'vcodec': 'h263'}, + '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 96, 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), audio_bitrate varies as well + '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + + + # 3D videos + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'audio_bitrate': 192, 'vcodec': 'h264'}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 128, 'vcodec': 'vp8'}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'audio_bitrate': 192, 'vcodec': 'vp8'}, + + # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 128, 'vcodec': 'h264'}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 256, 'vcodec': 'h264'}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 48, 'vcodec': 'h264'}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'audio_bitrate': 24, 'vcodec': 'h264'}, + + # DASH mp4 video + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, + + # Dash mp4 audio + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'audio_bitrate': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, + + # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + + # Dash webm audio + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'audio_bitrate': 256}, + + # Dash webm audio with opus inside + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'audio_bitrate': 160}, + + # RTMP (unnamed) + '_rtmp': {'protocol': 'rtmp'}, + + # av01 video only formats sometimes served with "unknown" codecs + '394': {'vcodec': 'av01.0.05M.08'}, + '395': {'vcodec': 'av01.0.05M.08'}, + '396': {'vcodec': 'av01.0.05M.08'}, + '397': {'vcodec': 'av01.0.05M.08'}, +} + +def _extract_metadata_row_info(video_renderer_info): + # extract category and music list + info = { + 'category': None, + 'music_list': [], + } + + current_song = {} + for row in deep_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]): + row_title = extract_str(deep_get(row, 'metadataRowRenderer', 'title'), default='') + row_content = extract_str(deep_get(row, 'metadataRowRenderer', 'contents', 0)) + if row_title == 'Category': + info['category'] = row_content + elif row_title in ('Song', 'Music'): + if current_song: + info['music_list'].append(current_song) + current_song = {'title': row_content} + elif row_title == 'Artist': + current_song['artist'] = row_content + elif row_title == 'Album': + current_song['album'] = row_content + elif row_title == 'Writers': + current_song['writers'] = row_content + elif row_title.startswith('Licensed'): + current_song['licensor'] = row_content + if current_song: + info['music_list'].append(current_song) + + return info + +def _extract_watch_info_mobile(top_level): + info = {} + microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) + + family_safe = microformat.get('isFamilySafe') + if family_safe is None: + info['age_restricted'] = None + else: + info['age_restricted'] = not family_safe + info['allowed_countries'] = microformat.get('availableCountries', []) + info['time_published'] = microformat.get('publishDate') + + response = top_level.get('response', {}) + + # video info from metadata renderers + items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'}) + if items: + video_info = items[0]['slimVideoMetadataRenderer'] + else: + print('Failed to extract video metadata') + video_info = {} + + info.update(_extract_metadata_row_info(video_info)) + info['description'] = extract_str(video_info.get('description'), recover_urls=True) + info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle'))) + info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) + info['author_id'] = deep_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') + info['title'] = extract_str(video_info.get('title')) + info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='') + info['unlisted'] = False + for badge in video_info.get('badges', []): + if deep_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted': + info['unlisted'] = True + info['like_count'] = None + info['dislike_count'] = None + if not info['time_published']: + info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) + for button in video_info.get('buttons', ()): + button_renderer = button.get('slimMetadataToggleButtonRenderer', {}) + + # all the digits can be found in the accessibility data + count = extract_int(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label')) + + # this count doesn't have all the digits, it's like 53K for instance + dumb_count = extract_int(extract_str(deep_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText'))) + + # the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0 + if dumb_count == 0: + count = 0 + + if 'isLike' in button_renderer: + info['like_count'] = count + elif 'isDislike' in button_renderer: + info['dislike_count'] = count + + # comment section info + items, _ = extract_items(response, item_types={'commentSectionRenderer'}) + if items: + comment_info = items[0]['commentSectionRenderer'] + comment_count_text = extract_str(deep_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText')) + if comment_count_text == 'Comments': # just this with no number, means 0 comments + info['comment_count'] = 0 + else: + info['comment_count'] = extract_int(comment_count_text) + info['comments_disabled'] = False + else: # no comment section present means comments are disabled + info['comment_count'] = 0 + info['comments_disabled'] = True + + # check for limited state + items, _ = extract_items(response, item_types={'limitedStateMessageRenderer'}) + if items: + info['limited_state'] = True + else: + info['limited_state'] = False + + # related videos + related, _ = extract_items(response) + info['related_videos'] = [extract_item_info(renderer) for renderer in related] + + return info + +month_abbreviations = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'} +def _extract_watch_info_desktop(top_level): + info = { + 'comment_count': None, + 'comments_disabled': None, + 'allowed_countries': None, + 'limited_state': None, + } + + video_info = {} + for renderer in deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()): + if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'): + video_info.update(list(renderer.values())[0]) + + info.update(_extract_metadata_row_info(video_info)) + info['description'] = extract_str(video_info.get('description', None), recover_urls=True) + info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) + + likes_dislikes = deep_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/') + if len(likes_dislikes) == 2: + info['like_count'] = extract_int(likes_dislikes[0]) + info['dislike_count'] = extract_int(likes_dislikes[1]) + else: + info['like_count'] = None + info['dislike_count'] = None + + info['title'] = extract_str(video_info.get('title', None)) + info['author'] = extract_str(deep_get(video_info, 'owner', 'videoOwnerRenderer', 'title')) + info['author_id'] = deep_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId') + info['view_count'] = extract_int(extract_str(deep_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount'))) + + related = deep_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[]) + info['related_videos'] = [extract_item_info(renderer) for renderer in related] + + return info + +def _extract_formats(info, player_response): + streaming_data = player_response.get('streamingData', {}) + yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', []) + + info['formats'] = [] + + for yt_fmt in yt_formats: + fmt = {} + fmt['ext'] = None + fmt['audio_bitrate'] = None + fmt['acodec'] = None + fmt['vcodec'] = None + fmt['width'] = yt_fmt.get('width') + fmt['height'] = yt_fmt.get('height') + fmt['file_size'] = yt_fmt.get('contentLength') + fmt['audio_sample_rate'] = yt_fmt.get('audioSampleRate') + fmt['fps'] = yt_fmt.get('fps') + cipher = dict(urllib.parse.parse_qsl(yt_fmt.get('cipher', ''))) + if cipher: + fmt['url'] = cipher.get('url') + else: + fmt['url'] = yt_fmt.get('url') + fmt['s'] = cipher.get('s') + fmt['sp'] = cipher.get('sp') + fmt.update(_formats.get(str(yt_fmt.get('itag')), {})) + + info['formats'].append(fmt) + +def _extract_playability_error(info, player_response, error_prefix=''): + if info['formats']: + info['playability_status'] = None + info['playability_error'] = None + return + + playability_status = deep_get(player_response, 'playabilityStatus', 'status', default=None) + info['playability_status'] = playability_status + + playability_reason = extract_str(multi_deep_get(player_response, + ['playabilityStatus', 'reason'], + ['playabilityStatus', 'errorScreen', 'playerErrorMessageRenderer', 'reason'], + default='Could not find playability error') + ) + + if playability_status not in (None, 'OK'): + info['playability_error'] = error_prefix + playability_reason + else: + info['playability_error'] = error_prefix + 'Unknown playability error' + +SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') +def extract_watch_info(polymer_json): + info = {'playability_error': None, 'error': None} + + if isinstance(polymer_json, dict): + top_level = polymer_json + elif isinstance(polymer_json, (list, tuple)): + top_level = {} + for page_part in polymer_json: + if not isinstance(page_part, dict): + return {'error': 'Invalid page part'} + top_level.update(page_part) + else: + return {'error': 'Invalid top level polymer data'} + + error = check_missing_keys(top_level, + ['player', 'args'], + ['player', 'assets', 'js'], + ['playerResponse'], + ) + if error: + info['playability_error'] = error + + player_args = deep_get(top_level, 'player', 'args', default={}) + player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {} + + # captions + info['automatic_caption_languages'] = [] + info['manual_caption_languages'] = [] + info['_manual_caption_language_names'] = {} # language name written in that language, needed in some cases to create the url + info['translation_languages'] = [] + captions_info = player_response.get('captions', {}) + info['_captions_base_url'] = normalize_url(deep_get(captions_info, 'playerCaptionsRenderer', 'baseUrl')) + for caption_track in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'captionTracks', default=()): + lang_code = caption_track.get('languageCode') + if not lang_code: + continue + if caption_track.get('kind') == 'asr': + info['automatic_caption_languages'].append(lang_code) + else: + info['manual_caption_languages'].append(lang_code) + base_url = caption_track.get('baseUrl', '') + lang_name = deep_get(urllib.parse.parse_qs(urllib.parse.urlparse(base_url).query), 'name', 0) + if lang_name: + info['_manual_caption_language_names'][lang_code] = lang_name + + for translation_lang_info in deep_get(captions_info, 'playerCaptionsTracklistRenderer', 'translationLanguages', default=()): + lang_code = translation_lang_info.get('languageCode') + if lang_code: + info['translation_languages'].append(lang_code) + if translation_lang_info.get('isTranslatable') == False: + print('WARNING: Found non-translatable caption language') + + # formats + _extract_formats(info, player_response) + + # playability errors + _extract_playability_error(info, player_response) + + # check age-restriction + info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error']) + + # base_js (for decryption of signatures) + info['base_js'] = deep_get(top_level, 'player', 'assets', 'js') + if info['base_js']: + info['base_js'] = normalize_url(info['base_js']) + info['player_name'] = get(info['base_js'].split('/'), -2) + else: + info['player_name'] = None + + # extract stuff from visible parts of page + mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={}) + if mobile: + info.update(_extract_watch_info_mobile(top_level)) + else: + info.update(_extract_watch_info_desktop(top_level)) + + # stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info + vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={}) + liberal_update(info, 'title', extract_str(vd.get('title'))) + liberal_update(info, 'duration', extract_int(vd.get('lengthSeconds'))) + liberal_update(info, 'view_count', extract_int(vd.get('viewCount'))) + # videos with no description have a blank string + liberal_update(info, 'description', vd.get('shortDescription')) + liberal_update(info, 'id', vd.get('videoId')) + liberal_update(info, 'author', vd.get('author')) + liberal_update(info, 'author_id', vd.get('channelId')) + liberal_update(info, 'live', vd.get('isLiveContent')) + conservative_update(info, 'unlisted', not vd.get('isCrawlable', True)) #isCrawlable is false on limited state videos even if they aren't unlisted + liberal_update(info, 'tags', vd.get('keywords', [])) + + # fallback stuff from microformat + mf = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) + conservative_update(info, 'title', extract_str(mf.get('title'))) + conservative_update(info, 'duration', extract_int(mf.get('lengthSeconds'))) + # this gives the view count for limited state videos + conservative_update(info, 'view_count', extract_int(mf.get('viewCount'))) + conservative_update(info, 'description', extract_str(mf.get('description'), recover_urls=True)) + conservative_update(info, 'author', mf.get('ownerChannelName')) + conservative_update(info, 'author_id', mf.get('externalChannelId')) + liberal_update(info, 'unlisted', mf.get('isUnlisted')) + liberal_update(info, 'category', mf.get('category')) + liberal_update(info, 'time_published', mf.get('publishDate')) + liberal_update(info, 'time_uploaded', mf.get('uploadDate')) + + # other stuff + info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None + return info + +def get_caption_url(info, language, format, automatic=False, translation_language=None): + '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.''' + url = info['_captions_base_url'] + url += '&lang=' + language + url += '&fmt=' + format + if automatic: + url += '&kind=asr' + elif language in info['_manual_caption_language_names']: + url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='') + + if translation_language: + url += '&tlang=' + translation_language + return url + +def update_with_age_restricted_info(info, video_info_page): + ERROR_PREFIX = 'Error bypassing age-restriction: ' + + video_info = urllib.parse.parse_qs(video_info_page) + player_response = deep_get(video_info, 'player_response', 0) + if player_response is None: + info['playability_error'] = ERROR_PREFIX + 'Could not find player_response in video_info_page' + return + try: + player_response = json.loads(player_response) + except json.decoder.JSONDecodeError: + traceback.print_exc() + info['playability_error'] = ERROR_PREFIX + 'Failed to parse json response' + return + + _extract_formats(info, player_response) + _extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX) + +def requires_decryption(info): + return ('formats' in info) and info['formats'] and info['formats'][0]['s'] + +# adapted from youtube-dl and invidious: +# https://github.com/omarroth/invidious/blob/master/src/invidious/helpers/signatures.cr +decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}]+)\}') +op_with_arg_re = re.compile(r'[^\.]+\.([^\(]+)\(a,(\d+)\)') +def extract_decryption_function(info, base_js): + '''Insert decryption function into info. Return error string if not successful. + Decryption function is a list of list[2] of numbers. + It is advisable to cache the decryption function (uniquely identified by info['player_name']) so base.js (1 MB) doesn't need to be redownloaded each time''' + info['decryption_function'] = None + decrypt_function_match = decrypt_function_re.search(base_js) + if decrypt_function_match is None: + return 'Could not find decryption function in base.js' + + function_body = decrypt_function_match.group(1).split(';')[1:-1] + if not function_body: + return 'Empty decryption function body' + + var_name = get(function_body[0].split('.'), 0) + if var_name is None: + return 'Could not find var_name' + + var_body_match = re.search(r'var ' + re.escape(var_name) + r'=\{(.*?)\};', base_js, flags=re.DOTALL) + if var_body_match is None: + return 'Could not find var_body' + + operations = var_body_match.group(1).replace('\n', '').split('},') + if not operations: + return 'Did not find any definitions in var_body' + operations[-1] = operations[-1][:-1] # remove the trailing '}' since we split by '},' on the others + operation_definitions = {} + for op in operations: + colon_index = op.find(':') + opening_brace_index = op.find('{') + + if colon_index == -1 or opening_brace_index == -1: + return 'Could not parse operation' + op_name = op[:colon_index] + op_body = op[opening_brace_index+1:] + if op_body == 'a.reverse()': + operation_definitions[op_name] = 0 + elif op_body == 'a.splice(0,b)': + operation_definitions[op_name] = 1 + elif op_body.startswith('var c=a[0]'): + operation_definitions[op_name] = 2 + else: + return 'Unknown op_body: ' + op_body + + decryption_function = [] + for op_with_arg in function_body: + match = op_with_arg_re.fullmatch(op_with_arg) + if match is None: + return 'Could not parse operation with arg' + op_name = match.group(1) + if op_name not in operation_definitions: + return 'Unknown op_name: ' + op_name + op_argument = match.group(2) + decryption_function.append([operation_definitions[op_name], int(op_argument)]) + + info['decryption_function'] = decryption_function + return False + +def _operation_2(a, b): + c = a[0] + a[0] = a[b % len(a)] + a[b % len(a)] = c + +def decrypt_signatures(info): + '''Applies info['decryption_function'] to decrypt all the signatures. Return err.''' + if not info.get('decryption_function'): + return 'decryption_function not in info' + for format in info['formats']: + if not format['s'] or not format['sp'] or not format['url']: + print('Warning: s, sp, or url not in format') + continue + + a = list(format['s']) + for op, argument in info['decryption_function']: + if op == 0: + a.reverse() + elif op == 1: + a = a[argument:] + else: + _operation_2(a, argument) + + signature = ''.join(a) + format['url'] += '&' + format['sp'] + '=' + signature + return False diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py deleted file mode 100644 index 38ba43a..0000000 --- a/youtube_dl/YoutubeDL.py +++ /dev/null @@ -1,2392 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -from __future__ import absolute_import, unicode_literals - -import collections -import contextlib -import copy -import datetime -import errno -import fileinput -import io -import itertools -import json -import locale -import operator -import os -import platform -import re -import shutil -import subprocess -import socket -import sys -import time -import tokenize -import traceback -import random - -from string import ascii_letters - -from .compat import ( - compat_basestring, - compat_cookiejar, - compat_get_terminal_size, - compat_http_client, - compat_kwargs, - compat_numeric_types, - compat_os_name, - compat_str, - compat_tokenize_tokenize, - compat_urllib_error, - compat_urllib_request, - compat_urllib_request_DataHandler, -) -from .utils import ( - age_restricted, - args_to_str, - ContentTooShortError, - date_from_str, - DateRange, - DEFAULT_OUTTMPL, - determine_ext, - determine_protocol, - DownloadError, - encode_compat_str, - encodeFilename, - error_to_compat_str, - expand_path, - ExtractorError, - format_bytes, - formatSeconds, - GeoRestrictedError, - int_or_none, - ISO3166Utils, - locked_file, - make_HTTPS_handler, - MaxDownloadsReached, - orderedSet, - PagedList, - parse_filesize, - PerRequestProxyHandler, - platform_name, - PostProcessingError, - preferredencoding, - prepend_extension, - register_socks_protocols, - render_table, - replace_extension, - SameFileError, - sanitize_filename, - sanitize_path, - sanitize_url, - sanitized_Request, - std_headers, - subtitles_filename, - UnavailableVideoError, - url_basename, - version_tuple, - write_json_file, - write_string, - YoutubeDLCookieProcessor, - YoutubeDLHandler, -) -from .cache import Cache -from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER -from .extractor.openload import PhantomJSwrapper -from .downloader import get_suitable_downloader -from .downloader.rtmp import rtmpdump_version -from .postprocessor import ( - FFmpegFixupM3u8PP, - FFmpegFixupM4aPP, - FFmpegFixupStretchedPP, - FFmpegMergerPP, - FFmpegPostProcessor, - get_postprocessor, -) -from .version import __version__ - -if compat_os_name == 'nt': - import ctypes - - -class YoutubeDL(object): - """YoutubeDL class. - - YoutubeDL objects are the ones responsible of downloading the - actual video file and writing it to disk if the user has requested - it, among some other tasks. In most cases there should be one per - program. As, given a video URL, the downloader doesn't know how to - extract all the needed information, task that InfoExtractors do, it - has to pass the URL to one of them. - - For this, YoutubeDL objects have a method that allows - InfoExtractors to be registered in a given order. When it is passed - a URL, the YoutubeDL object handles it to the first InfoExtractor it - finds that reports being able to handle it. The InfoExtractor extracts - all the information about the video or videos the URL refers to, and - YoutubeDL process the extracted information, possibly using a File - Downloader to download the video. - - YoutubeDL objects accept a lot of parameters. In order not to saturate - the object constructor with arguments, it receives a dictionary of - options instead. These options are available through the params - attribute for the InfoExtractors to use. The YoutubeDL also - registers itself as the downloader in charge for the InfoExtractors - that are added to it, so this is a "mutual registration". - - Available options: - - username: Username for authentication purposes. - password: Password for authentication purposes. - videopassword: Password for accessing a video. - ap_mso: Adobe Pass multiple-system operator identifier. - ap_username: Multiple-system operator account username. - ap_password: Multiple-system operator account password. - usenetrc: Use netrc for authentication instead. - verbose: Print additional info to stdout. - quiet: Do not print messages to stdout. - no_warnings: Do not print out anything for warnings. - forceurl: Force printing final URL. - forcetitle: Force printing title. - forceid: Force printing ID. - forcethumbnail: Force printing thumbnail URL. - forcedescription: Force printing description. - forcefilename: Force printing final filename. - forceduration: Force printing duration. - forcejson: Force printing info_dict as JSON. - dump_single_json: Force printing the info_dict of the whole playlist - (or video) as a single JSON line. - simulate: Do not download the video files. - format: Video format code. See options.py for more information. - outtmpl: Template for output names. - restrictfilenames: Do not allow "&" and spaces in file names - ignoreerrors: Do not stop on download errors. - force_generic_extractor: Force downloader to use the generic extractor - nooverwrites: Prevent overwriting files. - playliststart: Playlist item to start at. - playlistend: Playlist item to end at. - playlist_items: Specific indices of playlist to download. - playlistreverse: Download playlist items in reverse order. - playlistrandom: Download playlist items in random order. - matchtitle: Download only matching titles. - rejecttitle: Reject downloads for matching titles. - logger: Log messages to a logging.Logger instance. - logtostderr: Log messages to stderr instead of stdout. - writedescription: Write the video description to a .description file - writeinfojson: Write the video description to a .info.json file - writeannotations: Write the video annotations to a .annotations.xml file - writethumbnail: Write the thumbnail image to a file - write_all_thumbnails: Write all thumbnail formats to files - writesubtitles: Write the video subtitles to a file - writeautomaticsub: Write the automatically generated subtitles to a file - allsubtitles: Downloads all the subtitles of the video - (requires writesubtitles or writeautomaticsub) - listsubtitles: Lists all available subtitles for the video - subtitlesformat: The format code for subtitles - subtitleslangs: List of languages of the subtitles to download - keepvideo: Keep the video file after post-processing - daterange: A DateRange object, download only if the upload_date is in the range. - skip_download: Skip the actual download of the video file - cachedir: Location of the cache files in the filesystem. - False to disable filesystem cache. - noplaylist: Download single video instead of a playlist if in doubt. - age_limit: An integer representing the user's age in years. - Unsuitable videos for the given age are skipped. - min_views: An integer representing the minimum view count the video - must have in order to not be skipped. - Videos without view count information are always - downloaded. None for no limit. - max_views: An integer representing the maximum view count. - Videos that are more popular than that are not - downloaded. - Videos without view count information are always - downloaded. None for no limit. - download_archive: File name of a file where all downloads are recorded. - Videos already present in the file are not downloaded - again. - cookiefile: File name where cookies should be read from and dumped to. - nocheckcertificate:Do not verify SSL certificates - prefer_insecure: Use HTTP instead of HTTPS to retrieve information. - At the moment, this is only supported by YouTube. - proxy: URL of the proxy server to use - geo_verification_proxy: URL of the proxy to use for IP address verification - on geo-restricted sites. - socket_timeout: Time to wait for unresponsive hosts, in seconds - bidi_workaround: Work around buggy terminals without bidirectional text - support, using fridibi - debug_printtraffic:Print out sent and received HTTP traffic - include_ads: Download ads as well - default_search: Prepend this string if an input url is not valid. - 'auto' for elaborate guessing - encoding: Use this encoding instead of the system-specified. - extract_flat: Do not resolve URLs, return the immediate result. - Pass in 'in_playlist' to only show this behavior for - playlist items. - postprocessors: A list of dictionaries, each with an entry - * key: The name of the postprocessor. See - youtube_dl/postprocessor/__init__.py for a list. - as well as any further keyword arguments for the - postprocessor. - progress_hooks: A list of functions that get called on download - progress, with a dictionary with the entries - * status: One of "downloading", "error", or "finished". - Check this first and ignore unknown values. - - If status is one of "downloading", or "finished", the - following properties may also be present: - * filename: The final filename (always present) - * tmpfilename: The filename we're currently writing to - * downloaded_bytes: Bytes on disk - * total_bytes: Size of the whole file, None if unknown - * total_bytes_estimate: Guess of the eventual file size, - None if unavailable. - * elapsed: The number of seconds since download started. - * eta: The estimated time in seconds, None if unknown - * speed: The download speed in bytes/second, None if - unknown - * fragment_index: The counter of the currently - downloaded video fragment. - * fragment_count: The number of fragments (= individual - files that will be merged) - - Progress hooks are guaranteed to be called at least once - (with status "finished") if the download is successful. - merge_output_format: Extension to use when merging formats. - fixup: Automatically correct known faults of the file. - One of: - - "never": do nothing - - "warn": only emit a warning - - "detect_or_warn": check whether we can do anything - about it, warn otherwise (default) - source_address: Client-side IP address to bind to. - call_home: Boolean, true iff we are allowed to contact the - youtube-dl servers for debugging. - sleep_interval: Number of seconds to sleep before each download when - used alone or a lower bound of a range for randomized - sleep before each download (minimum possible number - of seconds to sleep) when used along with - max_sleep_interval. - max_sleep_interval:Upper bound of a range for randomized sleep before each - download (maximum possible number of seconds to sleep). - Must only be used along with sleep_interval. - Actual sleep time will be a random float from range - [sleep_interval; max_sleep_interval]. - listformats: Print an overview of available video formats and exit. - list_thumbnails: Print a table of all thumbnails and exit. - match_filter: A function that gets called with the info_dict of - every video. - If it returns a message, the video is ignored. - If it returns None, the video is downloaded. - match_filter_func in utils.py is one example for this. - no_color: Do not emit color codes in output. - geo_bypass: Bypass geographic restriction via faking X-Forwarded-For - HTTP header - geo_bypass_country: - Two-letter ISO 3166-2 country code that will be used for - explicit geographic restriction bypassing via faking - X-Forwarded-For HTTP header - geo_bypass_ip_block: - IP range in CIDR notation that will be used similarly to - geo_bypass_country - - The following options determine which downloader is picked: - external_downloader: Executable of the external downloader to call. - None or unset for standard (built-in) downloader. - hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv - if True, otherwise use ffmpeg/avconv if False, otherwise - use downloader suggested by extractor if None. - - The following parameters are not used by YoutubeDL itself, they are used by - the downloader (see youtube_dl/downloader/common.py): - nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test, - noresizebuffer, retries, continuedl, noprogress, consoletitle, - xattr_set_filesize, external_downloader_args, hls_use_mpegts, - http_chunk_size. - - The following options are used by the post processors: - prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, - otherwise prefer ffmpeg. - postprocessor_args: A list of additional command-line arguments for the - postprocessor. - - The following options are used by the Youtube extractor: - youtube_include_dash_manifest: If True (default), DASH manifests and related - data will be downloaded and processed by extractor. - You can reduce network I/O by disabling it if you don't - care about DASH. - """ - - _NUMERIC_FIELDS = set(( - 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', - 'timestamp', 'upload_year', 'upload_month', 'upload_day', - 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', - 'average_rating', 'comment_count', 'age_limit', - 'start_time', 'end_time', - 'chapter_number', 'season_number', 'episode_number', - 'track_number', 'disc_number', 'release_year', - 'playlist_index', - )) - - params = None - _ies = [] - _pps = [] - _download_retcode = None - _num_downloads = None - _screen_file = None - - def __init__(self, params=None, auto_init=True): - """Create a FileDownloader object with the given options.""" - if params is None: - params = {} - self._ies = [] - self._ies_instances = {} - self._pps = [] - self._progress_hooks = [] - self._download_retcode = 0 - self._num_downloads = 0 - self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] - self._err_file = sys.stderr - self.params = { - # Default parameters - 'nocheckcertificate': False, - } - self.params.update(params) - self.cache = Cache(self) - - def check_deprecated(param, option, suggestion): - if self.params.get(param) is not None: - self.report_warning( - '%s is deprecated. Use %s instead.' % (option, suggestion)) - return True - return False - - if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'): - if self.params.get('geo_verification_proxy') is None: - self.params['geo_verification_proxy'] = self.params['cn_verification_proxy'] - - check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits') - check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"') - check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"') - - if params.get('bidi_workaround', False): - try: - import pty - master, slave = pty.openpty() - width = compat_get_terminal_size().columns - if width is None: - width_args = [] - else: - width_args = ['-w', str(width)] - sp_kwargs = dict( - stdin=subprocess.PIPE, - stdout=slave, - stderr=self._err_file) - try: - self._output_process = subprocess.Popen( - ['bidiv'] + width_args, **sp_kwargs - ) - except OSError: - self._output_process = subprocess.Popen( - ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) - self._output_channel = os.fdopen(master, 'rb') - except OSError as ose: - if ose.errno == errno.ENOENT: - self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.') - else: - raise - - if (sys.platform != 'win32' and - sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and - not params.get('restrictfilenames', False)): - # Unicode filesystem API will throw errors (#1474, #13027) - self.report_warning( - 'Assuming --restrict-filenames since file system encoding ' - 'cannot encode all characters. ' - 'Set the LC_ALL environment variable to fix this.') - self.params['restrictfilenames'] = True - - if isinstance(params.get('outtmpl'), bytes): - self.report_warning( - 'Parameter outtmpl is bytes, but should be a unicode string. ' - 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') - - self._setup_opener() - - if auto_init: - self.print_debug_header() - self.add_default_info_extractors() - - for pp_def_raw in self.params.get('postprocessors', []): - pp_class = get_postprocessor(pp_def_raw['key']) - pp_def = dict(pp_def_raw) - del pp_def['key'] - pp = pp_class(self, **compat_kwargs(pp_def)) - self.add_post_processor(pp) - - for ph in self.params.get('progress_hooks', []): - self.add_progress_hook(ph) - - register_socks_protocols() - - def warn_if_short_id(self, argv): - # short YouTube ID starting with dash? - idxs = [ - i for i, a in enumerate(argv) - if re.match(r'^-[0-9A-Za-z_-]{10}$', a)] - if idxs: - correct_argv = ( - ['youtube-dl'] + - [a for i, a in enumerate(argv) if i not in idxs] + - ['--'] + [argv[i] for i in idxs] - ) - self.report_warning( - 'Long argument string detected. ' - 'Use -- to separate parameters and URLs, like this:\n%s\n' % - args_to_str(correct_argv)) - - def add_info_extractor(self, ie): - """Add an InfoExtractor object to the end of the list.""" - self._ies.append(ie) - if not isinstance(ie, type): - self._ies_instances[ie.ie_key()] = ie - ie.set_downloader(self) - - def get_info_extractor(self, ie_key): - """ - Get an instance of an IE with name ie_key, it will try to get one from - the _ies list, if there's no instance it will create a new one and add - it to the extractor list. - """ - ie = self._ies_instances.get(ie_key) - if ie is None: - ie = get_info_extractor(ie_key)() - self.add_info_extractor(ie) - return ie - - def add_default_info_extractors(self): - """ - Add the InfoExtractors returned by gen_extractors to the end of the list - """ - for ie in gen_extractor_classes(): - self.add_info_extractor(ie) - - def add_post_processor(self, pp): - """Add a PostProcessor object to the end of the chain.""" - self._pps.append(pp) - pp.set_downloader(self) - - def add_progress_hook(self, ph): - """Add the progress hook (currently only for the file downloader)""" - self._progress_hooks.append(ph) - - def _bidi_workaround(self, message): - if not hasattr(self, '_output_channel'): - return message - - assert hasattr(self, '_output_process') - assert isinstance(message, compat_str) - line_count = message.count('\n') + 1 - self._output_process.stdin.write((message + '\n').encode('utf-8')) - self._output_process.stdin.flush() - res = ''.join(self._output_channel.readline().decode('utf-8') - for _ in range(line_count)) - return res[:-len('\n')] - - def to_screen(self, message, skip_eol=False): - """Print message to stdout if not in quiet mode.""" - return self.to_stdout(message, skip_eol, check_quiet=True) - - def _write_string(self, s, out=None): - write_string(s, out=out, encoding=self.params.get('encoding')) - - def to_stdout(self, message, skip_eol=False, check_quiet=False): - """Print message to stdout if not in quiet mode.""" - if self.params.get('logger'): - self.params['logger'].debug(message) - elif not check_quiet or not self.params.get('quiet', False): - message = self._bidi_workaround(message) - terminator = ['\n', ''][skip_eol] - output = message + terminator - - self._write_string(output, self._screen_file) - - def to_stderr(self, message): - """Print message to stderr.""" - assert isinstance(message, compat_str) - if self.params.get('logger'): - self.params['logger'].error(message) - else: - message = self._bidi_workaround(message) - output = message + '\n' - self._write_string(output, self._err_file) - - def to_console_title(self, message): - if not self.params.get('consoletitle', False): - return - if compat_os_name == 'nt': - if ctypes.windll.kernel32.GetConsoleWindow(): - # c_wchar_p() might not be necessary if `message` is - # already of type unicode() - ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) - elif 'TERM' in os.environ: - self._write_string('\033]0;%s\007' % message, self._screen_file) - - def save_console_title(self): - if not self.params.get('consoletitle', False): - return - if self.params.get('simulate', False): - return - if compat_os_name != 'nt' and 'TERM' in os.environ: - # Save the title on stack - self._write_string('\033[22;0t', self._screen_file) - - def restore_console_title(self): - if not self.params.get('consoletitle', False): - return - if self.params.get('simulate', False): - return - if compat_os_name != 'nt' and 'TERM' in os.environ: - # Restore the title from stack - self._write_string('\033[23;0t', self._screen_file) - - def __enter__(self): - self.save_console_title() - return self - - def __exit__(self, *args): - self.restore_console_title() - - if self.params.get('cookiefile') is not None: - self.cookiejar.save() - - def trouble(self, message=None, tb=None): - """Determine action to take when a download problem appears. - - Depending on if the downloader has been configured to ignore - download errors or not, this method may throw an exception or - not when errors are found, after printing the message. - - tb, if given, is additional traceback information. - """ - if message is not None: - self.to_stderr(message) - if self.params.get('verbose'): - if tb is None: - if sys.exc_info()[0]: # if .trouble has been called from an except block - tb = '' - if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: - tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info)) - tb += encode_compat_str(traceback.format_exc()) - else: - tb_data = traceback.format_list(traceback.extract_stack()) - tb = ''.join(tb_data) - self.to_stderr(tb) - if not self.params.get('ignoreerrors', False): - if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: - exc_info = sys.exc_info()[1].exc_info - else: - exc_info = sys.exc_info() - raise DownloadError(message, exc_info) - self._download_retcode = 1 - - def report_warning(self, message): - ''' - Print the message to stderr, it will be prefixed with 'WARNING:' - If stderr is a tty file the 'WARNING:' will be colored - ''' - if self.params.get('logger') is not None: - self.params['logger'].warning(message) - else: - if self.params.get('no_warnings'): - return - if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': - _msg_header = '\033[0;33mWARNING:\033[0m' - else: - _msg_header = 'WARNING:' - warning_message = '%s %s' % (_msg_header, message) - self.to_stderr(warning_message) - - def report_error(self, message, tb=None): - ''' - Do the same as trouble, but prefixes the message with 'ERROR:', colored - in red if stderr is a tty file. - ''' - if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt': - _msg_header = '\033[0;31mERROR:\033[0m' - else: - _msg_header = 'ERROR:' - error_message = '%s %s' % (_msg_header, message) - self.trouble(error_message, tb) - - def report_file_already_downloaded(self, file_name): - """Report file has already been fully downloaded.""" - try: - self.to_screen('[download] %s has already been downloaded' % file_name) - except UnicodeEncodeError: - self.to_screen('[download] The file has already been downloaded') - - def prepare_filename(self, info_dict): - """Generate the output filename.""" - try: - template_dict = dict(info_dict) - - template_dict['epoch'] = int(time.time()) - autonumber_size = self.params.get('autonumber_size') - if autonumber_size is None: - autonumber_size = 5 - template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads - if template_dict.get('resolution') is None: - if template_dict.get('width') and template_dict.get('height'): - template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height']) - elif template_dict.get('height'): - template_dict['resolution'] = '%sp' % template_dict['height'] - elif template_dict.get('width'): - template_dict['resolution'] = '%dx?' % template_dict['width'] - - sanitize = lambda k, v: sanitize_filename( - compat_str(v), - restricted=self.params.get('restrictfilenames'), - is_id=(k == 'id' or k.endswith('_id'))) - template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) - for k, v in template_dict.items() - if v is not None and not isinstance(v, (list, tuple, dict))) - template_dict = collections.defaultdict(lambda: 'NA', template_dict) - - outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) - - # For fields playlist_index and autonumber convert all occurrences - # of %(field)s to %(field)0Nd for backward compatibility - field_size_compat_map = { - 'playlist_index': len(str(template_dict['n_entries'])), - 'autonumber': autonumber_size, - } - FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s' - mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl) - if mobj: - outtmpl = re.sub( - FIELD_SIZE_COMPAT_RE, - r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')], - outtmpl) - - # Missing numeric fields used together with integer presentation types - # in format specification will break the argument substitution since - # string 'NA' is returned for missing fields. We will patch output - # template for missing fields to meet string presentation type. - for numeric_field in self._NUMERIC_FIELDS: - if numeric_field not in template_dict: - # As of [1] format syntax is: - # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type - # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting - FORMAT_RE = r'''(?x) - (?<!%) - % - \({0}\) # mapping key - (?:[#0\-+ ]+)? # conversion flags (optional) - (?:\d+)? # minimum field width (optional) - (?:\.\d+)? # precision (optional) - [hlL]? # length modifier (optional) - [diouxXeEfFgGcrs%] # conversion type - ''' - outtmpl = re.sub( - FORMAT_RE.format(numeric_field), - r'%({0})s'.format(numeric_field), outtmpl) - - # expand_path translates '%%' into '%' and '$$' into '$' - # correspondingly that is not what we want since we need to keep - # '%%' intact for template dict substitution step. Working around - # with boundary-alike separator hack. - sep = ''.join([random.choice(ascii_letters) for _ in range(32)]) - outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep)) - - # outtmpl should be expand_path'ed before template dict substitution - # because meta fields may contain env variables we don't want to - # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and - # title "Hello $PATH", we don't want `$PATH` to be expanded. - filename = expand_path(outtmpl).replace(sep, '') % template_dict - - # Temporary fix for #4787 - # 'Treat' all problem characters by passing filename through preferredencoding - # to workaround encoding issues with subprocess on python2 @ Windows - if sys.version_info < (3, 0) and sys.platform == 'win32': - filename = encodeFilename(filename, True).decode(preferredencoding()) - return sanitize_path(filename) - except ValueError as err: - self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') - return None - - def _match_entry(self, info_dict, incomplete): - """ Returns None iff the file should be downloaded """ - - video_title = info_dict.get('title', info_dict.get('id', 'video')) - if 'title' in info_dict: - # This can happen when we're just evaluating the playlist - title = info_dict['title'] - matchtitle = self.params.get('matchtitle', False) - if matchtitle: - if not re.search(matchtitle, title, re.IGNORECASE): - return '"' + title + '" title did not match pattern "' + matchtitle + '"' - rejecttitle = self.params.get('rejecttitle', False) - if rejecttitle: - if re.search(rejecttitle, title, re.IGNORECASE): - return '"' + title + '" title matched reject pattern "' + rejecttitle + '"' - date = info_dict.get('upload_date') - if date is not None: - dateRange = self.params.get('daterange', DateRange()) - if date not in dateRange: - return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) - view_count = info_dict.get('view_count') - if view_count is not None: - min_views = self.params.get('min_views') - if min_views is not None and view_count < min_views: - return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views) - max_views = self.params.get('max_views') - if max_views is not None and view_count > max_views: - return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) - if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): - return 'Skipping "%s" because it is age restricted' % video_title - if self.in_download_archive(info_dict): - return '%s has already been recorded in archive' % video_title - - if not incomplete: - match_filter = self.params.get('match_filter') - if match_filter is not None: - ret = match_filter(info_dict) - if ret is not None: - return ret - - return None - - @staticmethod - def add_extra_info(info_dict, extra_info): - '''Set the keys from extra_info in info dict if they are missing''' - for key, value in extra_info.items(): - info_dict.setdefault(key, value) - - def extract_info(self, url, download=True, ie_key=None, extra_info={}, - process=True, force_generic_extractor=False): - ''' - Returns a list with a dictionary for each video we find. - If 'download', also downloads the videos. - extra_info is a dict containing the extra values to add to each result - ''' - - if not ie_key and force_generic_extractor: - ie_key = 'Generic' - - if ie_key: - ies = [self.get_info_extractor(ie_key)] - else: - ies = self._ies - - for ie in ies: - if not ie.suitable(url): - continue - - ie = self.get_info_extractor(ie.ie_key()) - if not ie.working(): - self.report_warning('The program functionality for this site has been marked as broken, ' - 'and will probably not work.') - - try: - ie_result = ie.extract(url) - if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) - break - if isinstance(ie_result, list): - # Backwards compatibility: old IE result format - ie_result = { - '_type': 'compat_list', - 'entries': ie_result, - } - self.add_default_extra_info(ie_result, ie, url) - if process: - return self.process_ie_result(ie_result, download, extra_info) - else: - return ie_result - except GeoRestrictedError as e: - msg = e.msg - if e.countries: - msg += '\nThis video is available in %s.' % ', '.join( - map(ISO3166Utils.short2full, e.countries)) - msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' - self.report_error(msg) - break - except ExtractorError as e: # An error we somewhat expected - self.report_error(compat_str(e), e.format_traceback()) - break - except MaxDownloadsReached: - raise - except Exception as e: - if self.params.get('ignoreerrors', False): - self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) - break - else: - raise - else: - self.report_error('no suitable InfoExtractor for URL %s' % url) - - def add_default_extra_info(self, ie_result, ie, url): - self.add_extra_info(ie_result, { - 'extractor': ie.IE_NAME, - 'webpage_url': url, - 'webpage_url_basename': url_basename(url), - 'extractor_key': ie.ie_key(), - }) - - def process_ie_result(self, ie_result, download=True, extra_info={}): - """ - Take the result of the ie(may be modified) and resolve all unresolved - references (URLs, playlist items). - - It will also download the videos if 'download'. - Returns the resolved ie_result. - """ - result_type = ie_result.get('_type', 'video') - - if result_type in ('url', 'url_transparent'): - ie_result['url'] = sanitize_url(ie_result['url']) - extract_flat = self.params.get('extract_flat', False) - if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or - extract_flat is True): - if self.params.get('forcejson', False): - self.to_stdout(json.dumps(ie_result)) - return ie_result - - if result_type == 'video': - self.add_extra_info(ie_result, extra_info) - return self.process_video_result(ie_result, download=download) - elif result_type == 'url': - # We have to add extra_info to the results because it may be - # contained in a playlist - return self.extract_info(ie_result['url'], - download, - ie_key=ie_result.get('ie_key'), - extra_info=extra_info) - elif result_type == 'url_transparent': - # Use the information from the embedding page - info = self.extract_info( - ie_result['url'], ie_key=ie_result.get('ie_key'), - extra_info=extra_info, download=False, process=False) - - # extract_info may return None when ignoreerrors is enabled and - # extraction failed with an error, don't crash and return early - # in this case - if not info: - return info - - force_properties = dict( - (k, v) for k, v in ie_result.items() if v is not None) - for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'): - if f in force_properties: - del force_properties[f] - new_result = info.copy() - new_result.update(force_properties) - - # Extracted info may not be a video result (i.e. - # info.get('_type', 'video') != video) but rather an url or - # url_transparent. In such cases outer metadata (from ie_result) - # should be propagated to inner one (info). For this to happen - # _type of info should be overridden with url_transparent. This - # fixes issue from https://github.com/rg3/youtube-dl/pull/11163. - if new_result.get('_type') == 'url': - new_result['_type'] = 'url_transparent' - - return self.process_ie_result( - new_result, download=download, extra_info=extra_info) - elif result_type in ('playlist', 'multi_video'): - # We process each entry in the playlist - playlist = ie_result.get('title') or ie_result.get('id') - self.to_screen('[download] Downloading playlist: %s' % playlist) - - playlist_results = [] - - playliststart = self.params.get('playliststart', 1) - 1 - playlistend = self.params.get('playlistend') - # For backwards compatibility, interpret -1 as whole list - if playlistend == -1: - playlistend = None - - playlistitems_str = self.params.get('playlist_items') - playlistitems = None - if playlistitems_str is not None: - def iter_playlistitems(format): - for string_segment in format.split(','): - if '-' in string_segment: - start, end = string_segment.split('-') - for item in range(int(start), int(end) + 1): - yield int(item) - else: - yield int(string_segment) - playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) - - ie_entries = ie_result['entries'] - - def make_playlistitems_entries(list_ie_entries): - num_entries = len(list_ie_entries) - return [ - list_ie_entries[i - 1] for i in playlistitems - if -num_entries <= i - 1 < num_entries] - - def report_download(num_entries): - self.to_screen( - '[%s] playlist %s: Downloading %d videos' % - (ie_result['extractor'], playlist, num_entries)) - - if isinstance(ie_entries, list): - n_all_entries = len(ie_entries) - if playlistitems: - entries = make_playlistitems_entries(ie_entries) - else: - entries = ie_entries[playliststart:playlistend] - n_entries = len(entries) - self.to_screen( - '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % - (ie_result['extractor'], playlist, n_all_entries, n_entries)) - elif isinstance(ie_entries, PagedList): - if playlistitems: - entries = [] - for item in playlistitems: - entries.extend(ie_entries.getslice( - item - 1, item - )) - else: - entries = ie_entries.getslice( - playliststart, playlistend) - n_entries = len(entries) - report_download(n_entries) - else: # iterable - if playlistitems: - entries = make_playlistitems_entries(list(itertools.islice( - ie_entries, 0, max(playlistitems)))) - else: - entries = list(itertools.islice( - ie_entries, playliststart, playlistend)) - n_entries = len(entries) - report_download(n_entries) - - if self.params.get('playlistreverse', False): - entries = entries[::-1] - - if self.params.get('playlistrandom', False): - random.shuffle(entries) - - x_forwarded_for = ie_result.get('__x_forwarded_for_ip') - - for i, entry in enumerate(entries, 1): - self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) - # This __x_forwarded_for_ip thing is a bit ugly but requires - # minimal changes - if x_forwarded_for: - entry['__x_forwarded_for_ip'] = x_forwarded_for - extra = { - 'n_entries': n_entries, - 'playlist': playlist, - 'playlist_id': ie_result.get('id'), - 'playlist_title': ie_result.get('title'), - 'playlist_uploader': ie_result.get('uploader'), - 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': i + playliststart, - 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'extractor_key': ie_result['extractor_key'], - } - - reason = self._match_entry(entry, incomplete=True) - if reason is not None: - self.to_screen('[download] ' + reason) - continue - - entry_result = self.process_ie_result(entry, - download=download, - extra_info=extra) - playlist_results.append(entry_result) - ie_result['entries'] = playlist_results - self.to_screen('[download] Finished downloading playlist: %s' % playlist) - return ie_result - elif result_type == 'compat_list': - self.report_warning( - 'Extractor %s returned a compat_list result. ' - 'It needs to be updated.' % ie_result.get('extractor')) - - def _fixup(r): - self.add_extra_info( - r, - { - 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'extractor_key': ie_result['extractor_key'], - } - ) - return r - ie_result['entries'] = [ - self.process_ie_result(_fixup(r), download, extra_info) - for r in ie_result['entries'] - ] - return ie_result - else: - raise Exception('Invalid result type: %s' % result_type) - - def _build_format_filter(self, filter_spec): - " Returns a function to filter the formats according to the filter_spec " - - OPERATORS = { - '<': operator.lt, - '<=': operator.le, - '>': operator.gt, - '>=': operator.ge, - '=': operator.eq, - '!=': operator.ne, - } - operator_rex = re.compile(r'''(?x)\s* - (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps) - \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* - (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) - $ - ''' % '|'.join(map(re.escape, OPERATORS.keys()))) - m = operator_rex.search(filter_spec) - if m: - try: - comparison_value = int(m.group('value')) - except ValueError: - comparison_value = parse_filesize(m.group('value')) - if comparison_value is None: - comparison_value = parse_filesize(m.group('value') + 'B') - if comparison_value is None: - raise ValueError( - 'Invalid value %r in format specification %r' % ( - m.group('value'), filter_spec)) - op = OPERATORS[m.group('op')] - - if not m: - STR_OPERATORS = { - '=': operator.eq, - '!=': operator.ne, - '^=': lambda attr, value: attr.startswith(value), - '$=': lambda attr, value: attr.endswith(value), - '*=': lambda attr, value: value in attr, - } - str_operator_rex = re.compile(r'''(?x) - \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id) - \s*(?P<op>%s)(?P<none_inclusive>\s*\?)? - \s*(?P<value>[a-zA-Z0-9._-]+) - \s*$ - ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) - m = str_operator_rex.search(filter_spec) - if m: - comparison_value = m.group('value') - op = STR_OPERATORS[m.group('op')] - - if not m: - raise ValueError('Invalid filter specification %r' % filter_spec) - - def _filter(f): - actual_value = f.get(m.group('key')) - if actual_value is None: - return m.group('none_inclusive') - return op(actual_value, comparison_value) - return _filter - - def _default_format_spec(self, info_dict, download=True): - - def can_merge(): - merger = FFmpegMergerPP(self) - return merger.available and merger.can_merge() - - def prefer_best(): - if self.params.get('simulate', False): - return False - if not download: - return False - if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-': - return True - if info_dict.get('is_live'): - return True - if not can_merge(): - return True - return False - - req_format_list = ['bestvideo+bestaudio', 'best'] - if prefer_best(): - req_format_list.reverse() - return '/'.join(req_format_list) - - def build_format_selector(self, format_spec): - def syntax_error(note, start): - message = ( - 'Invalid format specification: ' - '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) - return SyntaxError(message) - - PICKFIRST = 'PICKFIRST' - MERGE = 'MERGE' - SINGLE = 'SINGLE' - GROUP = 'GROUP' - FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) - - def _parse_filter(tokens): - filter_parts = [] - for type, string, start, _, _ in tokens: - if type == tokenize.OP and string == ']': - return ''.join(filter_parts) - else: - filter_parts.append(string) - - def _remove_unused_ops(tokens): - # Remove operators that we don't use and join them with the surrounding strings - # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' - ALLOWED_OPS = ('/', '+', ',', '(', ')') - last_string, last_start, last_end, last_line = None, None, None, None - for type, string, start, end, line in tokens: - if type == tokenize.OP and string == '[': - if last_string: - yield tokenize.NAME, last_string, last_start, last_end, last_line - last_string = None - yield type, string, start, end, line - # everything inside brackets will be handled by _parse_filter - for type, string, start, end, line in tokens: - yield type, string, start, end, line - if type == tokenize.OP and string == ']': - break - elif type == tokenize.OP and string in ALLOWED_OPS: - if last_string: - yield tokenize.NAME, last_string, last_start, last_end, last_line - last_string = None - yield type, string, start, end, line - elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: - if not last_string: - last_string = string - last_start = start - last_end = end - else: - last_string += string - if last_string: - yield tokenize.NAME, last_string, last_start, last_end, last_line - - def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): - selectors = [] - current_selector = None - for type, string, start, _, _ in tokens: - # ENCODING is only defined in python 3.x - if type == getattr(tokenize, 'ENCODING', None): - continue - elif type in [tokenize.NAME, tokenize.NUMBER]: - current_selector = FormatSelector(SINGLE, string, []) - elif type == tokenize.OP: - if string == ')': - if not inside_group: - # ')' will be handled by the parentheses group - tokens.restore_last_token() - break - elif inside_merge and string in ['/', ',']: - tokens.restore_last_token() - break - elif inside_choice and string == ',': - tokens.restore_last_token() - break - elif string == ',': - if not current_selector: - raise syntax_error('"," must follow a format selector', start) - selectors.append(current_selector) - current_selector = None - elif string == '/': - if not current_selector: - raise syntax_error('"/" must follow a format selector', start) - first_choice = current_selector - second_choice = _parse_format_selection(tokens, inside_choice=True) - current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) - elif string == '[': - if not current_selector: - current_selector = FormatSelector(SINGLE, 'best', []) - format_filter = _parse_filter(tokens) - current_selector.filters.append(format_filter) - elif string == '(': - if current_selector: - raise syntax_error('Unexpected "("', start) - group = _parse_format_selection(tokens, inside_group=True) - current_selector = FormatSelector(GROUP, group, []) - elif string == '+': - video_selector = current_selector - audio_selector = _parse_format_selection(tokens, inside_merge=True) - if not video_selector or not audio_selector: - raise syntax_error('"+" must be between two format selectors', start) - current_selector = FormatSelector(MERGE, (video_selector, audio_selector), []) - else: - raise syntax_error('Operator not recognized: "{0}"'.format(string), start) - elif type == tokenize.ENDMARKER: - break - if current_selector: - selectors.append(current_selector) - return selectors - - def _build_selector_function(selector): - if isinstance(selector, list): - fs = [_build_selector_function(s) for s in selector] - - def selector_function(ctx): - for f in fs: - for format in f(ctx): - yield format - return selector_function - elif selector.type == GROUP: - selector_function = _build_selector_function(selector.selector) - elif selector.type == PICKFIRST: - fs = [_build_selector_function(s) for s in selector.selector] - - def selector_function(ctx): - for f in fs: - picked_formats = list(f(ctx)) - if picked_formats: - return picked_formats - return [] - elif selector.type == SINGLE: - format_spec = selector.selector - - def selector_function(ctx): - formats = list(ctx['formats']) - if not formats: - return - if format_spec == 'all': - for f in formats: - yield f - elif format_spec in ['best', 'worst', None]: - format_idx = 0 if format_spec == 'worst' else -1 - audiovideo_formats = [ - f for f in formats - if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] - if audiovideo_formats: - yield audiovideo_formats[format_idx] - # for extractors with incomplete formats (audio only (soundcloud) - # or video only (imgur)) we will fallback to best/worst - # {video,audio}-only format - elif ctx['incomplete_formats']: - yield formats[format_idx] - elif format_spec == 'bestaudio': - audio_formats = [ - f for f in formats - if f.get('vcodec') == 'none'] - if audio_formats: - yield audio_formats[-1] - elif format_spec == 'worstaudio': - audio_formats = [ - f for f in formats - if f.get('vcodec') == 'none'] - if audio_formats: - yield audio_formats[0] - elif format_spec == 'bestvideo': - video_formats = [ - f for f in formats - if f.get('acodec') == 'none'] - if video_formats: - yield video_formats[-1] - elif format_spec == 'worstvideo': - video_formats = [ - f for f in formats - if f.get('acodec') == 'none'] - if video_formats: - yield video_formats[0] - else: - extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] - if format_spec in extensions: - filter_f = lambda f: f['ext'] == format_spec - else: - filter_f = lambda f: f['format_id'] == format_spec - matches = list(filter(filter_f, formats)) - if matches: - yield matches[-1] - elif selector.type == MERGE: - def _merge(formats_info): - format_1, format_2 = [f['format_id'] for f in formats_info] - # The first format must contain the video and the - # second the audio - if formats_info[0].get('vcodec') == 'none': - self.report_error('The first format must ' - 'contain the video, try using ' - '"-f %s+%s"' % (format_2, format_1)) - return - # Formats must be opposite (video+audio) - if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none': - self.report_error( - 'Both formats %s and %s are video-only, you must specify "-f video+audio"' - % (format_1, format_2)) - return - output_ext = ( - formats_info[0]['ext'] - if self.params.get('merge_output_format') is None - else self.params['merge_output_format']) - return { - 'requested_formats': formats_info, - 'format': '%s+%s' % (formats_info[0].get('format'), - formats_info[1].get('format')), - 'format_id': '%s+%s' % (formats_info[0].get('format_id'), - formats_info[1].get('format_id')), - 'width': formats_info[0].get('width'), - 'height': formats_info[0].get('height'), - 'resolution': formats_info[0].get('resolution'), - 'fps': formats_info[0].get('fps'), - 'vcodec': formats_info[0].get('vcodec'), - 'vbr': formats_info[0].get('vbr'), - 'stretched_ratio': formats_info[0].get('stretched_ratio'), - 'acodec': formats_info[1].get('acodec'), - 'abr': formats_info[1].get('abr'), - 'ext': output_ext, - } - video_selector, audio_selector = map(_build_selector_function, selector.selector) - - def selector_function(ctx): - for pair in itertools.product( - video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))): - yield _merge(pair) - - filters = [self._build_format_filter(f) for f in selector.filters] - - def final_selector(ctx): - ctx_copy = copy.deepcopy(ctx) - for _filter in filters: - ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) - return selector_function(ctx_copy) - return final_selector - - stream = io.BytesIO(format_spec.encode('utf-8')) - try: - tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) - except tokenize.TokenError: - raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) - - class TokenIterator(object): - def __init__(self, tokens): - self.tokens = tokens - self.counter = 0 - - def __iter__(self): - return self - - def __next__(self): - if self.counter >= len(self.tokens): - raise StopIteration() - value = self.tokens[self.counter] - self.counter += 1 - return value - - next = __next__ - - def restore_last_token(self): - self.counter -= 1 - - parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) - return _build_selector_function(parsed_selector) - - def _calc_headers(self, info_dict): - res = std_headers.copy() - - add_headers = info_dict.get('http_headers') - if add_headers: - res.update(add_headers) - - cookies = self._calc_cookies(info_dict) - if cookies: - res['Cookie'] = cookies - - if 'X-Forwarded-For' not in res: - x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip') - if x_forwarded_for_ip: - res['X-Forwarded-For'] = x_forwarded_for_ip - - return res - - def _calc_cookies(self, info_dict): - pr = sanitized_Request(info_dict['url']) - self.cookiejar.add_cookie_header(pr) - return pr.get_header('Cookie') - - def process_video_result(self, info_dict, download=True): - assert info_dict.get('_type', 'video') == 'video' - - if 'id' not in info_dict: - raise ExtractorError('Missing "id" field in extractor result') - if 'title' not in info_dict: - raise ExtractorError('Missing "title" field in extractor result') - - def report_force_conversion(field, field_not, conversion): - self.report_warning( - '"%s" field is not %s - forcing %s conversion, there is an error in extractor' - % (field, field_not, conversion)) - - def sanitize_string_field(info, string_field): - field = info.get(string_field) - if field is None or isinstance(field, compat_str): - return - report_force_conversion(string_field, 'a string', 'string') - info[string_field] = compat_str(field) - - def sanitize_numeric_fields(info): - for numeric_field in self._NUMERIC_FIELDS: - field = info.get(numeric_field) - if field is None or isinstance(field, compat_numeric_types): - continue - report_force_conversion(numeric_field, 'numeric', 'int') - info[numeric_field] = int_or_none(field) - - sanitize_string_field(info_dict, 'id') - sanitize_numeric_fields(info_dict) - - if 'playlist' not in info_dict: - # It isn't part of a playlist - info_dict['playlist'] = None - info_dict['playlist_index'] = None - - thumbnails = info_dict.get('thumbnails') - if thumbnails is None: - thumbnail = info_dict.get('thumbnail') - if thumbnail: - info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] - if thumbnails: - thumbnails.sort(key=lambda t: ( - t.get('preference') if t.get('preference') is not None else -1, - t.get('width') if t.get('width') is not None else -1, - t.get('height') if t.get('height') is not None else -1, - t.get('id') if t.get('id') is not None else '', t.get('url'))) - for i, t in enumerate(thumbnails): - t['url'] = sanitize_url(t['url']) - if t.get('width') and t.get('height'): - t['resolution'] = '%dx%d' % (t['width'], t['height']) - if t.get('id') is None: - t['id'] = '%d' % i - - if self.params.get('list_thumbnails'): - self.list_thumbnails(info_dict) - return - - thumbnail = info_dict.get('thumbnail') - if thumbnail: - info_dict['thumbnail'] = sanitize_url(thumbnail) - elif thumbnails: - info_dict['thumbnail'] = thumbnails[-1]['url'] - - if 'display_id' not in info_dict and 'id' in info_dict: - info_dict['display_id'] = info_dict['id'] - - if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: - # Working around out-of-range timestamp values (e.g. negative ones on Windows, - # see http://bugs.python.org/issue1646728) - try: - upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) - info_dict['upload_date'] = upload_date.strftime('%Y%m%d') - except (ValueError, OverflowError, OSError): - pass - - # Auto generate title fields corresponding to the *_number fields when missing - # in order to always have clean titles. This is very common for TV series. - for field in ('chapter', 'season', 'episode'): - if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): - info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) - - for cc_kind in ('subtitles', 'automatic_captions'): - cc = info_dict.get(cc_kind) - if cc: - for _, subtitle in cc.items(): - for subtitle_format in subtitle: - if subtitle_format.get('url'): - subtitle_format['url'] = sanitize_url(subtitle_format['url']) - if subtitle_format.get('ext') is None: - subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() - - automatic_captions = info_dict.get('automatic_captions') - subtitles = info_dict.get('subtitles') - - if self.params.get('listsubtitles', False): - if 'automatic_captions' in info_dict: - self.list_subtitles( - info_dict['id'], automatic_captions, 'automatic captions') - self.list_subtitles(info_dict['id'], subtitles, 'subtitles') - return - - info_dict['requested_subtitles'] = self.process_subtitles( - info_dict['id'], subtitles, automatic_captions) - - # We now pick which formats have to be downloaded - if info_dict.get('formats') is None: - # There's only one format available - formats = [info_dict] - else: - formats = info_dict['formats'] - - if not formats: - raise ExtractorError('No video formats found!') - - def is_wellformed(f): - url = f.get('url') - if not url: - self.report_warning( - '"url" field is missing or empty - skipping format, ' - 'there is an error in extractor') - return False - if isinstance(url, bytes): - sanitize_string_field(f, 'url') - return True - - # Filter out malformed formats for better extraction robustness - formats = list(filter(is_wellformed, formats)) - - formats_dict = {} - - # We check that all the formats have the format and format_id fields - for i, format in enumerate(formats): - sanitize_string_field(format, 'format_id') - sanitize_numeric_fields(format) - format['url'] = sanitize_url(format['url']) - if not format.get('format_id'): - format['format_id'] = compat_str(i) - else: - # Sanitize format_id from characters used in format selector expression - format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id']) - format_id = format['format_id'] - if format_id not in formats_dict: - formats_dict[format_id] = [] - formats_dict[format_id].append(format) - - # Make sure all formats have unique format_id - for format_id, ambiguous_formats in formats_dict.items(): - if len(ambiguous_formats) > 1: - for i, format in enumerate(ambiguous_formats): - format['format_id'] = '%s-%d' % (format_id, i) - - for i, format in enumerate(formats): - if format.get('format') is None: - format['format'] = '{id} - {res}{note}'.format( - id=format['format_id'], - res=self.format_resolution(format), - note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', - ) - # Automatically determine file extension if missing - if format.get('ext') is None: - format['ext'] = determine_ext(format['url']).lower() - # Automatically determine protocol if missing (useful for format - # selection purposes) - if format.get('protocol') is None: - format['protocol'] = determine_protocol(format) - # Add HTTP headers, so that external programs can use them from the - # json output - full_format_info = info_dict.copy() - full_format_info.update(format) - format['http_headers'] = self._calc_headers(full_format_info) - # Remove private housekeeping stuff - if '__x_forwarded_for_ip' in info_dict: - del info_dict['__x_forwarded_for_ip'] - - # TODO Central sorting goes here - - if formats[0] is not info_dict: - # only set the 'formats' fields if the original info_dict list them - # otherwise we end up with a circular reference, the first (and unique) - # element in the 'formats' field in info_dict is info_dict itself, - # which can't be exported to json - info_dict['formats'] = formats - if self.params.get('listformats'): - self.list_formats(info_dict) - return - - req_format = self.params.get('format') - if req_format is None: - req_format = self._default_format_spec(info_dict, download=download) - if self.params.get('verbose'): - self.to_stdout('[debug] Default format spec: %s' % req_format) - - format_selector = self.build_format_selector(req_format) - - # While in format selection we may need to have an access to the original - # format set in order to calculate some metrics or do some processing. - # For now we need to be able to guess whether original formats provided - # by extractor are incomplete or not (i.e. whether extractor provides only - # video-only or audio-only formats) for proper formats selection for - # extractors with such incomplete formats (see - # https://github.com/rg3/youtube-dl/pull/5556). - # Since formats may be filtered during format selection and may not match - # the original formats the results may be incorrect. Thus original formats - # or pre-calculated metrics should be passed to format selection routines - # as well. - # We will pass a context object containing all necessary additional data - # instead of just formats. - # This fixes incorrect format selection issue (see - # https://github.com/rg3/youtube-dl/issues/10083). - incomplete_formats = ( - # All formats are video-only or - all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or - # all formats are audio-only - all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) - - ctx = { - 'formats': formats, - 'incomplete_formats': incomplete_formats, - } - - formats_to_download = list(format_selector(ctx)) - if not formats_to_download: - raise ExtractorError('requested format not available', - expected=True) - - if download: - if len(formats_to_download) > 1: - self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download))) - for format in formats_to_download: - new_info = dict(info_dict) - new_info.update(format) - self.process_info(new_info) - # We update the info dict with the best quality format (backwards compatibility) - info_dict.update(formats_to_download[-1]) - return info_dict - - def process_subtitles(self, video_id, normal_subtitles, automatic_captions): - """Select the requested subtitles and their format""" - available_subs = {} - if normal_subtitles and self.params.get('writesubtitles'): - available_subs.update(normal_subtitles) - if automatic_captions and self.params.get('writeautomaticsub'): - for lang, cap_info in automatic_captions.items(): - if lang not in available_subs: - available_subs[lang] = cap_info - - if (not self.params.get('writesubtitles') and not - self.params.get('writeautomaticsub') or not - available_subs): - return None - - if self.params.get('allsubtitles', False): - requested_langs = available_subs.keys() - else: - if self.params.get('subtitleslangs', False): - requested_langs = self.params.get('subtitleslangs') - elif 'en' in available_subs: - requested_langs = ['en'] - else: - requested_langs = [list(available_subs.keys())[0]] - - formats_query = self.params.get('subtitlesformat', 'best') - formats_preference = formats_query.split('/') if formats_query else [] - subs = {} - for lang in requested_langs: - formats = available_subs.get(lang) - if formats is None: - self.report_warning('%s subtitles not available for %s' % (lang, video_id)) - continue - for ext in formats_preference: - if ext == 'best': - f = formats[-1] - break - matches = list(filter(lambda f: f['ext'] == ext, formats)) - if matches: - f = matches[-1] - break - else: - f = formats[-1] - self.report_warning( - 'No subtitle format found matching "%s" for language %s, ' - 'using %s' % (formats_query, lang, f['ext'])) - subs[lang] = f - return subs - - def process_info(self, info_dict): - """Process a single resolved IE result.""" - - assert info_dict.get('_type', 'video') == 'video' - - max_downloads = self.params.get('max_downloads') - if max_downloads is not None: - if self._num_downloads >= int(max_downloads): - raise MaxDownloadsReached() - - info_dict['fulltitle'] = info_dict['title'] - if len(info_dict['title']) > 200: - info_dict['title'] = info_dict['title'][:197] + '...' - - if 'format' not in info_dict: - info_dict['format'] = info_dict['ext'] - - reason = self._match_entry(info_dict, incomplete=False) - if reason is not None: - self.to_screen('[download] ' + reason) - return - - self._num_downloads += 1 - - info_dict['_filename'] = filename = self.prepare_filename(info_dict) - - # Forced printings - if self.params.get('forcetitle', False): - self.to_stdout(info_dict['fulltitle']) - if self.params.get('forceid', False): - self.to_stdout(info_dict['id']) - if self.params.get('forceurl', False): - if info_dict.get('requested_formats') is not None: - for f in info_dict['requested_formats']: - self.to_stdout(f['url'] + f.get('play_path', '')) - else: - # For RTMP URLs, also include the playpath - self.to_stdout(info_dict['url'] + info_dict.get('play_path', '')) - if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None: - self.to_stdout(info_dict['thumbnail']) - if self.params.get('forcedescription', False) and info_dict.get('description') is not None: - self.to_stdout(info_dict['description']) - if self.params.get('forcefilename', False) and filename is not None: - self.to_stdout(filename) - if self.params.get('forceduration', False) and info_dict.get('duration') is not None: - self.to_stdout(formatSeconds(info_dict['duration'])) - if self.params.get('forceformat', False): - self.to_stdout(info_dict['format']) - if self.params.get('forcejson', False): - self.to_stdout(json.dumps(info_dict)) - - # Do nothing else if in simulate mode - if self.params.get('simulate', False): - return - - if filename is None: - return - - def ensure_dir_exists(path): - try: - dn = os.path.dirname(path) - if dn and not os.path.exists(dn): - os.makedirs(dn) - return True - except (OSError, IOError) as err: - self.report_error('unable to create directory ' + error_to_compat_str(err)) - return False - - if not ensure_dir_exists(sanitize_path(encodeFilename(filename))): - return - - if self.params.get('writedescription', False): - descfn = replace_extension(filename, 'description', info_dict.get('ext')) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)): - self.to_screen('[info] Video description is already present') - elif info_dict.get('description') is None: - self.report_warning('There\'s no description to write.') - else: - try: - self.to_screen('[info] Writing video description to: ' + descfn) - with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: - descfile.write(info_dict['description']) - except (OSError, IOError): - self.report_error('Cannot write description file ' + descfn) - return - - if self.params.get('writeannotations', False): - annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext')) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)): - self.to_screen('[info] Video annotations are already present') - else: - try: - self.to_screen('[info] Writing video annotations to: ' + annofn) - with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: - annofile.write(info_dict['annotations']) - except (KeyError, TypeError): - self.report_warning('There are no annotations to write.') - except (OSError, IOError): - self.report_error('Cannot write annotations file: ' + annofn) - return - - subtitles_are_requested = any([self.params.get('writesubtitles', False), - self.params.get('writeautomaticsub')]) - - if subtitles_are_requested and info_dict.get('requested_subtitles'): - # subtitles download errors are already managed as troubles in relevant IE - # that way it will silently go on when used with unsupporting IE - subtitles = info_dict['requested_subtitles'] - ie = self.get_info_extractor(info_dict['extractor_key']) - for sub_lang, sub_info in subtitles.items(): - sub_format = sub_info['ext'] - sub_filename = subtitles_filename(filename, sub_lang, sub_format) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): - self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) - else: - self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - if sub_info.get('data') is not None: - try: - # Use newline='' to prevent conversion of newline characters - # See https://github.com/rg3/youtube-dl/issues/10268 - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: - subfile.write(sub_info['data']) - except (OSError, IOError): - self.report_error('Cannot write subtitles file ' + sub_filename) - return - else: - try: - sub_data = ie._request_webpage( - sub_info['url'], info_dict['id'], note=False).read() - with io.open(encodeFilename(sub_filename), 'wb') as subfile: - subfile.write(sub_data) - except (ExtractorError, IOError, OSError, ValueError) as err: - self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, error_to_compat_str(err))) - continue - - if self.params.get('writeinfojson', False): - infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): - self.to_screen('[info] Video description metadata is already present') - else: - self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn) - try: - write_json_file(self.filter_requested_info(info_dict), infofn) - except (OSError, IOError): - self.report_error('Cannot write metadata to JSON file ' + infofn) - return - - self._write_thumbnails(info_dict, filename) - - if not self.params.get('skip_download', False): - try: - def dl(name, info): - fd = get_suitable_downloader(info, self.params)(self, self.params) - for ph in self._progress_hooks: - fd.add_progress_hook(ph) - if self.params.get('verbose'): - self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) - return fd.download(name, info) - - if info_dict.get('requested_formats') is not None: - downloaded = [] - success = True - merger = FFmpegMergerPP(self) - if not merger.available: - postprocessors = [] - self.report_warning('You have requested multiple ' - 'formats but ffmpeg or avconv are not installed.' - ' The formats won\'t be merged.') - else: - postprocessors = [merger] - - def compatible_formats(formats): - video, audio = formats - # Check extension - video_ext, audio_ext = video.get('ext'), audio.get('ext') - if video_ext and audio_ext: - COMPATIBLE_EXTS = ( - ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'), - ('webm') - ) - for exts in COMPATIBLE_EXTS: - if video_ext in exts and audio_ext in exts: - return True - # TODO: Check acodec/vcodec - return False - - filename_real_ext = os.path.splitext(filename)[1][1:] - filename_wo_ext = ( - os.path.splitext(filename)[0] - if filename_real_ext == info_dict['ext'] - else filename) - requested_formats = info_dict['requested_formats'] - if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats): - info_dict['ext'] = 'mkv' - self.report_warning( - 'Requested formats are incompatible for merge and will be merged into mkv.') - # Ensure filename always has a correct extension for successful merge - filename = '%s.%s' % (filename_wo_ext, info_dict['ext']) - if os.path.exists(encodeFilename(filename)): - self.to_screen( - '[download] %s has already been downloaded and ' - 'merged' % filename) - else: - for f in requested_formats: - new_info = dict(info_dict) - new_info.update(f) - fname = prepend_extension( - self.prepare_filename(new_info), - 'f%s' % f['format_id'], new_info['ext']) - if not ensure_dir_exists(fname): - return - downloaded.append(fname) - partial_success = dl(fname, new_info) - success = success and partial_success - info_dict['__postprocessors'] = postprocessors - info_dict['__files_to_merge'] = downloaded - else: - # Just a single file - success = dl(filename, info_dict) - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self.report_error('unable to download video data: %s' % error_to_compat_str(err)) - return - except (OSError, IOError) as err: - raise UnavailableVideoError(err) - except (ContentTooShortError, ) as err: - self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) - return - - if success and filename != '-': - # Fixup content - fixup_policy = self.params.get('fixup') - if fixup_policy is None: - fixup_policy = 'detect_or_warn' - - INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.' - - stretched_ratio = info_dict.get('stretched_ratio') - if stretched_ratio is not None and stretched_ratio != 1: - if fixup_policy == 'warn': - self.report_warning('%s: Non-uniform pixel ratio (%s)' % ( - info_dict['id'], stretched_ratio)) - elif fixup_policy == 'detect_or_warn': - stretched_pp = FFmpegFixupStretchedPP(self) - if stretched_pp.available: - info_dict.setdefault('__postprocessors', []) - info_dict['__postprocessors'].append(stretched_pp) - else: - self.report_warning( - '%s: Non-uniform pixel ratio (%s). %s' - % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE)) - else: - assert fixup_policy in ('ignore', 'never') - - if (info_dict.get('requested_formats') is None and - info_dict.get('container') == 'm4a_dash'): - if fixup_policy == 'warn': - self.report_warning( - '%s: writing DASH m4a. ' - 'Only some players support this container.' - % info_dict['id']) - elif fixup_policy == 'detect_or_warn': - fixup_pp = FFmpegFixupM4aPP(self) - if fixup_pp.available: - info_dict.setdefault('__postprocessors', []) - info_dict['__postprocessors'].append(fixup_pp) - else: - self.report_warning( - '%s: writing DASH m4a. ' - 'Only some players support this container. %s' - % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) - else: - assert fixup_policy in ('ignore', 'never') - - if (info_dict.get('protocol') == 'm3u8_native' or - info_dict.get('protocol') == 'm3u8' and - self.params.get('hls_prefer_native')): - if fixup_policy == 'warn': - self.report_warning('%s: malformed AAC bitstream detected.' % ( - info_dict['id'])) - elif fixup_policy == 'detect_or_warn': - fixup_pp = FFmpegFixupM3u8PP(self) - if fixup_pp.available: - info_dict.setdefault('__postprocessors', []) - info_dict['__postprocessors'].append(fixup_pp) - else: - self.report_warning( - '%s: malformed AAC bitstream detected. %s' - % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) - else: - assert fixup_policy in ('ignore', 'never') - - try: - self.post_process(filename, info_dict) - except (PostProcessingError) as err: - self.report_error('postprocessing: %s' % str(err)) - return - self.record_download_archive(info_dict) - - def download(self, url_list): - """Download a given list of URLs.""" - outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) - if (len(url_list) > 1 and - outtmpl != '-' and - '%' not in outtmpl and - self.params.get('max_downloads') != 1): - raise SameFileError(outtmpl) - - for url in url_list: - try: - # It also downloads the videos - res = self.extract_info( - url, force_generic_extractor=self.params.get('force_generic_extractor', False)) - except UnavailableVideoError: - self.report_error('unable to download video') - except MaxDownloadsReached: - self.to_screen('[info] Maximum number of downloaded files reached.') - raise - else: - if self.params.get('dump_single_json', False): - self.to_stdout(json.dumps(res)) - - return self._download_retcode - - def download_with_info_file(self, info_filename): - with contextlib.closing(fileinput.FileInput( - [info_filename], mode='r', - openhook=fileinput.hook_encoded('utf-8'))) as f: - # FileInput doesn't have a read method, we can't call json.load - info = self.filter_requested_info(json.loads('\n'.join(f))) - try: - self.process_ie_result(info, download=True) - except DownloadError: - webpage_url = info.get('webpage_url') - if webpage_url is not None: - self.report_warning('The info failed to download, trying with "%s"' % webpage_url) - return self.download([webpage_url]) - else: - raise - return self._download_retcode - - @staticmethod - def filter_requested_info(info_dict): - return dict( - (k, v) for k, v in info_dict.items() - if k not in ['requested_formats', 'requested_subtitles']) - - def post_process(self, filename, ie_info): - """Run all the postprocessors on the given file.""" - info = dict(ie_info) - info['filepath'] = filename - pps_chain = [] - if ie_info.get('__postprocessors') is not None: - pps_chain.extend(ie_info['__postprocessors']) - pps_chain.extend(self._pps) - for pp in pps_chain: - files_to_delete = [] - try: - files_to_delete, info = pp.run(info) - except PostProcessingError as e: - self.report_error(e.msg) - if files_to_delete and not self.params.get('keepvideo', False): - for old_filename in files_to_delete: - self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename) - try: - os.remove(encodeFilename(old_filename)) - except (IOError, OSError): - self.report_warning('Unable to remove downloaded original file') - - def _make_archive_id(self, info_dict): - # Future-proof against any change in case - # and backwards compatibility with prior versions - extractor = info_dict.get('extractor_key') - if extractor is None: - if 'id' in info_dict: - extractor = info_dict.get('ie_key') # key in a playlist - if extractor is None: - return None # Incomplete video information - return extractor.lower() + ' ' + info_dict['id'] - - def in_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: - return False - - vid_id = self._make_archive_id(info_dict) - if vid_id is None: - return False # Incomplete video information - - try: - with locked_file(fn, 'r', encoding='utf-8') as archive_file: - for line in archive_file: - if line.strip() == vid_id: - return True - except IOError as ioe: - if ioe.errno != errno.ENOENT: - raise - return False - - def record_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: - return - vid_id = self._make_archive_id(info_dict) - assert vid_id - with locked_file(fn, 'a', encoding='utf-8') as archive_file: - archive_file.write(vid_id + '\n') - - @staticmethod - def format_resolution(format, default='unknown'): - if format.get('vcodec') == 'none': - return 'audio only' - if format.get('resolution') is not None: - return format['resolution'] - if format.get('height') is not None: - if format.get('width') is not None: - res = '%sx%s' % (format['width'], format['height']) - else: - res = '%sp' % format['height'] - elif format.get('width') is not None: - res = '%dx?' % format['width'] - else: - res = default - return res - - def _format_note(self, fdict): - res = '' - if fdict.get('ext') in ['f4f', 'f4m']: - res += '(unsupported) ' - if fdict.get('language'): - if res: - res += ' ' - res += '[%s] ' % fdict['language'] - if fdict.get('format_note') is not None: - res += fdict['format_note'] + ' ' - if fdict.get('tbr') is not None: - res += '%4dk ' % fdict['tbr'] - if fdict.get('container') is not None: - if res: - res += ', ' - res += '%s container' % fdict['container'] - if (fdict.get('vcodec') is not None and - fdict.get('vcodec') != 'none'): - if res: - res += ', ' - res += fdict['vcodec'] - if fdict.get('vbr') is not None: - res += '@' - elif fdict.get('vbr') is not None and fdict.get('abr') is not None: - res += 'video@' - if fdict.get('vbr') is not None: - res += '%4dk' % fdict['vbr'] - if fdict.get('fps') is not None: - if res: - res += ', ' - res += '%sfps' % fdict['fps'] - if fdict.get('acodec') is not None: - if res: - res += ', ' - if fdict['acodec'] == 'none': - res += 'video only' - else: - res += '%-5s' % fdict['acodec'] - elif fdict.get('abr') is not None: - if res: - res += ', ' - res += 'audio' - if fdict.get('abr') is not None: - res += '@%3dk' % fdict['abr'] - if fdict.get('asr') is not None: - res += ' (%5dHz)' % fdict['asr'] - if fdict.get('filesize') is not None: - if res: - res += ', ' - res += format_bytes(fdict['filesize']) - elif fdict.get('filesize_approx') is not None: - if res: - res += ', ' - res += '~' + format_bytes(fdict['filesize_approx']) - return res - - def list_formats(self, info_dict): - formats = info_dict.get('formats', [info_dict]) - table = [ - [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)] - for f in formats - if f.get('preference') is None or f['preference'] >= -1000] - if len(formats) > 1: - table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)' - - header_line = ['format code', 'extension', 'resolution', 'note'] - self.to_screen( - '[info] Available formats for %s:\n%s' % - (info_dict['id'], render_table(header_line, table))) - - def list_thumbnails(self, info_dict): - thumbnails = info_dict.get('thumbnails') - if not thumbnails: - self.to_screen('[info] No thumbnails present for %s' % info_dict['id']) - return - - self.to_screen( - '[info] Thumbnails for %s:' % info_dict['id']) - self.to_screen(render_table( - ['ID', 'width', 'height', 'URL'], - [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) - - def list_subtitles(self, video_id, subtitles, name='subtitles'): - if not subtitles: - self.to_screen('%s has no %s' % (video_id, name)) - return - self.to_screen( - 'Available %s for %s:' % (name, video_id)) - self.to_screen(render_table( - ['Language', 'formats'], - [[lang, ', '.join(f['ext'] for f in reversed(formats))] - for lang, formats in subtitles.items()])) - - def urlopen(self, req): - """ Start an HTTP download """ - if isinstance(req, compat_basestring): - req = sanitized_Request(req) - return self._opener.open(req, timeout=self._socket_timeout) - - def print_debug_header(self): - if not self.params.get('verbose'): - return - - if type('') is not compat_str: - # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326) - self.report_warning( - 'Your Python is broken! Update to a newer and supported version') - - stdout_encoding = getattr( - sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__) - encoding_str = ( - '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % ( - locale.getpreferredencoding(), - sys.getfilesystemencoding(), - stdout_encoding, - self.get_encoding())) - write_string(encoding_str, encoding=None) - - self._write_string('[debug] youtube-dl version ' + __version__ + '\n') - if _LAZY_LOADER: - self._write_string('[debug] Lazy loading extractors enabled' + '\n') - try: - sp = subprocess.Popen( - ['git', 'rev-parse', '--short', 'HEAD'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = sp.communicate() - out = out.decode().strip() - if re.match('[0-9a-f]+', out): - self._write_string('[debug] Git HEAD: ' + out + '\n') - except Exception: - try: - sys.exc_clear() - except Exception: - pass - - def python_implementation(): - impl_name = platform.python_implementation() - if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'): - return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] - return impl_name - - self._write_string('[debug] Python version %s (%s) - %s\n' % ( - platform.python_version(), python_implementation(), - platform_name())) - - exe_versions = FFmpegPostProcessor.get_versions(self) - exe_versions['rtmpdump'] = rtmpdump_version() - exe_versions['phantomjs'] = PhantomJSwrapper._version() - exe_str = ', '.join( - '%s %s' % (exe, v) - for exe, v in sorted(exe_versions.items()) - if v - ) - if not exe_str: - exe_str = 'none' - self._write_string('[debug] exe versions: %s\n' % exe_str) - - proxy_map = {} - for handler in self._opener.handlers: - if hasattr(handler, 'proxies'): - proxy_map.update(handler.proxies) - self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n') - - if self.params.get('call_home', False): - ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8') - self._write_string('[debug] Public IP address: %s\n' % ipaddr) - latest_version = self.urlopen( - 'https://yt-dl.org/latest/version').read().decode('utf-8') - if version_tuple(latest_version) > version_tuple(__version__): - self.report_warning( - 'You are using an outdated version (newest version: %s)! ' - 'See https://yt-dl.org/update if you need help updating.' % - latest_version) - - def _setup_opener(self): - timeout_val = self.params.get('socket_timeout') - self._socket_timeout = 600 if timeout_val is None else float(timeout_val) - - opts_cookiefile = self.params.get('cookiefile') - opts_proxy = self.params.get('proxy') - - if opts_cookiefile is None: - self.cookiejar = compat_cookiejar.CookieJar() - else: - opts_cookiefile = expand_path(opts_cookiefile) - self.cookiejar = compat_cookiejar.MozillaCookieJar( - opts_cookiefile) - if os.access(opts_cookiefile, os.R_OK): - self.cookiejar.load() - - cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) - if opts_proxy is not None: - if opts_proxy == '': - proxies = {} - else: - proxies = {'http': opts_proxy, 'https': opts_proxy} - else: - proxies = compat_urllib_request.getproxies() - # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) - if 'http' in proxies and 'https' not in proxies: - proxies['https'] = proxies['http'] - proxy_handler = PerRequestProxyHandler(proxies) - - debuglevel = 1 if self.params.get('debug_printtraffic') else 0 - https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) - ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) - data_handler = compat_urllib_request_DataHandler() - - # When passing our own FileHandler instance, build_opener won't add the - # default FileHandler and allows us to disable the file protocol, which - # can be used for malicious purposes (see - # https://github.com/rg3/youtube-dl/issues/8227) - file_handler = compat_urllib_request.FileHandler() - - def file_open(*args, **kwargs): - raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons') - file_handler.file_open = file_open - - opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler) - - # Delete the default user-agent header, which would otherwise apply in - # cases where our custom HTTP handler doesn't come into play - # (See https://github.com/rg3/youtube-dl/issues/1309 for details) - opener.addheaders = [] - self._opener = opener - - def encode(self, s): - if isinstance(s, bytes): - return s # Already encoded - - try: - return s.encode(self.get_encoding()) - except UnicodeEncodeError as err: - err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.' - raise - - def get_encoding(self): - encoding = self.params.get('encoding') - if encoding is None: - encoding = preferredencoding() - return encoding - - def _write_thumbnails(self, info_dict, filename): - if self.params.get('writethumbnail', False): - thumbnails = info_dict.get('thumbnails') - if thumbnails: - thumbnails = [thumbnails[-1]] - elif self.params.get('write_all_thumbnails', False): - thumbnails = info_dict.get('thumbnails') - else: - return - - if not thumbnails: - # No thumbnails present, so return immediately - return - - for t in thumbnails: - thumb_ext = determine_ext(t['url'], 'jpg') - suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' - thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' - t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext - - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): - self.to_screen('[%s] %s: Thumbnail %sis already present' % - (info_dict['extractor'], info_dict['id'], thumb_display_id)) - else: - self.to_screen('[%s] %s: Downloading thumbnail %s...' % - (info_dict['extractor'], info_dict['id'], thumb_display_id)) - try: - uf = self.urlopen(t['url']) - with open(encodeFilename(thumb_filename), 'wb') as thumbf: - shutil.copyfileobj(uf, thumbf) - self.to_screen('[%s] %s: Writing thumbnail %sto: %s' % - (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self.report_warning('Unable to download thumbnail "%s": %s' % - (t['url'], error_to_compat_str(err))) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py deleted file mode 100644 index ba435ea..0000000 --- a/youtube_dl/__init__.py +++ /dev/null @@ -1,481 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -from __future__ import unicode_literals - -__license__ = 'Public Domain' - -import codecs -import io -import os -import random -import sys - - -from .options import ( - parseOpts, -) -from .compat import ( - compat_getpass, - compat_shlex_split, - workaround_optparse_bug9161, -) -from .utils import ( - DateRange, - decodeOption, - DEFAULT_OUTTMPL, - DownloadError, - expand_path, - match_filter_func, - MaxDownloadsReached, - preferredencoding, - read_batch_urls, - SameFileError, - setproctitle, - std_headers, - write_string, - render_table, -) -from .update import update_self -from .downloader import ( - FileDownloader, -) -from .extractor import gen_extractors, list_extractors -from .extractor.adobepass import MSO_INFO -from .YoutubeDL import YoutubeDL - - -def _real_main(argv=None): - # Compatibility fixes for Windows - if sys.platform == 'win32': - # https://github.com/rg3/youtube-dl/issues/820 - codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) - - workaround_optparse_bug9161() - - setproctitle('youtube-dl') - - parser, opts, args = parseOpts(argv) - - # Set user agent - if opts.user_agent is not None: - std_headers['User-Agent'] = opts.user_agent - - # Set referer - if opts.referer is not None: - std_headers['Referer'] = opts.referer - - # Custom HTTP headers - if opts.headers is not None: - for h in opts.headers: - if ':' not in h: - parser.error('wrong header formatting, it should be key:value, not "%s"' % h) - key, value = h.split(':', 1) - if opts.verbose: - write_string('[debug] Adding header from command line option %s:%s\n' % (key, value)) - std_headers[key] = value - - # Dump user agent - if opts.dump_user_agent: - write_string(std_headers['User-Agent'] + '\n', out=sys.stdout) - sys.exit(0) - - # Batch file verification - batch_urls = [] - if opts.batchfile is not None: - try: - if opts.batchfile == '-': - batchfd = sys.stdin - else: - batchfd = io.open( - expand_path(opts.batchfile), - 'r', encoding='utf-8', errors='ignore') - batch_urls = read_batch_urls(batchfd) - if opts.verbose: - write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') - except IOError: - sys.exit('ERROR: batch file could not be read') - all_urls = batch_urls + [url.strip() for url in args] # batch_urls are already striped in read_batch_urls - _enc = preferredencoding() - all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] - - if opts.list_extractors: - for ie in list_extractors(opts.age_limit): - write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '') + '\n', out=sys.stdout) - matchedUrls = [url for url in all_urls if ie.suitable(url)] - for mu in matchedUrls: - write_string(' ' + mu + '\n', out=sys.stdout) - sys.exit(0) - if opts.list_extractor_descriptions: - for ie in list_extractors(opts.age_limit): - if not ie._WORKING: - continue - desc = getattr(ie, 'IE_DESC', ie.IE_NAME) - if desc is False: - continue - if hasattr(ie, 'SEARCH_KEY'): - _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow') - _COUNTS = ('', '5', '10', 'all') - desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) - write_string(desc + '\n', out=sys.stdout) - sys.exit(0) - if opts.ap_list_mso: - table = [[mso_id, mso_info['name']] for mso_id, mso_info in MSO_INFO.items()] - write_string('Supported TV Providers:\n' + render_table(['mso', 'mso name'], table) + '\n', out=sys.stdout) - sys.exit(0) - - # Conflicting, missing and erroneous options - if opts.usenetrc and (opts.username is not None or opts.password is not None): - parser.error('using .netrc conflicts with giving username/password') - if opts.password is not None and opts.username is None: - parser.error('account username missing\n') - if opts.ap_password is not None and opts.ap_username is None: - parser.error('TV Provider account username missing\n') - if opts.outtmpl is not None and (opts.usetitle or opts.autonumber or opts.useid): - parser.error('using output template conflicts with using title, video ID or auto number') - if opts.autonumber_size is not None: - if opts.autonumber_size <= 0: - parser.error('auto number size must be positive') - if opts.autonumber_start is not None: - if opts.autonumber_start < 0: - parser.error('auto number start must be positive or 0') - if opts.usetitle and opts.useid: - parser.error('using title conflicts with using video ID') - if opts.username is not None and opts.password is None: - opts.password = compat_getpass('Type account password and press [Return]: ') - if opts.ap_username is not None and opts.ap_password is None: - opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ') - if opts.ratelimit is not None: - numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) - if numeric_limit is None: - parser.error('invalid rate limit specified') - opts.ratelimit = numeric_limit - if opts.min_filesize is not None: - numeric_limit = FileDownloader.parse_bytes(opts.min_filesize) - if numeric_limit is None: - parser.error('invalid min_filesize specified') - opts.min_filesize = numeric_limit - if opts.max_filesize is not None: - numeric_limit = FileDownloader.parse_bytes(opts.max_filesize) - if numeric_limit is None: - parser.error('invalid max_filesize specified') - opts.max_filesize = numeric_limit - if opts.sleep_interval is not None: - if opts.sleep_interval < 0: - parser.error('sleep interval must be positive or 0') - if opts.max_sleep_interval is not None: - if opts.max_sleep_interval < 0: - parser.error('max sleep interval must be positive or 0') - if opts.max_sleep_interval < opts.sleep_interval: - parser.error('max sleep interval must be greater than or equal to min sleep interval') - else: - opts.max_sleep_interval = opts.sleep_interval - if opts.ap_mso and opts.ap_mso not in MSO_INFO: - parser.error('Unsupported TV Provider, use --ap-list-mso to get a list of supported TV Providers') - - def parse_retries(retries): - if retries in ('inf', 'infinite'): - parsed_retries = float('inf') - else: - try: - parsed_retries = int(retries) - except (TypeError, ValueError): - parser.error('invalid retry count specified') - return parsed_retries - if opts.retries is not None: - opts.retries = parse_retries(opts.retries) - if opts.fragment_retries is not None: - opts.fragment_retries = parse_retries(opts.fragment_retries) - if opts.buffersize is not None: - numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) - if numeric_buffersize is None: - parser.error('invalid buffer size specified') - opts.buffersize = numeric_buffersize - if opts.http_chunk_size is not None: - numeric_chunksize = FileDownloader.parse_bytes(opts.http_chunk_size) - if not numeric_chunksize: - parser.error('invalid http chunk size specified') - opts.http_chunk_size = numeric_chunksize - if opts.playliststart <= 0: - raise ValueError('Playlist start must be positive') - if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: - raise ValueError('Playlist end must be greater than playlist start') - if opts.extractaudio: - if opts.audioformat not in ['best', 'aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: - parser.error('invalid audio format specified') - if opts.audioquality: - opts.audioquality = opts.audioquality.strip('k').strip('K') - if not opts.audioquality.isdigit(): - parser.error('invalid audio quality specified') - if opts.recodevideo is not None: - if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']: - parser.error('invalid video recode format specified') - if opts.convertsubtitles is not None: - if opts.convertsubtitles not in ['srt', 'vtt', 'ass', 'lrc']: - parser.error('invalid subtitle format specified') - - if opts.date is not None: - date = DateRange.day(opts.date) - else: - date = DateRange(opts.dateafter, opts.datebefore) - - # Do not download videos when there are audio-only formats - if opts.extractaudio and not opts.keepvideo and opts.format is None: - opts.format = 'bestaudio/best' - - # --all-sub automatically sets --write-sub if --write-auto-sub is not given - # this was the old behaviour if only --all-sub was given. - if opts.allsubtitles and not opts.writeautomaticsub: - opts.writesubtitles = True - - outtmpl = ((opts.outtmpl is not None and opts.outtmpl) or - (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') or - (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') or - (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') or - (opts.usetitle and '%(title)s-%(id)s.%(ext)s') or - (opts.useid and '%(id)s.%(ext)s') or - (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') or - DEFAULT_OUTTMPL) - if not os.path.splitext(outtmpl)[1] and opts.extractaudio: - parser.error('Cannot download a video and extract audio into the same' - ' file! Use "{0}.%(ext)s" instead of "{0}" as the output' - ' template'.format(outtmpl)) - - any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json - any_printing = opts.print_json - download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive - - # PostProcessors - postprocessors = [] - if opts.metafromtitle: - postprocessors.append({ - 'key': 'MetadataFromTitle', - 'titleformat': opts.metafromtitle - }) - if opts.extractaudio: - postprocessors.append({ - 'key': 'FFmpegExtractAudio', - 'preferredcodec': opts.audioformat, - 'preferredquality': opts.audioquality, - 'nopostoverwrites': opts.nopostoverwrites, - }) - if opts.recodevideo: - postprocessors.append({ - 'key': 'FFmpegVideoConvertor', - 'preferedformat': opts.recodevideo, - }) - # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and - # FFmpegExtractAudioPP as containers before conversion may not support - # metadata (3gp, webm, etc.) - # And this post-processor should be placed before other metadata - # manipulating post-processors (FFmpegEmbedSubtitle) to prevent loss of - # extra metadata. By default ffmpeg preserves metadata applicable for both - # source and target containers. From this point the container won't change, - # so metadata can be added here. - if opts.addmetadata: - postprocessors.append({'key': 'FFmpegMetadata'}) - if opts.convertsubtitles: - postprocessors.append({ - 'key': 'FFmpegSubtitlesConvertor', - 'format': opts.convertsubtitles, - }) - if opts.embedsubtitles: - postprocessors.append({ - 'key': 'FFmpegEmbedSubtitle', - }) - if opts.embedthumbnail: - already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails - postprocessors.append({ - 'key': 'EmbedThumbnail', - 'already_have_thumbnail': already_have_thumbnail - }) - if not already_have_thumbnail: - opts.writethumbnail = True - # XAttrMetadataPP should be run after post-processors that may change file - # contents - if opts.xattrs: - postprocessors.append({'key': 'XAttrMetadata'}) - # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way. - # So if the user is able to remove the file before your postprocessor runs it might cause a few problems. - if opts.exec_cmd: - postprocessors.append({ - 'key': 'ExecAfterDownload', - 'exec_cmd': opts.exec_cmd, - }) - external_downloader_args = None - if opts.external_downloader_args: - external_downloader_args = compat_shlex_split(opts.external_downloader_args) - postprocessor_args = None - if opts.postprocessor_args: - postprocessor_args = compat_shlex_split(opts.postprocessor_args) - match_filter = ( - None if opts.match_filter is None - else match_filter_func(opts.match_filter)) - - ydl_opts = { - 'usenetrc': opts.usenetrc, - 'username': opts.username, - 'password': opts.password, - 'twofactor': opts.twofactor, - 'videopassword': opts.videopassword, - 'ap_mso': opts.ap_mso, - 'ap_username': opts.ap_username, - 'ap_password': opts.ap_password, - 'quiet': (opts.quiet or any_getting or any_printing), - 'no_warnings': opts.no_warnings, - 'forceurl': opts.geturl, - 'forcetitle': opts.gettitle, - 'forceid': opts.getid, - 'forcethumbnail': opts.getthumbnail, - 'forcedescription': opts.getdescription, - 'forceduration': opts.getduration, - 'forcefilename': opts.getfilename, - 'forceformat': opts.getformat, - 'forcejson': opts.dumpjson or opts.print_json, - 'dump_single_json': opts.dump_single_json, - 'simulate': opts.simulate or any_getting, - 'skip_download': opts.skip_download, - 'format': opts.format, - 'listformats': opts.listformats, - 'outtmpl': outtmpl, - 'autonumber_size': opts.autonumber_size, - 'autonumber_start': opts.autonumber_start, - 'restrictfilenames': opts.restrictfilenames, - 'ignoreerrors': opts.ignoreerrors, - 'force_generic_extractor': opts.force_generic_extractor, - 'ratelimit': opts.ratelimit, - 'nooverwrites': opts.nooverwrites, - 'retries': opts.retries, - 'fragment_retries': opts.fragment_retries, - 'skip_unavailable_fragments': opts.skip_unavailable_fragments, - 'keep_fragments': opts.keep_fragments, - 'buffersize': opts.buffersize, - 'noresizebuffer': opts.noresizebuffer, - 'http_chunk_size': opts.http_chunk_size, - 'continuedl': opts.continue_dl, - 'noprogress': opts.noprogress, - 'progress_with_newline': opts.progress_with_newline, - 'playliststart': opts.playliststart, - 'playlistend': opts.playlistend, - 'playlistreverse': opts.playlist_reverse, - 'playlistrandom': opts.playlist_random, - 'noplaylist': opts.noplaylist, - 'logtostderr': opts.outtmpl == '-', - 'consoletitle': opts.consoletitle, - 'nopart': opts.nopart, - 'updatetime': opts.updatetime, - 'writedescription': opts.writedescription, - 'writeannotations': opts.writeannotations, - 'writeinfojson': opts.writeinfojson, - 'writethumbnail': opts.writethumbnail, - 'write_all_thumbnails': opts.write_all_thumbnails, - 'writesubtitles': opts.writesubtitles, - 'writeautomaticsub': opts.writeautomaticsub, - 'allsubtitles': opts.allsubtitles, - 'listsubtitles': opts.listsubtitles, - 'subtitlesformat': opts.subtitlesformat, - 'subtitleslangs': opts.subtitleslangs, - 'matchtitle': decodeOption(opts.matchtitle), - 'rejecttitle': decodeOption(opts.rejecttitle), - 'max_downloads': opts.max_downloads, - 'prefer_free_formats': opts.prefer_free_formats, - 'verbose': opts.verbose, - 'dump_intermediate_pages': opts.dump_intermediate_pages, - 'write_pages': opts.write_pages, - 'test': opts.test, - 'keepvideo': opts.keepvideo, - 'min_filesize': opts.min_filesize, - 'max_filesize': opts.max_filesize, - 'min_views': opts.min_views, - 'max_views': opts.max_views, - 'daterange': date, - 'cachedir': opts.cachedir, - 'youtube_print_sig_code': opts.youtube_print_sig_code, - 'age_limit': opts.age_limit, - 'download_archive': download_archive_fn, - 'cookiefile': opts.cookiefile, - 'nocheckcertificate': opts.no_check_certificate, - 'prefer_insecure': opts.prefer_insecure, - 'proxy': opts.proxy, - 'socket_timeout': opts.socket_timeout, - 'bidi_workaround': opts.bidi_workaround, - 'debug_printtraffic': opts.debug_printtraffic, - 'prefer_ffmpeg': opts.prefer_ffmpeg, - 'include_ads': opts.include_ads, - 'default_search': opts.default_search, - 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, - 'encoding': opts.encoding, - 'extract_flat': opts.extract_flat, - 'mark_watched': opts.mark_watched, - 'merge_output_format': opts.merge_output_format, - 'postprocessors': postprocessors, - 'fixup': opts.fixup, - 'source_address': opts.source_address, - 'call_home': opts.call_home, - 'sleep_interval': opts.sleep_interval, - 'max_sleep_interval': opts.max_sleep_interval, - 'external_downloader': opts.external_downloader, - 'list_thumbnails': opts.list_thumbnails, - 'playlist_items': opts.playlist_items, - 'xattr_set_filesize': opts.xattr_set_filesize, - 'match_filter': match_filter, - 'no_color': opts.no_color, - 'ffmpeg_location': opts.ffmpeg_location, - 'hls_prefer_native': opts.hls_prefer_native, - 'hls_use_mpegts': opts.hls_use_mpegts, - 'external_downloader_args': external_downloader_args, - 'postprocessor_args': postprocessor_args, - 'cn_verification_proxy': opts.cn_verification_proxy, - 'geo_verification_proxy': opts.geo_verification_proxy, - 'config_location': opts.config_location, - 'geo_bypass': opts.geo_bypass, - 'geo_bypass_country': opts.geo_bypass_country, - 'geo_bypass_ip_block': opts.geo_bypass_ip_block, - # just for deprecation check - 'autonumber': opts.autonumber if opts.autonumber is True else None, - 'usetitle': opts.usetitle if opts.usetitle is True else None, - } - - with YoutubeDL(ydl_opts) as ydl: - # Update version - if opts.update_self: - update_self(ydl.to_screen, opts.verbose, ydl._opener) - - # Remove cache dir - if opts.rm_cachedir: - ydl.cache.remove() - - # Maybe do nothing - if (len(all_urls) < 1) and (opts.load_info_filename is None): - if opts.update_self or opts.rm_cachedir: - sys.exit() - - ydl.warn_if_short_id(sys.argv[1:] if argv is None else argv) - parser.error( - 'You must provide at least one URL.\n' - 'Type youtube-dl --help to see a list of all options.') - - try: - if opts.load_info_filename is not None: - retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename)) - else: - retcode = ydl.download(all_urls) - except MaxDownloadsReached: - ydl.to_screen('--max-download limit reached, aborting.') - retcode = 101 - - sys.exit(retcode) - - -def main(argv=None): - try: - _real_main(argv) - except DownloadError: - sys.exit(1) - except SameFileError: - sys.exit('ERROR: fixed output name but more than one file to download') - except KeyboardInterrupt: - sys.exit('\nERROR: Interrupted by user') - - -__all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors'] diff --git a/youtube_dl/__main__.py b/youtube_dl/__main__.py deleted file mode 100644 index 138f5fb..0000000 --- a/youtube_dl/__main__.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env python -from __future__ import unicode_literals - -# Execute with -# $ python youtube_dl/__main__.py (2.6+) -# $ python -m youtube_dl (2.7+) - -import sys - -if __package__ is None and not hasattr(sys, 'frozen'): - # direct call of __main__.py - import os.path - path = os.path.realpath(os.path.abspath(__file__)) - sys.path.insert(0, os.path.dirname(os.path.dirname(path))) - -import youtube_dl - -if __name__ == '__main__': - youtube_dl.main() diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py deleted file mode 100644 index 461bb6d..0000000 --- a/youtube_dl/aes.py +++ /dev/null @@ -1,361 +0,0 @@ -from __future__ import unicode_literals - -from math import ceil - -from .compat import compat_b64decode -from .utils import bytes_to_intlist, intlist_to_bytes - -BLOCK_SIZE_BYTES = 16 - - -def aes_ctr_decrypt(data, key, counter): - """ - Decrypt with aes in counter mode - - @param {int[]} data cipher - @param {int[]} key 16/24/32-Byte cipher key - @param {instance} counter Instance whose next_value function (@returns {int[]} 16-Byte block) - returns the next counter block - @returns {int[]} decrypted data - """ - expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) - - decrypted_data = [] - for i in range(block_count): - counter_block = counter.next_value() - block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] - block += [0] * (BLOCK_SIZE_BYTES - len(block)) - - cipher_counter_block = aes_encrypt(counter_block, expanded_key) - decrypted_data += xor(block, cipher_counter_block) - decrypted_data = decrypted_data[:len(data)] - - return decrypted_data - - -def aes_cbc_decrypt(data, key, iv): - """ - Decrypt with aes in CBC mode - - @param {int[]} data cipher - @param {int[]} key 16/24/32-Byte cipher key - @param {int[]} iv 16-Byte IV - @returns {int[]} decrypted data - """ - expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) - - decrypted_data = [] - previous_cipher_block = iv - for i in range(block_count): - block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] - block += [0] * (BLOCK_SIZE_BYTES - len(block)) - - decrypted_block = aes_decrypt(block, expanded_key) - decrypted_data += xor(decrypted_block, previous_cipher_block) - previous_cipher_block = block - decrypted_data = decrypted_data[:len(data)] - - return decrypted_data - - -def aes_cbc_encrypt(data, key, iv): - """ - Encrypt with aes in CBC mode. Using PKCS#7 padding - - @param {int[]} data cleartext - @param {int[]} key 16/24/32-Byte cipher key - @param {int[]} iv 16-Byte IV - @returns {int[]} encrypted data - """ - expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) - - encrypted_data = [] - previous_cipher_block = iv - for i in range(block_count): - block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] - remaining_length = BLOCK_SIZE_BYTES - len(block) - block += [remaining_length] * remaining_length - mixed_block = xor(block, previous_cipher_block) - - encrypted_block = aes_encrypt(mixed_block, expanded_key) - encrypted_data += encrypted_block - - previous_cipher_block = encrypted_block - - return encrypted_data - - -def key_expansion(data): - """ - Generate key schedule - - @param {int[]} data 16/24/32-Byte cipher key - @returns {int[]} 176/208/240-Byte expanded key - """ - data = data[:] # copy - rcon_iteration = 1 - key_size_bytes = len(data) - expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES - - while len(data) < expanded_key_size_bytes: - temp = data[-4:] - temp = key_schedule_core(temp, rcon_iteration) - rcon_iteration += 1 - data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) - - for _ in range(3): - temp = data[-4:] - data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) - - if key_size_bytes == 32: - temp = data[-4:] - temp = sub_bytes(temp) - data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) - - for _ in range(3 if key_size_bytes == 32 else 2 if key_size_bytes == 24 else 0): - temp = data[-4:] - data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) - data = data[:expanded_key_size_bytes] - - return data - - -def aes_encrypt(data, expanded_key): - """ - Encrypt one block with aes - - @param {int[]} data 16-Byte state - @param {int[]} expanded_key 176/208/240-Byte expanded key - @returns {int[]} 16-Byte cipher - """ - rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 - - data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) - for i in range(1, rounds + 1): - data = sub_bytes(data) - data = shift_rows(data) - if i != rounds: - data = mix_columns(data) - data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) - - return data - - -def aes_decrypt(data, expanded_key): - """ - Decrypt one block with aes - - @param {int[]} data 16-Byte cipher - @param {int[]} expanded_key 176/208/240-Byte expanded key - @returns {int[]} 16-Byte state - """ - rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 - - for i in range(rounds, 0, -1): - data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]) - if i != rounds: - data = mix_columns_inv(data) - data = shift_rows_inv(data) - data = sub_bytes_inv(data) - data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) - - return data - - -def aes_decrypt_text(data, password, key_size_bytes): - """ - Decrypt text - - The first 8 Bytes of decoded 'data' are the 8 high Bytes of the counter - - The cipher key is retrieved by encrypting the first 16 Byte of 'password' - with the first 'key_size_bytes' Bytes from 'password' (if necessary filled with 0's) - - Mode of operation is 'counter' - - @param {str} data Base64 encoded string - @param {str,unicode} password Password (will be encoded with utf-8) - @param {int} key_size_bytes Possible values: 16 for 128-Bit, 24 for 192-Bit or 32 for 256-Bit - @returns {str} Decrypted data - """ - NONCE_LENGTH_BYTES = 8 - - data = bytes_to_intlist(compat_b64decode(data)) - password = bytes_to_intlist(password.encode('utf-8')) - - key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) - key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES) - - nonce = data[:NONCE_LENGTH_BYTES] - cipher = data[NONCE_LENGTH_BYTES:] - - class Counter(object): - __value = nonce + [0] * (BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES) - - def next_value(self): - temp = self.__value - self.__value = inc(self.__value) - return temp - - decrypted_data = aes_ctr_decrypt(cipher, key, Counter()) - plaintext = intlist_to_bytes(decrypted_data) - - return plaintext - - -RCON = (0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36) -SBOX = (0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, - 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, - 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, - 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, - 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, - 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, - 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, - 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, - 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, - 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, - 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, - 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, - 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, - 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, - 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, - 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16) -SBOX_INV = (0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, - 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, - 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, - 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, - 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, - 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, - 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, - 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, - 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, - 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, - 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, - 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, - 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, - 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, - 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, - 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d) -MIX_COLUMN_MATRIX = ((0x2, 0x3, 0x1, 0x1), - (0x1, 0x2, 0x3, 0x1), - (0x1, 0x1, 0x2, 0x3), - (0x3, 0x1, 0x1, 0x2)) -MIX_COLUMN_MATRIX_INV = ((0xE, 0xB, 0xD, 0x9), - (0x9, 0xE, 0xB, 0xD), - (0xD, 0x9, 0xE, 0xB), - (0xB, 0xD, 0x9, 0xE)) -RIJNDAEL_EXP_TABLE = (0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF, 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35, - 0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4, 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA, - 0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26, 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31, - 0x53, 0xF5, 0x04, 0x0C, 0x14, 0x3C, 0x44, 0xCC, 0x4F, 0xD1, 0x68, 0xB8, 0xD3, 0x6E, 0xB2, 0xCD, - 0x4C, 0xD4, 0x67, 0xA9, 0xE0, 0x3B, 0x4D, 0xD7, 0x62, 0xA6, 0xF1, 0x08, 0x18, 0x28, 0x78, 0x88, - 0x83, 0x9E, 0xB9, 0xD0, 0x6B, 0xBD, 0xDC, 0x7F, 0x81, 0x98, 0xB3, 0xCE, 0x49, 0xDB, 0x76, 0x9A, - 0xB5, 0xC4, 0x57, 0xF9, 0x10, 0x30, 0x50, 0xF0, 0x0B, 0x1D, 0x27, 0x69, 0xBB, 0xD6, 0x61, 0xA3, - 0xFE, 0x19, 0x2B, 0x7D, 0x87, 0x92, 0xAD, 0xEC, 0x2F, 0x71, 0x93, 0xAE, 0xE9, 0x20, 0x60, 0xA0, - 0xFB, 0x16, 0x3A, 0x4E, 0xD2, 0x6D, 0xB7, 0xC2, 0x5D, 0xE7, 0x32, 0x56, 0xFA, 0x15, 0x3F, 0x41, - 0xC3, 0x5E, 0xE2, 0x3D, 0x47, 0xC9, 0x40, 0xC0, 0x5B, 0xED, 0x2C, 0x74, 0x9C, 0xBF, 0xDA, 0x75, - 0x9F, 0xBA, 0xD5, 0x64, 0xAC, 0xEF, 0x2A, 0x7E, 0x82, 0x9D, 0xBC, 0xDF, 0x7A, 0x8E, 0x89, 0x80, - 0x9B, 0xB6, 0xC1, 0x58, 0xE8, 0x23, 0x65, 0xAF, 0xEA, 0x25, 0x6F, 0xB1, 0xC8, 0x43, 0xC5, 0x54, - 0xFC, 0x1F, 0x21, 0x63, 0xA5, 0xF4, 0x07, 0x09, 0x1B, 0x2D, 0x77, 0x99, 0xB0, 0xCB, 0x46, 0xCA, - 0x45, 0xCF, 0x4A, 0xDE, 0x79, 0x8B, 0x86, 0x91, 0xA8, 0xE3, 0x3E, 0x42, 0xC6, 0x51, 0xF3, 0x0E, - 0x12, 0x36, 0x5A, 0xEE, 0x29, 0x7B, 0x8D, 0x8C, 0x8F, 0x8A, 0x85, 0x94, 0xA7, 0xF2, 0x0D, 0x17, - 0x39, 0x4B, 0xDD, 0x7C, 0x84, 0x97, 0xA2, 0xFD, 0x1C, 0x24, 0x6C, 0xB4, 0xC7, 0x52, 0xF6, 0x01) -RIJNDAEL_LOG_TABLE = (0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03, - 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1, - 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78, - 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e, - 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38, - 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10, - 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba, - 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57, - 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8, - 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0, - 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7, - 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d, - 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1, - 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab, - 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5, - 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07) - - -def sub_bytes(data): - return [SBOX[x] for x in data] - - -def sub_bytes_inv(data): - return [SBOX_INV[x] for x in data] - - -def rotate(data): - return data[1:] + [data[0]] - - -def key_schedule_core(data, rcon_iteration): - data = rotate(data) - data = sub_bytes(data) - data[0] = data[0] ^ RCON[rcon_iteration] - - return data - - -def xor(data1, data2): - return [x ^ y for x, y in zip(data1, data2)] - - -def rijndael_mul(a, b): - if(a == 0 or b == 0): - return 0 - return RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF] - - -def mix_column(data, matrix): - data_mixed = [] - for row in range(4): - mixed = 0 - for column in range(4): - # xor is (+) and (-) - mixed ^= rijndael_mul(data[column], matrix[row][column]) - data_mixed.append(mixed) - return data_mixed - - -def mix_columns(data, matrix=MIX_COLUMN_MATRIX): - data_mixed = [] - for i in range(4): - column = data[i * 4: (i + 1) * 4] - data_mixed += mix_column(column, matrix) - return data_mixed - - -def mix_columns_inv(data): - return mix_columns(data, MIX_COLUMN_MATRIX_INV) - - -def shift_rows(data): - data_shifted = [] - for column in range(4): - for row in range(4): - data_shifted.append(data[((column + row) & 0b11) * 4 + row]) - return data_shifted - - -def shift_rows_inv(data): - data_shifted = [] - for column in range(4): - for row in range(4): - data_shifted.append(data[((column - row) & 0b11) * 4 + row]) - return data_shifted - - -def inc(data): - data = data[:] # copy - for i in range(len(data) - 1, -1, -1): - if data[i] == 255: - data[i] = 0 - else: - data[i] = data[i] + 1 - break - return data - - -__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_cbc_decrypt', 'aes_decrypt_text'] diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py deleted file mode 100644 index 7bdade1..0000000 --- a/youtube_dl/cache.py +++ /dev/null @@ -1,96 +0,0 @@ -from __future__ import unicode_literals - -import errno -import io -import json -import os -import re -import shutil -import traceback - -from .compat import compat_getenv -from .utils import ( - expand_path, - write_json_file, -) - - -class Cache(object): - def __init__(self, ydl): - self._ydl = ydl - - def _get_root_dir(self): - res = self._ydl.params.get('cachedir') - if res is None: - cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache') - res = os.path.join(cache_root, 'youtube-dl') - return expand_path(res) - - def _get_cache_fn(self, section, key, dtype): - assert re.match(r'^[a-zA-Z0-9_.-]+$', section), \ - 'invalid section %r' % section - assert re.match(r'^[a-zA-Z0-9_.-]+$', key), 'invalid key %r' % key - return os.path.join( - self._get_root_dir(), section, '%s.%s' % (key, dtype)) - - @property - def enabled(self): - return self._ydl.params.get('cachedir') is not False - - def store(self, section, key, data, dtype='json'): - assert dtype in ('json',) - - if not self.enabled: - return - - fn = self._get_cache_fn(section, key, dtype) - try: - try: - os.makedirs(os.path.dirname(fn)) - except OSError as ose: - if ose.errno != errno.EEXIST: - raise - write_json_file(data, fn) - except Exception: - tb = traceback.format_exc() - self._ydl.report_warning( - 'Writing cache to %r failed: %s' % (fn, tb)) - - def load(self, section, key, dtype='json', default=None): - assert dtype in ('json',) - - if not self.enabled: - return default - - cache_fn = self._get_cache_fn(section, key, dtype) - try: - try: - with io.open(cache_fn, 'r', encoding='utf-8') as cachef: - return json.load(cachef) - except ValueError: - try: - file_size = os.path.getsize(cache_fn) - except (OSError, IOError) as oe: - file_size = str(oe) - self._ydl.report_warning( - 'Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) - except IOError: - pass # No cache available - - return default - - def remove(self): - if not self.enabled: - self._ydl.to_screen('Cache is disabled (Did you combine --no-cache-dir and --rm-cache-dir?)') - return - - cachedir = self._get_root_dir() - if not any((term in cachedir) for term in ('cache', 'tmp')): - raise Exception('Not removing directory %s - this does not look like a cache dir' % cachedir) - - self._ydl.to_screen( - 'Removing cache dir %s .' % cachedir, skip_eol=True) - if os.path.exists(cachedir): - self._ydl.to_screen('.', skip_eol=True) - shutil.rmtree(cachedir) - self._ydl.to_screen('.') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py deleted file mode 100644 index 7b77034..0000000 --- a/youtube_dl/compat.py +++ /dev/null @@ -1,3016 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import base64 -import binascii -import collections -import ctypes -import email -import getpass -import io -import itertools -import optparse -import os -import platform -import re -import shlex -import shutil -import socket -import struct -import subprocess -import sys -import xml.etree.ElementTree - - -try: - import urllib.request as compat_urllib_request -except ImportError: # Python 2 - import urllib2 as compat_urllib_request - -try: - import urllib.error as compat_urllib_error -except ImportError: # Python 2 - import urllib2 as compat_urllib_error - -try: - import urllib.parse as compat_urllib_parse -except ImportError: # Python 2 - import urllib as compat_urllib_parse - -try: - from urllib.parse import urlparse as compat_urllib_parse_urlparse -except ImportError: # Python 2 - from urlparse import urlparse as compat_urllib_parse_urlparse - -try: - import urllib.parse as compat_urlparse -except ImportError: # Python 2 - import urlparse as compat_urlparse - -try: - import urllib.response as compat_urllib_response -except ImportError: # Python 2 - import urllib as compat_urllib_response - -try: - import http.cookiejar as compat_cookiejar -except ImportError: # Python 2 - import cookielib as compat_cookiejar - -try: - import http.cookies as compat_cookies -except ImportError: # Python 2 - import Cookie as compat_cookies - -try: - import html.entities as compat_html_entities -except ImportError: # Python 2 - import htmlentitydefs as compat_html_entities - -try: # Python >= 3.3 - compat_html_entities_html5 = compat_html_entities.html5 -except AttributeError: - # Copied from CPython 3.5.1 html/entities.py - compat_html_entities_html5 = { - 'Aacute': '\xc1', - 'aacute': '\xe1', - 'Aacute;': '\xc1', - 'aacute;': '\xe1', - 'Abreve;': '\u0102', - 'abreve;': '\u0103', - 'ac;': '\u223e', - 'acd;': '\u223f', - 'acE;': '\u223e\u0333', - 'Acirc': '\xc2', - 'acirc': '\xe2', - 'Acirc;': '\xc2', - 'acirc;': '\xe2', - 'acute': '\xb4', - 'acute;': '\xb4', - 'Acy;': '\u0410', - 'acy;': '\u0430', - 'AElig': '\xc6', - 'aelig': '\xe6', - 'AElig;': '\xc6', - 'aelig;': '\xe6', - 'af;': '\u2061', - 'Afr;': '\U0001d504', - 'afr;': '\U0001d51e', - 'Agrave': '\xc0', - 'agrave': '\xe0', - 'Agrave;': '\xc0', - 'agrave;': '\xe0', - 'alefsym;': '\u2135', - 'aleph;': '\u2135', - 'Alpha;': '\u0391', - 'alpha;': '\u03b1', - 'Amacr;': '\u0100', - 'amacr;': '\u0101', - 'amalg;': '\u2a3f', - 'AMP': '&', - 'amp': '&', - 'AMP;': '&', - 'amp;': '&', - 'And;': '\u2a53', - 'and;': '\u2227', - 'andand;': '\u2a55', - 'andd;': '\u2a5c', - 'andslope;': '\u2a58', - 'andv;': '\u2a5a', - 'ang;': '\u2220', - 'ange;': '\u29a4', - 'angle;': '\u2220', - 'angmsd;': '\u2221', - 'angmsdaa;': '\u29a8', - 'angmsdab;': '\u29a9', - 'angmsdac;': '\u29aa', - 'angmsdad;': '\u29ab', - 'angmsdae;': '\u29ac', - 'angmsdaf;': '\u29ad', - 'angmsdag;': '\u29ae', - 'angmsdah;': '\u29af', - 'angrt;': '\u221f', - 'angrtvb;': '\u22be', - 'angrtvbd;': '\u299d', - 'angsph;': '\u2222', - 'angst;': '\xc5', - 'angzarr;': '\u237c', - 'Aogon;': '\u0104', - 'aogon;': '\u0105', - 'Aopf;': '\U0001d538', - 'aopf;': '\U0001d552', - 'ap;': '\u2248', - 'apacir;': '\u2a6f', - 'apE;': '\u2a70', - 'ape;': '\u224a', - 'apid;': '\u224b', - 'apos;': "'", - 'ApplyFunction;': '\u2061', - 'approx;': '\u2248', - 'approxeq;': '\u224a', - 'Aring': '\xc5', - 'aring': '\xe5', - 'Aring;': '\xc5', - 'aring;': '\xe5', - 'Ascr;': '\U0001d49c', - 'ascr;': '\U0001d4b6', - 'Assign;': '\u2254', - 'ast;': '*', - 'asymp;': '\u2248', - 'asympeq;': '\u224d', - 'Atilde': '\xc3', - 'atilde': '\xe3', - 'Atilde;': '\xc3', - 'atilde;': '\xe3', - 'Auml': '\xc4', - 'auml': '\xe4', - 'Auml;': '\xc4', - 'auml;': '\xe4', - 'awconint;': '\u2233', - 'awint;': '\u2a11', - 'backcong;': '\u224c', - 'backepsilon;': '\u03f6', - 'backprime;': '\u2035', - 'backsim;': '\u223d', - 'backsimeq;': '\u22cd', - 'Backslash;': '\u2216', - 'Barv;': '\u2ae7', - 'barvee;': '\u22bd', - 'Barwed;': '\u2306', - 'barwed;': '\u2305', - 'barwedge;': '\u2305', - 'bbrk;': '\u23b5', - 'bbrktbrk;': '\u23b6', - 'bcong;': '\u224c', - 'Bcy;': '\u0411', - 'bcy;': '\u0431', - 'bdquo;': '\u201e', - 'becaus;': '\u2235', - 'Because;': '\u2235', - 'because;': '\u2235', - 'bemptyv;': '\u29b0', - 'bepsi;': '\u03f6', - 'bernou;': '\u212c', - 'Bernoullis;': '\u212c', - 'Beta;': '\u0392', - 'beta;': '\u03b2', - 'beth;': '\u2136', - 'between;': '\u226c', - 'Bfr;': '\U0001d505', - 'bfr;': '\U0001d51f', - 'bigcap;': '\u22c2', - 'bigcirc;': '\u25ef', - 'bigcup;': '\u22c3', - 'bigodot;': '\u2a00', - 'bigoplus;': '\u2a01', - 'bigotimes;': '\u2a02', - 'bigsqcup;': '\u2a06', - 'bigstar;': '\u2605', - 'bigtriangledown;': '\u25bd', - 'bigtriangleup;': '\u25b3', - 'biguplus;': '\u2a04', - 'bigvee;': '\u22c1', - 'bigwedge;': '\u22c0', - 'bkarow;': '\u290d', - 'blacklozenge;': '\u29eb', - 'blacksquare;': '\u25aa', - 'blacktriangle;': '\u25b4', - 'blacktriangledown;': '\u25be', - 'blacktriangleleft;': '\u25c2', - 'blacktriangleright;': '\u25b8', - 'blank;': '\u2423', - 'blk12;': '\u2592', - 'blk14;': '\u2591', - 'blk34;': '\u2593', - 'block;': '\u2588', - 'bne;': '=\u20e5', - 'bnequiv;': '\u2261\u20e5', - 'bNot;': '\u2aed', - 'bnot;': '\u2310', - 'Bopf;': '\U0001d539', - 'bopf;': '\U0001d553', - 'bot;': '\u22a5', - 'bottom;': '\u22a5', - 'bowtie;': '\u22c8', - 'boxbox;': '\u29c9', - 'boxDL;': '\u2557', - 'boxDl;': '\u2556', - 'boxdL;': '\u2555', - 'boxdl;': '\u2510', - 'boxDR;': '\u2554', - 'boxDr;': '\u2553', - 'boxdR;': '\u2552', - 'boxdr;': '\u250c', - 'boxH;': '\u2550', - 'boxh;': '\u2500', - 'boxHD;': '\u2566', - 'boxHd;': '\u2564', - 'boxhD;': '\u2565', - 'boxhd;': '\u252c', - 'boxHU;': '\u2569', - 'boxHu;': '\u2567', - 'boxhU;': '\u2568', - 'boxhu;': '\u2534', - 'boxminus;': '\u229f', - 'boxplus;': '\u229e', - 'boxtimes;': '\u22a0', - 'boxUL;': '\u255d', - 'boxUl;': '\u255c', - 'boxuL;': '\u255b', - 'boxul;': '\u2518', - 'boxUR;': '\u255a', - 'boxUr;': '\u2559', - 'boxuR;': '\u2558', - 'boxur;': '\u2514', - 'boxV;': '\u2551', - 'boxv;': '\u2502', - 'boxVH;': '\u256c', - 'boxVh;': '\u256b', - 'boxvH;': '\u256a', - 'boxvh;': '\u253c', - 'boxVL;': '\u2563', - 'boxVl;': '\u2562', - 'boxvL;': '\u2561', - 'boxvl;': '\u2524', - 'boxVR;': '\u2560', - 'boxVr;': '\u255f', - 'boxvR;': '\u255e', - 'boxvr;': '\u251c', - 'bprime;': '\u2035', - 'Breve;': '\u02d8', - 'breve;': '\u02d8', - 'brvbar': '\xa6', - 'brvbar;': '\xa6', - 'Bscr;': '\u212c', - 'bscr;': '\U0001d4b7', - 'bsemi;': '\u204f', - 'bsim;': '\u223d', - 'bsime;': '\u22cd', - 'bsol;': '\\', - 'bsolb;': '\u29c5', - 'bsolhsub;': '\u27c8', - 'bull;': '\u2022', - 'bullet;': '\u2022', - 'bump;': '\u224e', - 'bumpE;': '\u2aae', - 'bumpe;': '\u224f', - 'Bumpeq;': '\u224e', - 'bumpeq;': '\u224f', - 'Cacute;': '\u0106', - 'cacute;': '\u0107', - 'Cap;': '\u22d2', - 'cap;': '\u2229', - 'capand;': '\u2a44', - 'capbrcup;': '\u2a49', - 'capcap;': '\u2a4b', - 'capcup;': '\u2a47', - 'capdot;': '\u2a40', - 'CapitalDifferentialD;': '\u2145', - 'caps;': '\u2229\ufe00', - 'caret;': '\u2041', - 'caron;': '\u02c7', - 'Cayleys;': '\u212d', - 'ccaps;': '\u2a4d', - 'Ccaron;': '\u010c', - 'ccaron;': '\u010d', - 'Ccedil': '\xc7', - 'ccedil': '\xe7', - 'Ccedil;': '\xc7', - 'ccedil;': '\xe7', - 'Ccirc;': '\u0108', - 'ccirc;': '\u0109', - 'Cconint;': '\u2230', - 'ccups;': '\u2a4c', - 'ccupssm;': '\u2a50', - 'Cdot;': '\u010a', - 'cdot;': '\u010b', - 'cedil': '\xb8', - 'cedil;': '\xb8', - 'Cedilla;': '\xb8', - 'cemptyv;': '\u29b2', - 'cent': '\xa2', - 'cent;': '\xa2', - 'CenterDot;': '\xb7', - 'centerdot;': '\xb7', - 'Cfr;': '\u212d', - 'cfr;': '\U0001d520', - 'CHcy;': '\u0427', - 'chcy;': '\u0447', - 'check;': '\u2713', - 'checkmark;': '\u2713', - 'Chi;': '\u03a7', - 'chi;': '\u03c7', - 'cir;': '\u25cb', - 'circ;': '\u02c6', - 'circeq;': '\u2257', - 'circlearrowleft;': '\u21ba', - 'circlearrowright;': '\u21bb', - 'circledast;': '\u229b', - 'circledcirc;': '\u229a', - 'circleddash;': '\u229d', - 'CircleDot;': '\u2299', - 'circledR;': '\xae', - 'circledS;': '\u24c8', - 'CircleMinus;': '\u2296', - 'CirclePlus;': '\u2295', - 'CircleTimes;': '\u2297', - 'cirE;': '\u29c3', - 'cire;': '\u2257', - 'cirfnint;': '\u2a10', - 'cirmid;': '\u2aef', - 'cirscir;': '\u29c2', - 'ClockwiseContourIntegral;': '\u2232', - 'CloseCurlyDoubleQuote;': '\u201d', - 'CloseCurlyQuote;': '\u2019', - 'clubs;': '\u2663', - 'clubsuit;': '\u2663', - 'Colon;': '\u2237', - 'colon;': ':', - 'Colone;': '\u2a74', - 'colone;': '\u2254', - 'coloneq;': '\u2254', - 'comma;': ',', - 'commat;': '@', - 'comp;': '\u2201', - 'compfn;': '\u2218', - 'complement;': '\u2201', - 'complexes;': '\u2102', - 'cong;': '\u2245', - 'congdot;': '\u2a6d', - 'Congruent;': '\u2261', - 'Conint;': '\u222f', - 'conint;': '\u222e', - 'ContourIntegral;': '\u222e', - 'Copf;': '\u2102', - 'copf;': '\U0001d554', - 'coprod;': '\u2210', - 'Coproduct;': '\u2210', - 'COPY': '\xa9', - 'copy': '\xa9', - 'COPY;': '\xa9', - 'copy;': '\xa9', - 'copysr;': '\u2117', - 'CounterClockwiseContourIntegral;': '\u2233', - 'crarr;': '\u21b5', - 'Cross;': '\u2a2f', - 'cross;': '\u2717', - 'Cscr;': '\U0001d49e', - 'cscr;': '\U0001d4b8', - 'csub;': '\u2acf', - 'csube;': '\u2ad1', - 'csup;': '\u2ad0', - 'csupe;': '\u2ad2', - 'ctdot;': '\u22ef', - 'cudarrl;': '\u2938', - 'cudarrr;': '\u2935', - 'cuepr;': '\u22de', - 'cuesc;': '\u22df', - 'cularr;': '\u21b6', - 'cularrp;': '\u293d', - 'Cup;': '\u22d3', - 'cup;': '\u222a', - 'cupbrcap;': '\u2a48', - 'CupCap;': '\u224d', - 'cupcap;': '\u2a46', - 'cupcup;': '\u2a4a', - 'cupdot;': '\u228d', - 'cupor;': '\u2a45', - 'cups;': '\u222a\ufe00', - 'curarr;': '\u21b7', - 'curarrm;': '\u293c', - 'curlyeqprec;': '\u22de', - 'curlyeqsucc;': '\u22df', - 'curlyvee;': '\u22ce', - 'curlywedge;': '\u22cf', - 'curren': '\xa4', - 'curren;': '\xa4', - 'curvearrowleft;': '\u21b6', - 'curvearrowright;': '\u21b7', - 'cuvee;': '\u22ce', - 'cuwed;': '\u22cf', - 'cwconint;': '\u2232', - 'cwint;': '\u2231', - 'cylcty;': '\u232d', - 'Dagger;': '\u2021', - 'dagger;': '\u2020', - 'daleth;': '\u2138', - 'Darr;': '\u21a1', - 'dArr;': '\u21d3', - 'darr;': '\u2193', - 'dash;': '\u2010', - 'Dashv;': '\u2ae4', - 'dashv;': '\u22a3', - 'dbkarow;': '\u290f', - 'dblac;': '\u02dd', - 'Dcaron;': '\u010e', - 'dcaron;': '\u010f', - 'Dcy;': '\u0414', - 'dcy;': '\u0434', - 'DD;': '\u2145', - 'dd;': '\u2146', - 'ddagger;': '\u2021', - 'ddarr;': '\u21ca', - 'DDotrahd;': '\u2911', - 'ddotseq;': '\u2a77', - 'deg': '\xb0', - 'deg;': '\xb0', - 'Del;': '\u2207', - 'Delta;': '\u0394', - 'delta;': '\u03b4', - 'demptyv;': '\u29b1', - 'dfisht;': '\u297f', - 'Dfr;': '\U0001d507', - 'dfr;': '\U0001d521', - 'dHar;': '\u2965', - 'dharl;': '\u21c3', - 'dharr;': '\u21c2', - 'DiacriticalAcute;': '\xb4', - 'DiacriticalDot;': '\u02d9', - 'DiacriticalDoubleAcute;': '\u02dd', - 'DiacriticalGrave;': '`', - 'DiacriticalTilde;': '\u02dc', - 'diam;': '\u22c4', - 'Diamond;': '\u22c4', - 'diamond;': '\u22c4', - 'diamondsuit;': '\u2666', - 'diams;': '\u2666', - 'die;': '\xa8', - 'DifferentialD;': '\u2146', - 'digamma;': '\u03dd', - 'disin;': '\u22f2', - 'div;': '\xf7', - 'divide': '\xf7', - 'divide;': '\xf7', - 'divideontimes;': '\u22c7', - 'divonx;': '\u22c7', - 'DJcy;': '\u0402', - 'djcy;': '\u0452', - 'dlcorn;': '\u231e', - 'dlcrop;': '\u230d', - 'dollar;': '$', - 'Dopf;': '\U0001d53b', - 'dopf;': '\U0001d555', - 'Dot;': '\xa8', - 'dot;': '\u02d9', - 'DotDot;': '\u20dc', - 'doteq;': '\u2250', - 'doteqdot;': '\u2251', - 'DotEqual;': '\u2250', - 'dotminus;': '\u2238', - 'dotplus;': '\u2214', - 'dotsquare;': '\u22a1', - 'doublebarwedge;': '\u2306', - 'DoubleContourIntegral;': '\u222f', - 'DoubleDot;': '\xa8', - 'DoubleDownArrow;': '\u21d3', - 'DoubleLeftArrow;': '\u21d0', - 'DoubleLeftRightArrow;': '\u21d4', - 'DoubleLeftTee;': '\u2ae4', - 'DoubleLongLeftArrow;': '\u27f8', - 'DoubleLongLeftRightArrow;': '\u27fa', - 'DoubleLongRightArrow;': '\u27f9', - 'DoubleRightArrow;': '\u21d2', - 'DoubleRightTee;': '\u22a8', - 'DoubleUpArrow;': '\u21d1', - 'DoubleUpDownArrow;': '\u21d5', - 'DoubleVerticalBar;': '\u2225', - 'DownArrow;': '\u2193', - 'Downarrow;': '\u21d3', - 'downarrow;': '\u2193', - 'DownArrowBar;': '\u2913', - 'DownArrowUpArrow;': '\u21f5', - 'DownBreve;': '\u0311', - 'downdownarrows;': '\u21ca', - 'downharpoonleft;': '\u21c3', - 'downharpoonright;': '\u21c2', - 'DownLeftRightVector;': '\u2950', - 'DownLeftTeeVector;': '\u295e', - 'DownLeftVector;': '\u21bd', - 'DownLeftVectorBar;': '\u2956', - 'DownRightTeeVector;': '\u295f', - 'DownRightVector;': '\u21c1', - 'DownRightVectorBar;': '\u2957', - 'DownTee;': '\u22a4', - 'DownTeeArrow;': '\u21a7', - 'drbkarow;': '\u2910', - 'drcorn;': '\u231f', - 'drcrop;': '\u230c', - 'Dscr;': '\U0001d49f', - 'dscr;': '\U0001d4b9', - 'DScy;': '\u0405', - 'dscy;': '\u0455', - 'dsol;': '\u29f6', - 'Dstrok;': '\u0110', - 'dstrok;': '\u0111', - 'dtdot;': '\u22f1', - 'dtri;': '\u25bf', - 'dtrif;': '\u25be', - 'duarr;': '\u21f5', - 'duhar;': '\u296f', - 'dwangle;': '\u29a6', - 'DZcy;': '\u040f', - 'dzcy;': '\u045f', - 'dzigrarr;': '\u27ff', - 'Eacute': '\xc9', - 'eacute': '\xe9', - 'Eacute;': '\xc9', - 'eacute;': '\xe9', - 'easter;': '\u2a6e', - 'Ecaron;': '\u011a', - 'ecaron;': '\u011b', - 'ecir;': '\u2256', - 'Ecirc': '\xca', - 'ecirc': '\xea', - 'Ecirc;': '\xca', - 'ecirc;': '\xea', - 'ecolon;': '\u2255', - 'Ecy;': '\u042d', - 'ecy;': '\u044d', - 'eDDot;': '\u2a77', - 'Edot;': '\u0116', - 'eDot;': '\u2251', - 'edot;': '\u0117', - 'ee;': '\u2147', - 'efDot;': '\u2252', - 'Efr;': '\U0001d508', - 'efr;': '\U0001d522', - 'eg;': '\u2a9a', - 'Egrave': '\xc8', - 'egrave': '\xe8', - 'Egrave;': '\xc8', - 'egrave;': '\xe8', - 'egs;': '\u2a96', - 'egsdot;': '\u2a98', - 'el;': '\u2a99', - 'Element;': '\u2208', - 'elinters;': '\u23e7', - 'ell;': '\u2113', - 'els;': '\u2a95', - 'elsdot;': '\u2a97', - 'Emacr;': '\u0112', - 'emacr;': '\u0113', - 'empty;': '\u2205', - 'emptyset;': '\u2205', - 'EmptySmallSquare;': '\u25fb', - 'emptyv;': '\u2205', - 'EmptyVerySmallSquare;': '\u25ab', - 'emsp13;': '\u2004', - 'emsp14;': '\u2005', - 'emsp;': '\u2003', - 'ENG;': '\u014a', - 'eng;': '\u014b', - 'ensp;': '\u2002', - 'Eogon;': '\u0118', - 'eogon;': '\u0119', - 'Eopf;': '\U0001d53c', - 'eopf;': '\U0001d556', - 'epar;': '\u22d5', - 'eparsl;': '\u29e3', - 'eplus;': '\u2a71', - 'epsi;': '\u03b5', - 'Epsilon;': '\u0395', - 'epsilon;': '\u03b5', - 'epsiv;': '\u03f5', - 'eqcirc;': '\u2256', - 'eqcolon;': '\u2255', - 'eqsim;': '\u2242', - 'eqslantgtr;': '\u2a96', - 'eqslantless;': '\u2a95', - 'Equal;': '\u2a75', - 'equals;': '=', - 'EqualTilde;': '\u2242', - 'equest;': '\u225f', - 'Equilibrium;': '\u21cc', - 'equiv;': '\u2261', - 'equivDD;': '\u2a78', - 'eqvparsl;': '\u29e5', - 'erarr;': '\u2971', - 'erDot;': '\u2253', - 'Escr;': '\u2130', - 'escr;': '\u212f', - 'esdot;': '\u2250', - 'Esim;': '\u2a73', - 'esim;': '\u2242', - 'Eta;': '\u0397', - 'eta;': '\u03b7', - 'ETH': '\xd0', - 'eth': '\xf0', - 'ETH;': '\xd0', - 'eth;': '\xf0', - 'Euml': '\xcb', - 'euml': '\xeb', - 'Euml;': '\xcb', - 'euml;': '\xeb', - 'euro;': '\u20ac', - 'excl;': '!', - 'exist;': '\u2203', - 'Exists;': '\u2203', - 'expectation;': '\u2130', - 'ExponentialE;': '\u2147', - 'exponentiale;': '\u2147', - 'fallingdotseq;': '\u2252', - 'Fcy;': '\u0424', - 'fcy;': '\u0444', - 'female;': '\u2640', - 'ffilig;': '\ufb03', - 'fflig;': '\ufb00', - 'ffllig;': '\ufb04', - 'Ffr;': '\U0001d509', - 'ffr;': '\U0001d523', - 'filig;': '\ufb01', - 'FilledSmallSquare;': '\u25fc', - 'FilledVerySmallSquare;': '\u25aa', - 'fjlig;': 'fj', - 'flat;': '\u266d', - 'fllig;': '\ufb02', - 'fltns;': '\u25b1', - 'fnof;': '\u0192', - 'Fopf;': '\U0001d53d', - 'fopf;': '\U0001d557', - 'ForAll;': '\u2200', - 'forall;': '\u2200', - 'fork;': '\u22d4', - 'forkv;': '\u2ad9', - 'Fouriertrf;': '\u2131', - 'fpartint;': '\u2a0d', - 'frac12': '\xbd', - 'frac12;': '\xbd', - 'frac13;': '\u2153', - 'frac14': '\xbc', - 'frac14;': '\xbc', - 'frac15;': '\u2155', - 'frac16;': '\u2159', - 'frac18;': '\u215b', - 'frac23;': '\u2154', - 'frac25;': '\u2156', - 'frac34': '\xbe', - 'frac34;': '\xbe', - 'frac35;': '\u2157', - 'frac38;': '\u215c', - 'frac45;': '\u2158', - 'frac56;': '\u215a', - 'frac58;': '\u215d', - 'frac78;': '\u215e', - 'frasl;': '\u2044', - 'frown;': '\u2322', - 'Fscr;': '\u2131', - 'fscr;': '\U0001d4bb', - 'gacute;': '\u01f5', - 'Gamma;': '\u0393', - 'gamma;': '\u03b3', - 'Gammad;': '\u03dc', - 'gammad;': '\u03dd', - 'gap;': '\u2a86', - 'Gbreve;': '\u011e', - 'gbreve;': '\u011f', - 'Gcedil;': '\u0122', - 'Gcirc;': '\u011c', - 'gcirc;': '\u011d', - 'Gcy;': '\u0413', - 'gcy;': '\u0433', - 'Gdot;': '\u0120', - 'gdot;': '\u0121', - 'gE;': '\u2267', - 'ge;': '\u2265', - 'gEl;': '\u2a8c', - 'gel;': '\u22db', - 'geq;': '\u2265', - 'geqq;': '\u2267', - 'geqslant;': '\u2a7e', - 'ges;': '\u2a7e', - 'gescc;': '\u2aa9', - 'gesdot;': '\u2a80', - 'gesdoto;': '\u2a82', - 'gesdotol;': '\u2a84', - 'gesl;': '\u22db\ufe00', - 'gesles;': '\u2a94', - 'Gfr;': '\U0001d50a', - 'gfr;': '\U0001d524', - 'Gg;': '\u22d9', - 'gg;': '\u226b', - 'ggg;': '\u22d9', - 'gimel;': '\u2137', - 'GJcy;': '\u0403', - 'gjcy;': '\u0453', - 'gl;': '\u2277', - 'gla;': '\u2aa5', - 'glE;': '\u2a92', - 'glj;': '\u2aa4', - 'gnap;': '\u2a8a', - 'gnapprox;': '\u2a8a', - 'gnE;': '\u2269', - 'gne;': '\u2a88', - 'gneq;': '\u2a88', - 'gneqq;': '\u2269', - 'gnsim;': '\u22e7', - 'Gopf;': '\U0001d53e', - 'gopf;': '\U0001d558', - 'grave;': '`', - 'GreaterEqual;': '\u2265', - 'GreaterEqualLess;': '\u22db', - 'GreaterFullEqual;': '\u2267', - 'GreaterGreater;': '\u2aa2', - 'GreaterLess;': '\u2277', - 'GreaterSlantEqual;': '\u2a7e', - 'GreaterTilde;': '\u2273', - 'Gscr;': '\U0001d4a2', - 'gscr;': '\u210a', - 'gsim;': '\u2273', - 'gsime;': '\u2a8e', - 'gsiml;': '\u2a90', - 'GT': '>', - 'gt': '>', - 'GT;': '>', - 'Gt;': '\u226b', - 'gt;': '>', - 'gtcc;': '\u2aa7', - 'gtcir;': '\u2a7a', - 'gtdot;': '\u22d7', - 'gtlPar;': '\u2995', - 'gtquest;': '\u2a7c', - 'gtrapprox;': '\u2a86', - 'gtrarr;': '\u2978', - 'gtrdot;': '\u22d7', - 'gtreqless;': '\u22db', - 'gtreqqless;': '\u2a8c', - 'gtrless;': '\u2277', - 'gtrsim;': '\u2273', - 'gvertneqq;': '\u2269\ufe00', - 'gvnE;': '\u2269\ufe00', - 'Hacek;': '\u02c7', - 'hairsp;': '\u200a', - 'half;': '\xbd', - 'hamilt;': '\u210b', - 'HARDcy;': '\u042a', - 'hardcy;': '\u044a', - 'hArr;': '\u21d4', - 'harr;': '\u2194', - 'harrcir;': '\u2948', - 'harrw;': '\u21ad', - 'Hat;': '^', - 'hbar;': '\u210f', - 'Hcirc;': '\u0124', - 'hcirc;': '\u0125', - 'hearts;': '\u2665', - 'heartsuit;': '\u2665', - 'hellip;': '\u2026', - 'hercon;': '\u22b9', - 'Hfr;': '\u210c', - 'hfr;': '\U0001d525', - 'HilbertSpace;': '\u210b', - 'hksearow;': '\u2925', - 'hkswarow;': '\u2926', - 'hoarr;': '\u21ff', - 'homtht;': '\u223b', - 'hookleftarrow;': '\u21a9', - 'hookrightarrow;': '\u21aa', - 'Hopf;': '\u210d', - 'hopf;': '\U0001d559', - 'horbar;': '\u2015', - 'HorizontalLine;': '\u2500', - 'Hscr;': '\u210b', - 'hscr;': '\U0001d4bd', - 'hslash;': '\u210f', - 'Hstrok;': '\u0126', - 'hstrok;': '\u0127', - 'HumpDownHump;': '\u224e', - 'HumpEqual;': '\u224f', - 'hybull;': '\u2043', - 'hyphen;': '\u2010', - 'Iacute': '\xcd', - 'iacute': '\xed', - 'Iacute;': '\xcd', - 'iacute;': '\xed', - 'ic;': '\u2063', - 'Icirc': '\xce', - 'icirc': '\xee', - 'Icirc;': '\xce', - 'icirc;': '\xee', - 'Icy;': '\u0418', - 'icy;': '\u0438', - 'Idot;': '\u0130', - 'IEcy;': '\u0415', - 'iecy;': '\u0435', - 'iexcl': '\xa1', - 'iexcl;': '\xa1', - 'iff;': '\u21d4', - 'Ifr;': '\u2111', - 'ifr;': '\U0001d526', - 'Igrave': '\xcc', - 'igrave': '\xec', - 'Igrave;': '\xcc', - 'igrave;': '\xec', - 'ii;': '\u2148', - 'iiiint;': '\u2a0c', - 'iiint;': '\u222d', - 'iinfin;': '\u29dc', - 'iiota;': '\u2129', - 'IJlig;': '\u0132', - 'ijlig;': '\u0133', - 'Im;': '\u2111', - 'Imacr;': '\u012a', - 'imacr;': '\u012b', - 'image;': '\u2111', - 'ImaginaryI;': '\u2148', - 'imagline;': '\u2110', - 'imagpart;': '\u2111', - 'imath;': '\u0131', - 'imof;': '\u22b7', - 'imped;': '\u01b5', - 'Implies;': '\u21d2', - 'in;': '\u2208', - 'incare;': '\u2105', - 'infin;': '\u221e', - 'infintie;': '\u29dd', - 'inodot;': '\u0131', - 'Int;': '\u222c', - 'int;': '\u222b', - 'intcal;': '\u22ba', - 'integers;': '\u2124', - 'Integral;': '\u222b', - 'intercal;': '\u22ba', - 'Intersection;': '\u22c2', - 'intlarhk;': '\u2a17', - 'intprod;': '\u2a3c', - 'InvisibleComma;': '\u2063', - 'InvisibleTimes;': '\u2062', - 'IOcy;': '\u0401', - 'iocy;': '\u0451', - 'Iogon;': '\u012e', - 'iogon;': '\u012f', - 'Iopf;': '\U0001d540', - 'iopf;': '\U0001d55a', - 'Iota;': '\u0399', - 'iota;': '\u03b9', - 'iprod;': '\u2a3c', - 'iquest': '\xbf', - 'iquest;': '\xbf', - 'Iscr;': '\u2110', - 'iscr;': '\U0001d4be', - 'isin;': '\u2208', - 'isindot;': '\u22f5', - 'isinE;': '\u22f9', - 'isins;': '\u22f4', - 'isinsv;': '\u22f3', - 'isinv;': '\u2208', - 'it;': '\u2062', - 'Itilde;': '\u0128', - 'itilde;': '\u0129', - 'Iukcy;': '\u0406', - 'iukcy;': '\u0456', - 'Iuml': '\xcf', - 'iuml': '\xef', - 'Iuml;': '\xcf', - 'iuml;': '\xef', - 'Jcirc;': '\u0134', - 'jcirc;': '\u0135', - 'Jcy;': '\u0419', - 'jcy;': '\u0439', - 'Jfr;': '\U0001d50d', - 'jfr;': '\U0001d527', - 'jmath;': '\u0237', - 'Jopf;': '\U0001d541', - 'jopf;': '\U0001d55b', - 'Jscr;': '\U0001d4a5', - 'jscr;': '\U0001d4bf', - 'Jsercy;': '\u0408', - 'jsercy;': '\u0458', - 'Jukcy;': '\u0404', - 'jukcy;': '\u0454', - 'Kappa;': '\u039a', - 'kappa;': '\u03ba', - 'kappav;': '\u03f0', - 'Kcedil;': '\u0136', - 'kcedil;': '\u0137', - 'Kcy;': '\u041a', - 'kcy;': '\u043a', - 'Kfr;': '\U0001d50e', - 'kfr;': '\U0001d528', - 'kgreen;': '\u0138', - 'KHcy;': '\u0425', - 'khcy;': '\u0445', - 'KJcy;': '\u040c', - 'kjcy;': '\u045c', - 'Kopf;': '\U0001d542', - 'kopf;': '\U0001d55c', - 'Kscr;': '\U0001d4a6', - 'kscr;': '\U0001d4c0', - 'lAarr;': '\u21da', - 'Lacute;': '\u0139', - 'lacute;': '\u013a', - 'laemptyv;': '\u29b4', - 'lagran;': '\u2112', - 'Lambda;': '\u039b', - 'lambda;': '\u03bb', - 'Lang;': '\u27ea', - 'lang;': '\u27e8', - 'langd;': '\u2991', - 'langle;': '\u27e8', - 'lap;': '\u2a85', - 'Laplacetrf;': '\u2112', - 'laquo': '\xab', - 'laquo;': '\xab', - 'Larr;': '\u219e', - 'lArr;': '\u21d0', - 'larr;': '\u2190', - 'larrb;': '\u21e4', - 'larrbfs;': '\u291f', - 'larrfs;': '\u291d', - 'larrhk;': '\u21a9', - 'larrlp;': '\u21ab', - 'larrpl;': '\u2939', - 'larrsim;': '\u2973', - 'larrtl;': '\u21a2', - 'lat;': '\u2aab', - 'lAtail;': '\u291b', - 'latail;': '\u2919', - 'late;': '\u2aad', - 'lates;': '\u2aad\ufe00', - 'lBarr;': '\u290e', - 'lbarr;': '\u290c', - 'lbbrk;': '\u2772', - 'lbrace;': '{', - 'lbrack;': '[', - 'lbrke;': '\u298b', - 'lbrksld;': '\u298f', - 'lbrkslu;': '\u298d', - 'Lcaron;': '\u013d', - 'lcaron;': '\u013e', - 'Lcedil;': '\u013b', - 'lcedil;': '\u013c', - 'lceil;': '\u2308', - 'lcub;': '{', - 'Lcy;': '\u041b', - 'lcy;': '\u043b', - 'ldca;': '\u2936', - 'ldquo;': '\u201c', - 'ldquor;': '\u201e', - 'ldrdhar;': '\u2967', - 'ldrushar;': '\u294b', - 'ldsh;': '\u21b2', - 'lE;': '\u2266', - 'le;': '\u2264', - 'LeftAngleBracket;': '\u27e8', - 'LeftArrow;': '\u2190', - 'Leftarrow;': '\u21d0', - 'leftarrow;': '\u2190', - 'LeftArrowBar;': '\u21e4', - 'LeftArrowRightArrow;': '\u21c6', - 'leftarrowtail;': '\u21a2', - 'LeftCeiling;': '\u2308', - 'LeftDoubleBracket;': '\u27e6', - 'LeftDownTeeVector;': '\u2961', - 'LeftDownVector;': '\u21c3', - 'LeftDownVectorBar;': '\u2959', - 'LeftFloor;': '\u230a', - 'leftharpoondown;': '\u21bd', - 'leftharpoonup;': '\u21bc', - 'leftleftarrows;': '\u21c7', - 'LeftRightArrow;': '\u2194', - 'Leftrightarrow;': '\u21d4', - 'leftrightarrow;': '\u2194', - 'leftrightarrows;': '\u21c6', - 'leftrightharpoons;': '\u21cb', - 'leftrightsquigarrow;': '\u21ad', - 'LeftRightVector;': '\u294e', - 'LeftTee;': '\u22a3', - 'LeftTeeArrow;': '\u21a4', - 'LeftTeeVector;': '\u295a', - 'leftthreetimes;': '\u22cb', - 'LeftTriangle;': '\u22b2', - 'LeftTriangleBar;': '\u29cf', - 'LeftTriangleEqual;': '\u22b4', - 'LeftUpDownVector;': '\u2951', - 'LeftUpTeeVector;': '\u2960', - 'LeftUpVector;': '\u21bf', - 'LeftUpVectorBar;': '\u2958', - 'LeftVector;': '\u21bc', - 'LeftVectorBar;': '\u2952', - 'lEg;': '\u2a8b', - 'leg;': '\u22da', - 'leq;': '\u2264', - 'leqq;': '\u2266', - 'leqslant;': '\u2a7d', - 'les;': '\u2a7d', - 'lescc;': '\u2aa8', - 'lesdot;': '\u2a7f', - 'lesdoto;': '\u2a81', - 'lesdotor;': '\u2a83', - 'lesg;': '\u22da\ufe00', - 'lesges;': '\u2a93', - 'lessapprox;': '\u2a85', - 'lessdot;': '\u22d6', - 'lesseqgtr;': '\u22da', - 'lesseqqgtr;': '\u2a8b', - 'LessEqualGreater;': '\u22da', - 'LessFullEqual;': '\u2266', - 'LessGreater;': '\u2276', - 'lessgtr;': '\u2276', - 'LessLess;': '\u2aa1', - 'lesssim;': '\u2272', - 'LessSlantEqual;': '\u2a7d', - 'LessTilde;': '\u2272', - 'lfisht;': '\u297c', - 'lfloor;': '\u230a', - 'Lfr;': '\U0001d50f', - 'lfr;': '\U0001d529', - 'lg;': '\u2276', - 'lgE;': '\u2a91', - 'lHar;': '\u2962', - 'lhard;': '\u21bd', - 'lharu;': '\u21bc', - 'lharul;': '\u296a', - 'lhblk;': '\u2584', - 'LJcy;': '\u0409', - 'ljcy;': '\u0459', - 'Ll;': '\u22d8', - 'll;': '\u226a', - 'llarr;': '\u21c7', - 'llcorner;': '\u231e', - 'Lleftarrow;': '\u21da', - 'llhard;': '\u296b', - 'lltri;': '\u25fa', - 'Lmidot;': '\u013f', - 'lmidot;': '\u0140', - 'lmoust;': '\u23b0', - 'lmoustache;': '\u23b0', - 'lnap;': '\u2a89', - 'lnapprox;': '\u2a89', - 'lnE;': '\u2268', - 'lne;': '\u2a87', - 'lneq;': '\u2a87', - 'lneqq;': '\u2268', - 'lnsim;': '\u22e6', - 'loang;': '\u27ec', - 'loarr;': '\u21fd', - 'lobrk;': '\u27e6', - 'LongLeftArrow;': '\u27f5', - 'Longleftarrow;': '\u27f8', - 'longleftarrow;': '\u27f5', - 'LongLeftRightArrow;': '\u27f7', - 'Longleftrightarrow;': '\u27fa', - 'longleftrightarrow;': '\u27f7', - 'longmapsto;': '\u27fc', - 'LongRightArrow;': '\u27f6', - 'Longrightarrow;': '\u27f9', - 'longrightarrow;': '\u27f6', - 'looparrowleft;': '\u21ab', - 'looparrowright;': '\u21ac', - 'lopar;': '\u2985', - 'Lopf;': '\U0001d543', - 'lopf;': '\U0001d55d', - 'loplus;': '\u2a2d', - 'lotimes;': '\u2a34', - 'lowast;': '\u2217', - 'lowbar;': '_', - 'LowerLeftArrow;': '\u2199', - 'LowerRightArrow;': '\u2198', - 'loz;': '\u25ca', - 'lozenge;': '\u25ca', - 'lozf;': '\u29eb', - 'lpar;': '(', - 'lparlt;': '\u2993', - 'lrarr;': '\u21c6', - 'lrcorner;': '\u231f', - 'lrhar;': '\u21cb', - 'lrhard;': '\u296d', - 'lrm;': '\u200e', - 'lrtri;': '\u22bf', - 'lsaquo;': '\u2039', - 'Lscr;': '\u2112', - 'lscr;': '\U0001d4c1', - 'Lsh;': '\u21b0', - 'lsh;': '\u21b0', - 'lsim;': '\u2272', - 'lsime;': '\u2a8d', - 'lsimg;': '\u2a8f', - 'lsqb;': '[', - 'lsquo;': '\u2018', - 'lsquor;': '\u201a', - 'Lstrok;': '\u0141', - 'lstrok;': '\u0142', - 'LT': '<', - 'lt': '<', - 'LT;': '<', - 'Lt;': '\u226a', - 'lt;': '<', - 'ltcc;': '\u2aa6', - 'ltcir;': '\u2a79', - 'ltdot;': '\u22d6', - 'lthree;': '\u22cb', - 'ltimes;': '\u22c9', - 'ltlarr;': '\u2976', - 'ltquest;': '\u2a7b', - 'ltri;': '\u25c3', - 'ltrie;': '\u22b4', - 'ltrif;': '\u25c2', - 'ltrPar;': '\u2996', - 'lurdshar;': '\u294a', - 'luruhar;': '\u2966', - 'lvertneqq;': '\u2268\ufe00', - 'lvnE;': '\u2268\ufe00', - 'macr': '\xaf', - 'macr;': '\xaf', - 'male;': '\u2642', - 'malt;': '\u2720', - 'maltese;': '\u2720', - 'Map;': '\u2905', - 'map;': '\u21a6', - 'mapsto;': '\u21a6', - 'mapstodown;': '\u21a7', - 'mapstoleft;': '\u21a4', - 'mapstoup;': '\u21a5', - 'marker;': '\u25ae', - 'mcomma;': '\u2a29', - 'Mcy;': '\u041c', - 'mcy;': '\u043c', - 'mdash;': '\u2014', - 'mDDot;': '\u223a', - 'measuredangle;': '\u2221', - 'MediumSpace;': '\u205f', - 'Mellintrf;': '\u2133', - 'Mfr;': '\U0001d510', - 'mfr;': '\U0001d52a', - 'mho;': '\u2127', - 'micro': '\xb5', - 'micro;': '\xb5', - 'mid;': '\u2223', - 'midast;': '*', - 'midcir;': '\u2af0', - 'middot': '\xb7', - 'middot;': '\xb7', - 'minus;': '\u2212', - 'minusb;': '\u229f', - 'minusd;': '\u2238', - 'minusdu;': '\u2a2a', - 'MinusPlus;': '\u2213', - 'mlcp;': '\u2adb', - 'mldr;': '\u2026', - 'mnplus;': '\u2213', - 'models;': '\u22a7', - 'Mopf;': '\U0001d544', - 'mopf;': '\U0001d55e', - 'mp;': '\u2213', - 'Mscr;': '\u2133', - 'mscr;': '\U0001d4c2', - 'mstpos;': '\u223e', - 'Mu;': '\u039c', - 'mu;': '\u03bc', - 'multimap;': '\u22b8', - 'mumap;': '\u22b8', - 'nabla;': '\u2207', - 'Nacute;': '\u0143', - 'nacute;': '\u0144', - 'nang;': '\u2220\u20d2', - 'nap;': '\u2249', - 'napE;': '\u2a70\u0338', - 'napid;': '\u224b\u0338', - 'napos;': '\u0149', - 'napprox;': '\u2249', - 'natur;': '\u266e', - 'natural;': '\u266e', - 'naturals;': '\u2115', - 'nbsp': '\xa0', - 'nbsp;': '\xa0', - 'nbump;': '\u224e\u0338', - 'nbumpe;': '\u224f\u0338', - 'ncap;': '\u2a43', - 'Ncaron;': '\u0147', - 'ncaron;': '\u0148', - 'Ncedil;': '\u0145', - 'ncedil;': '\u0146', - 'ncong;': '\u2247', - 'ncongdot;': '\u2a6d\u0338', - 'ncup;': '\u2a42', - 'Ncy;': '\u041d', - 'ncy;': '\u043d', - 'ndash;': '\u2013', - 'ne;': '\u2260', - 'nearhk;': '\u2924', - 'neArr;': '\u21d7', - 'nearr;': '\u2197', - 'nearrow;': '\u2197', - 'nedot;': '\u2250\u0338', - 'NegativeMediumSpace;': '\u200b', - 'NegativeThickSpace;': '\u200b', - 'NegativeThinSpace;': '\u200b', - 'NegativeVeryThinSpace;': '\u200b', - 'nequiv;': '\u2262', - 'nesear;': '\u2928', - 'nesim;': '\u2242\u0338', - 'NestedGreaterGreater;': '\u226b', - 'NestedLessLess;': '\u226a', - 'NewLine;': '\n', - 'nexist;': '\u2204', - 'nexists;': '\u2204', - 'Nfr;': '\U0001d511', - 'nfr;': '\U0001d52b', - 'ngE;': '\u2267\u0338', - 'nge;': '\u2271', - 'ngeq;': '\u2271', - 'ngeqq;': '\u2267\u0338', - 'ngeqslant;': '\u2a7e\u0338', - 'nges;': '\u2a7e\u0338', - 'nGg;': '\u22d9\u0338', - 'ngsim;': '\u2275', - 'nGt;': '\u226b\u20d2', - 'ngt;': '\u226f', - 'ngtr;': '\u226f', - 'nGtv;': '\u226b\u0338', - 'nhArr;': '\u21ce', - 'nharr;': '\u21ae', - 'nhpar;': '\u2af2', - 'ni;': '\u220b', - 'nis;': '\u22fc', - 'nisd;': '\u22fa', - 'niv;': '\u220b', - 'NJcy;': '\u040a', - 'njcy;': '\u045a', - 'nlArr;': '\u21cd', - 'nlarr;': '\u219a', - 'nldr;': '\u2025', - 'nlE;': '\u2266\u0338', - 'nle;': '\u2270', - 'nLeftarrow;': '\u21cd', - 'nleftarrow;': '\u219a', - 'nLeftrightarrow;': '\u21ce', - 'nleftrightarrow;': '\u21ae', - 'nleq;': '\u2270', - 'nleqq;': '\u2266\u0338', - 'nleqslant;': '\u2a7d\u0338', - 'nles;': '\u2a7d\u0338', - 'nless;': '\u226e', - 'nLl;': '\u22d8\u0338', - 'nlsim;': '\u2274', - 'nLt;': '\u226a\u20d2', - 'nlt;': '\u226e', - 'nltri;': '\u22ea', - 'nltrie;': '\u22ec', - 'nLtv;': '\u226a\u0338', - 'nmid;': '\u2224', - 'NoBreak;': '\u2060', - 'NonBreakingSpace;': '\xa0', - 'Nopf;': '\u2115', - 'nopf;': '\U0001d55f', - 'not': '\xac', - 'Not;': '\u2aec', - 'not;': '\xac', - 'NotCongruent;': '\u2262', - 'NotCupCap;': '\u226d', - 'NotDoubleVerticalBar;': '\u2226', - 'NotElement;': '\u2209', - 'NotEqual;': '\u2260', - 'NotEqualTilde;': '\u2242\u0338', - 'NotExists;': '\u2204', - 'NotGreater;': '\u226f', - 'NotGreaterEqual;': '\u2271', - 'NotGreaterFullEqual;': '\u2267\u0338', - 'NotGreaterGreater;': '\u226b\u0338', - 'NotGreaterLess;': '\u2279', - 'NotGreaterSlantEqual;': '\u2a7e\u0338', - 'NotGreaterTilde;': '\u2275', - 'NotHumpDownHump;': '\u224e\u0338', - 'NotHumpEqual;': '\u224f\u0338', - 'notin;': '\u2209', - 'notindot;': '\u22f5\u0338', - 'notinE;': '\u22f9\u0338', - 'notinva;': '\u2209', - 'notinvb;': '\u22f7', - 'notinvc;': '\u22f6', - 'NotLeftTriangle;': '\u22ea', - 'NotLeftTriangleBar;': '\u29cf\u0338', - 'NotLeftTriangleEqual;': '\u22ec', - 'NotLess;': '\u226e', - 'NotLessEqual;': '\u2270', - 'NotLessGreater;': '\u2278', - 'NotLessLess;': '\u226a\u0338', - 'NotLessSlantEqual;': '\u2a7d\u0338', - 'NotLessTilde;': '\u2274', - 'NotNestedGreaterGreater;': '\u2aa2\u0338', - 'NotNestedLessLess;': '\u2aa1\u0338', - 'notni;': '\u220c', - 'notniva;': '\u220c', - 'notnivb;': '\u22fe', - 'notnivc;': '\u22fd', - 'NotPrecedes;': '\u2280', - 'NotPrecedesEqual;': '\u2aaf\u0338', - 'NotPrecedesSlantEqual;': '\u22e0', - 'NotReverseElement;': '\u220c', - 'NotRightTriangle;': '\u22eb', - 'NotRightTriangleBar;': '\u29d0\u0338', - 'NotRightTriangleEqual;': '\u22ed', - 'NotSquareSubset;': '\u228f\u0338', - 'NotSquareSubsetEqual;': '\u22e2', - 'NotSquareSuperset;': '\u2290\u0338', - 'NotSquareSupersetEqual;': '\u22e3', - 'NotSubset;': '\u2282\u20d2', - 'NotSubsetEqual;': '\u2288', - 'NotSucceeds;': '\u2281', - 'NotSucceedsEqual;': '\u2ab0\u0338', - 'NotSucceedsSlantEqual;': '\u22e1', - 'NotSucceedsTilde;': '\u227f\u0338', - 'NotSuperset;': '\u2283\u20d2', - 'NotSupersetEqual;': '\u2289', - 'NotTilde;': '\u2241', - 'NotTildeEqual;': '\u2244', - 'NotTildeFullEqual;': '\u2247', - 'NotTildeTilde;': '\u2249', - 'NotVerticalBar;': '\u2224', - 'npar;': '\u2226', - 'nparallel;': '\u2226', - 'nparsl;': '\u2afd\u20e5', - 'npart;': '\u2202\u0338', - 'npolint;': '\u2a14', - 'npr;': '\u2280', - 'nprcue;': '\u22e0', - 'npre;': '\u2aaf\u0338', - 'nprec;': '\u2280', - 'npreceq;': '\u2aaf\u0338', - 'nrArr;': '\u21cf', - 'nrarr;': '\u219b', - 'nrarrc;': '\u2933\u0338', - 'nrarrw;': '\u219d\u0338', - 'nRightarrow;': '\u21cf', - 'nrightarrow;': '\u219b', - 'nrtri;': '\u22eb', - 'nrtrie;': '\u22ed', - 'nsc;': '\u2281', - 'nsccue;': '\u22e1', - 'nsce;': '\u2ab0\u0338', - 'Nscr;': '\U0001d4a9', - 'nscr;': '\U0001d4c3', - 'nshortmid;': '\u2224', - 'nshortparallel;': '\u2226', - 'nsim;': '\u2241', - 'nsime;': '\u2244', - 'nsimeq;': '\u2244', - 'nsmid;': '\u2224', - 'nspar;': '\u2226', - 'nsqsube;': '\u22e2', - 'nsqsupe;': '\u22e3', - 'nsub;': '\u2284', - 'nsubE;': '\u2ac5\u0338', - 'nsube;': '\u2288', - 'nsubset;': '\u2282\u20d2', - 'nsubseteq;': '\u2288', - 'nsubseteqq;': '\u2ac5\u0338', - 'nsucc;': '\u2281', - 'nsucceq;': '\u2ab0\u0338', - 'nsup;': '\u2285', - 'nsupE;': '\u2ac6\u0338', - 'nsupe;': '\u2289', - 'nsupset;': '\u2283\u20d2', - 'nsupseteq;': '\u2289', - 'nsupseteqq;': '\u2ac6\u0338', - 'ntgl;': '\u2279', - 'Ntilde': '\xd1', - 'ntilde': '\xf1', - 'Ntilde;': '\xd1', - 'ntilde;': '\xf1', - 'ntlg;': '\u2278', - 'ntriangleleft;': '\u22ea', - 'ntrianglelefteq;': '\u22ec', - 'ntriangleright;': '\u22eb', - 'ntrianglerighteq;': '\u22ed', - 'Nu;': '\u039d', - 'nu;': '\u03bd', - 'num;': '#', - 'numero;': '\u2116', - 'numsp;': '\u2007', - 'nvap;': '\u224d\u20d2', - 'nVDash;': '\u22af', - 'nVdash;': '\u22ae', - 'nvDash;': '\u22ad', - 'nvdash;': '\u22ac', - 'nvge;': '\u2265\u20d2', - 'nvgt;': '>\u20d2', - 'nvHarr;': '\u2904', - 'nvinfin;': '\u29de', - 'nvlArr;': '\u2902', - 'nvle;': '\u2264\u20d2', - 'nvlt;': '<\u20d2', - 'nvltrie;': '\u22b4\u20d2', - 'nvrArr;': '\u2903', - 'nvrtrie;': '\u22b5\u20d2', - 'nvsim;': '\u223c\u20d2', - 'nwarhk;': '\u2923', - 'nwArr;': '\u21d6', - 'nwarr;': '\u2196', - 'nwarrow;': '\u2196', - 'nwnear;': '\u2927', - 'Oacute': '\xd3', - 'oacute': '\xf3', - 'Oacute;': '\xd3', - 'oacute;': '\xf3', - 'oast;': '\u229b', - 'ocir;': '\u229a', - 'Ocirc': '\xd4', - 'ocirc': '\xf4', - 'Ocirc;': '\xd4', - 'ocirc;': '\xf4', - 'Ocy;': '\u041e', - 'ocy;': '\u043e', - 'odash;': '\u229d', - 'Odblac;': '\u0150', - 'odblac;': '\u0151', - 'odiv;': '\u2a38', - 'odot;': '\u2299', - 'odsold;': '\u29bc', - 'OElig;': '\u0152', - 'oelig;': '\u0153', - 'ofcir;': '\u29bf', - 'Ofr;': '\U0001d512', - 'ofr;': '\U0001d52c', - 'ogon;': '\u02db', - 'Ograve': '\xd2', - 'ograve': '\xf2', - 'Ograve;': '\xd2', - 'ograve;': '\xf2', - 'ogt;': '\u29c1', - 'ohbar;': '\u29b5', - 'ohm;': '\u03a9', - 'oint;': '\u222e', - 'olarr;': '\u21ba', - 'olcir;': '\u29be', - 'olcross;': '\u29bb', - 'oline;': '\u203e', - 'olt;': '\u29c0', - 'Omacr;': '\u014c', - 'omacr;': '\u014d', - 'Omega;': '\u03a9', - 'omega;': '\u03c9', - 'Omicron;': '\u039f', - 'omicron;': '\u03bf', - 'omid;': '\u29b6', - 'ominus;': '\u2296', - 'Oopf;': '\U0001d546', - 'oopf;': '\U0001d560', - 'opar;': '\u29b7', - 'OpenCurlyDoubleQuote;': '\u201c', - 'OpenCurlyQuote;': '\u2018', - 'operp;': '\u29b9', - 'oplus;': '\u2295', - 'Or;': '\u2a54', - 'or;': '\u2228', - 'orarr;': '\u21bb', - 'ord;': '\u2a5d', - 'order;': '\u2134', - 'orderof;': '\u2134', - 'ordf': '\xaa', - 'ordf;': '\xaa', - 'ordm': '\xba', - 'ordm;': '\xba', - 'origof;': '\u22b6', - 'oror;': '\u2a56', - 'orslope;': '\u2a57', - 'orv;': '\u2a5b', - 'oS;': '\u24c8', - 'Oscr;': '\U0001d4aa', - 'oscr;': '\u2134', - 'Oslash': '\xd8', - 'oslash': '\xf8', - 'Oslash;': '\xd8', - 'oslash;': '\xf8', - 'osol;': '\u2298', - 'Otilde': '\xd5', - 'otilde': '\xf5', - 'Otilde;': '\xd5', - 'otilde;': '\xf5', - 'Otimes;': '\u2a37', - 'otimes;': '\u2297', - 'otimesas;': '\u2a36', - 'Ouml': '\xd6', - 'ouml': '\xf6', - 'Ouml;': '\xd6', - 'ouml;': '\xf6', - 'ovbar;': '\u233d', - 'OverBar;': '\u203e', - 'OverBrace;': '\u23de', - 'OverBracket;': '\u23b4', - 'OverParenthesis;': '\u23dc', - 'par;': '\u2225', - 'para': '\xb6', - 'para;': '\xb6', - 'parallel;': '\u2225', - 'parsim;': '\u2af3', - 'parsl;': '\u2afd', - 'part;': '\u2202', - 'PartialD;': '\u2202', - 'Pcy;': '\u041f', - 'pcy;': '\u043f', - 'percnt;': '%', - 'period;': '.', - 'permil;': '\u2030', - 'perp;': '\u22a5', - 'pertenk;': '\u2031', - 'Pfr;': '\U0001d513', - 'pfr;': '\U0001d52d', - 'Phi;': '\u03a6', - 'phi;': '\u03c6', - 'phiv;': '\u03d5', - 'phmmat;': '\u2133', - 'phone;': '\u260e', - 'Pi;': '\u03a0', - 'pi;': '\u03c0', - 'pitchfork;': '\u22d4', - 'piv;': '\u03d6', - 'planck;': '\u210f', - 'planckh;': '\u210e', - 'plankv;': '\u210f', - 'plus;': '+', - 'plusacir;': '\u2a23', - 'plusb;': '\u229e', - 'pluscir;': '\u2a22', - 'plusdo;': '\u2214', - 'plusdu;': '\u2a25', - 'pluse;': '\u2a72', - 'PlusMinus;': '\xb1', - 'plusmn': '\xb1', - 'plusmn;': '\xb1', - 'plussim;': '\u2a26', - 'plustwo;': '\u2a27', - 'pm;': '\xb1', - 'Poincareplane;': '\u210c', - 'pointint;': '\u2a15', - 'Popf;': '\u2119', - 'popf;': '\U0001d561', - 'pound': '\xa3', - 'pound;': '\xa3', - 'Pr;': '\u2abb', - 'pr;': '\u227a', - 'prap;': '\u2ab7', - 'prcue;': '\u227c', - 'prE;': '\u2ab3', - 'pre;': '\u2aaf', - 'prec;': '\u227a', - 'precapprox;': '\u2ab7', - 'preccurlyeq;': '\u227c', - 'Precedes;': '\u227a', - 'PrecedesEqual;': '\u2aaf', - 'PrecedesSlantEqual;': '\u227c', - 'PrecedesTilde;': '\u227e', - 'preceq;': '\u2aaf', - 'precnapprox;': '\u2ab9', - 'precneqq;': '\u2ab5', - 'precnsim;': '\u22e8', - 'precsim;': '\u227e', - 'Prime;': '\u2033', - 'prime;': '\u2032', - 'primes;': '\u2119', - 'prnap;': '\u2ab9', - 'prnE;': '\u2ab5', - 'prnsim;': '\u22e8', - 'prod;': '\u220f', - 'Product;': '\u220f', - 'profalar;': '\u232e', - 'profline;': '\u2312', - 'profsurf;': '\u2313', - 'prop;': '\u221d', - 'Proportion;': '\u2237', - 'Proportional;': '\u221d', - 'propto;': '\u221d', - 'prsim;': '\u227e', - 'prurel;': '\u22b0', - 'Pscr;': '\U0001d4ab', - 'pscr;': '\U0001d4c5', - 'Psi;': '\u03a8', - 'psi;': '\u03c8', - 'puncsp;': '\u2008', - 'Qfr;': '\U0001d514', - 'qfr;': '\U0001d52e', - 'qint;': '\u2a0c', - 'Qopf;': '\u211a', - 'qopf;': '\U0001d562', - 'qprime;': '\u2057', - 'Qscr;': '\U0001d4ac', - 'qscr;': '\U0001d4c6', - 'quaternions;': '\u210d', - 'quatint;': '\u2a16', - 'quest;': '?', - 'questeq;': '\u225f', - 'QUOT': '"', - 'quot': '"', - 'QUOT;': '"', - 'quot;': '"', - 'rAarr;': '\u21db', - 'race;': '\u223d\u0331', - 'Racute;': '\u0154', - 'racute;': '\u0155', - 'radic;': '\u221a', - 'raemptyv;': '\u29b3', - 'Rang;': '\u27eb', - 'rang;': '\u27e9', - 'rangd;': '\u2992', - 'range;': '\u29a5', - 'rangle;': '\u27e9', - 'raquo': '\xbb', - 'raquo;': '\xbb', - 'Rarr;': '\u21a0', - 'rArr;': '\u21d2', - 'rarr;': '\u2192', - 'rarrap;': '\u2975', - 'rarrb;': '\u21e5', - 'rarrbfs;': '\u2920', - 'rarrc;': '\u2933', - 'rarrfs;': '\u291e', - 'rarrhk;': '\u21aa', - 'rarrlp;': '\u21ac', - 'rarrpl;': '\u2945', - 'rarrsim;': '\u2974', - 'Rarrtl;': '\u2916', - 'rarrtl;': '\u21a3', - 'rarrw;': '\u219d', - 'rAtail;': '\u291c', - 'ratail;': '\u291a', - 'ratio;': '\u2236', - 'rationals;': '\u211a', - 'RBarr;': '\u2910', - 'rBarr;': '\u290f', - 'rbarr;': '\u290d', - 'rbbrk;': '\u2773', - 'rbrace;': '}', - 'rbrack;': ']', - 'rbrke;': '\u298c', - 'rbrksld;': '\u298e', - 'rbrkslu;': '\u2990', - 'Rcaron;': '\u0158', - 'rcaron;': '\u0159', - 'Rcedil;': '\u0156', - 'rcedil;': '\u0157', - 'rceil;': '\u2309', - 'rcub;': '}', - 'Rcy;': '\u0420', - 'rcy;': '\u0440', - 'rdca;': '\u2937', - 'rdldhar;': '\u2969', - 'rdquo;': '\u201d', - 'rdquor;': '\u201d', - 'rdsh;': '\u21b3', - 'Re;': '\u211c', - 'real;': '\u211c', - 'realine;': '\u211b', - 'realpart;': '\u211c', - 'reals;': '\u211d', - 'rect;': '\u25ad', - 'REG': '\xae', - 'reg': '\xae', - 'REG;': '\xae', - 'reg;': '\xae', - 'ReverseElement;': '\u220b', - 'ReverseEquilibrium;': '\u21cb', - 'ReverseUpEquilibrium;': '\u296f', - 'rfisht;': '\u297d', - 'rfloor;': '\u230b', - 'Rfr;': '\u211c', - 'rfr;': '\U0001d52f', - 'rHar;': '\u2964', - 'rhard;': '\u21c1', - 'rharu;': '\u21c0', - 'rharul;': '\u296c', - 'Rho;': '\u03a1', - 'rho;': '\u03c1', - 'rhov;': '\u03f1', - 'RightAngleBracket;': '\u27e9', - 'RightArrow;': '\u2192', - 'Rightarrow;': '\u21d2', - 'rightarrow;': '\u2192', - 'RightArrowBar;': '\u21e5', - 'RightArrowLeftArrow;': '\u21c4', - 'rightarrowtail;': '\u21a3', - 'RightCeiling;': '\u2309', - 'RightDoubleBracket;': '\u27e7', - 'RightDownTeeVector;': '\u295d', - 'RightDownVector;': '\u21c2', - 'RightDownVectorBar;': '\u2955', - 'RightFloor;': '\u230b', - 'rightharpoondown;': '\u21c1', - 'rightharpoonup;': '\u21c0', - 'rightleftarrows;': '\u21c4', - 'rightleftharpoons;': '\u21cc', - 'rightrightarrows;': '\u21c9', - 'rightsquigarrow;': '\u219d', - 'RightTee;': '\u22a2', - 'RightTeeArrow;': '\u21a6', - 'RightTeeVector;': '\u295b', - 'rightthreetimes;': '\u22cc', - 'RightTriangle;': '\u22b3', - 'RightTriangleBar;': '\u29d0', - 'RightTriangleEqual;': '\u22b5', - 'RightUpDownVector;': '\u294f', - 'RightUpTeeVector;': '\u295c', - 'RightUpVector;': '\u21be', - 'RightUpVectorBar;': '\u2954', - 'RightVector;': '\u21c0', - 'RightVectorBar;': '\u2953', - 'ring;': '\u02da', - 'risingdotseq;': '\u2253', - 'rlarr;': '\u21c4', - 'rlhar;': '\u21cc', - 'rlm;': '\u200f', - 'rmoust;': '\u23b1', - 'rmoustache;': '\u23b1', - 'rnmid;': '\u2aee', - 'roang;': '\u27ed', - 'roarr;': '\u21fe', - 'robrk;': '\u27e7', - 'ropar;': '\u2986', - 'Ropf;': '\u211d', - 'ropf;': '\U0001d563', - 'roplus;': '\u2a2e', - 'rotimes;': '\u2a35', - 'RoundImplies;': '\u2970', - 'rpar;': ')', - 'rpargt;': '\u2994', - 'rppolint;': '\u2a12', - 'rrarr;': '\u21c9', - 'Rrightarrow;': '\u21db', - 'rsaquo;': '\u203a', - 'Rscr;': '\u211b', - 'rscr;': '\U0001d4c7', - 'Rsh;': '\u21b1', - 'rsh;': '\u21b1', - 'rsqb;': ']', - 'rsquo;': '\u2019', - 'rsquor;': '\u2019', - 'rthree;': '\u22cc', - 'rtimes;': '\u22ca', - 'rtri;': '\u25b9', - 'rtrie;': '\u22b5', - 'rtrif;': '\u25b8', - 'rtriltri;': '\u29ce', - 'RuleDelayed;': '\u29f4', - 'ruluhar;': '\u2968', - 'rx;': '\u211e', - 'Sacute;': '\u015a', - 'sacute;': '\u015b', - 'sbquo;': '\u201a', - 'Sc;': '\u2abc', - 'sc;': '\u227b', - 'scap;': '\u2ab8', - 'Scaron;': '\u0160', - 'scaron;': '\u0161', - 'sccue;': '\u227d', - 'scE;': '\u2ab4', - 'sce;': '\u2ab0', - 'Scedil;': '\u015e', - 'scedil;': '\u015f', - 'Scirc;': '\u015c', - 'scirc;': '\u015d', - 'scnap;': '\u2aba', - 'scnE;': '\u2ab6', - 'scnsim;': '\u22e9', - 'scpolint;': '\u2a13', - 'scsim;': '\u227f', - 'Scy;': '\u0421', - 'scy;': '\u0441', - 'sdot;': '\u22c5', - 'sdotb;': '\u22a1', - 'sdote;': '\u2a66', - 'searhk;': '\u2925', - 'seArr;': '\u21d8', - 'searr;': '\u2198', - 'searrow;': '\u2198', - 'sect': '\xa7', - 'sect;': '\xa7', - 'semi;': ';', - 'seswar;': '\u2929', - 'setminus;': '\u2216', - 'setmn;': '\u2216', - 'sext;': '\u2736', - 'Sfr;': '\U0001d516', - 'sfr;': '\U0001d530', - 'sfrown;': '\u2322', - 'sharp;': '\u266f', - 'SHCHcy;': '\u0429', - 'shchcy;': '\u0449', - 'SHcy;': '\u0428', - 'shcy;': '\u0448', - 'ShortDownArrow;': '\u2193', - 'ShortLeftArrow;': '\u2190', - 'shortmid;': '\u2223', - 'shortparallel;': '\u2225', - 'ShortRightArrow;': '\u2192', - 'ShortUpArrow;': '\u2191', - 'shy': '\xad', - 'shy;': '\xad', - 'Sigma;': '\u03a3', - 'sigma;': '\u03c3', - 'sigmaf;': '\u03c2', - 'sigmav;': '\u03c2', - 'sim;': '\u223c', - 'simdot;': '\u2a6a', - 'sime;': '\u2243', - 'simeq;': '\u2243', - 'simg;': '\u2a9e', - 'simgE;': '\u2aa0', - 'siml;': '\u2a9d', - 'simlE;': '\u2a9f', - 'simne;': '\u2246', - 'simplus;': '\u2a24', - 'simrarr;': '\u2972', - 'slarr;': '\u2190', - 'SmallCircle;': '\u2218', - 'smallsetminus;': '\u2216', - 'smashp;': '\u2a33', - 'smeparsl;': '\u29e4', - 'smid;': '\u2223', - 'smile;': '\u2323', - 'smt;': '\u2aaa', - 'smte;': '\u2aac', - 'smtes;': '\u2aac\ufe00', - 'SOFTcy;': '\u042c', - 'softcy;': '\u044c', - 'sol;': '/', - 'solb;': '\u29c4', - 'solbar;': '\u233f', - 'Sopf;': '\U0001d54a', - 'sopf;': '\U0001d564', - 'spades;': '\u2660', - 'spadesuit;': '\u2660', - 'spar;': '\u2225', - 'sqcap;': '\u2293', - 'sqcaps;': '\u2293\ufe00', - 'sqcup;': '\u2294', - 'sqcups;': '\u2294\ufe00', - 'Sqrt;': '\u221a', - 'sqsub;': '\u228f', - 'sqsube;': '\u2291', - 'sqsubset;': '\u228f', - 'sqsubseteq;': '\u2291', - 'sqsup;': '\u2290', - 'sqsupe;': '\u2292', - 'sqsupset;': '\u2290', - 'sqsupseteq;': '\u2292', - 'squ;': '\u25a1', - 'Square;': '\u25a1', - 'square;': '\u25a1', - 'SquareIntersection;': '\u2293', - 'SquareSubset;': '\u228f', - 'SquareSubsetEqual;': '\u2291', - 'SquareSuperset;': '\u2290', - 'SquareSupersetEqual;': '\u2292', - 'SquareUnion;': '\u2294', - 'squarf;': '\u25aa', - 'squf;': '\u25aa', - 'srarr;': '\u2192', - 'Sscr;': '\U0001d4ae', - 'sscr;': '\U0001d4c8', - 'ssetmn;': '\u2216', - 'ssmile;': '\u2323', - 'sstarf;': '\u22c6', - 'Star;': '\u22c6', - 'star;': '\u2606', - 'starf;': '\u2605', - 'straightepsilon;': '\u03f5', - 'straightphi;': '\u03d5', - 'strns;': '\xaf', - 'Sub;': '\u22d0', - 'sub;': '\u2282', - 'subdot;': '\u2abd', - 'subE;': '\u2ac5', - 'sube;': '\u2286', - 'subedot;': '\u2ac3', - 'submult;': '\u2ac1', - 'subnE;': '\u2acb', - 'subne;': '\u228a', - 'subplus;': '\u2abf', - 'subrarr;': '\u2979', - 'Subset;': '\u22d0', - 'subset;': '\u2282', - 'subseteq;': '\u2286', - 'subseteqq;': '\u2ac5', - 'SubsetEqual;': '\u2286', - 'subsetneq;': '\u228a', - 'subsetneqq;': '\u2acb', - 'subsim;': '\u2ac7', - 'subsub;': '\u2ad5', - 'subsup;': '\u2ad3', - 'succ;': '\u227b', - 'succapprox;': '\u2ab8', - 'succcurlyeq;': '\u227d', - 'Succeeds;': '\u227b', - 'SucceedsEqual;': '\u2ab0', - 'SucceedsSlantEqual;': '\u227d', - 'SucceedsTilde;': '\u227f', - 'succeq;': '\u2ab0', - 'succnapprox;': '\u2aba', - 'succneqq;': '\u2ab6', - 'succnsim;': '\u22e9', - 'succsim;': '\u227f', - 'SuchThat;': '\u220b', - 'Sum;': '\u2211', - 'sum;': '\u2211', - 'sung;': '\u266a', - 'sup1': '\xb9', - 'sup1;': '\xb9', - 'sup2': '\xb2', - 'sup2;': '\xb2', - 'sup3': '\xb3', - 'sup3;': '\xb3', - 'Sup;': '\u22d1', - 'sup;': '\u2283', - 'supdot;': '\u2abe', - 'supdsub;': '\u2ad8', - 'supE;': '\u2ac6', - 'supe;': '\u2287', - 'supedot;': '\u2ac4', - 'Superset;': '\u2283', - 'SupersetEqual;': '\u2287', - 'suphsol;': '\u27c9', - 'suphsub;': '\u2ad7', - 'suplarr;': '\u297b', - 'supmult;': '\u2ac2', - 'supnE;': '\u2acc', - 'supne;': '\u228b', - 'supplus;': '\u2ac0', - 'Supset;': '\u22d1', - 'supset;': '\u2283', - 'supseteq;': '\u2287', - 'supseteqq;': '\u2ac6', - 'supsetneq;': '\u228b', - 'supsetneqq;': '\u2acc', - 'supsim;': '\u2ac8', - 'supsub;': '\u2ad4', - 'supsup;': '\u2ad6', - 'swarhk;': '\u2926', - 'swArr;': '\u21d9', - 'swarr;': '\u2199', - 'swarrow;': '\u2199', - 'swnwar;': '\u292a', - 'szlig': '\xdf', - 'szlig;': '\xdf', - 'Tab;': '\t', - 'target;': '\u2316', - 'Tau;': '\u03a4', - 'tau;': '\u03c4', - 'tbrk;': '\u23b4', - 'Tcaron;': '\u0164', - 'tcaron;': '\u0165', - 'Tcedil;': '\u0162', - 'tcedil;': '\u0163', - 'Tcy;': '\u0422', - 'tcy;': '\u0442', - 'tdot;': '\u20db', - 'telrec;': '\u2315', - 'Tfr;': '\U0001d517', - 'tfr;': '\U0001d531', - 'there4;': '\u2234', - 'Therefore;': '\u2234', - 'therefore;': '\u2234', - 'Theta;': '\u0398', - 'theta;': '\u03b8', - 'thetasym;': '\u03d1', - 'thetav;': '\u03d1', - 'thickapprox;': '\u2248', - 'thicksim;': '\u223c', - 'ThickSpace;': '\u205f\u200a', - 'thinsp;': '\u2009', - 'ThinSpace;': '\u2009', - 'thkap;': '\u2248', - 'thksim;': '\u223c', - 'THORN': '\xde', - 'thorn': '\xfe', - 'THORN;': '\xde', - 'thorn;': '\xfe', - 'Tilde;': '\u223c', - 'tilde;': '\u02dc', - 'TildeEqual;': '\u2243', - 'TildeFullEqual;': '\u2245', - 'TildeTilde;': '\u2248', - 'times': '\xd7', - 'times;': '\xd7', - 'timesb;': '\u22a0', - 'timesbar;': '\u2a31', - 'timesd;': '\u2a30', - 'tint;': '\u222d', - 'toea;': '\u2928', - 'top;': '\u22a4', - 'topbot;': '\u2336', - 'topcir;': '\u2af1', - 'Topf;': '\U0001d54b', - 'topf;': '\U0001d565', - 'topfork;': '\u2ada', - 'tosa;': '\u2929', - 'tprime;': '\u2034', - 'TRADE;': '\u2122', - 'trade;': '\u2122', - 'triangle;': '\u25b5', - 'triangledown;': '\u25bf', - 'triangleleft;': '\u25c3', - 'trianglelefteq;': '\u22b4', - 'triangleq;': '\u225c', - 'triangleright;': '\u25b9', - 'trianglerighteq;': '\u22b5', - 'tridot;': '\u25ec', - 'trie;': '\u225c', - 'triminus;': '\u2a3a', - 'TripleDot;': '\u20db', - 'triplus;': '\u2a39', - 'trisb;': '\u29cd', - 'tritime;': '\u2a3b', - 'trpezium;': '\u23e2', - 'Tscr;': '\U0001d4af', - 'tscr;': '\U0001d4c9', - 'TScy;': '\u0426', - 'tscy;': '\u0446', - 'TSHcy;': '\u040b', - 'tshcy;': '\u045b', - 'Tstrok;': '\u0166', - 'tstrok;': '\u0167', - 'twixt;': '\u226c', - 'twoheadleftarrow;': '\u219e', - 'twoheadrightarrow;': '\u21a0', - 'Uacute': '\xda', - 'uacute': '\xfa', - 'Uacute;': '\xda', - 'uacute;': '\xfa', - 'Uarr;': '\u219f', - 'uArr;': '\u21d1', - 'uarr;': '\u2191', - 'Uarrocir;': '\u2949', - 'Ubrcy;': '\u040e', - 'ubrcy;': '\u045e', - 'Ubreve;': '\u016c', - 'ubreve;': '\u016d', - 'Ucirc': '\xdb', - 'ucirc': '\xfb', - 'Ucirc;': '\xdb', - 'ucirc;': '\xfb', - 'Ucy;': '\u0423', - 'ucy;': '\u0443', - 'udarr;': '\u21c5', - 'Udblac;': '\u0170', - 'udblac;': '\u0171', - 'udhar;': '\u296e', - 'ufisht;': '\u297e', - 'Ufr;': '\U0001d518', - 'ufr;': '\U0001d532', - 'Ugrave': '\xd9', - 'ugrave': '\xf9', - 'Ugrave;': '\xd9', - 'ugrave;': '\xf9', - 'uHar;': '\u2963', - 'uharl;': '\u21bf', - 'uharr;': '\u21be', - 'uhblk;': '\u2580', - 'ulcorn;': '\u231c', - 'ulcorner;': '\u231c', - 'ulcrop;': '\u230f', - 'ultri;': '\u25f8', - 'Umacr;': '\u016a', - 'umacr;': '\u016b', - 'uml': '\xa8', - 'uml;': '\xa8', - 'UnderBar;': '_', - 'UnderBrace;': '\u23df', - 'UnderBracket;': '\u23b5', - 'UnderParenthesis;': '\u23dd', - 'Union;': '\u22c3', - 'UnionPlus;': '\u228e', - 'Uogon;': '\u0172', - 'uogon;': '\u0173', - 'Uopf;': '\U0001d54c', - 'uopf;': '\U0001d566', - 'UpArrow;': '\u2191', - 'Uparrow;': '\u21d1', - 'uparrow;': '\u2191', - 'UpArrowBar;': '\u2912', - 'UpArrowDownArrow;': '\u21c5', - 'UpDownArrow;': '\u2195', - 'Updownarrow;': '\u21d5', - 'updownarrow;': '\u2195', - 'UpEquilibrium;': '\u296e', - 'upharpoonleft;': '\u21bf', - 'upharpoonright;': '\u21be', - 'uplus;': '\u228e', - 'UpperLeftArrow;': '\u2196', - 'UpperRightArrow;': '\u2197', - 'Upsi;': '\u03d2', - 'upsi;': '\u03c5', - 'upsih;': '\u03d2', - 'Upsilon;': '\u03a5', - 'upsilon;': '\u03c5', - 'UpTee;': '\u22a5', - 'UpTeeArrow;': '\u21a5', - 'upuparrows;': '\u21c8', - 'urcorn;': '\u231d', - 'urcorner;': '\u231d', - 'urcrop;': '\u230e', - 'Uring;': '\u016e', - 'uring;': '\u016f', - 'urtri;': '\u25f9', - 'Uscr;': '\U0001d4b0', - 'uscr;': '\U0001d4ca', - 'utdot;': '\u22f0', - 'Utilde;': '\u0168', - 'utilde;': '\u0169', - 'utri;': '\u25b5', - 'utrif;': '\u25b4', - 'uuarr;': '\u21c8', - 'Uuml': '\xdc', - 'uuml': '\xfc', - 'Uuml;': '\xdc', - 'uuml;': '\xfc', - 'uwangle;': '\u29a7', - 'vangrt;': '\u299c', - 'varepsilon;': '\u03f5', - 'varkappa;': '\u03f0', - 'varnothing;': '\u2205', - 'varphi;': '\u03d5', - 'varpi;': '\u03d6', - 'varpropto;': '\u221d', - 'vArr;': '\u21d5', - 'varr;': '\u2195', - 'varrho;': '\u03f1', - 'varsigma;': '\u03c2', - 'varsubsetneq;': '\u228a\ufe00', - 'varsubsetneqq;': '\u2acb\ufe00', - 'varsupsetneq;': '\u228b\ufe00', - 'varsupsetneqq;': '\u2acc\ufe00', - 'vartheta;': '\u03d1', - 'vartriangleleft;': '\u22b2', - 'vartriangleright;': '\u22b3', - 'Vbar;': '\u2aeb', - 'vBar;': '\u2ae8', - 'vBarv;': '\u2ae9', - 'Vcy;': '\u0412', - 'vcy;': '\u0432', - 'VDash;': '\u22ab', - 'Vdash;': '\u22a9', - 'vDash;': '\u22a8', - 'vdash;': '\u22a2', - 'Vdashl;': '\u2ae6', - 'Vee;': '\u22c1', - 'vee;': '\u2228', - 'veebar;': '\u22bb', - 'veeeq;': '\u225a', - 'vellip;': '\u22ee', - 'Verbar;': '\u2016', - 'verbar;': '|', - 'Vert;': '\u2016', - 'vert;': '|', - 'VerticalBar;': '\u2223', - 'VerticalLine;': '|', - 'VerticalSeparator;': '\u2758', - 'VerticalTilde;': '\u2240', - 'VeryThinSpace;': '\u200a', - 'Vfr;': '\U0001d519', - 'vfr;': '\U0001d533', - 'vltri;': '\u22b2', - 'vnsub;': '\u2282\u20d2', - 'vnsup;': '\u2283\u20d2', - 'Vopf;': '\U0001d54d', - 'vopf;': '\U0001d567', - 'vprop;': '\u221d', - 'vrtri;': '\u22b3', - 'Vscr;': '\U0001d4b1', - 'vscr;': '\U0001d4cb', - 'vsubnE;': '\u2acb\ufe00', - 'vsubne;': '\u228a\ufe00', - 'vsupnE;': '\u2acc\ufe00', - 'vsupne;': '\u228b\ufe00', - 'Vvdash;': '\u22aa', - 'vzigzag;': '\u299a', - 'Wcirc;': '\u0174', - 'wcirc;': '\u0175', - 'wedbar;': '\u2a5f', - 'Wedge;': '\u22c0', - 'wedge;': '\u2227', - 'wedgeq;': '\u2259', - 'weierp;': '\u2118', - 'Wfr;': '\U0001d51a', - 'wfr;': '\U0001d534', - 'Wopf;': '\U0001d54e', - 'wopf;': '\U0001d568', - 'wp;': '\u2118', - 'wr;': '\u2240', - 'wreath;': '\u2240', - 'Wscr;': '\U0001d4b2', - 'wscr;': '\U0001d4cc', - 'xcap;': '\u22c2', - 'xcirc;': '\u25ef', - 'xcup;': '\u22c3', - 'xdtri;': '\u25bd', - 'Xfr;': '\U0001d51b', - 'xfr;': '\U0001d535', - 'xhArr;': '\u27fa', - 'xharr;': '\u27f7', - 'Xi;': '\u039e', - 'xi;': '\u03be', - 'xlArr;': '\u27f8', - 'xlarr;': '\u27f5', - 'xmap;': '\u27fc', - 'xnis;': '\u22fb', - 'xodot;': '\u2a00', - 'Xopf;': '\U0001d54f', - 'xopf;': '\U0001d569', - 'xoplus;': '\u2a01', - 'xotime;': '\u2a02', - 'xrArr;': '\u27f9', - 'xrarr;': '\u27f6', - 'Xscr;': '\U0001d4b3', - 'xscr;': '\U0001d4cd', - 'xsqcup;': '\u2a06', - 'xuplus;': '\u2a04', - 'xutri;': '\u25b3', - 'xvee;': '\u22c1', - 'xwedge;': '\u22c0', - 'Yacute': '\xdd', - 'yacute': '\xfd', - 'Yacute;': '\xdd', - 'yacute;': '\xfd', - 'YAcy;': '\u042f', - 'yacy;': '\u044f', - 'Ycirc;': '\u0176', - 'ycirc;': '\u0177', - 'Ycy;': '\u042b', - 'ycy;': '\u044b', - 'yen': '\xa5', - 'yen;': '\xa5', - 'Yfr;': '\U0001d51c', - 'yfr;': '\U0001d536', - 'YIcy;': '\u0407', - 'yicy;': '\u0457', - 'Yopf;': '\U0001d550', - 'yopf;': '\U0001d56a', - 'Yscr;': '\U0001d4b4', - 'yscr;': '\U0001d4ce', - 'YUcy;': '\u042e', - 'yucy;': '\u044e', - 'yuml': '\xff', - 'Yuml;': '\u0178', - 'yuml;': '\xff', - 'Zacute;': '\u0179', - 'zacute;': '\u017a', - 'Zcaron;': '\u017d', - 'zcaron;': '\u017e', - 'Zcy;': '\u0417', - 'zcy;': '\u0437', - 'Zdot;': '\u017b', - 'zdot;': '\u017c', - 'zeetrf;': '\u2128', - 'ZeroWidthSpace;': '\u200b', - 'Zeta;': '\u0396', - 'zeta;': '\u03b6', - 'Zfr;': '\u2128', - 'zfr;': '\U0001d537', - 'ZHcy;': '\u0416', - 'zhcy;': '\u0436', - 'zigrarr;': '\u21dd', - 'Zopf;': '\u2124', - 'zopf;': '\U0001d56b', - 'Zscr;': '\U0001d4b5', - 'zscr;': '\U0001d4cf', - 'zwj;': '\u200d', - 'zwnj;': '\u200c', - } - -try: - import http.client as compat_http_client -except ImportError: # Python 2 - import httplib as compat_http_client - -try: - from urllib.error import HTTPError as compat_HTTPError -except ImportError: # Python 2 - from urllib2 import HTTPError as compat_HTTPError - -try: - from urllib.request import urlretrieve as compat_urlretrieve -except ImportError: # Python 2 - from urllib import urlretrieve as compat_urlretrieve - -try: - from html.parser import HTMLParser as compat_HTMLParser -except ImportError: # Python 2 - from HTMLParser import HTMLParser as compat_HTMLParser - -try: # Python 2 - from HTMLParser import HTMLParseError as compat_HTMLParseError -except ImportError: # Python <3.4 - try: - from html.parser import HTMLParseError as compat_HTMLParseError - except ImportError: # Python >3.4 - - # HTMLParseError has been deprecated in Python 3.3 and removed in - # Python 3.5. Introducing dummy exception for Python >3.5 for compatible - # and uniform cross-version exceptiong handling - class compat_HTMLParseError(Exception): - pass - -try: - from subprocess import DEVNULL - compat_subprocess_get_DEVNULL = lambda: DEVNULL -except ImportError: - compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') - -try: - import http.server as compat_http_server -except ImportError: - import BaseHTTPServer as compat_http_server - -try: - compat_str = unicode # Python 2 -except NameError: - compat_str = str - -try: - from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes - from urllib.parse import unquote as compat_urllib_parse_unquote - from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus -except ImportError: # Python 2 - _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') - else re.compile(r'([\x00-\x7f]+)')) - - # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus - # implementations from cpython 3.4.3's stdlib. Python 2's version - # is apparently broken (see https://github.com/rg3/youtube-dl/pull/6244) - - def compat_urllib_parse_unquote_to_bytes(string): - """unquote_to_bytes('abc%20def') -> b'abc def'.""" - # Note: strings are encoded as UTF-8. This is only an issue if it contains - # unescaped non-ASCII characters, which URIs should not. - if not string: - # Is it a string-like object? - string.split - return b'' - if isinstance(string, compat_str): - string = string.encode('utf-8') - bits = string.split(b'%') - if len(bits) == 1: - return string - res = [bits[0]] - append = res.append - for item in bits[1:]: - try: - append(compat_urllib_parse._hextochr[item[:2]]) - append(item[2:]) - except KeyError: - append(b'%') - append(item) - return b''.join(res) - - def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): - """Replace %xx escapes by their single-character equivalent. The optional - encoding and errors parameters specify how to decode percent-encoded - sequences into Unicode characters, as accepted by the bytes.decode() - method. - By default, percent-encoded sequences are decoded with UTF-8, and invalid - sequences are replaced by a placeholder character. - - unquote('abc%20def') -> 'abc def'. - """ - if '%' not in string: - string.split - return string - if encoding is None: - encoding = 'utf-8' - if errors is None: - errors = 'replace' - bits = _asciire.split(string) - res = [bits[0]] - append = res.append - for i in range(1, len(bits), 2): - append(compat_urllib_parse_unquote_to_bytes(bits[i]).decode(encoding, errors)) - append(bits[i + 1]) - return ''.join(res) - - def compat_urllib_parse_unquote_plus(string, encoding='utf-8', errors='replace'): - """Like unquote(), but also replace plus signs by spaces, as required for - unquoting HTML form values. - - unquote_plus('%7e/abc+def') -> '~/abc def' - """ - string = string.replace('+', ' ') - return compat_urllib_parse_unquote(string, encoding, errors) - -try: - from urllib.parse import urlencode as compat_urllib_parse_urlencode -except ImportError: # Python 2 - # Python 2 will choke in urlencode on mixture of byte and unicode strings. - # Possible solutions are to either port it from python 3 with all - # the friends or manually ensure input query contains only byte strings. - # We will stick with latter thus recursively encoding the whole query. - def compat_urllib_parse_urlencode(query, doseq=0, encoding='utf-8'): - def encode_elem(e): - if isinstance(e, dict): - e = encode_dict(e) - elif isinstance(e, (list, tuple,)): - list_e = encode_list(e) - e = tuple(list_e) if isinstance(e, tuple) else list_e - elif isinstance(e, compat_str): - e = e.encode(encoding) - return e - - def encode_dict(d): - return dict((encode_elem(k), encode_elem(v)) for k, v in d.items()) - - def encode_list(l): - return [encode_elem(e) for e in l] - - return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq) - -try: - from urllib.request import DataHandler as compat_urllib_request_DataHandler -except ImportError: # Python < 3.4 - # Ported from CPython 98774:1733b3bd46db, Lib/urllib/request.py - class compat_urllib_request_DataHandler(compat_urllib_request.BaseHandler): - def data_open(self, req): - # data URLs as specified in RFC 2397. - # - # ignores POSTed data - # - # syntax: - # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data - # mediatype := [ type "/" subtype ] *( ";" parameter ) - # data := *urlchar - # parameter := attribute "=" value - url = req.get_full_url() - - scheme, data = url.split(':', 1) - mediatype, data = data.split(',', 1) - - # even base64 encoded data URLs might be quoted so unquote in any case: - data = compat_urllib_parse_unquote_to_bytes(data) - if mediatype.endswith(';base64'): - data = binascii.a2b_base64(data) - mediatype = mediatype[:-7] - - if not mediatype: - mediatype = 'text/plain;charset=US-ASCII' - - headers = email.message_from_string( - 'Content-type: %s\nContent-length: %d\n' % (mediatype, len(data))) - - return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url) - -try: - compat_basestring = basestring # Python 2 -except NameError: - compat_basestring = str - -try: - compat_chr = unichr # Python 2 -except NameError: - compat_chr = chr - -try: - from xml.etree.ElementTree import ParseError as compat_xml_parse_error -except ImportError: # Python 2.6 - from xml.parsers.expat import ExpatError as compat_xml_parse_error - - -etree = xml.etree.ElementTree - - -class _TreeBuilder(etree.TreeBuilder): - def doctype(self, name, pubid, system): - pass - - -if sys.version_info[0] >= 3: - def compat_etree_fromstring(text): - return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) -else: - # python 2.x tries to encode unicode strings with ascii (see the - # XMLParser._fixtext method) - try: - _etree_iter = etree.Element.iter - except AttributeError: # Python <=2.6 - def _etree_iter(root): - for el in root.findall('*'): - yield el - for sub in _etree_iter(el): - yield sub - - # on 2.6 XML doesn't have a parser argument, function copied from CPython - # 2.7 source - def _XML(text, parser=None): - if not parser: - parser = etree.XMLParser(target=_TreeBuilder()) - parser.feed(text) - return parser.close() - - def _element_factory(*args, **kwargs): - el = etree.Element(*args, **kwargs) - for k, v in el.items(): - if isinstance(v, bytes): - el.set(k, v.decode('utf-8')) - return el - - def compat_etree_fromstring(text): - doc = _XML(text, parser=etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory))) - for el in _etree_iter(doc): - if el.text is not None and isinstance(el.text, bytes): - el.text = el.text.decode('utf-8') - return doc - -if hasattr(etree, 'register_namespace'): - compat_etree_register_namespace = etree.register_namespace -else: - def compat_etree_register_namespace(prefix, uri): - """Register a namespace prefix. - The registry is global, and any existing mapping for either the - given prefix or the namespace URI will be removed. - *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and - attributes in this namespace will be serialized with prefix if possible. - ValueError is raised if prefix is reserved or is invalid. - """ - if re.match(r"ns\d+$", prefix): - raise ValueError("Prefix format reserved for internal use") - for k, v in list(etree._namespace_map.items()): - if k == uri or v == prefix: - del etree._namespace_map[k] - etree._namespace_map[uri] = prefix - -if sys.version_info < (2, 7): - # Here comes the crazy part: In 2.6, if the xpath is a unicode, - # .//node does not match if a node is a direct child of . ! - def compat_xpath(xpath): - if isinstance(xpath, compat_str): - xpath = xpath.encode('ascii') - return xpath -else: - compat_xpath = lambda xpath: xpath - -try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - - def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - qs, _coerce_result = qs, compat_str - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value and not strict_parsing: - continue - nv = name_value.split('=', 1) - if len(nv) != 2: - if strict_parsing: - raise ValueError('bad query field: %r' % (name_value,)) - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = compat_urllib_parse_unquote( - name, encoding=encoding, errors=errors) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = compat_urllib_parse_unquote( - value, encoding=encoding, errors=errors) - value = _coerce_result(value) - r.append((name, value)) - return r - - def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - parsed_result = {} - pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, - encoding=encoding, errors=errors) - for name, value in pairs: - if name in parsed_result: - parsed_result[name].append(value) - else: - parsed_result[name] = [value] - return parsed_result - - -compat_os_name = os._name if os.name == 'java' else os.name - - -if compat_os_name == 'nt': - def compat_shlex_quote(s): - return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') -else: - try: - from shlex import quote as compat_shlex_quote - except ImportError: # Python < 3.3 - def compat_shlex_quote(s): - if re.match(r'^[-_\w./]+$', s): - return s - else: - return "'" + s.replace("'", "'\"'\"'") + "'" - - -try: - args = shlex.split('中文') - assert (isinstance(args, list) and - isinstance(args[0], compat_str) and - args[0] == '中文') - compat_shlex_split = shlex.split -except (AssertionError, UnicodeEncodeError): - # Working around shlex issue with unicode strings on some python 2 - # versions (see http://bugs.python.org/issue1548891) - def compat_shlex_split(s, comments=False, posix=True): - if isinstance(s, compat_str): - s = s.encode('utf-8') - return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix))) - - -def compat_ord(c): - if type(c) is int: - return c - else: - return ord(c) - - -if sys.version_info >= (3, 0): - compat_getenv = os.getenv - compat_expanduser = os.path.expanduser - - def compat_setenv(key, value, env=os.environ): - env[key] = value -else: - # Environment variables should be decoded with filesystem encoding. - # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918) - - def compat_getenv(key, default=None): - from .utils import get_filesystem_encoding - env = os.getenv(key, default) - if env: - env = env.decode(get_filesystem_encoding()) - return env - - def compat_setenv(key, value, env=os.environ): - def encode(v): - from .utils import get_filesystem_encoding - return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v - env[encode(key)] = encode(value) - - # HACK: The default implementations of os.path.expanduser from cpython do not decode - # environment variables with filesystem encoding. We will work around this by - # providing adjusted implementations. - # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib - # for different platforms with correct environment variables decoding. - - if compat_os_name == 'posix': - def compat_expanduser(path): - """Expand ~ and ~user constructions. If user or $HOME is unknown, - do nothing.""" - if not path.startswith('~'): - return path - i = path.find('/', 1) - if i < 0: - i = len(path) - if i == 1: - if 'HOME' not in os.environ: - import pwd - userhome = pwd.getpwuid(os.getuid()).pw_dir - else: - userhome = compat_getenv('HOME') - else: - import pwd - try: - pwent = pwd.getpwnam(path[1:i]) - except KeyError: - return path - userhome = pwent.pw_dir - userhome = userhome.rstrip('/') - return (userhome + path[i:]) or '/' - elif compat_os_name in ('nt', 'ce'): - def compat_expanduser(path): - """Expand ~ and ~user constructs. - - If user or $HOME is unknown, do nothing.""" - if path[:1] != '~': - return path - i, n = 1, len(path) - while i < n and path[i] not in '/\\': - i = i + 1 - - if 'HOME' in os.environ: - userhome = compat_getenv('HOME') - elif 'USERPROFILE' in os.environ: - userhome = compat_getenv('USERPROFILE') - elif 'HOMEPATH' not in os.environ: - return path - else: - try: - drive = compat_getenv('HOMEDRIVE') - except KeyError: - drive = '' - userhome = os.path.join(drive, compat_getenv('HOMEPATH')) - - if i != 1: # ~user - userhome = os.path.join(os.path.dirname(userhome), path[1:i]) - - return userhome + path[i:] - else: - compat_expanduser = os.path.expanduser - - -if sys.version_info < (3, 0): - def compat_print(s): - from .utils import preferredencoding - print(s.encode(preferredencoding(), 'xmlcharrefreplace')) -else: - def compat_print(s): - assert isinstance(s, compat_str) - print(s) - - -if sys.version_info < (3, 0) and sys.platform == 'win32': - def compat_getpass(prompt, *args, **kwargs): - if isinstance(prompt, compat_str): - from .utils import preferredencoding - prompt = prompt.encode(preferredencoding()) - return getpass.getpass(prompt, *args, **kwargs) -else: - compat_getpass = getpass.getpass - -try: - compat_input = raw_input -except NameError: # Python 3 - compat_input = input - -# Python < 2.6.5 require kwargs to be bytes -try: - def _testfunc(x): - pass - _testfunc(**{'x': 0}) -except TypeError: - def compat_kwargs(kwargs): - return dict((bytes(k), v) for k, v in kwargs.items()) -else: - compat_kwargs = lambda kwargs: kwargs - - -try: - compat_numeric_types = (int, float, long, complex) -except NameError: # Python 3 - compat_numeric_types = (int, float, complex) - - -try: - compat_integer_types = (int, long) -except NameError: # Python 3 - compat_integer_types = (int, ) - - -if sys.version_info < (2, 7): - def compat_socket_create_connection(address, timeout, source_address=None): - host, port = address - err = None - for res in socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM): - af, socktype, proto, canonname, sa = res - sock = None - try: - sock = socket.socket(af, socktype, proto) - sock.settimeout(timeout) - if source_address: - sock.bind(source_address) - sock.connect(sa) - return sock - except socket.error as _: - err = _ - if sock is not None: - sock.close() - if err is not None: - raise err - else: - raise socket.error('getaddrinfo returns an empty list') -else: - compat_socket_create_connection = socket.create_connection - - -# Fix https://github.com/rg3/youtube-dl/issues/4223 -# See http://bugs.python.org/issue9161 for what is broken -def workaround_optparse_bug9161(): - op = optparse.OptionParser() - og = optparse.OptionGroup(op, 'foo') - try: - og.add_option('-t') - except TypeError: - real_add_option = optparse.OptionGroup.add_option - - def _compat_add_option(self, *args, **kwargs): - enc = lambda v: ( - v.encode('ascii', 'replace') if isinstance(v, compat_str) - else v) - bargs = [enc(a) for a in args] - bkwargs = dict( - (k, enc(v)) for k, v in kwargs.items()) - return real_add_option(self, *bargs, **bkwargs) - optparse.OptionGroup.add_option = _compat_add_option - - -if hasattr(shutil, 'get_terminal_size'): # Python >= 3.3 - compat_get_terminal_size = shutil.get_terminal_size -else: - _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines']) - - def compat_get_terminal_size(fallback=(80, 24)): - columns = compat_getenv('COLUMNS') - if columns: - columns = int(columns) - else: - columns = None - lines = compat_getenv('LINES') - if lines: - lines = int(lines) - else: - lines = None - - if columns is None or lines is None or columns <= 0 or lines <= 0: - try: - sp = subprocess.Popen( - ['stty', 'size'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = sp.communicate() - _lines, _columns = map(int, out.split()) - except Exception: - _columns, _lines = _terminal_size(*fallback) - - if columns is None or columns <= 0: - columns = _columns - if lines is None or lines <= 0: - lines = _lines - return _terminal_size(columns, lines) - -try: - itertools.count(start=0, step=1) - compat_itertools_count = itertools.count -except TypeError: # Python 2.6 - def compat_itertools_count(start=0, step=1): - n = start - while True: - yield n - n += step - -if sys.version_info >= (3, 0): - from tokenize import tokenize as compat_tokenize_tokenize -else: - from tokenize import generate_tokens as compat_tokenize_tokenize - - -try: - struct.pack('!I', 0) -except TypeError: - # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument - # See https://bugs.python.org/issue19099 - def compat_struct_pack(spec, *args): - if isinstance(spec, compat_str): - spec = spec.encode('ascii') - return struct.pack(spec, *args) - - def compat_struct_unpack(spec, *args): - if isinstance(spec, compat_str): - spec = spec.encode('ascii') - return struct.unpack(spec, *args) - - class compat_Struct(struct.Struct): - def __init__(self, fmt): - if isinstance(fmt, compat_str): - fmt = fmt.encode('ascii') - super(compat_Struct, self).__init__(fmt) -else: - compat_struct_pack = struct.pack - compat_struct_unpack = struct.unpack - if platform.python_implementation() == 'IronPython' and sys.version_info < (2, 7, 8): - class compat_Struct(struct.Struct): - def unpack(self, string): - if not isinstance(string, buffer): # noqa: F821 - string = buffer(string) # noqa: F821 - return super(compat_Struct, self).unpack(string) - else: - compat_Struct = struct.Struct - - -try: - from future_builtins import zip as compat_zip -except ImportError: # not 2.6+ or is 3.x - try: - from itertools import izip as compat_zip # < 2.5 or 3.x - except ImportError: - compat_zip = zip - - -if sys.version_info < (3, 3): - def compat_b64decode(s, *args, **kwargs): - if isinstance(s, compat_str): - s = s.encode('ascii') - return base64.b64decode(s, *args, **kwargs) -else: - compat_b64decode = base64.b64decode - - -if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0): - # PyPy2 prior to version 5.4.0 expects byte strings as Windows function - # names, see the original PyPy issue [1] and the youtube-dl one [2]. - # 1. https://bitbucket.org/pypy/pypy/issues/2360/windows-ctypescdll-typeerror-function-name - # 2. https://github.com/rg3/youtube-dl/pull/4392 - def compat_ctypes_WINFUNCTYPE(*args, **kwargs): - real = ctypes.WINFUNCTYPE(*args, **kwargs) - - def resf(tpl, *args, **kwargs): - funcname, dll = tpl - return real((str(funcname), dll), *args, **kwargs) - - return resf -else: - def compat_ctypes_WINFUNCTYPE(*args, **kwargs): - return ctypes.WINFUNCTYPE(*args, **kwargs) - - -__all__ = [ - 'compat_HTMLParseError', - 'compat_HTMLParser', - 'compat_HTTPError', - 'compat_Struct', - 'compat_b64decode', - 'compat_basestring', - 'compat_chr', - 'compat_cookiejar', - 'compat_cookies', - 'compat_ctypes_WINFUNCTYPE', - 'compat_etree_fromstring', - 'compat_etree_register_namespace', - 'compat_expanduser', - 'compat_get_terminal_size', - 'compat_getenv', - 'compat_getpass', - 'compat_html_entities', - 'compat_html_entities_html5', - 'compat_http_client', - 'compat_http_server', - 'compat_input', - 'compat_integer_types', - 'compat_itertools_count', - 'compat_kwargs', - 'compat_numeric_types', - 'compat_ord', - 'compat_os_name', - 'compat_parse_qs', - 'compat_print', - 'compat_setenv', - 'compat_shlex_quote', - 'compat_shlex_split', - 'compat_socket_create_connection', - 'compat_str', - 'compat_struct_pack', - 'compat_struct_unpack', - 'compat_subprocess_get_DEVNULL', - 'compat_tokenize_tokenize', - 'compat_urllib_error', - 'compat_urllib_parse', - 'compat_urllib_parse_unquote', - 'compat_urllib_parse_unquote_plus', - 'compat_urllib_parse_unquote_to_bytes', - 'compat_urllib_parse_urlencode', - 'compat_urllib_parse_urlparse', - 'compat_urllib_request', - 'compat_urllib_request_DataHandler', - 'compat_urllib_response', - 'compat_urlparse', - 'compat_urlretrieve', - 'compat_xml_parse_error', - 'compat_xpath', - 'compat_zip', - 'workaround_optparse_bug9161', -] diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py deleted file mode 100644 index 2e485df..0000000 --- a/youtube_dl/downloader/__init__.py +++ /dev/null @@ -1,61 +0,0 @@ -from __future__ import unicode_literals - -from .common import FileDownloader -from .f4m import F4mFD -from .hls import HlsFD -from .http import HttpFD -from .rtmp import RtmpFD -from .dash import DashSegmentsFD -from .rtsp import RtspFD -from .ism import IsmFD -from .external import ( - get_external_downloader, - FFmpegFD, -) - -from ..utils import ( - determine_protocol, -) - -PROTOCOL_MAP = { - 'rtmp': RtmpFD, - 'm3u8_native': HlsFD, - 'm3u8': FFmpegFD, - 'mms': RtspFD, - 'rtsp': RtspFD, - 'f4m': F4mFD, - 'http_dash_segments': DashSegmentsFD, - 'ism': IsmFD, -} - - -def get_suitable_downloader(info_dict, params={}): - """Get the downloader class that can handle the info dict.""" - protocol = determine_protocol(info_dict) - info_dict['protocol'] = protocol - - # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): - # return FFmpegFD - - external_downloader = params.get('external_downloader') - if external_downloader is not None: - ed = get_external_downloader(external_downloader) - if ed.can_download(info_dict): - return ed - - if protocol.startswith('m3u8') and info_dict.get('is_live'): - return FFmpegFD - - if protocol == 'm3u8' and params.get('hls_prefer_native') is True: - return HlsFD - - if protocol == 'm3u8_native' and params.get('hls_prefer_native') is False: - return FFmpegFD - - return PROTOCOL_MAP.get(protocol, HttpFD) - - -__all__ = [ - 'get_suitable_downloader', - 'FileDownloader', -] diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py deleted file mode 100644 index 5979833..0000000 --- a/youtube_dl/downloader/common.py +++ /dev/null @@ -1,389 +0,0 @@ -from __future__ import division, unicode_literals - -import os -import re -import sys -import time -import random - -from ..compat import compat_os_name -from ..utils import ( - decodeArgument, - encodeFilename, - error_to_compat_str, - format_bytes, - shell_quote, - timeconvert, -) - - -class FileDownloader(object): - """File Downloader class. - - File downloader objects are the ones responsible of downloading the - actual video file and writing it to disk. - - File downloaders accept a lot of parameters. In order not to saturate - the object constructor with arguments, it receives a dictionary of - options instead. - - Available options: - - verbose: Print additional info to stdout. - quiet: Do not print messages to stdout. - ratelimit: Download speed limit, in bytes/sec. - retries: Number of times to retry for HTTP error 5xx - buffersize: Size of download buffer in bytes. - noresizebuffer: Do not automatically resize the download buffer. - continuedl: Try to continue downloads if possible. - noprogress: Do not print the progress bar. - logtostderr: Log messages to stderr instead of stdout. - consoletitle: Display progress in console window's titlebar. - nopart: Do not use temporary .part files. - updatetime: Use the Last-modified header to set output file timestamps. - test: Download only first bytes to test the downloader. - min_filesize: Skip files smaller than this size - max_filesize: Skip files larger than this size - xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. - external_downloader_args: A list of additional command-line arguments for the - external downloader. - hls_use_mpegts: Use the mpegts container for HLS videos. - http_chunk_size: Size of a chunk for chunk-based HTTP downloading. May be - useful for bypassing bandwidth throttling imposed by - a webserver (experimental) - - Subclasses of this one must re-define the real_download method. - """ - - _TEST_FILE_SIZE = 10241 - params = None - - def __init__(self, ydl, params): - """Create a FileDownloader object with the given options.""" - self.ydl = ydl - self._progress_hooks = [] - self.params = params - self.add_progress_hook(self.report_progress) - - @staticmethod - def format_seconds(seconds): - (mins, secs) = divmod(seconds, 60) - (hours, mins) = divmod(mins, 60) - if hours > 99: - return '--:--:--' - if hours == 0: - return '%02d:%02d' % (mins, secs) - else: - return '%02d:%02d:%02d' % (hours, mins, secs) - - @staticmethod - def calc_percent(byte_counter, data_len): - if data_len is None: - return None - return float(byte_counter) / float(data_len) * 100.0 - - @staticmethod - def format_percent(percent): - if percent is None: - return '---.-%' - return '%6s' % ('%3.1f%%' % percent) - - @staticmethod - def calc_eta(start, now, total, current): - if total is None: - return None - if now is None: - now = time.time() - dif = now - start - if current == 0 or dif < 0.001: # One millisecond - return None - rate = float(current) / dif - return int((float(total) - float(current)) / rate) - - @staticmethod - def format_eta(eta): - if eta is None: - return '--:--' - return FileDownloader.format_seconds(eta) - - @staticmethod - def calc_speed(start, now, bytes): - dif = now - start - if bytes == 0 or dif < 0.001: # One millisecond - return None - return float(bytes) / dif - - @staticmethod - def format_speed(speed): - if speed is None: - return '%10s' % '---b/s' - return '%10s' % ('%s/s' % format_bytes(speed)) - - @staticmethod - def format_retries(retries): - return 'inf' if retries == float('inf') else '%.0f' % retries - - @staticmethod - def best_block_size(elapsed_time, bytes): - new_min = max(bytes / 2.0, 1.0) - new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB - if elapsed_time < 0.001: - return int(new_max) - rate = bytes / elapsed_time - if rate > new_max: - return int(new_max) - if rate < new_min: - return int(new_min) - return int(rate) - - @staticmethod - def parse_bytes(bytestr): - """Parse a string indicating a byte quantity into an integer.""" - matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr) - if matchobj is None: - return None - number = float(matchobj.group(1)) - multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower()) - return int(round(number * multiplier)) - - def to_screen(self, *args, **kargs): - self.ydl.to_screen(*args, **kargs) - - def to_stderr(self, message): - self.ydl.to_screen(message) - - def to_console_title(self, message): - self.ydl.to_console_title(message) - - def trouble(self, *args, **kargs): - self.ydl.trouble(*args, **kargs) - - def report_warning(self, *args, **kargs): - self.ydl.report_warning(*args, **kargs) - - def report_error(self, *args, **kargs): - self.ydl.report_error(*args, **kargs) - - def slow_down(self, start_time, now, byte_counter): - """Sleep if the download speed is over the rate limit.""" - rate_limit = self.params.get('ratelimit') - if rate_limit is None or byte_counter == 0: - return - if now is None: - now = time.time() - elapsed = now - start_time - if elapsed <= 0.0: - return - speed = float(byte_counter) / elapsed - if speed > rate_limit: - time.sleep(max((byte_counter // rate_limit) - elapsed, 0)) - - def temp_name(self, filename): - """Returns a temporary filename for the given filename.""" - if self.params.get('nopart', False) or filename == '-' or \ - (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))): - return filename - return filename + '.part' - - def undo_temp_name(self, filename): - if filename.endswith('.part'): - return filename[:-len('.part')] - return filename - - def ytdl_filename(self, filename): - return filename + '.ytdl' - - def try_rename(self, old_filename, new_filename): - try: - if old_filename == new_filename: - return - os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) - except (IOError, OSError) as err: - self.report_error('unable to rename file: %s' % error_to_compat_str(err)) - - def try_utime(self, filename, last_modified_hdr): - """Try to set the last-modified time of the given file.""" - if last_modified_hdr is None: - return - if not os.path.isfile(encodeFilename(filename)): - return - timestr = last_modified_hdr - if timestr is None: - return - filetime = timeconvert(timestr) - if filetime is None: - return filetime - # Ignore obviously invalid dates - if filetime == 0: - return - try: - os.utime(filename, (time.time(), filetime)) - except Exception: - pass - return filetime - - def report_destination(self, filename): - """Report destination filename.""" - self.to_screen('[download] Destination: ' + filename) - - def _report_progress_status(self, msg, is_last_line=False): - fullmsg = '[download] ' + msg - if self.params.get('progress_with_newline', False): - self.to_screen(fullmsg) - else: - if compat_os_name == 'nt': - prev_len = getattr(self, '_report_progress_prev_line_length', - 0) - if prev_len > len(fullmsg): - fullmsg += ' ' * (prev_len - len(fullmsg)) - self._report_progress_prev_line_length = len(fullmsg) - clear_line = '\r' - else: - clear_line = ('\r\x1b[K' if sys.stderr.isatty() else '\r') - self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line) - self.to_console_title('youtube-dl ' + msg) - - def report_progress(self, s): - if s['status'] == 'finished': - if self.params.get('noprogress', False): - self.to_screen('[download] Download completed') - else: - msg_template = '100%%' - if s.get('total_bytes') is not None: - s['_total_bytes_str'] = format_bytes(s['total_bytes']) - msg_template += ' of %(_total_bytes_str)s' - if s.get('elapsed') is not None: - s['_elapsed_str'] = self.format_seconds(s['elapsed']) - msg_template += ' in %(_elapsed_str)s' - self._report_progress_status( - msg_template % s, is_last_line=True) - - if self.params.get('noprogress'): - return - - if s['status'] != 'downloading': - return - - if s.get('eta') is not None: - s['_eta_str'] = self.format_eta(s['eta']) - else: - s['_eta_str'] = 'Unknown ETA' - - if s.get('total_bytes') and s.get('downloaded_bytes') is not None: - s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes']) - elif s.get('total_bytes_estimate') and s.get('downloaded_bytes') is not None: - s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes_estimate']) - else: - if s.get('downloaded_bytes') == 0: - s['_percent_str'] = self.format_percent(0) - else: - s['_percent_str'] = 'Unknown %' - - if s.get('speed') is not None: - s['_speed_str'] = self.format_speed(s['speed']) - else: - s['_speed_str'] = 'Unknown speed' - - if s.get('total_bytes') is not None: - s['_total_bytes_str'] = format_bytes(s['total_bytes']) - msg_template = '%(_percent_str)s of %(_total_bytes_str)s at %(_speed_str)s ETA %(_eta_str)s' - elif s.get('total_bytes_estimate') is not None: - s['_total_bytes_estimate_str'] = format_bytes(s['total_bytes_estimate']) - msg_template = '%(_percent_str)s of ~%(_total_bytes_estimate_str)s at %(_speed_str)s ETA %(_eta_str)s' - else: - if s.get('downloaded_bytes') is not None: - s['_downloaded_bytes_str'] = format_bytes(s['downloaded_bytes']) - if s.get('elapsed'): - s['_elapsed_str'] = self.format_seconds(s['elapsed']) - msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s (%(_elapsed_str)s)' - else: - msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s' - else: - msg_template = '%(_percent_str)s % at %(_speed_str)s ETA %(_eta_str)s' - - self._report_progress_status(msg_template % s) - - def report_resuming_byte(self, resume_len): - """Report attempt to resume at given byte.""" - self.to_screen('[download] Resuming download at byte %s' % resume_len) - - def report_retry(self, err, count, retries): - """Report retry in case of HTTP error 5xx""" - self.to_screen( - '[download] Got server HTTP error: %s. Retrying (attempt %d of %s)...' - % (error_to_compat_str(err), count, self.format_retries(retries))) - - def report_file_already_downloaded(self, file_name): - """Report file has already been fully downloaded.""" - try: - self.to_screen('[download] %s has already been downloaded' % file_name) - except UnicodeEncodeError: - self.to_screen('[download] The file has already been downloaded') - - def report_unable_to_resume(self): - """Report it was impossible to resume download.""" - self.to_screen('[download] Unable to resume') - - def download(self, filename, info_dict): - """Download to a filename using the info from info_dict - Return True on success and False otherwise - """ - - nooverwrites_and_exists = ( - self.params.get('nooverwrites', False) and - os.path.exists(encodeFilename(filename)) - ) - - if not hasattr(filename, 'write'): - continuedl_and_exists = ( - self.params.get('continuedl', True) and - os.path.isfile(encodeFilename(filename)) and - not self.params.get('nopart', False) - ) - - # Check file already present - if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists): - self.report_file_already_downloaded(filename) - self._hook_progress({ - 'filename': filename, - 'status': 'finished', - 'total_bytes': os.path.getsize(encodeFilename(filename)), - }) - return True - - min_sleep_interval = self.params.get('sleep_interval') - if min_sleep_interval: - max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) - sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) - self.to_screen( - '[download] Sleeping %s seconds...' % ( - int(sleep_interval) if sleep_interval.is_integer() - else '%.2f' % sleep_interval)) - time.sleep(sleep_interval) - - return self.real_download(filename, info_dict) - - def real_download(self, filename, info_dict): - """Real download process. Redefine in subclasses.""" - raise NotImplementedError('This method must be implemented by subclasses') - - def _hook_progress(self, status): - for ph in self._progress_hooks: - ph(status) - - def add_progress_hook(self, ph): - # See YoutubeDl.py (search for progress_hooks) for a description of - # this interface - self._progress_hooks.append(ph) - - def _debug_cmd(self, args, exe=None): - if not self.params.get('verbose', False): - return - - str_args = [decodeArgument(a) for a in args] - - if exe is None: - exe = os.path.basename(str_args[0]) - - self.to_screen('[debug] %s command line: %s' % ( - exe, shell_quote(str_args))) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py deleted file mode 100644 index eaa7adf..0000000 --- a/youtube_dl/downloader/dash.py +++ /dev/null @@ -1,80 +0,0 @@ -from __future__ import unicode_literals - -from .fragment import FragmentFD -from ..compat import compat_urllib_error -from ..utils import ( - DownloadError, - urljoin, -) - - -class DashSegmentsFD(FragmentFD): - """ - Download segments in a DASH manifest - """ - - FD_NAME = 'dashsegments' - - def real_download(self, filename, info_dict): - fragment_base_url = info_dict.get('fragment_base_url') - fragments = info_dict['fragments'][:1] if self.params.get( - 'test', False) else info_dict['fragments'] - - ctx = { - 'filename': filename, - 'total_frags': len(fragments), - } - - self._prepare_and_start_frag_download(ctx) - - fragment_retries = self.params.get('fragment_retries', 0) - skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - - frag_index = 0 - for i, fragment in enumerate(fragments): - frag_index += 1 - if frag_index <= ctx['fragment_index']: - continue - # In DASH, the first segment contains necessary headers to - # generate a valid MP4 file, so always abort for the first segment - fatal = i == 0 or not skip_unavailable_fragments - count = 0 - while count <= fragment_retries: - try: - fragment_url = fragment.get('url') - if not fragment_url: - assert fragment_base_url - fragment_url = urljoin(fragment_base_url, fragment['path']) - success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) - if not success: - return False - self._append_fragment(ctx, frag_content) - break - except compat_urllib_error.HTTPError as err: - # YouTube may often return 404 HTTP error for a fragment causing the - # whole download to fail. However if the same fragment is immediately - # retried with the same request data this usually succeeds (1-2 attemps - # is usually enough) thus allowing to download the whole file successfully. - # To be future-proof we will retry all fragments that fail with any - # HTTP error. - count += 1 - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) - except DownloadError: - # Don't retry fragment if error occurred during HTTP downloading - # itself since it has own retry settings - if not fatal: - self.report_skip_fragment(frag_index) - break - raise - - if count > fragment_retries: - if not fatal: - self.report_skip_fragment(frag_index) - continue - self.report_error('giving up after %s fragment retries' % fragment_retries) - return False - - self._finish_frag_download(ctx) - - return True diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py deleted file mode 100644 index 958d00a..0000000 --- a/youtube_dl/downloader/external.py +++ /dev/null @@ -1,354 +0,0 @@ -from __future__ import unicode_literals - -import os.path -import re -import subprocess -import sys -import time - -from .common import FileDownloader -from ..compat import ( - compat_setenv, - compat_str, -) -from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS -from ..utils import ( - cli_option, - cli_valueless_option, - cli_bool_option, - cli_configuration_args, - encodeFilename, - encodeArgument, - handle_youtubedl_headers, - check_executable, - is_outdated_version, -) - - -class ExternalFD(FileDownloader): - def real_download(self, filename, info_dict): - self.report_destination(filename) - tmpfilename = self.temp_name(filename) - - try: - started = time.time() - retval = self._call_downloader(tmpfilename, info_dict) - except KeyboardInterrupt: - if not info_dict.get('is_live'): - raise - # Live stream downloading cancellation should be considered as - # correct and expected termination thus all postprocessing - # should take place - retval = 0 - self.to_screen('[%s] Interrupted by user' % self.get_basename()) - - if retval == 0: - status = { - 'filename': filename, - 'status': 'finished', - 'elapsed': time.time() - started, - } - if filename != '-': - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) - self.try_rename(tmpfilename, filename) - status.update({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - }) - self._hook_progress(status) - return True - else: - self.to_stderr('\n') - self.report_error('%s exited with code %d' % ( - self.get_basename(), retval)) - return False - - @classmethod - def get_basename(cls): - return cls.__name__[:-2].lower() - - @property - def exe(self): - return self.params.get('external_downloader') - - @classmethod - def available(cls): - return check_executable(cls.get_basename(), [cls.AVAILABLE_OPT]) - - @classmethod - def supports(cls, info_dict): - return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps') - - @classmethod - def can_download(cls, info_dict): - return cls.available() and cls.supports(info_dict) - - def _option(self, command_option, param): - return cli_option(self.params, command_option, param) - - def _bool_option(self, command_option, param, true_value='true', false_value='false', separator=None): - return cli_bool_option(self.params, command_option, param, true_value, false_value, separator) - - def _valueless_option(self, command_option, param, expected_value=True): - return cli_valueless_option(self.params, command_option, param, expected_value) - - def _configuration_args(self, default=[]): - return cli_configuration_args(self.params, 'external_downloader_args', default) - - def _call_downloader(self, tmpfilename, info_dict): - """ Either overwrite this or implement _make_cmd """ - cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] - - self._debug_cmd(cmd) - - p = subprocess.Popen( - cmd, stderr=subprocess.PIPE) - _, stderr = p.communicate() - if p.returncode != 0: - self.to_stderr(stderr.decode('utf-8', 'replace')) - return p.returncode - - -class CurlFD(ExternalFD): - AVAILABLE_OPT = '-V' - - def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '--location', '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') - cmd += self._valueless_option('--silent', 'noprogress') - cmd += self._valueless_option('--verbose', 'verbose') - cmd += self._option('--limit-rate', 'ratelimit') - cmd += self._option('--retry', 'retries') - cmd += self._option('--max-filesize', 'max_filesize') - cmd += self._option('--interface', 'source_address') - cmd += self._option('--proxy', 'proxy') - cmd += self._valueless_option('--insecure', 'nocheckcertificate') - cmd += self._configuration_args() - cmd += ['--', info_dict['url']] - return cmd - - def _call_downloader(self, tmpfilename, info_dict): - cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] - - self._debug_cmd(cmd) - - # curl writes the progress to stderr so don't capture it. - p = subprocess.Popen(cmd) - p.communicate() - return p.returncode - - -class AxelFD(ExternalFD): - AVAILABLE_OPT = '-V' - - def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): - cmd += ['-H', '%s: %s' % (key, val)] - cmd += self._configuration_args() - cmd += ['--', info_dict['url']] - return cmd - - -class WgetFD(ExternalFD): - AVAILABLE_OPT = '--version' - - def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._option('--bind-address', 'source_address') - cmd += self._option('--proxy', 'proxy') - cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') - cmd += self._configuration_args() - cmd += ['--', info_dict['url']] - return cmd - - -class Aria2cFD(ExternalFD): - AVAILABLE_OPT = '-v' - - def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-c'] - cmd += self._configuration_args([ - '--min-split-size', '1M', '--max-connection-per-server', '4']) - dn = os.path.dirname(tmpfilename) - if dn: - cmd += ['--dir', dn] - cmd += ['--out', os.path.basename(tmpfilename)] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._option('--interface', 'source_address') - cmd += self._option('--all-proxy', 'proxy') - cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') - cmd += ['--', info_dict['url']] - return cmd - - -class HttpieFD(ExternalFD): - @classmethod - def available(cls): - return check_executable('http', ['--version']) - - def _make_cmd(self, tmpfilename, info_dict): - cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] - for key, val in info_dict['http_headers'].items(): - cmd += ['%s:%s' % (key, val)] - return cmd - - -class FFmpegFD(ExternalFD): - @classmethod - def supports(cls, info_dict): - return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms') - - @classmethod - def available(cls): - return FFmpegPostProcessor().available - - def _call_downloader(self, tmpfilename, info_dict): - url = info_dict['url'] - ffpp = FFmpegPostProcessor(downloader=self) - if not ffpp.available: - self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.') - return False - ffpp.check_version() - - args = [ffpp.executable, '-y'] - - for log_level in ('quiet', 'verbose'): - if self.params.get(log_level, False): - args += ['-loglevel', log_level] - break - - seekable = info_dict.get('_seekable') - if seekable is not None: - # setting -seekable prevents ffmpeg from guessing if the server - # supports seeking(by adding the header `Range: bytes=0-`), which - # can cause problems in some cases - # https://github.com/rg3/youtube-dl/issues/11800#issuecomment-275037127 - # http://trac.ffmpeg.org/ticket/6125#comment:10 - args += ['-seekable', '1' if seekable else '0'] - - args += self._configuration_args() - - # start_time = info_dict.get('start_time') or 0 - # if start_time: - # args += ['-ss', compat_str(start_time)] - # end_time = info_dict.get('end_time') - # if end_time: - # args += ['-t', compat_str(end_time - start_time)] - - if info_dict['http_headers'] and re.match(r'^https?://', url): - # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: - # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. - headers = handle_youtubedl_headers(info_dict['http_headers']) - args += [ - '-headers', - ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] - - env = None - proxy = self.params.get('proxy') - if proxy: - if not re.match(r'^[\da-zA-Z]+://', proxy): - proxy = 'http://%s' % proxy - - if proxy.startswith('socks'): - self.report_warning( - '%s does not support SOCKS proxies. Downloading is likely to fail. ' - 'Consider adding --hls-prefer-native to your command.' % self.get_basename()) - - # Since December 2015 ffmpeg supports -http_proxy option (see - # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) - # We could switch to the following code if we are able to detect version properly - # args += ['-http_proxy', proxy] - env = os.environ.copy() - compat_setenv('HTTP_PROXY', proxy, env=env) - compat_setenv('http_proxy', proxy, env=env) - - protocol = info_dict.get('protocol') - - if protocol == 'rtmp': - player_url = info_dict.get('player_url') - page_url = info_dict.get('page_url') - app = info_dict.get('app') - play_path = info_dict.get('play_path') - tc_url = info_dict.get('tc_url') - flash_version = info_dict.get('flash_version') - live = info_dict.get('rtmp_live', False) - if player_url is not None: - args += ['-rtmp_swfverify', player_url] - if page_url is not None: - args += ['-rtmp_pageurl', page_url] - if app is not None: - args += ['-rtmp_app', app] - if play_path is not None: - args += ['-rtmp_playpath', play_path] - if tc_url is not None: - args += ['-rtmp_tcurl', tc_url] - if flash_version is not None: - args += ['-rtmp_flashver', flash_version] - if live: - args += ['-rtmp_live', 'live'] - - args += ['-i', url, '-c', 'copy'] - - if self.params.get('test', False): - args += ['-fs', compat_str(self._TEST_FILE_SIZE)] - - if protocol in ('m3u8', 'm3u8_native'): - if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': - args += ['-f', 'mpegts'] - else: - args += ['-f', 'mp4'] - if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2', False)) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): - args += ['-bsf:a', 'aac_adtstoasc'] - elif protocol == 'rtmp': - args += ['-f', 'flv'] - else: - args += ['-f', EXT_TO_OUT_FORMATS.get(info_dict['ext'], info_dict['ext'])] - - args = [encodeArgument(opt) for opt in args] - args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) - - self._debug_cmd(args) - - proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env) - try: - retval = proc.wait() - except KeyboardInterrupt: - # subprocces.run would send the SIGKILL signal to ffmpeg and the - # mp4 file couldn't be played, but if we ask ffmpeg to quit it - # produces a file that is playable (this is mostly useful for live - # streams). Note that Windows is not affected and produces playable - # files (see https://github.com/rg3/youtube-dl/issues/8300). - if sys.platform != 'win32': - proc.communicate(b'q') - raise - return retval - - -class AVconvFD(FFmpegFD): - pass - - -_BY_NAME = dict( - (klass.get_basename(), klass) - for name, klass in globals().items() - if name.endswith('FD') and name != 'ExternalFD' -) - - -def list_external_downloaders(): - return sorted(_BY_NAME.keys()) - - -def get_external_downloader(external_downloader): - """ Given the name of the executable, see whether we support the given - downloader . """ - # Drop .exe extension on Windows - bn = os.path.splitext(os.path.basename(external_downloader))[0] - return _BY_NAME[bn] diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py deleted file mode 100644 index 15e71be..0000000 --- a/youtube_dl/downloader/f4m.py +++ /dev/null @@ -1,438 +0,0 @@ -from __future__ import division, unicode_literals - -import io -import itertools -import time - -from .fragment import FragmentFD -from ..compat import ( - compat_b64decode, - compat_etree_fromstring, - compat_urlparse, - compat_urllib_error, - compat_urllib_parse_urlparse, - compat_struct_pack, - compat_struct_unpack, -) -from ..utils import ( - fix_xml_ampersands, - xpath_text, -) - - -class DataTruncatedError(Exception): - pass - - -class FlvReader(io.BytesIO): - """ - Reader for Flv files - The file format is documented in https://www.adobe.com/devnet/f4v.html - """ - - def read_bytes(self, n): - data = self.read(n) - if len(data) < n: - raise DataTruncatedError( - 'FlvReader error: need %d bytes while only %d bytes got' % ( - n, len(data))) - return data - - # Utility functions for reading numbers and strings - def read_unsigned_long_long(self): - return compat_struct_unpack('!Q', self.read_bytes(8))[0] - - def read_unsigned_int(self): - return compat_struct_unpack('!I', self.read_bytes(4))[0] - - def read_unsigned_char(self): - return compat_struct_unpack('!B', self.read_bytes(1))[0] - - def read_string(self): - res = b'' - while True: - char = self.read_bytes(1) - if char == b'\x00': - break - res += char - return res - - def read_box_info(self): - """ - Read a box and return the info as a tuple: (box_size, box_type, box_data) - """ - real_size = size = self.read_unsigned_int() - box_type = self.read_bytes(4) - header_end = 8 - if size == 1: - real_size = self.read_unsigned_long_long() - header_end = 16 - return real_size, box_type, self.read_bytes(real_size - header_end) - - def read_asrt(self): - # version - self.read_unsigned_char() - # flags - self.read_bytes(3) - quality_entry_count = self.read_unsigned_char() - # QualityEntryCount - for i in range(quality_entry_count): - self.read_string() - - segment_run_count = self.read_unsigned_int() - segments = [] - for i in range(segment_run_count): - first_segment = self.read_unsigned_int() - fragments_per_segment = self.read_unsigned_int() - segments.append((first_segment, fragments_per_segment)) - - return { - 'segment_run': segments, - } - - def read_afrt(self): - # version - self.read_unsigned_char() - # flags - self.read_bytes(3) - # time scale - self.read_unsigned_int() - - quality_entry_count = self.read_unsigned_char() - # QualitySegmentUrlModifiers - for i in range(quality_entry_count): - self.read_string() - - fragments_count = self.read_unsigned_int() - fragments = [] - for i in range(fragments_count): - first = self.read_unsigned_int() - first_ts = self.read_unsigned_long_long() - duration = self.read_unsigned_int() - if duration == 0: - discontinuity_indicator = self.read_unsigned_char() - else: - discontinuity_indicator = None - fragments.append({ - 'first': first, - 'ts': first_ts, - 'duration': duration, - 'discontinuity_indicator': discontinuity_indicator, - }) - - return { - 'fragments': fragments, - } - - def read_abst(self): - # version - self.read_unsigned_char() - # flags - self.read_bytes(3) - - self.read_unsigned_int() # BootstrapinfoVersion - # Profile,Live,Update,Reserved - flags = self.read_unsigned_char() - live = flags & 0x20 != 0 - # time scale - self.read_unsigned_int() - # CurrentMediaTime - self.read_unsigned_long_long() - # SmpteTimeCodeOffset - self.read_unsigned_long_long() - - self.read_string() # MovieIdentifier - server_count = self.read_unsigned_char() - # ServerEntryTable - for i in range(server_count): - self.read_string() - quality_count = self.read_unsigned_char() - # QualityEntryTable - for i in range(quality_count): - self.read_string() - # DrmData - self.read_string() - # MetaData - self.read_string() - - segments_count = self.read_unsigned_char() - segments = [] - for i in range(segments_count): - box_size, box_type, box_data = self.read_box_info() - assert box_type == b'asrt' - segment = FlvReader(box_data).read_asrt() - segments.append(segment) - fragments_run_count = self.read_unsigned_char() - fragments = [] - for i in range(fragments_run_count): - box_size, box_type, box_data = self.read_box_info() - assert box_type == b'afrt' - fragments.append(FlvReader(box_data).read_afrt()) - - return { - 'segments': segments, - 'fragments': fragments, - 'live': live, - } - - def read_bootstrap_info(self): - total_size, box_type, box_data = self.read_box_info() - assert box_type == b'abst' - return FlvReader(box_data).read_abst() - - -def read_bootstrap_info(bootstrap_bytes): - return FlvReader(bootstrap_bytes).read_bootstrap_info() - - -def build_fragments_list(boot_info): - """ Return a list of (segment, fragment) for each fragment in the video """ - res = [] - segment_run_table = boot_info['segments'][0] - fragment_run_entry_table = boot_info['fragments'][0]['fragments'] - first_frag_number = fragment_run_entry_table[0]['first'] - fragments_counter = itertools.count(first_frag_number) - for segment, fragments_count in segment_run_table['segment_run']: - # In some live HDS streams (for example Rai), `fragments_count` is - # abnormal and causing out-of-memory errors. It's OK to change the - # number of fragments for live streams as they are updated periodically - if fragments_count == 4294967295 and boot_info['live']: - fragments_count = 2 - for _ in range(fragments_count): - res.append((segment, next(fragments_counter))) - - if boot_info['live']: - res = res[-2:] - - return res - - -def write_unsigned_int(stream, val): - stream.write(compat_struct_pack('!I', val)) - - -def write_unsigned_int_24(stream, val): - stream.write(compat_struct_pack('!I', val)[1:]) - - -def write_flv_header(stream): - """Writes the FLV header to stream""" - # FLV header - stream.write(b'FLV\x01') - stream.write(b'\x05') - stream.write(b'\x00\x00\x00\x09') - stream.write(b'\x00\x00\x00\x00') - - -def write_metadata_tag(stream, metadata): - """Writes optional metadata tag to stream""" - SCRIPT_TAG = b'\x12' - FLV_TAG_HEADER_LEN = 11 - - if metadata: - stream.write(SCRIPT_TAG) - write_unsigned_int_24(stream, len(metadata)) - stream.write(b'\x00\x00\x00\x00\x00\x00\x00') - stream.write(metadata) - write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata)) - - -def remove_encrypted_media(media): - return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib and - 'drmAdditionalHeaderSetId' not in e.attrib, - media)) - - -def _add_ns(prop, ver=1): - return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop) - - -def get_base_url(manifest): - base_url = xpath_text( - manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)], - 'base URL', default=None) - if base_url: - base_url = base_url.strip() - return base_url - - -class F4mFD(FragmentFD): - """ - A downloader for f4m manifests or AdobeHDS. - """ - - FD_NAME = 'f4m' - - def _get_unencrypted_media(self, doc): - media = doc.findall(_add_ns('media')) - if not media: - self.report_error('No media found') - for e in (doc.findall(_add_ns('drmAdditionalHeader')) + - doc.findall(_add_ns('drmAdditionalHeaderSet'))): - # If id attribute is missing it's valid for all media nodes - # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute - if 'id' not in e.attrib: - self.report_error('Missing ID in f4m DRM') - media = remove_encrypted_media(media) - if not media: - self.report_error('Unsupported DRM') - return media - - def _get_bootstrap_from_url(self, bootstrap_url): - bootstrap = self.ydl.urlopen(bootstrap_url).read() - return read_bootstrap_info(bootstrap) - - def _update_live_fragments(self, bootstrap_url, latest_fragment): - fragments_list = [] - retries = 30 - while (not fragments_list) and (retries > 0): - boot_info = self._get_bootstrap_from_url(bootstrap_url) - fragments_list = build_fragments_list(boot_info) - fragments_list = [f for f in fragments_list if f[1] > latest_fragment] - if not fragments_list: - # Retry after a while - time.sleep(5.0) - retries -= 1 - - if not fragments_list: - self.report_error('Failed to update fragments') - - return fragments_list - - def _parse_bootstrap_node(self, node, base_url): - # Sometimes non empty inline bootstrap info can be specified along - # with bootstrap url attribute (e.g. dummy inline bootstrap info - # contains whitespace characters in [1]). We will prefer bootstrap - # url over inline bootstrap info when present. - # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m - bootstrap_url = node.get('url') - if bootstrap_url: - bootstrap_url = compat_urlparse.urljoin( - base_url, bootstrap_url) - boot_info = self._get_bootstrap_from_url(bootstrap_url) - else: - bootstrap_url = None - bootstrap = compat_b64decode(node.text) - boot_info = read_bootstrap_info(bootstrap) - return boot_info, bootstrap_url - - def real_download(self, filename, info_dict): - man_url = info_dict['url'] - requested_bitrate = info_dict.get('tbr') - self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) - - urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) - man_url = urlh.geturl() - # Some manifests may be malformed, e.g. prosiebensat1 generated manifests - # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244 - # and https://github.com/rg3/youtube-dl/issues/7823) - manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip() - - doc = compat_etree_fromstring(manifest) - formats = [(int(f.attrib.get('bitrate', -1)), f) - for f in self._get_unencrypted_media(doc)] - if requested_bitrate is None or len(formats) == 1: - # get the best format - formats = sorted(formats, key=lambda f: f[0]) - rate, media = formats[-1] - else: - rate, media = list(filter( - lambda f: int(f[0]) == requested_bitrate, formats))[0] - - # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. - man_base_url = get_base_url(doc) or man_url - - base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url']) - bootstrap_node = doc.find(_add_ns('bootstrapInfo')) - boot_info, bootstrap_url = self._parse_bootstrap_node( - bootstrap_node, man_base_url) - live = boot_info['live'] - metadata_node = media.find(_add_ns('metadata')) - if metadata_node is not None: - metadata = compat_b64decode(metadata_node.text) - else: - metadata = None - - fragments_list = build_fragments_list(boot_info) - test = self.params.get('test', False) - if test: - # We only download the first fragment - fragments_list = fragments_list[:1] - total_frags = len(fragments_list) - # For some akamai manifests we'll need to add a query to the fragment url - akamai_pv = xpath_text(doc, _add_ns('pv-2.0')) - - ctx = { - 'filename': filename, - 'total_frags': total_frags, - 'live': live, - } - - self._prepare_frag_download(ctx) - - dest_stream = ctx['dest_stream'] - - if ctx['complete_frags_downloaded_bytes'] == 0: - write_flv_header(dest_stream) - if not live: - write_metadata_tag(dest_stream, metadata) - - base_url_parsed = compat_urllib_parse_urlparse(base_url) - - self._start_frag_download(ctx) - - frag_index = 0 - while fragments_list: - seg_i, frag_i = fragments_list.pop(0) - frag_index += 1 - if frag_index <= ctx['fragment_index']: - continue - name = 'Seg%d-Frag%d' % (seg_i, frag_i) - query = [] - if base_url_parsed.query: - query.append(base_url_parsed.query) - if akamai_pv: - query.append(akamai_pv.strip(';')) - if info_dict.get('extra_param_to_segment_url'): - query.append(info_dict['extra_param_to_segment_url']) - url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) - try: - success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict) - if not success: - return False - reader = FlvReader(down_data) - while True: - try: - _, box_type, box_data = reader.read_box_info() - except DataTruncatedError: - if test: - # In tests, segments may be truncated, and thus - # FlvReader may not be able to parse the whole - # chunk. If so, write the segment as is - # See https://github.com/rg3/youtube-dl/issues/9214 - dest_stream.write(down_data) - break - raise - if box_type == b'mdat': - self._append_fragment(ctx, box_data) - break - except (compat_urllib_error.HTTPError, ) as err: - if live and (err.code == 404 or err.code == 410): - # We didn't keep up with the live window. Continue - # with the next available fragment. - msg = 'Fragment %d unavailable' % frag_i - self.report_warning(msg) - fragments_list = [] - else: - raise - - if not fragments_list and not test and live and bootstrap_url: - fragments_list = self._update_live_fragments(bootstrap_url, frag_i) - total_frags += len(fragments_list) - if fragments_list and (fragments_list[0][1] > frag_i + 1): - msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) - self.report_warning(msg) - - self._finish_frag_download(ctx) - - return True diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py deleted file mode 100644 index 917f6dc..0000000 --- a/youtube_dl/downloader/fragment.py +++ /dev/null @@ -1,268 +0,0 @@ -from __future__ import division, unicode_literals - -import os -import time -import json - -from .common import FileDownloader -from .http import HttpFD -from ..utils import ( - error_to_compat_str, - encodeFilename, - sanitize_open, - sanitized_Request, -) - - -class HttpQuietDownloader(HttpFD): - def to_screen(self, *args, **kargs): - pass - - -class FragmentFD(FileDownloader): - """ - A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). - - Available options: - - fragment_retries: Number of times to retry a fragment for HTTP error (DASH - and hlsnative only) - skip_unavailable_fragments: - Skip unavailable fragments (DASH and hlsnative only) - keep_fragments: Keep downloaded fragments on disk after downloading is - finished - - For each incomplete fragment download youtube-dl keeps on disk a special - bookkeeping file with download state and metadata (in future such files will - be used for any incomplete download handled by youtube-dl). This file is - used to properly handle resuming, check download file consistency and detect - potential errors. The file has a .ytdl extension and represents a standard - JSON file of the following format: - - extractor: - Dictionary of extractor related data. TBD. - - downloader: - Dictionary of downloader related data. May contain following data: - current_fragment: - Dictionary with current (being downloaded) fragment data: - index: 0-based index of current fragment among all fragments - fragment_count: - Total count of fragments - - This feature is experimental and file format may change in future. - """ - - def report_retry_fragment(self, err, frag_index, count, retries): - self.to_screen( - '[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s)...' - % (error_to_compat_str(err), frag_index, count, self.format_retries(retries))) - - def report_skip_fragment(self, frag_index): - self.to_screen('[download] Skipping fragment %d...' % frag_index) - - def _prepare_url(self, info_dict, url): - headers = info_dict.get('http_headers') - return sanitized_Request(url, None, headers) if headers else url - - def _prepare_and_start_frag_download(self, ctx): - self._prepare_frag_download(ctx) - self._start_frag_download(ctx) - - @staticmethod - def __do_ytdl_file(ctx): - return not ctx['live'] and not ctx['tmpfilename'] == '-' - - def _read_ytdl_file(self, ctx): - assert 'ytdl_corrupt' not in ctx - stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') - try: - ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index'] - except Exception: - ctx['ytdl_corrupt'] = True - finally: - stream.close() - - def _write_ytdl_file(self, ctx): - frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w') - downloader = { - 'current_fragment': { - 'index': ctx['fragment_index'], - }, - } - if ctx.get('fragment_count') is not None: - downloader['fragment_count'] = ctx['fragment_count'] - frag_index_stream.write(json.dumps({'downloader': downloader})) - frag_index_stream.close() - - def _download_fragment(self, ctx, frag_url, info_dict, headers=None): - fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) - success = ctx['dl'].download(fragment_filename, { - 'url': frag_url, - 'http_headers': headers or info_dict.get('http_headers'), - }) - if not success: - return False, None - down, frag_sanitized = sanitize_open(fragment_filename, 'rb') - ctx['fragment_filename_sanitized'] = frag_sanitized - frag_content = down.read() - down.close() - return True, frag_content - - def _append_fragment(self, ctx, frag_content): - try: - ctx['dest_stream'].write(frag_content) - ctx['dest_stream'].flush() - finally: - if self.__do_ytdl_file(ctx): - self._write_ytdl_file(ctx) - if not self.params.get('keep_fragments', False): - os.remove(encodeFilename(ctx['fragment_filename_sanitized'])) - del ctx['fragment_filename_sanitized'] - - def _prepare_frag_download(self, ctx): - if 'live' not in ctx: - ctx['live'] = False - if not ctx['live']: - total_frags_str = '%d' % ctx['total_frags'] - ad_frags = ctx.get('ad_frags', 0) - if ad_frags: - total_frags_str += ' (not including %d ad)' % ad_frags - else: - total_frags_str = 'unknown (live)' - self.to_screen( - '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str)) - self.report_destination(ctx['filename']) - dl = HttpQuietDownloader( - self.ydl, - { - 'continuedl': True, - 'quiet': True, - 'noprogress': True, - 'ratelimit': self.params.get('ratelimit'), - 'retries': self.params.get('retries', 0), - 'nopart': self.params.get('nopart', False), - 'test': self.params.get('test', False), - } - ) - tmpfilename = self.temp_name(ctx['filename']) - open_mode = 'wb' - resume_len = 0 - - # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): - open_mode = 'ab' - resume_len = os.path.getsize(encodeFilename(tmpfilename)) - - # Should be initialized before ytdl file check - ctx.update({ - 'tmpfilename': tmpfilename, - 'fragment_index': 0, - }) - - if self.__do_ytdl_file(ctx): - if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): - self._read_ytdl_file(ctx) - is_corrupt = ctx.get('ytdl_corrupt') is True - is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 - if is_corrupt or is_inconsistent: - message = ( - '.ytdl file is corrupt' if is_corrupt else - 'Inconsistent state of incomplete fragment download') - self.report_warning( - '%s. Restarting from the beginning...' % message) - ctx['fragment_index'] = resume_len = 0 - if 'ytdl_corrupt' in ctx: - del ctx['ytdl_corrupt'] - self._write_ytdl_file(ctx) - else: - self._write_ytdl_file(ctx) - assert ctx['fragment_index'] == 0 - - dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) - - ctx.update({ - 'dl': dl, - 'dest_stream': dest_stream, - 'tmpfilename': tmpfilename, - # Total complete fragments downloaded so far in bytes - 'complete_frags_downloaded_bytes': resume_len, - }) - - def _start_frag_download(self, ctx): - total_frags = ctx['total_frags'] - # This dict stores the download progress, it's updated by the progress - # hook - state = { - 'status': 'downloading', - 'downloaded_bytes': ctx['complete_frags_downloaded_bytes'], - 'fragment_index': ctx['fragment_index'], - 'fragment_count': total_frags, - 'filename': ctx['filename'], - 'tmpfilename': ctx['tmpfilename'], - } - - start = time.time() - ctx.update({ - 'started': start, - # Amount of fragment's bytes downloaded by the time of the previous - # frag progress hook invocation - 'prev_frag_downloaded_bytes': 0, - }) - - def frag_progress_hook(s): - if s['status'] not in ('downloading', 'finished'): - return - - time_now = time.time() - state['elapsed'] = time_now - start - frag_total_bytes = s.get('total_bytes') or 0 - if not ctx['live']: - estimated_size = ( - (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) / - (state['fragment_index'] + 1) * total_frags) - state['total_bytes_estimate'] = estimated_size - - if s['status'] == 'finished': - state['fragment_index'] += 1 - ctx['fragment_index'] = state['fragment_index'] - state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] - ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] - ctx['prev_frag_downloaded_bytes'] = 0 - else: - frag_downloaded_bytes = s['downloaded_bytes'] - state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] - if not ctx['live']: - state['eta'] = self.calc_eta( - start, time_now, estimated_size, - state['downloaded_bytes']) - state['speed'] = s.get('speed') or ctx.get('speed') - ctx['speed'] = state['speed'] - ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes - self._hook_progress(state) - - ctx['dl'].add_progress_hook(frag_progress_hook) - - return start - - def _finish_frag_download(self, ctx): - ctx['dest_stream'].close() - if self.__do_ytdl_file(ctx): - ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename'])) - if os.path.isfile(ytdl_filename): - os.remove(ytdl_filename) - elapsed = time.time() - ctx['started'] - - if ctx['tmpfilename'] == '-': - downloaded_bytes = ctx['complete_frags_downloaded_bytes'] - else: - self.try_rename(ctx['tmpfilename'], ctx['filename']) - downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) - - self._hook_progress({ - 'downloaded_bytes': downloaded_bytes, - 'total_bytes': downloaded_bytes, - 'filename': ctx['filename'], - 'status': 'finished', - 'elapsed': elapsed, - }) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py deleted file mode 100644 index fd30452..0000000 --- a/youtube_dl/downloader/hls.py +++ /dev/null @@ -1,204 +0,0 @@ -from __future__ import unicode_literals - -import re -import binascii -try: - from Crypto.Cipher import AES - can_decrypt_frag = True -except ImportError: - can_decrypt_frag = False - -from .fragment import FragmentFD -from .external import FFmpegFD - -from ..compat import ( - compat_urllib_error, - compat_urlparse, - compat_struct_pack, -) -from ..utils import ( - parse_m3u8_attributes, - update_url_query, -) - - -class HlsFD(FragmentFD): - """ A limited implementation that does not require ffmpeg """ - - FD_NAME = 'hlsnative' - - @staticmethod - def can_download(manifest, info_dict): - UNSUPPORTED_FEATURES = ( - r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1] - # r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] - - # Live streams heuristic does not always work (e.g. geo restricted to Germany - # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) - # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] - - # This heuristic also is not correct since segments may not be appended as well. - # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite - # no segments will definitely be appended to the end of the playlist. - # r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of - # # event media playlists [4] - - # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 - # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 - # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 - # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 - ) - check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] - is_aes128_enc = '#EXT-X-KEY:METHOD=AES-128' in manifest - check_results.append(can_decrypt_frag or not is_aes128_enc) - check_results.append(not (is_aes128_enc and r'#EXT-X-BYTERANGE' in manifest)) - check_results.append(not info_dict.get('is_live')) - return all(check_results) - - def real_download(self, filename, info_dict): - man_url = info_dict['url'] - self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) - - urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) - man_url = urlh.geturl() - s = urlh.read().decode('utf-8', 'ignore') - - if not self.can_download(s, info_dict): - if info_dict.get('extra_param_to_segment_url'): - self.report_error('pycrypto not found. Please install it.') - return False - self.report_warning( - 'hlsnative has detected features it does not support, ' - 'extraction will be delegated to ffmpeg') - fd = FFmpegFD(self.ydl, self.params) - for ph in self._progress_hooks: - fd.add_progress_hook(ph) - return fd.real_download(filename, info_dict) - - def is_ad_fragment(s): - return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s or - s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad')) - - media_frags = 0 - ad_frags = 0 - ad_frag_next = False - for line in s.splitlines(): - line = line.strip() - if not line: - continue - if line.startswith('#'): - if is_ad_fragment(line): - ad_frags += 1 - ad_frag_next = True - continue - if ad_frag_next: - ad_frag_next = False - continue - media_frags += 1 - - ctx = { - 'filename': filename, - 'total_frags': media_frags, - 'ad_frags': ad_frags, - } - - self._prepare_and_start_frag_download(ctx) - - fragment_retries = self.params.get('fragment_retries', 0) - skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - test = self.params.get('test', False) - - extra_query = None - extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') - if extra_param_to_segment_url: - extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url) - i = 0 - media_sequence = 0 - decrypt_info = {'METHOD': 'NONE'} - byte_range = {} - frag_index = 0 - ad_frag_next = False - for line in s.splitlines(): - line = line.strip() - if line: - if not line.startswith('#'): - if ad_frag_next: - ad_frag_next = False - continue - frag_index += 1 - if frag_index <= ctx['fragment_index']: - continue - frag_url = ( - line - if re.match(r'^https?://', line) - else compat_urlparse.urljoin(man_url, line)) - if extra_query: - frag_url = update_url_query(frag_url, extra_query) - count = 0 - headers = info_dict.get('http_headers', {}) - if byte_range: - headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end']) - while count <= fragment_retries: - try: - success, frag_content = self._download_fragment( - ctx, frag_url, info_dict, headers) - if not success: - return False - break - except compat_urllib_error.HTTPError as err: - # Unavailable (possibly temporary) fragments may be served. - # First we try to retry then either skip or abort. - # See https://github.com/rg3/youtube-dl/issues/10165, - # https://github.com/rg3/youtube-dl/issues/10448). - count += 1 - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) - if count > fragment_retries: - if skip_unavailable_fragments: - i += 1 - media_sequence += 1 - self.report_skip_fragment(frag_index) - continue - self.report_error( - 'giving up after %s fragment retries' % fragment_retries) - return False - if decrypt_info['METHOD'] == 'AES-128': - iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) - decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( - self._prepare_url(info_dict, decrypt_info['URI'])).read() - frag_content = AES.new( - decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) - self._append_fragment(ctx, frag_content) - # We only download the first fragment during the test - if test: - break - i += 1 - media_sequence += 1 - elif line.startswith('#EXT-X-KEY'): - decrypt_url = decrypt_info.get('URI') - decrypt_info = parse_m3u8_attributes(line[11:]) - if decrypt_info['METHOD'] == 'AES-128': - if 'IV' in decrypt_info: - decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32)) - if not re.match(r'^https?://', decrypt_info['URI']): - decrypt_info['URI'] = compat_urlparse.urljoin( - man_url, decrypt_info['URI']) - if extra_query: - decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) - if decrypt_url != decrypt_info['URI']: - decrypt_info['KEY'] = None - elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): - media_sequence = int(line[22:]) - elif line.startswith('#EXT-X-BYTERANGE'): - splitted_byte_range = line[17:].split('@') - sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end'] - byte_range = { - 'start': sub_range_start, - 'end': sub_range_start + int(splitted_byte_range[0]), - } - elif is_ad_fragment(line): - ad_frag_next = True - - self._finish_frag_download(ctx) - - return True diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py deleted file mode 100644 index 5b1e960..0000000 --- a/youtube_dl/downloader/http.py +++ /dev/null @@ -1,354 +0,0 @@ -from __future__ import unicode_literals - -import errno -import os -import socket -import time -import random -import re - -from .common import FileDownloader -from ..compat import ( - compat_str, - compat_urllib_error, -) -from ..utils import ( - ContentTooShortError, - encodeFilename, - int_or_none, - sanitize_open, - sanitized_Request, - write_xattr, - XAttrMetadataError, - XAttrUnavailableError, -) - - -class HttpFD(FileDownloader): - def real_download(self, filename, info_dict): - url = info_dict['url'] - - class DownloadContext(dict): - __getattr__ = dict.get - __setattr__ = dict.__setitem__ - __delattr__ = dict.__delitem__ - - ctx = DownloadContext() - ctx.filename = filename - ctx.tmpfilename = self.temp_name(filename) - ctx.stream = None - - # Do not include the Accept-Encoding header - headers = {'Youtubedl-no-compression': 'True'} - add_headers = info_dict.get('http_headers') - if add_headers: - headers.update(add_headers) - - is_test = self.params.get('test', False) - chunk_size = self._TEST_FILE_SIZE if is_test else ( - info_dict.get('downloader_options', {}).get('http_chunk_size') or - self.params.get('http_chunk_size') or 0) - - ctx.open_mode = 'wb' - ctx.resume_len = 0 - ctx.data_len = None - ctx.block_size = self.params.get('buffersize', 1024) - ctx.start_time = time.time() - ctx.chunk_size = None - - if self.params.get('continuedl', True): - # Establish possible resume length - if os.path.isfile(encodeFilename(ctx.tmpfilename)): - ctx.resume_len = os.path.getsize( - encodeFilename(ctx.tmpfilename)) - - ctx.is_resume = ctx.resume_len > 0 - - count = 0 - retries = self.params.get('retries', 0) - - class SucceedDownload(Exception): - pass - - class RetryDownload(Exception): - def __init__(self, source_error): - self.source_error = source_error - - class NextFragment(Exception): - pass - - def set_range(req, start, end): - range_header = 'bytes=%d-' % start - if end: - range_header += compat_str(end) - req.add_header('Range', range_header) - - def establish_connection(): - ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size) - if not is_test and chunk_size else chunk_size) - if ctx.resume_len > 0: - range_start = ctx.resume_len - if ctx.is_resume: - self.report_resuming_byte(ctx.resume_len) - ctx.open_mode = 'ab' - elif ctx.chunk_size > 0: - range_start = 0 - else: - range_start = None - ctx.is_resume = False - range_end = range_start + ctx.chunk_size - 1 if ctx.chunk_size else None - if range_end and ctx.data_len is not None and range_end >= ctx.data_len: - range_end = ctx.data_len - 1 - has_range = range_start is not None - ctx.has_range = has_range - request = sanitized_Request(url, None, headers) - if has_range: - set_range(request, range_start, range_end) - # Establish connection - try: - ctx.data = self.ydl.urlopen(request) - # When trying to resume, Content-Range HTTP header of response has to be checked - # to match the value of requested Range HTTP header. This is due to a webservers - # that don't support resuming and serve a whole file with no Content-Range - # set in response despite of requested Range (see - # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) - if has_range: - content_range = ctx.data.headers.get('Content-Range') - if content_range: - content_range_m = re.search(r'bytes (\d+)-(\d+)?(?:/(\d+))?', content_range) - # Content-Range is present and matches requested Range, resume is possible - if content_range_m: - if range_start == int(content_range_m.group(1)): - content_range_end = int_or_none(content_range_m.group(2)) - content_len = int_or_none(content_range_m.group(3)) - accept_content_len = ( - # Non-chunked download - not ctx.chunk_size or - # Chunked download and requested piece or - # its part is promised to be served - content_range_end == range_end or - content_len < range_end) - if accept_content_len: - ctx.data_len = content_len - return - # Content-Range is either not present or invalid. Assuming remote webserver is - # trying to send the whole file, resume is not possible, so wiping the local file - # and performing entire redownload - self.report_unable_to_resume() - ctx.resume_len = 0 - ctx.open_mode = 'wb' - ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None)) - return - except (compat_urllib_error.HTTPError, ) as err: - if err.code == 416: - # Unable to resume (requested range not satisfiable) - try: - # Open the connection again without the range header - ctx.data = self.ydl.urlopen( - sanitized_Request(url, None, headers)) - content_length = ctx.data.info()['Content-Length'] - except (compat_urllib_error.HTTPError, ) as err: - if err.code < 500 or err.code >= 600: - raise - else: - # Examine the reported length - if (content_length is not None and - (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)): - # The file had already been fully downloaded. - # Explanation to the above condition: in issue #175 it was revealed that - # YouTube sometimes adds or removes a few bytes from the end of the file, - # changing the file size slightly and causing problems for some users. So - # I decided to implement a suggested change and consider the file - # completely downloaded if the file size differs less than 100 bytes from - # the one in the hard drive. - self.report_file_already_downloaded(ctx.filename) - self.try_rename(ctx.tmpfilename, ctx.filename) - self._hook_progress({ - 'filename': ctx.filename, - 'status': 'finished', - 'downloaded_bytes': ctx.resume_len, - 'total_bytes': ctx.resume_len, - }) - raise SucceedDownload() - else: - # The length does not match, we start the download over - self.report_unable_to_resume() - ctx.resume_len = 0 - ctx.open_mode = 'wb' - return - elif err.code < 500 or err.code >= 600: - # Unexpected HTTP error - raise - raise RetryDownload(err) - except socket.error as err: - if err.errno != errno.ECONNRESET: - # Connection reset is no problem, just retry - raise - raise RetryDownload(err) - - def download(): - data_len = ctx.data.info().get('Content-length', None) - - # Range HTTP header may be ignored/unsupported by a webserver - # (e.g. extractor/scivee.py, extractor/bambuser.py). - # However, for a test we still would like to download just a piece of a file. - # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control - # block size when downloading a file. - if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): - data_len = self._TEST_FILE_SIZE - - if data_len is not None: - data_len = int(data_len) + ctx.resume_len - min_data_len = self.params.get('min_filesize') - max_data_len = self.params.get('max_filesize') - if min_data_len is not None and data_len < min_data_len: - self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) - return False - if max_data_len is not None and data_len > max_data_len: - self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) - return False - - byte_counter = 0 + ctx.resume_len - block_size = ctx.block_size - start = time.time() - - # measure time over whole while-loop, so slow_down() and best_block_size() work together properly - now = None # needed for slow_down() in the first loop run - before = start # start measuring - - def retry(e): - to_stdout = ctx.tmpfilename == '-' - if not to_stdout: - ctx.stream.close() - ctx.stream = None - ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) - raise RetryDownload(e) - - while True: - try: - # Download and write - data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) - # socket.timeout is a subclass of socket.error but may not have - # errno set - except socket.timeout as e: - retry(e) - except socket.error as e: - if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT): - raise - retry(e) - - byte_counter += len(data_block) - - # exit loop when download is finished - if len(data_block) == 0: - break - - # Open destination file just in time - if ctx.stream is None: - try: - ctx.stream, ctx.tmpfilename = sanitize_open( - ctx.tmpfilename, ctx.open_mode) - assert ctx.stream is not None - ctx.filename = self.undo_temp_name(ctx.tmpfilename) - self.report_destination(ctx.filename) - except (OSError, IOError) as err: - self.report_error('unable to open for writing: %s' % str(err)) - return False - - if self.params.get('xattr_set_filesize', False) and data_len is not None: - try: - write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) - except (XAttrUnavailableError, XAttrMetadataError) as err: - self.report_error('unable to set filesize xattr: %s' % str(err)) - - try: - ctx.stream.write(data_block) - except (IOError, OSError) as err: - self.to_stderr('\n') - self.report_error('unable to write data: %s' % str(err)) - return False - - # Apply rate limit - self.slow_down(start, now, byte_counter - ctx.resume_len) - - # end measuring of one loop run - now = time.time() - after = now - - # Adjust block size - if not self.params.get('noresizebuffer', False): - block_size = self.best_block_size(after - before, len(data_block)) - - before = after - - # Progress message - speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) - if ctx.data_len is None: - eta = None - else: - eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len) - - self._hook_progress({ - 'status': 'downloading', - 'downloaded_bytes': byte_counter, - 'total_bytes': ctx.data_len, - 'tmpfilename': ctx.tmpfilename, - 'filename': ctx.filename, - 'eta': eta, - 'speed': speed, - 'elapsed': now - ctx.start_time, - }) - - if is_test and byte_counter == data_len: - break - - if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len: - ctx.resume_len = byte_counter - # ctx.block_size = block_size - raise NextFragment() - - if ctx.stream is None: - self.to_stderr('\n') - self.report_error('Did not get any data blocks') - return False - if ctx.tmpfilename != '-': - ctx.stream.close() - - if data_len is not None and byte_counter != data_len: - err = ContentTooShortError(byte_counter, int(data_len)) - if count <= retries: - retry(err) - raise err - - self.try_rename(ctx.tmpfilename, ctx.filename) - - # Update file modification time - if self.params.get('updatetime', True): - info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None)) - - self._hook_progress({ - 'downloaded_bytes': byte_counter, - 'total_bytes': byte_counter, - 'filename': ctx.filename, - 'status': 'finished', - 'elapsed': time.time() - ctx.start_time, - }) - - return True - - while count <= retries: - try: - establish_connection() - return download() - except RetryDownload as e: - count += 1 - if count <= retries: - self.report_retry(e.source_error, count, retries) - continue - except NextFragment: - continue - except SucceedDownload: - return True - - self.report_error('giving up after %s retries' % retries) - return False diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py deleted file mode 100644 index 063fcf4..0000000 --- a/youtube_dl/downloader/ism.py +++ /dev/null @@ -1,259 +0,0 @@ -from __future__ import unicode_literals - -import time -import binascii -import io - -from .fragment import FragmentFD -from ..compat import ( - compat_Struct, - compat_urllib_error, -) - - -u8 = compat_Struct('>B') -u88 = compat_Struct('>Bx') -u16 = compat_Struct('>H') -u1616 = compat_Struct('>Hxx') -u32 = compat_Struct('>I') -u64 = compat_Struct('>Q') - -s88 = compat_Struct('>bx') -s16 = compat_Struct('>h') -s1616 = compat_Struct('>hxx') -s32 = compat_Struct('>i') - -unity_matrix = (s32.pack(0x10000) + s32.pack(0) * 3) * 2 + s32.pack(0x40000000) - -TRACK_ENABLED = 0x1 -TRACK_IN_MOVIE = 0x2 -TRACK_IN_PREVIEW = 0x4 - -SELF_CONTAINED = 0x1 - - -def box(box_type, payload): - return u32.pack(8 + len(payload)) + box_type + payload - - -def full_box(box_type, version, flags, payload): - return box(box_type, u8.pack(version) + u32.pack(flags)[1:] + payload) - - -def write_piff_header(stream, params): - track_id = params['track_id'] - fourcc = params['fourcc'] - duration = params['duration'] - timescale = params.get('timescale', 10000000) - language = params.get('language', 'und') - height = params.get('height', 0) - width = params.get('width', 0) - is_audio = width == 0 and height == 0 - creation_time = modification_time = int(time.time()) - - ftyp_payload = b'isml' # major brand - ftyp_payload += u32.pack(1) # minor version - ftyp_payload += b'piff' + b'iso2' # compatible brands - stream.write(box(b'ftyp', ftyp_payload)) # File Type Box - - mvhd_payload = u64.pack(creation_time) - mvhd_payload += u64.pack(modification_time) - mvhd_payload += u32.pack(timescale) - mvhd_payload += u64.pack(duration) - mvhd_payload += s1616.pack(1) # rate - mvhd_payload += s88.pack(1) # volume - mvhd_payload += u16.pack(0) # reserved - mvhd_payload += u32.pack(0) * 2 # reserved - mvhd_payload += unity_matrix - mvhd_payload += u32.pack(0) * 6 # pre defined - mvhd_payload += u32.pack(0xffffffff) # next track id - moov_payload = full_box(b'mvhd', 1, 0, mvhd_payload) # Movie Header Box - - tkhd_payload = u64.pack(creation_time) - tkhd_payload += u64.pack(modification_time) - tkhd_payload += u32.pack(track_id) # track id - tkhd_payload += u32.pack(0) # reserved - tkhd_payload += u64.pack(duration) - tkhd_payload += u32.pack(0) * 2 # reserved - tkhd_payload += s16.pack(0) # layer - tkhd_payload += s16.pack(0) # alternate group - tkhd_payload += s88.pack(1 if is_audio else 0) # volume - tkhd_payload += u16.pack(0) # reserved - tkhd_payload += unity_matrix - tkhd_payload += u1616.pack(width) - tkhd_payload += u1616.pack(height) - trak_payload = full_box(b'tkhd', 1, TRACK_ENABLED | TRACK_IN_MOVIE | TRACK_IN_PREVIEW, tkhd_payload) # Track Header Box - - mdhd_payload = u64.pack(creation_time) - mdhd_payload += u64.pack(modification_time) - mdhd_payload += u32.pack(timescale) - mdhd_payload += u64.pack(duration) - mdhd_payload += u16.pack(((ord(language[0]) - 0x60) << 10) | ((ord(language[1]) - 0x60) << 5) | (ord(language[2]) - 0x60)) - mdhd_payload += u16.pack(0) # pre defined - mdia_payload = full_box(b'mdhd', 1, 0, mdhd_payload) # Media Header Box - - hdlr_payload = u32.pack(0) # pre defined - hdlr_payload += b'soun' if is_audio else b'vide' # handler type - hdlr_payload += u32.pack(0) * 3 # reserved - hdlr_payload += (b'Sound' if is_audio else b'Video') + b'Handler\0' # name - mdia_payload += full_box(b'hdlr', 0, 0, hdlr_payload) # Handler Reference Box - - if is_audio: - smhd_payload = s88.pack(0) # balance - smhd_payload += u16.pack(0) # reserved - media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header - else: - vmhd_payload = u16.pack(0) # graphics mode - vmhd_payload += u16.pack(0) * 3 # opcolor - media_header_box = full_box(b'vmhd', 0, 1, vmhd_payload) # Video Media Header - minf_payload = media_header_box - - dref_payload = u32.pack(1) # entry count - dref_payload += full_box(b'url ', 0, SELF_CONTAINED, b'') # Data Entry URL Box - dinf_payload = full_box(b'dref', 0, 0, dref_payload) # Data Reference Box - minf_payload += box(b'dinf', dinf_payload) # Data Information Box - - stsd_payload = u32.pack(1) # entry count - - sample_entry_payload = u8.pack(0) * 6 # reserved - sample_entry_payload += u16.pack(1) # data reference index - if is_audio: - sample_entry_payload += u32.pack(0) * 2 # reserved - sample_entry_payload += u16.pack(params.get('channels', 2)) - sample_entry_payload += u16.pack(params.get('bits_per_sample', 16)) - sample_entry_payload += u16.pack(0) # pre defined - sample_entry_payload += u16.pack(0) # reserved - sample_entry_payload += u1616.pack(params['sampling_rate']) - - if fourcc == 'AACL': - sample_entry_box = box(b'mp4a', sample_entry_payload) - else: - sample_entry_payload += u16.pack(0) # pre defined - sample_entry_payload += u16.pack(0) # reserved - sample_entry_payload += u32.pack(0) * 3 # pre defined - sample_entry_payload += u16.pack(width) - sample_entry_payload += u16.pack(height) - sample_entry_payload += u1616.pack(0x48) # horiz resolution 72 dpi - sample_entry_payload += u1616.pack(0x48) # vert resolution 72 dpi - sample_entry_payload += u32.pack(0) # reserved - sample_entry_payload += u16.pack(1) # frame count - sample_entry_payload += u8.pack(0) * 32 # compressor name - sample_entry_payload += u16.pack(0x18) # depth - sample_entry_payload += s16.pack(-1) # pre defined - - codec_private_data = binascii.unhexlify(params['codec_private_data'].encode('utf-8')) - if fourcc in ('H264', 'AVC1'): - sps, pps = codec_private_data.split(u32.pack(1))[1:] - avcc_payload = u8.pack(1) # configuration version - avcc_payload += sps[1:4] # avc profile indication + profile compatibility + avc level indication - avcc_payload += u8.pack(0xfc | (params.get('nal_unit_length_field', 4) - 1)) # complete represenation (1) + reserved (11111) + length size minus one - avcc_payload += u8.pack(1) # reserved (0) + number of sps (0000001) - avcc_payload += u16.pack(len(sps)) - avcc_payload += sps - avcc_payload += u8.pack(1) # number of pps - avcc_payload += u16.pack(len(pps)) - avcc_payload += pps - sample_entry_payload += box(b'avcC', avcc_payload) # AVC Decoder Configuration Record - sample_entry_box = box(b'avc1', sample_entry_payload) # AVC Simple Entry - stsd_payload += sample_entry_box - - stbl_payload = full_box(b'stsd', 0, 0, stsd_payload) # Sample Description Box - - stts_payload = u32.pack(0) # entry count - stbl_payload += full_box(b'stts', 0, 0, stts_payload) # Decoding Time to Sample Box - - stsc_payload = u32.pack(0) # entry count - stbl_payload += full_box(b'stsc', 0, 0, stsc_payload) # Sample To Chunk Box - - stco_payload = u32.pack(0) # entry count - stbl_payload += full_box(b'stco', 0, 0, stco_payload) # Chunk Offset Box - - minf_payload += box(b'stbl', stbl_payload) # Sample Table Box - - mdia_payload += box(b'minf', minf_payload) # Media Information Box - - trak_payload += box(b'mdia', mdia_payload) # Media Box - - moov_payload += box(b'trak', trak_payload) # Track Box - - mehd_payload = u64.pack(duration) - mvex_payload = full_box(b'mehd', 1, 0, mehd_payload) # Movie Extends Header Box - - trex_payload = u32.pack(track_id) # track id - trex_payload += u32.pack(1) # default sample description index - trex_payload += u32.pack(0) # default sample duration - trex_payload += u32.pack(0) # default sample size - trex_payload += u32.pack(0) # default sample flags - mvex_payload += full_box(b'trex', 0, 0, trex_payload) # Track Extends Box - - moov_payload += box(b'mvex', mvex_payload) # Movie Extends Box - stream.write(box(b'moov', moov_payload)) # Movie Box - - -def extract_box_data(data, box_sequence): - data_reader = io.BytesIO(data) - while True: - box_size = u32.unpack(data_reader.read(4))[0] - box_type = data_reader.read(4) - if box_type == box_sequence[0]: - box_data = data_reader.read(box_size - 8) - if len(box_sequence) == 1: - return box_data - return extract_box_data(box_data, box_sequence[1:]) - data_reader.seek(box_size - 8, 1) - - -class IsmFD(FragmentFD): - """ - Download segments in a ISM manifest - """ - - FD_NAME = 'ism' - - def real_download(self, filename, info_dict): - segments = info_dict['fragments'][:1] if self.params.get( - 'test', False) else info_dict['fragments'] - - ctx = { - 'filename': filename, - 'total_frags': len(segments), - } - - self._prepare_and_start_frag_download(ctx) - - fragment_retries = self.params.get('fragment_retries', 0) - skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - - track_written = False - frag_index = 0 - for i, segment in enumerate(segments): - frag_index += 1 - if frag_index <= ctx['fragment_index']: - continue - count = 0 - while count <= fragment_retries: - try: - success, frag_content = self._download_fragment(ctx, segment['url'], info_dict) - if not success: - return False - if not track_written: - tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd']) - info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0] - write_piff_header(ctx['dest_stream'], info_dict['_download_params']) - track_written = True - self._append_fragment(ctx, frag_content) - break - except compat_urllib_error.HTTPError as err: - count += 1 - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) - if count > fragment_retries: - if skip_unavailable_fragments: - self.report_skip_fragment(frag_index) - continue - self.report_error('giving up after %s fragment retries' % fragment_retries) - return False - - self._finish_frag_download(ctx) - - return True diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py deleted file mode 100644 index fbb7f51..0000000 --- a/youtube_dl/downloader/rtmp.py +++ /dev/null @@ -1,214 +0,0 @@ -from __future__ import unicode_literals - -import os -import re -import subprocess -import time - -from .common import FileDownloader -from ..compat import compat_str -from ..utils import ( - check_executable, - encodeFilename, - encodeArgument, - get_exe_version, -) - - -def rtmpdump_version(): - return get_exe_version( - 'rtmpdump', ['--help'], r'(?i)RTMPDump\s*v?([0-9a-zA-Z._-]+)') - - -class RtmpFD(FileDownloader): - def real_download(self, filename, info_dict): - def run_rtmpdump(args): - start = time.time() - resume_percent = None - resume_downloaded_data_len = None - proc = subprocess.Popen(args, stderr=subprocess.PIPE) - cursor_in_new_line = True - proc_stderr_closed = False - try: - while not proc_stderr_closed: - # read line from stderr - line = '' - while True: - char = proc.stderr.read(1) - if not char: - proc_stderr_closed = True - break - if char in [b'\r', b'\n']: - break - line += char.decode('ascii', 'replace') - if not line: - # proc_stderr_closed is True - continue - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) - if mobj: - downloaded_data_len = int(float(mobj.group(1)) * 1024) - percent = float(mobj.group(2)) - if not resume_percent: - resume_percent = percent - resume_downloaded_data_len = downloaded_data_len - time_now = time.time() - eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) - speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) - data_len = None - if percent > 0: - data_len = int(downloaded_data_len * 100 / percent) - self._hook_progress({ - 'status': 'downloading', - 'downloaded_bytes': downloaded_data_len, - 'total_bytes_estimate': data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'eta': eta, - 'elapsed': time_now - start, - 'speed': speed, - }) - cursor_in_new_line = False - else: - # no percent for live streams - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) - if mobj: - downloaded_data_len = int(float(mobj.group(1)) * 1024) - time_now = time.time() - speed = self.calc_speed(start, time_now, downloaded_data_len) - self._hook_progress({ - 'downloaded_bytes': downloaded_data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'status': 'downloading', - 'elapsed': time_now - start, - 'speed': speed, - }) - cursor_in_new_line = False - elif self.params.get('verbose', False): - if not cursor_in_new_line: - self.to_screen('') - cursor_in_new_line = True - self.to_screen('[rtmpdump] ' + line) - finally: - proc.wait() - if not cursor_in_new_line: - self.to_screen('') - return proc.returncode - - url = info_dict['url'] - player_url = info_dict.get('player_url') - page_url = info_dict.get('page_url') - app = info_dict.get('app') - play_path = info_dict.get('play_path') - tc_url = info_dict.get('tc_url') - flash_version = info_dict.get('flash_version') - live = info_dict.get('rtmp_live', False) - conn = info_dict.get('rtmp_conn') - protocol = info_dict.get('rtmp_protocol') - real_time = info_dict.get('rtmp_real_time', False) - no_resume = info_dict.get('no_resume', False) - continue_dl = self.params.get('continuedl', True) - - self.report_destination(filename) - tmpfilename = self.temp_name(filename) - test = self.params.get('test', False) - - # Check for rtmpdump first - if not check_executable('rtmpdump', ['-h']): - self.report_error('RTMP download detected but "rtmpdump" could not be run. Please install it.') - return False - - # Download using rtmpdump. rtmpdump returns exit code 2 when - # the connection was interrupted and resuming appears to be - # possible. This is part of rtmpdump's normal usage, AFAIK. - basic_args = [ - 'rtmpdump', '--verbose', '-r', url, - '-o', tmpfilename] - if player_url is not None: - basic_args += ['--swfVfy', player_url] - if page_url is not None: - basic_args += ['--pageUrl', page_url] - if app is not None: - basic_args += ['--app', app] - if play_path is not None: - basic_args += ['--playpath', play_path] - if tc_url is not None: - basic_args += ['--tcUrl', tc_url] - if test: - basic_args += ['--stop', '1'] - if flash_version is not None: - basic_args += ['--flashVer', flash_version] - if live: - basic_args += ['--live'] - if isinstance(conn, list): - for entry in conn: - basic_args += ['--conn', entry] - elif isinstance(conn, compat_str): - basic_args += ['--conn', conn] - if protocol is not None: - basic_args += ['--protocol', protocol] - if real_time: - basic_args += ['--realtime'] - - args = basic_args - if not no_resume and continue_dl and not live: - args += ['--resume'] - if not live and continue_dl: - args += ['--skip', '1'] - - args = [encodeArgument(a) for a in args] - - self._debug_cmd(args, exe='rtmpdump') - - RD_SUCCESS = 0 - RD_FAILED = 1 - RD_INCOMPLETE = 2 - RD_NO_CONNECT = 3 - - started = time.time() - - try: - retval = run_rtmpdump(args) - except KeyboardInterrupt: - if not info_dict.get('is_live'): - raise - retval = RD_SUCCESS - self.to_screen('\n[rtmpdump] Interrupted by user') - - if retval == RD_NO_CONNECT: - self.report_error('[rtmpdump] Could not connect to RTMP server.') - return False - - while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live: - prevsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] Downloaded %s bytes' % prevsize) - time.sleep(5.0) # This seems to be needed - args = basic_args + ['--resume'] - if retval == RD_FAILED: - args += ['--skip', '1'] - args = [encodeArgument(a) for a in args] - retval = run_rtmpdump(args) - cursize = os.path.getsize(encodeFilename(tmpfilename)) - if prevsize == cursize and retval == RD_FAILED: - break - # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those - if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024: - self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.') - retval = RD_SUCCESS - break - if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE): - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] Downloaded %s bytes' % fsize) - self.try_rename(tmpfilename, filename) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - 'elapsed': time.time() - started, - }) - return True - else: - self.to_stderr('\n') - self.report_error('rtmpdump exited with code %d' % retval) - return False diff --git a/youtube_dl/downloader/rtsp.py b/youtube_dl/downloader/rtsp.py deleted file mode 100644 index 939358b..0000000 --- a/youtube_dl/downloader/rtsp.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import unicode_literals - -import os -import subprocess - -from .common import FileDownloader -from ..utils import ( - check_executable, - encodeFilename, -) - - -class RtspFD(FileDownloader): - def real_download(self, filename, info_dict): - url = info_dict['url'] - self.report_destination(filename) - tmpfilename = self.temp_name(filename) - - if check_executable('mplayer', ['-h']): - args = [ - 'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', - '-dumpstream', '-dumpfile', tmpfilename, url] - elif check_executable('mpv', ['-h']): - args = [ - 'mpv', '-really-quiet', '--vo=null', '--stream-dump=' + tmpfilename, url] - else: - self.report_error('MMS or RTSP download detected but neither "mplayer" nor "mpv" could be run. Please install any.') - return False - - self._debug_cmd(args) - - retval = subprocess.call(args) - if retval == 0: - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) - self.try_rename(tmpfilename, filename) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - }) - return True - else: - self.to_stderr('\n') - self.report_error('%s exited with code %d' % (args[0], retval)) - return False diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py deleted file mode 100644 index d5a4418..0000000 --- a/youtube_dl/extractor/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import unicode_literals - -try: - from .lazy_extractors import * - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True -except ImportError: - _LAZY_LOADER = False - from .extractors import * - - _ALL_CLASSES = [ - klass - for name, klass in globals().items() - if name.endswith('IE') and name != 'GenericIE' - ] - #_ALL_CLASSES.append(GenericIE) - - -def gen_extractor_classes(): - """ Return a list of supported extractors. - The order does matter; the first extractor matched is the one handling the URL. - """ - return _ALL_CLASSES - - -def gen_extractors(): - """ Return a list of an instance of every supported extractor. - The order does matter; the first extractor matched is the one handling the URL. - """ - return [klass() for klass in gen_extractor_classes()] - - -def list_extractors(age_limit): - """ - Return a list of extractors that are suitable for the given age, - sorted by extractor ID. - """ - - return sorted( - filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()), - key=lambda ie: ie.IE_NAME.lower()) - - -def get_info_extractor(ie_name): - """Returns the info extractor class with the given ie_name""" - return globals()[ie_name + 'IE'] diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py deleted file mode 100644 index b83b51e..0000000 --- a/youtube_dl/extractor/adobepass.py +++ /dev/null @@ -1,1567 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import time -import xml.etree.ElementTree as etree - -from .common import InfoExtractor -from ..compat import ( - compat_kwargs, - compat_urlparse, -) -from ..utils import ( - unescapeHTML, - urlencode_postdata, - unified_timestamp, - ExtractorError, - NO_DEFAULT, -) - - -MSO_INFO = { - 'DTV': { - 'name': 'DIRECTV', - 'username_field': 'username', - 'password_field': 'password', - }, - 'ATTOTT': { - 'name': 'DIRECTV NOW', - 'username_field': 'email', - 'password_field': 'loginpassword', - }, - 'Rogers': { - 'name': 'Rogers', - 'username_field': 'UserName', - 'password_field': 'UserPassword', - }, - 'Comcast_SSO': { - 'name': 'Comcast XFINITY', - 'username_field': 'user', - 'password_field': 'passwd', - }, - 'TWC': { - 'name': 'Time Warner Cable | Spectrum', - 'username_field': 'Ecom_User_ID', - 'password_field': 'Ecom_Password', - }, - 'Brighthouse': { - 'name': 'Bright House Networks | Spectrum', - 'username_field': 'j_username', - 'password_field': 'j_password', - }, - 'Charter_Direct': { - 'name': 'Charter Spectrum', - 'username_field': 'IDToken1', - 'password_field': 'IDToken2', - }, - 'Verizon': { - 'name': 'Verizon FiOS', - 'username_field': 'IDToken1', - 'password_field': 'IDToken2', - }, - 'thr030': { - 'name': '3 Rivers Communications' - }, - 'com140': { - 'name': 'Access Montana' - }, - 'acecommunications': { - 'name': 'AcenTek' - }, - 'acm010': { - 'name': 'Acme Communications' - }, - 'ada020': { - 'name': 'Adams Cable Service' - }, - 'alb020': { - 'name': 'Albany Mutual Telephone' - }, - 'algona': { - 'name': 'Algona Municipal Utilities' - }, - 'allwest': { - 'name': 'All West Communications' - }, - 'all025': { - 'name': 'Allen\'s Communications' - }, - 'spl010': { - 'name': 'Alliance Communications' - }, - 'all070': { - 'name': 'ALLO Communications' - }, - 'alpine': { - 'name': 'Alpine Communications' - }, - 'hun015': { - 'name': 'American Broadband' - }, - 'nwc010': { - 'name': 'American Broadband Missouri' - }, - 'com130-02': { - 'name': 'American Community Networks' - }, - 'com130-01': { - 'name': 'American Warrior Networks' - }, - 'tom020': { - 'name': 'Amherst Telephone/Tomorrow Valley' - }, - 'tvc020': { - 'name': 'Andycable' - }, - 'arkwest': { - 'name': 'Arkwest Communications' - }, - 'art030': { - 'name': 'Arthur Mutual Telephone Company' - }, - 'arvig': { - 'name': 'Arvig' - }, - 'nttcash010': { - 'name': 'Ashland Home Net' - }, - 'astound': { - 'name': 'Astound (now Wave)' - }, - 'dix030': { - 'name': 'ATC Broadband' - }, - 'ara010': { - 'name': 'ATC Communications' - }, - 'she030-02': { - 'name': 'Ayersville Communications' - }, - 'baldwin': { - 'name': 'Baldwin Lightstream' - }, - 'bal040': { - 'name': 'Ballard TV' - }, - 'cit025': { - 'name': 'Bardstown Cable TV' - }, - 'bay030': { - 'name': 'Bay Country Communications' - }, - 'tel095': { - 'name': 'Beaver Creek Cooperative Telephone' - }, - 'bea020': { - 'name': 'Beaver Valley Cable' - }, - 'bee010': { - 'name': 'Bee Line Cable' - }, - 'wir030': { - 'name': 'Beehive Broadband' - }, - 'bra020': { - 'name': 'BELD' - }, - 'bel020': { - 'name': 'Bellevue Municipal Cable' - }, - 'vol040-01': { - 'name': 'Ben Lomand Connect / BLTV' - }, - 'bev010': { - 'name': 'BEVCOMM' - }, - 'big020': { - 'name': 'Big Sandy Broadband' - }, - 'ble020': { - 'name': 'Bledsoe Telephone Cooperative' - }, - 'bvt010': { - 'name': 'Blue Valley Tele-Communications' - }, - 'bra050': { - 'name': 'Brandenburg Telephone Co.' - }, - 'bte010': { - 'name': 'Bristol Tennessee Essential Services' - }, - 'annearundel': { - 'name': 'Broadstripe' - }, - 'btc010': { - 'name': 'BTC Communications' - }, - 'btc040': { - 'name': 'BTC Vision - Nahunta' - }, - 'bul010': { - 'name': 'Bulloch Telephone Cooperative' - }, - 'but010': { - 'name': 'Butler-Bremer Communications' - }, - 'tel160-csp': { - 'name': 'C Spire SNAP' - }, - 'csicable': { - 'name': 'Cable Services Inc.' - }, - 'cableamerica': { - 'name': 'CableAmerica' - }, - 'cab038': { - 'name': 'CableSouth Media 3' - }, - 'weh010-camtel': { - 'name': 'Cam-Tel Company' - }, - 'car030': { - 'name': 'Cameron Communications' - }, - 'canbytel': { - 'name': 'Canby Telcom' - }, - 'crt020': { - 'name': 'CapRock Tv' - }, - 'car050': { - 'name': 'Carnegie Cable' - }, - 'cas': { - 'name': 'CAS Cable' - }, - 'casscomm': { - 'name': 'CASSCOMM' - }, - 'mid180-02': { - 'name': 'Catalina Broadband Solutions' - }, - 'cccomm': { - 'name': 'CC Communications' - }, - 'nttccde010': { - 'name': 'CDE Lightband' - }, - 'cfunet': { - 'name': 'Cedar Falls Utilities' - }, - 'dem010-01': { - 'name': 'Celect-Bloomer Telephone Area' - }, - 'dem010-02': { - 'name': 'Celect-Bruce Telephone Area' - }, - 'dem010-03': { - 'name': 'Celect-Citizens Connected Area' - }, - 'dem010-04': { - 'name': 'Celect-Elmwood/Spring Valley Area' - }, - 'dem010-06': { - 'name': 'Celect-Mosaic Telecom' - }, - 'dem010-05': { - 'name': 'Celect-West WI Telephone Area' - }, - 'net010-02': { - 'name': 'Cellcom/Nsight Telservices' - }, - 'cen100': { - 'name': 'CentraCom' - }, - 'nttccst010': { - 'name': 'Central Scott / CSTV' - }, - 'cha035': { - 'name': 'Chaparral CableVision' - }, - 'cha050': { - 'name': 'Chariton Valley Communication Corporation, Inc.' - }, - 'cha060': { - 'name': 'Chatmoss Cablevision' - }, - 'nttcche010': { - 'name': 'Cherokee Communications' - }, - 'che050': { - 'name': 'Chesapeake Bay Communications' - }, - 'cimtel': { - 'name': 'Cim-Tel Cable, LLC.' - }, - 'cit180': { - 'name': 'Citizens Cablevision - Floyd, VA' - }, - 'cit210': { - 'name': 'Citizens Cablevision, Inc.' - }, - 'cit040': { - 'name': 'Citizens Fiber' - }, - 'cit250': { - 'name': 'Citizens Mutual' - }, - 'war040': { - 'name': 'Citizens Telephone Corporation' - }, - 'wat025': { - 'name': 'City Of Monroe' - }, - 'wadsworth': { - 'name': 'CityLink' - }, - 'nor100': { - 'name': 'CL Tel' - }, - 'cla010': { - 'name': 'Clarence Telephone and Cedar Communications' - }, - 'ser060': { - 'name': 'Clear Choice Communications' - }, - 'tac020': { - 'name': 'Click! Cable TV' - }, - 'war020': { - 'name': 'CLICK1.NET' - }, - 'cml010': { - 'name': 'CML Telephone Cooperative Association' - }, - 'cns': { - 'name': 'CNS' - }, - 'com160': { - 'name': 'Co-Mo Connect' - }, - 'coa020': { - 'name': 'Coast Communications' - }, - 'coa030': { - 'name': 'Coaxial Cable TV' - }, - 'mid055': { - 'name': 'Cobalt TV (Mid-State Community TV)' - }, - 'col070': { - 'name': 'Columbia Power & Water Systems' - }, - 'col080': { - 'name': 'Columbus Telephone' - }, - 'nor105': { - 'name': 'Communications 1 Cablevision, Inc.' - }, - 'com150': { - 'name': 'Community Cable & Broadband' - }, - 'com020': { - 'name': 'Community Communications Company' - }, - 'coy010': { - 'name': 'commZoom' - }, - 'com025': { - 'name': 'Complete Communication Services' - }, - 'cat020': { - 'name': 'Comporium' - }, - 'com071': { - 'name': 'ComSouth Telesys' - }, - 'consolidatedcable': { - 'name': 'Consolidated' - }, - 'conwaycorp': { - 'name': 'Conway Corporation' - }, - 'coo050': { - 'name': 'Coon Valley Telecommunications Inc' - }, - 'coo080': { - 'name': 'Cooperative Telephone Company' - }, - 'cpt010': { - 'name': 'CP-TEL' - }, - 'cra010': { - 'name': 'Craw-Kan Telephone' - }, - 'crestview': { - 'name': 'Crestview Cable Communications' - }, - 'cross': { - 'name': 'Cross TV' - }, - 'cro030': { - 'name': 'Crosslake Communications' - }, - 'ctc040': { - 'name': 'CTC - Brainerd MN' - }, - 'phe030': { - 'name': 'CTV-Beam - East Alabama' - }, - 'cun010': { - 'name': 'Cunningham Telephone & Cable' - }, - 'dpc010': { - 'name': 'D & P Communications' - }, - 'dak030': { - 'name': 'Dakota Central Telecommunications' - }, - 'nttcdel010': { - 'name': 'Delcambre Telephone LLC' - }, - 'tel160-del': { - 'name': 'Delta Telephone Company' - }, - 'sal040': { - 'name': 'DiamondNet' - }, - 'ind060-dc': { - 'name': 'Direct Communications' - }, - 'doy010': { - 'name': 'Doylestown Cable TV' - }, - 'dic010': { - 'name': 'DRN' - }, - 'dtc020': { - 'name': 'DTC' - }, - 'dtc010': { - 'name': 'DTC Cable (Delhi)' - }, - 'dum010': { - 'name': 'Dumont Telephone Company' - }, - 'dun010': { - 'name': 'Dunkerton Telephone Cooperative' - }, - 'cci010': { - 'name': 'Duo County Telecom' - }, - 'eagle': { - 'name': 'Eagle Communications' - }, - 'weh010-east': { - 'name': 'East Arkansas Cable TV' - }, - 'eatel': { - 'name': 'EATEL Video, LLC' - }, - 'ell010': { - 'name': 'ECTA' - }, - 'emerytelcom': { - 'name': 'Emery Telcom Video LLC' - }, - 'nor200': { - 'name': 'Empire Access' - }, - 'endeavor': { - 'name': 'Endeavor Communications' - }, - 'sun045': { - 'name': 'Enhanced Telecommunications Corporation' - }, - 'mid030': { - 'name': 'enTouch' - }, - 'epb020': { - 'name': 'EPB Smartnet' - }, - 'jea010': { - 'name': 'EPlus Broadband' - }, - 'com065': { - 'name': 'ETC' - }, - 'ete010': { - 'name': 'Etex Communications' - }, - 'fbc-tele': { - 'name': 'F&B Communications' - }, - 'fal010': { - 'name': 'Falcon Broadband' - }, - 'fam010': { - 'name': 'FamilyView CableVision' - }, - 'far020': { - 'name': 'Farmers Mutual Telephone Company' - }, - 'fay010': { - 'name': 'Fayetteville Public Utilities' - }, - 'sal060': { - 'name': 'fibrant' - }, - 'fid010': { - 'name': 'Fidelity Communications' - }, - 'for030': { - 'name': 'FJ Communications' - }, - 'fli020': { - 'name': 'Flint River Communications' - }, - 'far030': { - 'name': 'FMT - Jesup' - }, - 'foo010': { - 'name': 'Foothills Communications' - }, - 'for080': { - 'name': 'Forsyth CableNet' - }, - 'fbcomm': { - 'name': 'Frankfort Plant Board' - }, - 'tel160-fra': { - 'name': 'Franklin Telephone Company' - }, - 'nttcftc010': { - 'name': 'FTC' - }, - 'fullchannel': { - 'name': 'Full Channel, Inc.' - }, - 'gar040': { - 'name': 'Gardonville Cooperative Telephone Association' - }, - 'gbt010': { - 'name': 'GBT Communications, Inc.' - }, - 'tec010': { - 'name': 'Genuine Telecom' - }, - 'clr010': { - 'name': 'Giant Communications' - }, - 'gla010': { - 'name': 'Glasgow EPB' - }, - 'gle010': { - 'name': 'Glenwood Telecommunications' - }, - 'gra060': { - 'name': 'GLW Broadband Inc.' - }, - 'goldenwest': { - 'name': 'Golden West Cablevision' - }, - 'vis030': { - 'name': 'Grantsburg Telcom' - }, - 'gpcom': { - 'name': 'Great Plains Communications' - }, - 'gri010': { - 'name': 'Gridley Cable Inc' - }, - 'hbc010': { - 'name': 'H&B Cable Services' - }, - 'hae010': { - 'name': 'Haefele TV Inc.' - }, - 'htc010': { - 'name': 'Halstad Telephone Company' - }, - 'har005': { - 'name': 'Harlan Municipal Utilities' - }, - 'har020': { - 'name': 'Hart Communications' - }, - 'ced010': { - 'name': 'Hartelco TV' - }, - 'hea040': { - 'name': 'Heart of Iowa Communications Cooperative' - }, - 'htc020': { - 'name': 'Hickory Telephone Company' - }, - 'nttchig010': { - 'name': 'Highland Communication Services' - }, - 'hig030': { - 'name': 'Highland Media' - }, - 'spc010': { - 'name': 'Hilliary Communications' - }, - 'hin020': { - 'name': 'Hinton CATV Co.' - }, - 'hometel': { - 'name': 'HomeTel Entertainment, Inc.' - }, - 'hoodcanal': { - 'name': 'Hood Canal Communications' - }, - 'weh010-hope': { - 'name': 'Hope - Prescott Cable TV' - }, - 'horizoncable': { - 'name': 'Horizon Cable TV, Inc.' - }, - 'hor040': { - 'name': 'Horizon Chillicothe Telephone' - }, - 'htc030': { - 'name': 'HTC Communications Co. - IL' - }, - 'htccomm': { - 'name': 'HTC Communications, Inc. - IA' - }, - 'wal005': { - 'name': 'Huxley Communications' - }, - 'imon': { - 'name': 'ImOn Communications' - }, - 'ind040': { - 'name': 'Independence Telecommunications' - }, - 'rrc010': { - 'name': 'Inland Networks' - }, - 'stc020': { - 'name': 'Innovative Cable TV St Croix' - }, - 'car100': { - 'name': 'Innovative Cable TV St Thomas-St John' - }, - 'icc010': { - 'name': 'Inside Connect Cable' - }, - 'int100': { - 'name': 'Integra Telecom' - }, - 'int050': { - 'name': 'Interstate Telecommunications Coop' - }, - 'irv010': { - 'name': 'Irvine Cable' - }, - 'k2c010': { - 'name': 'K2 Communications' - }, - 'kal010': { - 'name': 'Kalida Telephone Company, Inc.' - }, - 'kal030': { - 'name': 'Kalona Cooperative Telephone Company' - }, - 'kmt010': { - 'name': 'KMTelecom' - }, - 'kpu010': { - 'name': 'KPU Telecommunications' - }, - 'kuh010': { - 'name': 'Kuhn Communications, Inc.' - }, - 'lak130': { - 'name': 'Lakeland Communications' - }, - 'lan010': { - 'name': 'Langco' - }, - 'lau020': { - 'name': 'Laurel Highland Total Communications, Inc.' - }, - 'leh010': { - 'name': 'Lehigh Valley Cooperative Telephone' - }, - 'bra010': { - 'name': 'Limestone Cable/Bracken Cable' - }, - 'loc020': { - 'name': 'LISCO' - }, - 'lit020': { - 'name': 'Litestream' - }, - 'tel140': { - 'name': 'LivCom' - }, - 'loc010': { - 'name': 'LocalTel Communications' - }, - 'weh010-longview': { - 'name': 'Longview - Kilgore Cable TV' - }, - 'lon030': { - 'name': 'Lonsdale Video Ventures, LLC' - }, - 'lns010': { - 'name': 'Lost Nation-Elwood Telephone Co.' - }, - 'nttclpc010': { - 'name': 'LPC Connect' - }, - 'lumos': { - 'name': 'Lumos Networks' - }, - 'madison': { - 'name': 'Madison Communications' - }, - 'mad030': { - 'name': 'Madison County Cable Inc.' - }, - 'nttcmah010': { - 'name': 'Mahaska Communication Group' - }, - 'mar010': { - 'name': 'Marne & Elk Horn Telephone Company' - }, - 'mcc040': { - 'name': 'McClure Telephone Co.' - }, - 'mctv': { - 'name': 'MCTV' - }, - 'merrimac': { - 'name': 'Merrimac Communications Ltd.' - }, - 'metronet': { - 'name': 'Metronet' - }, - 'mhtc': { - 'name': 'MHTC' - }, - 'midhudson': { - 'name': 'Mid-Hudson Cable' - }, - 'midrivers': { - 'name': 'Mid-Rivers Communications' - }, - 'mid045': { - 'name': 'Midstate Communications' - }, - 'mil080': { - 'name': 'Milford Communications' - }, - 'min030': { - 'name': 'MINET' - }, - 'nttcmin010': { - 'name': 'Minford TV' - }, - 'san040-02': { - 'name': 'Mitchell Telecom' - }, - 'mlg010': { - 'name': 'MLGC' - }, - 'mon060': { - 'name': 'Mon-Cre TVE' - }, - 'mou110': { - 'name': 'Mountain Telephone' - }, - 'mou050': { - 'name': 'Mountain Village Cable' - }, - 'mtacomm': { - 'name': 'MTA Communications, LLC' - }, - 'mtc010': { - 'name': 'MTC Cable' - }, - 'med040': { - 'name': 'MTC Technologies' - }, - 'man060': { - 'name': 'MTCC' - }, - 'mtc030': { - 'name': 'MTCO Communications' - }, - 'mul050': { - 'name': 'Mulberry Telecommunications' - }, - 'mur010': { - 'name': 'Murray Electric System' - }, - 'musfiber': { - 'name': 'MUS FiberNET' - }, - 'mpw': { - 'name': 'Muscatine Power & Water' - }, - 'nttcsli010': { - 'name': 'myEVTV.com' - }, - 'nor115': { - 'name': 'NCC' - }, - 'nor260': { - 'name': 'NDTC' - }, - 'nctc': { - 'name': 'Nebraska Central Telecom, Inc.' - }, - 'nel020': { - 'name': 'Nelsonville TV Cable' - }, - 'nem010': { - 'name': 'Nemont' - }, - 'new075': { - 'name': 'New Hope Telephone Cooperative' - }, - 'nor240': { - 'name': 'NICP' - }, - 'cic010': { - 'name': 'NineStar Connect' - }, - 'nktelco': { - 'name': 'NKTelco' - }, - 'nortex': { - 'name': 'Nortex Communications' - }, - 'nor140': { - 'name': 'North Central Telephone Cooperative' - }, - 'nor030': { - 'name': 'Northland Communications' - }, - 'nor075': { - 'name': 'Northwest Communications' - }, - 'nor125': { - 'name': 'Norwood Light Broadband' - }, - 'net010': { - 'name': 'Nsight Telservices' - }, - 'dur010': { - 'name': 'Ntec' - }, - 'nts010': { - 'name': 'NTS Communications' - }, - 'new045': { - 'name': 'NU-Telecom' - }, - 'nulink': { - 'name': 'NuLink' - }, - 'jam030': { - 'name': 'NVC' - }, - 'far035': { - 'name': 'OmniTel Communications' - }, - 'onesource': { - 'name': 'OneSource Communications' - }, - 'cit230': { - 'name': 'Opelika Power Services' - }, - 'daltonutilities': { - 'name': 'OptiLink' - }, - 'mid140': { - 'name': 'OPTURA' - }, - 'ote010': { - 'name': 'OTEC Communication Company' - }, - 'cci020': { - 'name': 'Packerland Broadband' - }, - 'pan010': { - 'name': 'Panora Telco/Guthrie Center Communications' - }, - 'otter': { - 'name': 'Park Region Telephone & Otter Tail Telcom' - }, - 'mid050': { - 'name': 'Partner Communications Cooperative' - }, - 'fib010': { - 'name': 'Pathway' - }, - 'paulbunyan': { - 'name': 'Paul Bunyan Communications' - }, - 'pem020': { - 'name': 'Pembroke Telephone Company' - }, - 'mck010': { - 'name': 'Peoples Rural Telephone Cooperative' - }, - 'pul010': { - 'name': 'PES Energize' - }, - 'phi010': { - 'name': 'Philippi Communications System' - }, - 'phonoscope': { - 'name': 'Phonoscope Cable' - }, - 'pin070': { - 'name': 'Pine Belt Communications, Inc.' - }, - 'weh010-pine': { - 'name': 'Pine Bluff Cable TV' - }, - 'pin060': { - 'name': 'Pineland Telephone Cooperative' - }, - 'cam010': { - 'name': 'Pinpoint Communications' - }, - 'pio060': { - 'name': 'Pioneer Broadband' - }, - 'pioncomm': { - 'name': 'Pioneer Communications' - }, - 'pioneer': { - 'name': 'Pioneer DTV' - }, - 'pla020': { - 'name': 'Plant TiftNet, Inc.' - }, - 'par010': { - 'name': 'PLWC' - }, - 'pro035': { - 'name': 'PMT' - }, - 'vik011': { - 'name': 'Polar Cablevision' - }, - 'pottawatomie': { - 'name': 'Pottawatomie Telephone Co.' - }, - 'premiercomm': { - 'name': 'Premier Communications' - }, - 'psc010': { - 'name': 'PSC' - }, - 'pan020': { - 'name': 'PTCI' - }, - 'qco010': { - 'name': 'QCOL' - }, - 'qua010': { - 'name': 'Quality Cablevision' - }, - 'rad010': { - 'name': 'Radcliffe Telephone Company' - }, - 'car040': { - 'name': 'Rainbow Communications' - }, - 'rai030': { - 'name': 'Rainier Connect' - }, - 'ral010': { - 'name': 'Ralls Technologies' - }, - 'rct010': { - 'name': 'RC Technologies' - }, - 'red040': { - 'name': 'Red River Communications' - }, - 'ree010': { - 'name': 'Reedsburg Utility Commission' - }, - 'mol010': { - 'name': 'Reliance Connects- Oregon' - }, - 'res020': { - 'name': 'Reserve Telecommunications' - }, - 'weh010-resort': { - 'name': 'Resort TV Cable' - }, - 'rld010': { - 'name': 'Richland Grant Telephone Cooperative, Inc.' - }, - 'riv030': { - 'name': 'River Valley Telecommunications Coop' - }, - 'rockportcable': { - 'name': 'Rock Port Cablevision' - }, - 'rsf010': { - 'name': 'RS Fiber' - }, - 'rtc': { - 'name': 'RTC Communication Corp' - }, - 'res040': { - 'name': 'RTC-Reservation Telephone Coop.' - }, - 'rte010': { - 'name': 'RTEC Communications' - }, - 'stc010': { - 'name': 'S&T' - }, - 'san020': { - 'name': 'San Bruno Cable TV' - }, - 'san040-01': { - 'name': 'Santel' - }, - 'sav010': { - 'name': 'SCI Broadband-Savage Communications Inc.' - }, - 'sco050': { - 'name': 'Scottsboro Electric Power Board' - }, - 'scr010': { - 'name': 'Scranton Telephone Company' - }, - 'selco': { - 'name': 'SELCO' - }, - 'she010': { - 'name': 'Shentel' - }, - 'she030': { - 'name': 'Sherwood Mutual Telephone Association, Inc.' - }, - 'ind060-ssc': { - 'name': 'Silver Star Communications' - }, - 'sjoberg': { - 'name': 'Sjoberg\'s Inc.' - }, - 'sou025': { - 'name': 'SKT' - }, - 'sky050': { - 'name': 'SkyBest TV' - }, - 'nttcsmi010': { - 'name': 'Smithville Communications' - }, - 'woo010': { - 'name': 'Solarus' - }, - 'sou075': { - 'name': 'South Central Rural Telephone Cooperative' - }, - 'sou065': { - 'name': 'South Holt Cablevision, Inc.' - }, - 'sou035': { - 'name': 'South Slope Cooperative Communications' - }, - 'spa020': { - 'name': 'Spanish Fork Community Network' - }, - 'spe010': { - 'name': 'Spencer Municipal Utilities' - }, - 'spi005': { - 'name': 'Spillway Communications, Inc.' - }, - 'srt010': { - 'name': 'SRT' - }, - 'cccsmc010': { - 'name': 'St. Maarten Cable TV' - }, - 'sta025': { - 'name': 'Star Communications' - }, - 'sco020': { - 'name': 'STE' - }, - 'uin010': { - 'name': 'STRATA Networks' - }, - 'sum010': { - 'name': 'Sumner Cable TV' - }, - 'pie010': { - 'name': 'Surry TV/PCSI TV' - }, - 'swa010': { - 'name': 'Swayzee Communications' - }, - 'sweetwater': { - 'name': 'Sweetwater Cable Television Co' - }, - 'weh010-talequah': { - 'name': 'Tahlequah Cable TV' - }, - 'tct': { - 'name': 'TCT' - }, - 'tel050': { - 'name': 'Tele-Media Company' - }, - 'com050': { - 'name': 'The Community Agency' - }, - 'thr020': { - 'name': 'Three River' - }, - 'cab140': { - 'name': 'Town & Country Technologies' - }, - 'tra010': { - 'name': 'Trans-Video' - }, - 'tre010': { - 'name': 'Trenton TV Cable Company' - }, - 'tcc': { - 'name': 'Tri County Communications Cooperative' - }, - 'tri025': { - 'name': 'TriCounty Telecom' - }, - 'tri110': { - 'name': 'TrioTel Communications, Inc.' - }, - 'tro010': { - 'name': 'Troy Cablevision, Inc.' - }, - 'tsc': { - 'name': 'TSC' - }, - 'cit220': { - 'name': 'Tullahoma Utilities Board' - }, - 'tvc030': { - 'name': 'TV Cable of Rensselaer' - }, - 'tvc015': { - 'name': 'TVC Cable' - }, - 'cab180': { - 'name': 'TVision' - }, - 'twi040': { - 'name': 'Twin Lakes' - }, - 'tvtinc': { - 'name': 'Twin Valley' - }, - 'uis010': { - 'name': 'Union Telephone Company' - }, - 'uni110': { - 'name': 'United Communications - TN' - }, - 'uni120': { - 'name': 'United Services' - }, - 'uss020': { - 'name': 'US Sonet' - }, - 'cab060': { - 'name': 'USA Communications' - }, - 'she005': { - 'name': 'USA Communications/Shellsburg, IA' - }, - 'val040': { - 'name': 'Valley TeleCom Group' - }, - 'val025': { - 'name': 'Valley Telecommunications' - }, - 'val030': { - 'name': 'Valparaiso Broadband' - }, - 'cla050': { - 'name': 'Vast Broadband' - }, - 'sul015': { - 'name': 'Venture Communications Cooperative, Inc.' - }, - 'ver025': { - 'name': 'Vernon Communications Co-op' - }, - 'weh010-vicksburg': { - 'name': 'Vicksburg Video' - }, - 'vis070': { - 'name': 'Vision Communications' - }, - 'volcanotel': { - 'name': 'Volcano Vision, Inc.' - }, - 'vol040-02': { - 'name': 'VolFirst / BLTV' - }, - 'ver070': { - 'name': 'VTel' - }, - 'nttcvtx010': { - 'name': 'VTX1' - }, - 'bci010-02': { - 'name': 'Vyve Broadband' - }, - 'wab020': { - 'name': 'Wabash Mutual Telephone' - }, - 'waitsfield': { - 'name': 'Waitsfield Cable' - }, - 'wal010': { - 'name': 'Walnut Communications' - }, - 'wavebroadband': { - 'name': 'Wave' - }, - 'wav030': { - 'name': 'Waverly Communications Utility' - }, - 'wbi010': { - 'name': 'WBI' - }, - 'web020': { - 'name': 'Webster-Calhoun Cooperative Telephone Association' - }, - 'wes005': { - 'name': 'West Alabama TV Cable' - }, - 'carolinata': { - 'name': 'West Carolina Communications' - }, - 'wct010': { - 'name': 'West Central Telephone Association' - }, - 'wes110': { - 'name': 'West River Cooperative Telephone Company' - }, - 'ani030': { - 'name': 'WesTel Systems' - }, - 'westianet': { - 'name': 'Western Iowa Networks' - }, - 'nttcwhi010': { - 'name': 'Whidbey Telecom' - }, - 'weh010-white': { - 'name': 'White County Cable TV' - }, - 'wes130': { - 'name': 'Wiatel' - }, - 'wik010': { - 'name': 'Wiktel' - }, - 'wil070': { - 'name': 'Wilkes Communications, Inc./RiverStreet Networks' - }, - 'wil015': { - 'name': 'Wilson Communications' - }, - 'win010': { - 'name': 'Windomnet/SMBS' - }, - 'win090': { - 'name': 'Windstream Cable TV' - }, - 'wcta': { - 'name': 'Winnebago Cooperative Telecom Association' - }, - 'wtc010': { - 'name': 'WTC' - }, - 'wil040': { - 'name': 'WTC Communications, Inc.' - }, - 'wya010': { - 'name': 'Wyandotte Cable' - }, - 'hin020-02': { - 'name': 'X-Stream Services' - }, - 'xit010': { - 'name': 'XIT Communications' - }, - 'yel010': { - 'name': 'Yelcot Communications' - }, - 'mid180-01': { - 'name': 'yondoo' - }, - 'cou060': { - 'name': 'Zito Media' - }, -} - - -class AdobePassIE(InfoExtractor): - _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' - _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' - _MVPD_CACHE = 'ap-mvpd' - - _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' - - def _download_webpage_handle(self, *args, **kwargs): - headers = kwargs.get('headers', {}) - headers.update(self.geo_verification_headers()) - kwargs['headers'] = headers - return super(AdobePassIE, self)._download_webpage_handle( - *args, **compat_kwargs(kwargs)) - - @staticmethod - def _get_mvpd_resource(provider_id, title, guid, rating): - channel = etree.Element('channel') - channel_title = etree.SubElement(channel, 'title') - channel_title.text = provider_id - item = etree.SubElement(channel, 'item') - resource_title = etree.SubElement(item, 'title') - resource_title.text = title - resource_guid = etree.SubElement(item, 'guid') - resource_guid.text = guid - resource_rating = etree.SubElement(item, 'media:rating') - resource_rating.attrib = {'scheme': 'urn:v-chip'} - resource_rating.text = rating - return '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">' + etree.tostring(channel).decode() + '</rss>' - - def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): - def xml_text(xml_str, tag): - return self._search_regex( - '<%s>(.+?)</%s>' % (tag, tag), xml_str, tag) - - def is_expired(token, date_ele): - token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(token, date_ele))) - return token_expires and token_expires <= int(time.time()) - - def post_form(form_page_res, note, data={}): - form_page, urlh = form_page_res - post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url') - if not re.match(r'https?://', post_url): - post_url = compat_urlparse.urljoin(urlh.geturl(), post_url) - form_data = self._hidden_inputs(form_page) - form_data.update(data) - return self._download_webpage_handle( - post_url, video_id, note, data=urlencode_postdata(form_data), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - - def raise_mvpd_required(): - raise ExtractorError( - 'This video is only available for users of participating TV providers. ' - 'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier ' - 'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True) - - def extract_redirect_url(html, url=None, fatal=False): - # TODO: eliminate code duplication with generic extractor and move - # redirection code into _download_webpage_handle - REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' - redirect_url = self._search_regex( - r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' - r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX, - html, 'meta refresh redirect', - default=NO_DEFAULT if fatal else None, fatal=fatal) - if not redirect_url: - return None - if url: - redirect_url = compat_urlparse.urljoin(url, unescapeHTML(redirect_url)) - return redirect_url - - mvpd_headers = { - 'ap_42': 'anonymous', - 'ap_11': 'Linux i686', - 'ap_z': self._USER_AGENT, - 'User-Agent': self._USER_AGENT, - } - - guid = xml_text(resource, 'guid') if '<' in resource else resource - count = 0 - while count < 2: - requestor_info = self._downloader.cache.load(self._MVPD_CACHE, requestor_id) or {} - authn_token = requestor_info.get('authn_token') - if authn_token and is_expired(authn_token, 'simpleTokenExpires'): - authn_token = None - if not authn_token: - # TODO add support for other TV Providers - mso_id = self._downloader.params.get('ap_mso') - if not mso_id: - raise_mvpd_required() - username, password = self._get_login_info('ap_username', 'ap_password', mso_id) - if not username or not password: - raise_mvpd_required() - mso_info = MSO_INFO[mso_id] - - provider_redirect_page_res = self._download_webpage_handle( - self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, - 'Downloading Provider Redirect Page', query={ - 'noflash': 'true', - 'mso_id': mso_id, - 'requestor_id': requestor_id, - 'no_iframe': 'false', - 'domain_name': 'adobe.com', - 'redirect_url': url, - }) - - if mso_id == 'Comcast_SSO': - # Comcast page flow varies by video site and whether you - # are on Comcast's network. - provider_redirect_page, urlh = provider_redirect_page_res - if 'automatically signing you in' in provider_redirect_page: - oauth_redirect_url = self._html_search_regex( - r'window\.location\s*=\s*[\'"]([^\'"]+)', - provider_redirect_page, 'oauth redirect') - self._download_webpage( - oauth_redirect_url, video_id, 'Confirming auto login') - else: - if '<form name="signin"' in provider_redirect_page: - provider_login_page_res = provider_redirect_page_res - elif 'http-equiv="refresh"' in provider_redirect_page: - oauth_redirect_url = extract_redirect_url( - provider_redirect_page, fatal=True) - provider_login_page_res = self._download_webpage_handle( - oauth_redirect_url, video_id, - self._DOWNLOADING_LOGIN_PAGE) - else: - provider_login_page_res = post_form( - provider_redirect_page_res, - self._DOWNLOADING_LOGIN_PAGE) - - mvpd_confirm_page_res = post_form( - provider_login_page_res, 'Logging in', { - mso_info['username_field']: username, - mso_info['password_field']: password, - }) - mvpd_confirm_page, urlh = mvpd_confirm_page_res - if '<button class="submit" value="Resume">Resume</button>' in mvpd_confirm_page: - post_form(mvpd_confirm_page_res, 'Confirming Login') - elif mso_id == 'Verizon': - # In general, if you're connecting from a Verizon-assigned IP, - # you will not actually pass your credentials. - provider_redirect_page, urlh = provider_redirect_page_res - if 'Please wait ...' in provider_redirect_page: - saml_redirect_url = self._html_search_regex( - r'self\.parent\.location=(["\'])(?P<url>.+?)\1', - provider_redirect_page, - 'SAML Redirect URL', group='url') - saml_login_page = self._download_webpage( - saml_redirect_url, video_id, - 'Downloading SAML Login Page') - else: - saml_login_page_res = post_form( - provider_redirect_page_res, 'Logging in', { - mso_info['username_field']: username, - mso_info['password_field']: password, - }) - saml_login_page, urlh = saml_login_page_res - if 'Please try again.' in saml_login_page: - raise ExtractorError( - 'We\'re sorry, but either the User ID or Password entered is not correct.') - saml_login_url = self._search_regex( - r'xmlHttp\.open\("POST"\s*,\s*(["\'])(?P<url>.+?)\1', - saml_login_page, 'SAML Login URL', group='url') - saml_response_json = self._download_json( - saml_login_url, video_id, 'Downloading SAML Response', - headers={'Content-Type': 'text/xml'}) - self._download_webpage( - saml_response_json['targetValue'], video_id, - 'Confirming Login', data=urlencode_postdata({ - 'SAMLResponse': saml_response_json['SAMLResponse'], - 'RelayState': saml_response_json['RelayState'] - }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded' - }) - else: - # Some providers (e.g. DIRECTV NOW) have another meta refresh - # based redirect that should be followed. - provider_redirect_page, urlh = provider_redirect_page_res - provider_refresh_redirect_url = extract_redirect_url( - provider_redirect_page, url=urlh.geturl()) - if provider_refresh_redirect_url: - provider_redirect_page_res = self._download_webpage_handle( - provider_refresh_redirect_url, video_id, - 'Downloading Provider Redirect Page (meta refresh)') - provider_login_page_res = post_form( - provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE) - mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', { - mso_info.get('username_field', 'username'): username, - mso_info.get('password_field', 'password'): password, - }) - if mso_id != 'Rogers': - post_form(mvpd_confirm_page_res, 'Confirming Login') - - session = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, - 'Retrieving Session', data=urlencode_postdata({ - '_method': 'GET', - 'requestor_id': requestor_id, - }), headers=mvpd_headers) - if '<pendingLogout' in session: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) - count += 1 - continue - authn_token = unescapeHTML(xml_text(session, 'authnToken')) - requestor_info['authn_token'] = authn_token - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) - - authz_token = requestor_info.get(guid) - if authz_token and is_expired(authz_token, 'simpleTokenTTL'): - authz_token = None - if not authz_token: - authorize = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id, - 'Retrieving Authorization Token', data=urlencode_postdata({ - 'resource_id': resource, - 'requestor_id': requestor_id, - 'authentication_token': authn_token, - 'mso_id': xml_text(authn_token, 'simpleTokenMsoID'), - 'userMeta': '1', - }), headers=mvpd_headers) - if '<pendingLogout' in authorize: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) - count += 1 - continue - if '<error' in authorize: - raise ExtractorError(xml_text(authorize, 'details'), expected=True) - authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) - requestor_info[guid] = authz_token - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) - - mvpd_headers.update({ - 'ap_19': xml_text(authn_token, 'simpleSamlNameID'), - 'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'), - }) - - short_authorize = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize', - video_id, 'Retrieving Media Token', data=urlencode_postdata({ - 'authz_token': authz_token, - 'requestor_id': requestor_id, - 'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'), - 'hashed_guid': 'false', - }), headers=mvpd_headers) - if '<pendingLogout' in short_authorize: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) - count += 1 - continue - return short_authorize diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py deleted file mode 100644 index 5d4db54..0000000 --- a/youtube_dl/extractor/common.py +++ /dev/null @@ -1,2862 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import base64 -import datetime -import hashlib -import json -import netrc -import os -import random -import re -import socket -import sys -import time -import math - -from ..compat import ( - compat_cookiejar, - compat_cookies, - compat_etree_fromstring, - compat_getpass, - compat_integer_types, - compat_http_client, - compat_os_name, - compat_str, - compat_urllib_error, - compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, - compat_urllib_request, - compat_urlparse, - compat_xml_parse_error, -) -from ..downloader.f4m import ( - get_base_url, - remove_encrypted_media, -) -from ..utils import ( - NO_DEFAULT, - age_restricted, - base_url, - bug_reports_message, - clean_html, - compiled_regex_type, - determine_ext, - determine_protocol, - error_to_compat_str, - ExtractorError, - extract_attributes, - fix_xml_ampersands, - float_or_none, - GeoRestrictedError, - GeoUtils, - int_or_none, - js_to_json, - JSON_LD_RE, - mimetype2ext, - orderedSet, - parse_codecs, - parse_duration, - parse_iso8601, - parse_m3u8_attributes, - RegexNotFoundError, - sanitized_Request, - sanitize_filename, - unescapeHTML, - unified_strdate, - unified_timestamp, - update_Request, - update_url_query, - urljoin, - url_basename, - xpath_element, - xpath_text, - xpath_with_ns, -) - - -class InfoExtractor(object): - """Information Extractor class. - - Information extractors are the classes that, given a URL, extract - information about the video (or videos) the URL refers to. This - information includes the real video URL, the video title, author and - others. The information is stored in a dictionary which is then - passed to the YoutubeDL. The YoutubeDL processes this - information possibly downloading the video to the file system, among - other possible outcomes. - - The type field determines the type of the result. - By far the most common value (and the default if _type is missing) is - "video", which indicates a single video. - - For a video, the dictionaries must include the following fields: - - id: Video identifier. - title: Video title, unescaped. - - Additionally, it must contain either a formats entry or a url one: - - formats: A list of dictionaries for each format available, ordered - from worst to best quality. - - Potential fields: - * url Mandatory. The URL of the video file - * manifest_url - The URL of the manifest file in case of - fragmented media (DASH, hls, hds) - * ext Will be calculated from URL if missing - * format A human-readable description of the format - ("mp4 container with h264/opus"). - Calculated from the format_id, width, height. - and format_note fields if missing. - * format_id A short description of the format - ("mp4_h264_opus" or "19"). - Technically optional, but strongly recommended. - * format_note Additional info about the format - ("3D" or "DASH video") - * width Width of the video, if known - * height Height of the video, if known - * resolution Textual description of width and height - * tbr Average bitrate of audio and video in KBit/s - * abr Average audio bitrate in KBit/s - * acodec Name of the audio codec in use - * asr Audio sampling rate in Hertz - * vbr Average video bitrate in KBit/s - * fps Frame rate - * vcodec Name of the video codec in use - * container Name of the container format - * filesize The number of bytes, if known in advance - * filesize_approx An estimate for the number of bytes - * player_url SWF Player URL (used for rtmpdump). - * protocol The protocol that will be used for the actual - download, lower-case. - "http", "https", "rtsp", "rtmp", "rtmpe", - "m3u8", "m3u8_native" or "http_dash_segments". - * fragment_base_url - Base URL for fragments. Each fragment's path - value (if present) will be relative to - this URL. - * fragments A list of fragments of a fragmented media. - Each fragment entry must contain either an url - or a path. If an url is present it should be - considered by a client. Otherwise both path and - fragment_base_url must be present. Here is - the list of all potential fields: - * "url" - fragment's URL - * "path" - fragment's path relative to - fragment_base_url - * "duration" (optional, int or float) - * "filesize" (optional, int) - * preference Order number of this format. If this field is - present and not None, the formats get sorted - by this field, regardless of all other values. - -1 for default (order by other properties), - -2 or smaller for less than default. - < -1000 to hide the format (if there is - another one which is strictly better) - * language Language code, e.g. "de" or "en-US". - * language_preference Is this in the language mentioned in - the URL? - 10 if it's what the URL is about, - -1 for default (don't know), - -10 otherwise, other values reserved for now. - * quality Order number of the video quality of this - format, irrespective of the file format. - -1 for default (order by other properties), - -2 or smaller for less than default. - * source_preference Order number for this video source - (quality takes higher priority) - -1 for default (order by other properties), - -2 or smaller for less than default. - * http_headers A dictionary of additional HTTP headers - to add to the request. - * stretched_ratio If given and not 1, indicates that the - video's pixels are not square. - width : height ratio as float. - * no_resume The server does not support resuming the - (HTTP or RTMP) download. Boolean. - * downloader_options A dictionary of downloader options as - described in FileDownloader - - url: Final video URL. - ext: Video filename extension. - format: The video format, defaults to ext (used for --get-format) - player_url: SWF Player URL (used for rtmpdump). - - The following fields are optional: - - alt_title: A secondary title of the video. - display_id An alternative identifier for the video, not necessarily - unique, but available before title. Typically, id is - something like "4234987", title "Dancing naked mole rats", - and display_id "dancing-naked-mole-rats" - thumbnails: A list of dictionaries, with the following entries: - * "id" (optional, string) - Thumbnail format ID - * "url" - * "preference" (optional, int) - quality of the image - * "width" (optional, int) - * "height" (optional, int) - * "resolution" (optional, string "{width}x{height"}, - deprecated) - * "filesize" (optional, int) - thumbnail: Full URL to a video thumbnail image. - description: Full video description. - uploader: Full name of the video uploader. - license: License name the video is licensed under. - creator: The creator of the video. - release_date: The date (YYYYMMDD) when the video was released. - timestamp: UNIX timestamp of the moment the video became available. - upload_date: Video upload date (YYYYMMDD). - If not explicitly set, calculated from timestamp. - uploader_id: Nickname or id of the video uploader. - uploader_url: Full URL to a personal webpage of the video uploader. - location: Physical location where the video was filmed. - subtitles: The available subtitles as a dictionary in the format - {tag: subformats}. "tag" is usually a language code, and - "subformats" is a list sorted from lower to higher - preference, each element is a dictionary with the "ext" - entry and one of: - * "data": The subtitles file contents - * "url": A URL pointing to the subtitles file - "ext" will be calculated from URL if missing - automatic_captions: Like 'subtitles', used by the YoutubeIE for - automatically generated captions - duration: Length of the video in seconds, as an integer or float. - view_count: How many users have watched the video on the platform. - like_count: Number of positive ratings of the video - dislike_count: Number of negative ratings of the video - repost_count: Number of reposts of the video - average_rating: Average rating give by users, the scale used depends on the webpage - comment_count: Number of comments on the video - comments: A list of comments, each with one or more of the following - properties (all but one of text or html optional): - * "author" - human-readable name of the comment author - * "author_id" - user ID of the comment author - * "id" - Comment ID - * "html" - Comment as HTML - * "text" - Plain text of the comment - * "timestamp" - UNIX timestamp of comment - * "parent" - ID of the comment this one is replying to. - Set to "root" to indicate that this is a - comment to the original video. - age_limit: Age restriction for the video, as an integer (years) - webpage_url: The URL to the video webpage, if given to youtube-dl it - should allow to get the same result again. (It will be set - by YoutubeDL if it's missing) - categories: A list of categories that the video falls in, for example - ["Sports", "Berlin"] - tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] - is_live: True, False, or None (=unknown). Whether this video is a - live stream that goes on instead of a fixed-length video. - start_time: Time in seconds where the reproduction should start, as - specified in the URL. - end_time: Time in seconds where the reproduction should end, as - specified in the URL. - chapters: A list of dictionaries, with the following entries: - * "start_time" - The start time of the chapter in seconds - * "end_time" - The end time of the chapter in seconds - * "title" (optional, string) - - The following fields should only be used when the video belongs to some logical - chapter or section: - - chapter: Name or title of the chapter the video belongs to. - chapter_number: Number of the chapter the video belongs to, as an integer. - chapter_id: Id of the chapter the video belongs to, as a unicode string. - - The following fields should only be used when the video is an episode of some - series, programme or podcast: - - series: Title of the series or programme the video episode belongs to. - season: Title of the season the video episode belongs to. - season_number: Number of the season the video episode belongs to, as an integer. - season_id: Id of the season the video episode belongs to, as a unicode string. - episode: Title of the video episode. Unlike mandatory video title field, - this field should denote the exact title of the video episode - without any kind of decoration. - episode_number: Number of the video episode within a season, as an integer. - episode_id: Id of the video episode, as a unicode string. - - The following fields should only be used when the media is a track or a part of - a music album: - - track: Title of the track. - track_number: Number of the track within an album or a disc, as an integer. - track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), - as a unicode string. - artist: Artist(s) of the track. - genre: Genre(s) of the track. - album: Title of the album the track belongs to. - album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). - album_artist: List of all artists appeared on the album (e.g. - "Ash Borer / Fell Voices" or "Various Artists", useful for splits - and compilations). - disc_number: Number of the disc or other physical medium the track belongs to, - as an integer. - release_year: Year (YYYY) when the album was released. - - Unless mentioned otherwise, the fields should be Unicode strings. - - Unless mentioned otherwise, None is equivalent to absence of information. - - - _type "playlist" indicates multiple videos. - There must be a key "entries", which is a list, an iterable, or a PagedList - object, each element of which is a valid dictionary by this specification. - - Additionally, playlists can have "id", "title", "description", "uploader", - "uploader_id", "uploader_url" attributes with the same semantics as videos - (see above). - - - _type "multi_video" indicates that there are multiple videos that - form a single show, for examples multiple acts of an opera or TV episode. - It must have an entries key like a playlist and contain all the keys - required for a video at the same time. - - - _type "url" indicates that the video must be extracted from another - location, possibly by a different extractor. Its only required key is: - "url" - the next URL to extract. - The key "ie_key" can be set to the class name (minus the trailing "IE", - e.g. "Youtube") if the extractor class is known in advance. - Additionally, the dictionary may have any properties of the resolved entity - known in advance, for example "title" if the title of the referred video is - known ahead of time. - - - _type "url_transparent" entities have the same specification as "url", but - indicate that the given additional information is more precise than the one - associated with the resolved URL. - This is useful when a site employs a video service that hosts the video and - its technical metadata, but that video service does not embed a useful - title, description etc. - - - Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods and define a _VALID_URL regexp. - Probably, they should also be added to the list of extractors. - - _GEO_BYPASS attribute may be set to False in order to disable - geo restriction bypass mechanisms for a particular extractor. - Though it won't disable explicit geo restriction bypass based on - country code provided with geo_bypass_country. - - _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted - countries for this extractor. One of these countries will be used by - geo restriction bypass mechanism right away in order to bypass - geo restriction, of course, if the mechanism is not disabled. - - _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted - IP blocks in CIDR notation for this extractor. One of these IP blocks - will be used by geo restriction bypass mechanism similarly - to _GEO_COUNTRIES. - - Finally, the _WORKING attribute should be set to False for broken IEs - in order to warn the users and skip the tests. - """ - - _ready = False - _downloader = None - _x_forwarded_for_ip = None - _GEO_BYPASS = True - _GEO_COUNTRIES = None - _GEO_IP_BLOCKS = None - _WORKING = True - - def __init__(self, downloader=None): - """Constructor. Receives an optional downloader.""" - self._ready = False - self._x_forwarded_for_ip = None - self.set_downloader(downloader) - - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - - # This does not use has/getattr intentionally - we want to know whether - # we have cached the regexp for *this* class, whereas getattr would also - # match the superclass - if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - return cls._VALID_URL_RE.match(url) is not None - - @classmethod - def _match_id(cls, url): - if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - m = cls._VALID_URL_RE.match(url) - assert m - return compat_str(m.group('id')) - - @classmethod - def working(cls): - """Getter method for _WORKING.""" - return cls._WORKING - - def initialize(self): - """Initializes an instance (authentication, etc).""" - self._initialize_geo_bypass({ - 'countries': self._GEO_COUNTRIES, - 'ip_blocks': self._GEO_IP_BLOCKS, - }) - if not self._ready: - self._real_initialize() - self._ready = True - - def _initialize_geo_bypass(self, geo_bypass_context): - """ - Initialize geo restriction bypass mechanism. - - This method is used to initialize geo bypass mechanism based on faking - X-Forwarded-For HTTP header. A random country from provided country list - is selected and a random IP belonging to this country is generated. This - IP will be passed as X-Forwarded-For HTTP header in all subsequent - HTTP requests. - - This method will be used for initial geo bypass mechanism initialization - during the instance initialization with _GEO_COUNTRIES and - _GEO_IP_BLOCKS. - - You may also manually call it from extractor's code if geo bypass - information is not available beforehand (e.g. obtained during - extraction) or due to some other reason. In this case you should pass - this information in geo bypass context passed as first argument. It may - contain following fields: - - countries: List of geo unrestricted countries (similar - to _GEO_COUNTRIES) - ip_blocks: List of geo unrestricted IP blocks in CIDR notation - (similar to _GEO_IP_BLOCKS) - - """ - if not self._x_forwarded_for_ip: - - # Geo bypass mechanism is explicitly disabled by user - if not self._downloader.params.get('geo_bypass', True): - return - - if not geo_bypass_context: - geo_bypass_context = {} - - # Backward compatibility: previously _initialize_geo_bypass - # expected a list of countries, some 3rd party code may still use - # it this way - if isinstance(geo_bypass_context, (list, tuple)): - geo_bypass_context = { - 'countries': geo_bypass_context, - } - - # The whole point of geo bypass mechanism is to fake IP - # as X-Forwarded-For HTTP header based on some IP block or - # country code. - - # Path 1: bypassing based on IP block in CIDR notation - - # Explicit IP block specified by user, use it right away - # regardless of whether extractor is geo bypassable or not - ip_block = self._downloader.params.get('geo_bypass_ip_block', None) - - # Otherwise use random IP block from geo bypass context but only - # if extractor is known as geo bypassable - if not ip_block: - ip_blocks = geo_bypass_context.get('ip_blocks') - if self._GEO_BYPASS and ip_blocks: - ip_block = random.choice(ip_blocks) - - if ip_block: - self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) - if self._downloader.params.get('verbose', False): - self._downloader.to_screen( - '[debug] Using fake IP %s as X-Forwarded-For.' - % self._x_forwarded_for_ip) - return - - # Path 2: bypassing based on country code - - # Explicit country code specified by user, use it right away - # regardless of whether extractor is geo bypassable or not - country = self._downloader.params.get('geo_bypass_country', None) - - # Otherwise use random country code from geo bypass context but - # only if extractor is known as geo bypassable - if not country: - countries = geo_bypass_context.get('countries') - if self._GEO_BYPASS and countries: - country = random.choice(countries) - - if country: - self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) - if self._downloader.params.get('verbose', False): - self._downloader.to_screen( - '[debug] Using fake IP %s (%s) as X-Forwarded-For.' - % (self._x_forwarded_for_ip, country.upper())) - - def extract(self, url): - """Extracts URL information and returns it in list of dicts.""" - try: - for _ in range(2): - try: - self.initialize() - ie_result = self._real_extract(url) - if self._x_forwarded_for_ip: - ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip - return ie_result - except GeoRestrictedError as e: - if self.__maybe_fake_ip_and_retry(e.countries): - continue - raise - except ExtractorError: - raise - except compat_http_client.IncompleteRead as e: - raise ExtractorError('A network error has occurred.', cause=e, expected=True) - except (KeyError, StopIteration) as e: - raise ExtractorError('An extractor error has occurred.', cause=e) - - def __maybe_fake_ip_and_retry(self, countries): - if (not self._downloader.params.get('geo_bypass_country', None) and - self._GEO_BYPASS and - self._downloader.params.get('geo_bypass', True) and - not self._x_forwarded_for_ip and - countries): - country_code = random.choice(countries) - self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) - if self._x_forwarded_for_ip: - self.report_warning( - 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' - % (self._x_forwarded_for_ip, country_code.upper())) - return True - return False - - def set_downloader(self, downloader): - """Sets the downloader for this IE.""" - self._downloader = downloader - - def _real_initialize(self): - """Real initialization process. Redefine in subclasses.""" - pass - - def _real_extract(self, url): - """Real extraction process. Redefine in subclasses.""" - pass - - @classmethod - def ie_key(cls): - """A string for getting the InfoExtractor with get_info_extractor""" - return compat_str(cls.__name__[:-2]) - - @property - def IE_NAME(self): - return compat_str(type(self).__name__[:-2]) - - @staticmethod - def __can_accept_status_code(err, expected_status): - assert isinstance(err, compat_urllib_error.HTTPError) - if expected_status is None: - return False - if isinstance(expected_status, compat_integer_types): - return err.code == expected_status - elif isinstance(expected_status, (list, tuple)): - return err.code in expected_status - elif callable(expected_status): - return expected_status(err.code) is True - else: - assert False - - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): - """ - Return the response handle. - - See _download_webpage docstring for arguments specification. - """ - if note is None: - self.report_download_webpage(video_id) - elif note is not False: - if video_id is None: - self.to_screen('%s' % (note,)) - else: - self.to_screen('%s: %s' % (video_id, note)) - - # Some sites check X-Forwarded-For HTTP header in order to figure out - # the origin of the client behind proxy. This allows bypassing geo - # restriction by faking this header's value to IP that belongs to some - # geo unrestricted country. We will do so once we encounter any - # geo restriction error. - if self._x_forwarded_for_ip: - if 'X-Forwarded-For' not in headers: - headers['X-Forwarded-For'] = self._x_forwarded_for_ip - - if isinstance(url_or_request, compat_urllib_request.Request): - url_or_request = update_Request( - url_or_request, data=data, headers=headers, query=query) - else: - if query: - url_or_request = update_url_query(url_or_request, query) - if data is not None or headers: - url_or_request = sanitized_Request(url_or_request, data, headers) - try: - return self._downloader.urlopen(url_or_request) - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - if isinstance(err, compat_urllib_error.HTTPError): - if self.__can_accept_status_code(err, expected_status): - return err.fp - - if errnote is False: - return False - if errnote is None: - errnote = 'Unable to download webpage' - - errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) - if fatal: - raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) - else: - self._downloader.report_warning(errmsg) - return False - - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): - """ - Return a tuple (page content as string, URL handle). - - See _download_webpage docstring for arguments specification. - """ - # Strip hashes from the URL (#1038) - if isinstance(url_or_request, (compat_str, str)): - url_or_request = url_or_request.partition('#')[0] - - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) - if urlh is False: - assert not fatal - return False - content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) - return (content, urlh) - - @staticmethod - def _guess_encoding_from_content(content_type, webpage_bytes): - m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) - if m: - encoding = m.group(1) - else: - m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', - webpage_bytes[:1024]) - if m: - encoding = m.group(1).decode('ascii') - elif webpage_bytes.startswith(b'\xff\xfe'): - encoding = 'utf-16' - else: - encoding = 'utf-8' - - return encoding - - def __check_blocked(self, content): - first_block = content[:512] - if ('<title>Access to this site is blocked</title>' in content and - 'Websense' in first_block): - msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' - blocked_iframe = self._html_search_regex( - r'<iframe src="([^"]+)"', content, - 'Websense information URL', default=None) - if blocked_iframe: - msg += ' Visit %s for more details' % blocked_iframe - raise ExtractorError(msg, expected=True) - if '<title>The URL you requested has been blocked</title>' in first_block: - msg = ( - 'Access to this webpage has been blocked by Indian censorship. ' - 'Use a VPN or proxy server (with --proxy) to route around it.') - block_msg = self._html_search_regex( - r'</h1><p>(.*?)</p>', - content, 'block message', default=None) - if block_msg: - msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') - raise ExtractorError(msg, expected=True) - if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and - 'blocklist.rkn.gov.ru' in content): - raise ExtractorError( - 'Access to this webpage has been blocked by decision of the Russian government. ' - 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', - expected=True) - - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): - content_type = urlh.headers.get('Content-Type', '') - webpage_bytes = urlh.read() - if prefix is not None: - webpage_bytes = prefix + webpage_bytes - if not encoding: - encoding = self._guess_encoding_from_content(content_type, webpage_bytes) - if self._downloader.params.get('dump_intermediate_pages', False): - self.to_screen('Dumping request to ' + urlh.geturl()) - dump = base64.b64encode(webpage_bytes).decode('ascii') - self._downloader.to_screen(dump) - if self._downloader.params.get('write_pages', False): - basen = '%s_%s' % (video_id, urlh.geturl()) - if len(basen) > 240: - h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() - basen = basen[:240 - len(h)] + h - raw_filename = basen + '.dump' - filename = sanitize_filename(raw_filename, restricted=True) - self.to_screen('Saving request to ' + filename) - # Working around MAX_PATH limitation on Windows (see - # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if compat_os_name == 'nt': - absfilepath = os.path.abspath(filename) - if len(absfilepath) > 259: - filename = '\\\\?\\' + absfilepath - with open(filename, 'wb') as outf: - outf.write(webpage_bytes) - - try: - content = webpage_bytes.decode(encoding, 'replace') - except LookupError: - content = webpage_bytes.decode('utf-8', 'replace') - - self.__check_blocked(content) - - return content - - def _download_webpage( - self, url_or_request, video_id, note=None, errnote=None, - fatal=True, tries=1, timeout=5, encoding=None, data=None, - headers={}, query={}, expected_status=None): - """ - Return the data of the page as a string. - - Arguments: - url_or_request -- plain text URL as a string or - a compat_urllib_request.Requestobject - video_id -- Video/playlist/item identifier (string) - - Keyword arguments: - note -- note printed before downloading (string) - errnote -- note printed in case of an error (string) - fatal -- flag denoting whether error should be considered fatal, - i.e. whether it should cause ExtractionError to be raised, - otherwise a warning will be reported and extraction continued - tries -- number of tries - timeout -- sleep interval between tries - encoding -- encoding for a page content decoding, guessed automatically - when not explicitly specified - data -- POST data (bytes) - headers -- HTTP headers (dict) - query -- URL query (dict) - expected_status -- allows to accept failed HTTP requests (non 2xx - status code) by explicitly specifying a set of accepted status - codes. Can be any of the following entities: - - an integer type specifying an exact failed status code to - accept - - a list or a tuple of integer types specifying a list of - failed status codes to accept - - a callable accepting an actual failed status code and - returning True if it should be accepted - Note that this argument does not affect success status codes (2xx) - which are always accepted. - """ - - success = False - try_count = 0 - while success is False: - try: - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - success = True - except compat_http_client.IncompleteRead as e: - try_count += 1 - if try_count >= tries: - raise e - self._sleep(timeout, video_id) - if res is False: - return res - else: - content, _ = res - return content - - def _download_xml_handle( - self, url_or_request, video_id, note='Downloading XML', - errnote='Unable to download XML', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle). - - See _download_webpage docstring for arguments specification. - """ - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - if res is False: - return res - xml_string, urlh = res - return self._parse_xml( - xml_string, video_id, transform_source=transform_source, - fatal=fatal), urlh - - def _download_xml( - self, url_or_request, video_id, - note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, - data=None, headers={}, query={}, expected_status=None): - """ - Return the xml as an xml.etree.ElementTree.Element. - - See _download_webpage docstring for arguments specification. - """ - res = self._download_xml_handle( - url_or_request, video_id, note=note, errnote=errnote, - transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, - expected_status=expected_status) - return res if res is False else res[0] - - def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): - if transform_source: - xml_string = transform_source(xml_string) - try: - return compat_etree_fromstring(xml_string.encode('utf-8')) - except compat_xml_parse_error as ve: - errmsg = '%s: Failed to parse XML ' % video_id - if fatal: - raise ExtractorError(errmsg, cause=ve) - else: - self.report_warning(errmsg + str(ve)) - - def _download_json_handle( - self, url_or_request, video_id, note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return a tuple (JSON object, URL handle). - - See _download_webpage docstring for arguments specification. - """ - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - if res is False: - return res - json_string, urlh = res - return self._parse_json( - json_string, video_id, transform_source=transform_source, - fatal=fatal), urlh - - def _download_json( - self, url_or_request, video_id, note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return the JSON object as a dict. - - See _download_webpage docstring for arguments specification. - """ - res = self._download_json_handle( - url_or_request, video_id, note=note, errnote=errnote, - transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, - expected_status=expected_status) - return res if res is False else res[0] - - def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): - if transform_source: - json_string = transform_source(json_string) - try: - return json.loads(json_string) - except ValueError as ve: - errmsg = '%s: Failed to parse JSON ' % video_id - if fatal: - raise ExtractorError(errmsg, cause=ve) - else: - self.report_warning(errmsg + str(ve)) - - def report_warning(self, msg, video_id=None): - idstr = '' if video_id is None else '%s: ' % video_id - self._downloader.report_warning( - '[%s] %s%s' % (self.IE_NAME, idstr, msg)) - - def to_screen(self, msg): - """Print msg to screen, prefixing it with '[ie_name]'""" - self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg)) - - def report_extraction(self, id_or_name): - """Report information extraction.""" - self.to_screen('%s: Extracting information' % id_or_name) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self.to_screen('%s: Downloading webpage' % video_id) - - def report_age_confirmation(self): - """Report attempt to confirm age.""" - self.to_screen('Confirming age') - - def report_login(self): - """Report attempt to log in.""" - self.to_screen('Logging in') - - @staticmethod - def raise_login_required(msg='This video is only available for registered users'): - raise ExtractorError( - '%s. Use --username and --password or --netrc to provide account credentials.' % msg, - expected=True) - - @staticmethod - def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None): - raise GeoRestrictedError(msg, countries=countries) - - # Methods for following #608 - @staticmethod - def url_result(url, ie=None, video_id=None, video_title=None): - """Returns a URL that points to a page that should be processed""" - # TODO: ie should be the class used for getting the info - video_info = {'_type': 'url', - 'url': url, - 'ie_key': ie} - if video_id is not None: - video_info['id'] = video_id - if video_title is not None: - video_info['title'] = video_title - return video_info - - def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): - urls = orderedSet( - self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) - for m in matches) - return self.playlist_result( - urls, playlist_id=playlist_id, playlist_title=playlist_title) - - @staticmethod - def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): - """Returns a playlist""" - video_info = {'_type': 'playlist', - 'entries': entries} - if playlist_id: - video_info['id'] = playlist_id - if playlist_title: - video_info['title'] = playlist_title - if playlist_description: - video_info['description'] = playlist_description - return video_info - - def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): - """ - Perform a regex search on the given string, using a single or a list of - patterns returning the first matching group. - In case of failure return a default value or raise a WARNING or a - RegexNotFoundError, depending on fatal, specifying the field name. - """ - if isinstance(pattern, (str, compat_str, compiled_regex_type)): - mobj = re.search(pattern, string, flags) - else: - for p in pattern: - mobj = re.search(p, string, flags) - if mobj: - break - - if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): - _name = '\033[0;34m%s\033[0m' % name - else: - _name = name - - if mobj: - if group is None: - # return the first matching group - return next(g for g in mobj.groups() if g is not None) - else: - return mobj.group(group) - elif default is not NO_DEFAULT: - return default - elif fatal: - raise RegexNotFoundError('Unable to extract %s' % _name) - else: - self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) - return None - - def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): - """ - Like _search_regex, but strips HTML tags and unescapes entities. - """ - res = self._search_regex(pattern, string, name, default, fatal, flags, group) - if res: - return clean_html(res).strip() - else: - return res - - def _get_netrc_login_info(self, netrc_machine=None): - username = None - password = None - netrc_machine = netrc_machine or self._NETRC_MACHINE - - if self._downloader.params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(netrc_machine) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError( - 'No authenticators for %s' % netrc_machine) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning( - 'parsing .netrc: %s' % error_to_compat_str(err)) - - return username, password - - def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): - """ - Get the login info as (username, password) - First look for the manually specified credentials using username_option - and password_option as keys in params dictionary. If no such credentials - available look in the netrc file using the netrc_machine or _NETRC_MACHINE - value. - If there's no info available, return (None, None) - """ - if self._downloader is None: - return (None, None) - - downloader_params = self._downloader.params - - # Attempt to use provided username and password or .netrc data - if downloader_params.get(username_option) is not None: - username = downloader_params[username_option] - password = downloader_params[password_option] - else: - username, password = self._get_netrc_login_info(netrc_machine) - - return username, password - - def _get_tfa_info(self, note='two-factor verification code'): - """ - Get the two-factor authentication info - TODO - asking the user will be required for sms/phone verify - currently just uses the command line option - If there's no info available, return None - """ - if self._downloader is None: - return None - downloader_params = self._downloader.params - - if downloader_params.get('twofactor') is not None: - return downloader_params['twofactor'] - - return compat_getpass('Type %s and press [Return]: ' % note) - - # Helper functions for extracting OpenGraph info - @staticmethod - def _og_regexes(prop): - content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' - % {'prop': re.escape(prop)}) - template = r'<meta[^>]+?%s[^>]+?%s' - return [ - template % (property_re, content_re), - template % (content_re, property_re), - ] - - @staticmethod - def _meta_regex(prop): - return r'''(?isx)<meta - (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) - [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) - - def _og_search_property(self, prop, html, name=None, **kargs): - if not isinstance(prop, (list, tuple)): - prop = [prop] - if name is None: - name = 'OpenGraph %s' % prop[0] - og_regexes = [] - for p in prop: - og_regexes.extend(self._og_regexes(p)) - escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs) - if escaped is None: - return None - return unescapeHTML(escaped) - - def _og_search_thumbnail(self, html, **kargs): - return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs) - - def _og_search_description(self, html, **kargs): - return self._og_search_property('description', html, fatal=False, **kargs) - - def _og_search_title(self, html, **kargs): - return self._og_search_property('title', html, **kargs) - - def _og_search_video_url(self, html, name='video url', secure=True, **kargs): - regexes = self._og_regexes('video') + self._og_regexes('video:url') - if secure: - regexes = self._og_regexes('video:secure_url') + regexes - return self._html_search_regex(regexes, html, name, **kargs) - - def _og_search_url(self, html, **kargs): - return self._og_search_property('url', html, **kargs) - - def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): - if not isinstance(name, (list, tuple)): - name = [name] - if display_name is None: - display_name = name[0] - return self._html_search_regex( - [self._meta_regex(n) for n in name], - html, display_name, fatal=fatal, group='content', **kwargs) - - def _dc_search_uploader(self, html): - return self._html_search_meta('dc.creator', html, 'uploader') - - def _rta_search(self, html): - # See http://www.rtalabel.org/index.php?content=howtofaq#single - if re.search(r'(?ix)<meta\s+name="rating"\s+' - r' content="RTA-5042-1996-1400-1577-RTA"', - html): - return 18 - return 0 - - def _media_rating_search(self, html): - # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ - rating = self._html_search_meta('rating', html) - - if not rating: - return None - - RATING_TABLE = { - 'safe for kids': 0, - 'general': 8, - '14 years': 14, - 'mature': 17, - 'restricted': 19, - } - return RATING_TABLE.get(rating.lower()) - - def _family_friendly_search(self, html): - # See http://schema.org/VideoObject - family_friendly = self._html_search_meta( - 'isFamilyFriendly', html, default=None) - - if not family_friendly: - return None - - RATING_TABLE = { - '1': 0, - 'true': 0, - '0': 18, - 'false': 18, - } - return RATING_TABLE.get(family_friendly.lower()) - - def _twitter_search_player(self, html): - return self._html_search_meta('twitter:player', html, - 'twitter card player') - - def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): - json_ld = self._search_regex( - JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs) - default = kwargs.get('default', NO_DEFAULT) - if not json_ld: - return default if default is not NO_DEFAULT else {} - # JSON-LD may be malformed and thus `fatal` should be respected. - # At the same time `default` may be passed that assumes `fatal=False` - # for _search_regex. Let's simulate the same behavior here as well. - fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False - return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) - - def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): - if isinstance(json_ld, compat_str): - json_ld = self._parse_json(json_ld, video_id, fatal=fatal) - if not json_ld: - return {} - info = {} - if not isinstance(json_ld, (list, tuple, dict)): - return info - if isinstance(json_ld, dict): - json_ld = [json_ld] - - INTERACTION_TYPE_MAP = { - 'CommentAction': 'comment', - 'AgreeAction': 'like', - 'DisagreeAction': 'dislike', - 'LikeAction': 'like', - 'DislikeAction': 'dislike', - 'ListenAction': 'view', - 'WatchAction': 'view', - 'ViewAction': 'view', - } - - def extract_interaction_statistic(e): - interaction_statistic = e.get('interactionStatistic') - if not isinstance(interaction_statistic, list): - return - for is_e in interaction_statistic: - if not isinstance(is_e, dict): - continue - if is_e.get('@type') != 'InteractionCounter': - continue - interaction_type = is_e.get('interactionType') - if not isinstance(interaction_type, compat_str): - continue - interaction_count = int_or_none(is_e.get('userInteractionCount')) - if interaction_count is None: - continue - count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1]) - if not count_kind: - continue - count_key = '%s_count' % count_kind - if info.get(count_key) is not None: - continue - info[count_key] = interaction_count - - def extract_video_object(e): - assert e['@type'] == 'VideoObject' - info.update({ - 'url': e.get('contentUrl'), - 'title': unescapeHTML(e.get('name')), - 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), - 'duration': parse_duration(e.get('duration')), - 'timestamp': unified_timestamp(e.get('uploadDate')), - 'filesize': float_or_none(e.get('contentSize')), - 'tbr': int_or_none(e.get('bitrate')), - 'width': int_or_none(e.get('width')), - 'height': int_or_none(e.get('height')), - 'view_count': int_or_none(e.get('interactionCount')), - }) - extract_interaction_statistic(e) - - for e in json_ld: - if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')): - item_type = e.get('@type') - if expected_type is not None and expected_type != item_type: - return info - if item_type in ('TVEpisode', 'Episode'): - info.update({ - 'episode': unescapeHTML(e.get('name')), - 'episode_number': int_or_none(e.get('episodeNumber')), - 'description': unescapeHTML(e.get('description')), - }) - part_of_season = e.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): - info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) - part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): - info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type in ('Article', 'NewsArticle'): - info.update({ - 'timestamp': parse_iso8601(e.get('datePublished')), - 'title': unescapeHTML(e.get('headline')), - 'description': unescapeHTML(e.get('articleBody')), - }) - elif item_type == 'VideoObject': - extract_video_object(e) - continue - video = e.get('video') - if isinstance(video, dict) and video.get('@type') == 'VideoObject': - extract_video_object(video) - break - return dict((k, v) for k, v in info.items() if v is not None) - - @staticmethod - def _hidden_inputs(html): - html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) - hidden_inputs = {} - for input in re.findall(r'(?i)(<input[^>]+>)', html): - attrs = extract_attributes(input) - if not input: - continue - if attrs.get('type') not in ('hidden', 'submit'): - continue - name = attrs.get('name') or attrs.get('id') - value = attrs.get('value') - if name and value is not None: - hidden_inputs[name] = value - return hidden_inputs - - def _form_hidden_inputs(self, form_id, html): - form = self._search_regex( - r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, - html, '%s form' % form_id, group='form') - return self._hidden_inputs(form) - - def _sort_formats(self, formats, field_preference=None): - if not formats: - raise ExtractorError('No video formats found') - - for f in formats: - # Automatically determine tbr when missing based on abr and vbr (improves - # formats sorting in some cases) - if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None: - f['tbr'] = f['abr'] + f['vbr'] - - def _formats_key(f): - # TODO remove the following workaround - from ..utils import determine_ext - if not f.get('ext') and 'url' in f: - f['ext'] = determine_ext(f['url']) - - if isinstance(field_preference, (list, tuple)): - return tuple( - f.get(field) - if f.get(field) is not None - else ('' if field == 'format_id' else -1) - for field in field_preference) - - preference = f.get('preference') - if preference is None: - preference = 0 - if f.get('ext') in ['f4f', 'f4m']: # Not yet supported - preference -= 0.5 - - protocol = f.get('protocol') or determine_protocol(f) - proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1) - - if f.get('vcodec') == 'none': # audio only - preference -= 50 - if self._downloader.params.get('prefer_free_formats'): - ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] - else: - ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a'] - ext_preference = 0 - try: - audio_ext_preference = ORDER.index(f['ext']) - except ValueError: - audio_ext_preference = -1 - else: - if f.get('acodec') == 'none': # video only - preference -= 40 - if self._downloader.params.get('prefer_free_formats'): - ORDER = ['flv', 'mp4', 'webm'] - else: - ORDER = ['webm', 'flv', 'mp4'] - try: - ext_preference = ORDER.index(f['ext']) - except ValueError: - ext_preference = -1 - audio_ext_preference = 0 - - return ( - preference, - f.get('language_preference') if f.get('language_preference') is not None else -1, - f.get('quality') if f.get('quality') is not None else -1, - f.get('tbr') if f.get('tbr') is not None else -1, - f.get('filesize') if f.get('filesize') is not None else -1, - f.get('vbr') if f.get('vbr') is not None else -1, - f.get('height') if f.get('height') is not None else -1, - f.get('width') if f.get('width') is not None else -1, - proto_preference, - ext_preference, - f.get('abr') if f.get('abr') is not None else -1, - audio_ext_preference, - f.get('fps') if f.get('fps') is not None else -1, - f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, - f.get('source_preference') if f.get('source_preference') is not None else -1, - f.get('format_id') if f.get('format_id') is not None else '', - ) - formats.sort(key=_formats_key) - - def _check_formats(self, formats, video_id): - if formats: - formats[:] = filter( - lambda f: self._is_valid_url( - f['url'], video_id, - item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), - formats) - - @staticmethod - def _remove_duplicate_formats(formats): - format_urls = set() - unique_formats = [] - for f in formats: - if f['url'] not in format_urls: - format_urls.add(f['url']) - unique_formats.append(f) - formats[:] = unique_formats - - def _is_valid_url(self, url, video_id, item='video', headers={}): - url = self._proto_relative_url(url, scheme='http:') - # For now assume non HTTP(S) URLs always valid - if not (url.startswith('http://') or url.startswith('https://')): - return True - try: - self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) - return True - except ExtractorError as e: - if isinstance(e.cause, compat_urllib_error.URLError): - self.to_screen( - '%s: %s URL is invalid, skipping' % (video_id, item)) - return False - raise - - def http_scheme(self): - """ Either "http:" or "https:", depending on the user's preferences """ - return ( - 'http:' - if self._downloader.params.get('prefer_insecure', False) - else 'https:') - - def _proto_relative_url(self, url, scheme=None): - if url is None: - return url - if url.startswith('//'): - if scheme is None: - scheme = self.http_scheme() - return scheme + url - else: - return url - - def _sleep(self, timeout, video_id, msg_template=None): - if msg_template is None: - msg_template = '%(video_id)s: Waiting for %(timeout)s seconds' - msg = msg_template % {'video_id': video_id, 'timeout': timeout} - self.to_screen(msg) - time.sleep(timeout) - - def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, - transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=True, m3u8_id=None): - manifest = self._download_xml( - manifest_url, video_id, 'Downloading f4m manifest', - 'Unable to download f4m manifest', - # Some manifests may be malformed, e.g. prosiebensat1 generated manifests - # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) - transform_source=transform_source, - fatal=fatal) - - if manifest is False: - return [] - - return self._parse_f4m_formats( - manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, - transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) - - def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, - transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=True, m3u8_id=None): - # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy - akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') - if akamai_pv is not None and ';' in akamai_pv.text: - playerVerificationChallenge = akamai_pv.text.split(';')[0] - if playerVerificationChallenge.strip() != '': - return [] - - formats = [] - manifest_version = '1.0' - media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') - if not media_nodes: - manifest_version = '2.0' - media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') - # Remove unsupported DRM protected media from final formats - # rendition (see https://github.com/rg3/youtube-dl/issues/8573). - media_nodes = remove_encrypted_media(media_nodes) - if not media_nodes: - return formats - - manifest_base_url = get_base_url(manifest) - - bootstrap_info = xpath_element( - manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], - 'bootstrap info', default=None) - - vcodec = None - mime_type = xpath_text( - manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'], - 'base URL', default=None) - if mime_type and mime_type.startswith('audio/'): - vcodec = 'none' - - for i, media_el in enumerate(media_nodes): - tbr = int_or_none(media_el.attrib.get('bitrate')) - width = int_or_none(media_el.attrib.get('width')) - height = int_or_none(media_el.attrib.get('height')) - format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) - # If <bootstrapInfo> is present, the specified f4m is a - # stream-level manifest, and only set-level manifests may refer to - # external resources. See section 11.4 and section 4 of F4M spec - if bootstrap_info is None: - media_url = None - # @href is introduced in 2.0, see section 11.6 of F4M spec - if manifest_version == '2.0': - media_url = media_el.attrib.get('href') - if media_url is None: - media_url = media_el.attrib.get('url') - if not media_url: - continue - manifest_url = ( - media_url if media_url.startswith('http://') or media_url.startswith('https://') - else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) - # If media_url is itself a f4m manifest do the recursive extraction - # since bitrates in parent manifest (this one) and media_url manifest - # may differ leading to inability to resolve the format by requested - # bitrate in f4m downloader - ext = determine_ext(manifest_url) - if ext == 'f4m': - f4m_formats = self._extract_f4m_formats( - manifest_url, video_id, preference=preference, f4m_id=f4m_id, - transform_source=transform_source, fatal=fatal) - # Sometimes stream-level manifest contains single media entry that - # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). - # At the same time parent's media entry in set-level manifest may - # contain it. We will copy it from parent in such cases. - if len(f4m_formats) == 1: - f = f4m_formats[0] - f.update({ - 'tbr': f.get('tbr') or tbr, - 'width': f.get('width') or width, - 'height': f.get('height') or height, - 'format_id': f.get('format_id') if not tbr else format_id, - 'vcodec': vcodec, - }) - formats.extend(f4m_formats) - continue - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', preference=preference, - m3u8_id=m3u8_id, fatal=fatal)) - continue - formats.append({ - 'format_id': format_id, - 'url': manifest_url, - 'manifest_url': manifest_url, - 'ext': 'flv' if bootstrap_info is not None else None, - 'protocol': 'f4m', - 'tbr': tbr, - 'width': width, - 'height': height, - 'vcodec': vcodec, - 'preference': preference, - }) - return formats - - def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None): - return { - 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), - 'url': m3u8_url, - 'ext': ext, - 'protocol': 'm3u8', - 'preference': preference - 100 if preference else -100, - 'resolution': 'multiple', - 'format_note': 'Quality selection URL', - } - - def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, - entry_protocol='m3u8', preference=None, - m3u8_id=None, note=None, errnote=None, - fatal=True, live=False): - res = self._download_webpage_handle( - m3u8_url, video_id, - note=note or 'Downloading m3u8 information', - errnote=errnote or 'Failed to download m3u8 information', - fatal=fatal) - - if res is False: - return [] - - m3u8_doc, urlh = res - m3u8_url = urlh.geturl() - - return self._parse_m3u8_formats( - m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, - preference=preference, m3u8_id=m3u8_id, live=live) - - def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, - entry_protocol='m3u8', preference=None, - m3u8_id=None, live=False): - if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access - return [] - - if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay - return [] - - formats = [] - - format_url = lambda u: ( - u - if re.match(r'^https?://', u) - else compat_urlparse.urljoin(m3u8_url, u)) - - # References: - # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 - # 2. https://github.com/rg3/youtube-dl/issues/12211 - - # We should try extracting formats only from master playlists [1, 4.3.4], - # i.e. playlists that describe available qualities. On the other hand - # media playlists [1, 4.3.3] should be returned as is since they contain - # just the media without qualities renditions. - # Fortunately, master playlist can be easily distinguished from media - # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] - # master playlist tags MUST NOT appear in a media playist and vice versa. - # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every - # media playlist and MUST NOT appear in master playlist thus we can - # clearly detect media playlist with this criterion. - - if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is - return [{ - 'url': m3u8_url, - 'format_id': m3u8_id, - 'ext': ext, - 'protocol': entry_protocol, - 'preference': preference, - }] - - groups = {} - last_stream_inf = {} - - def extract_media(x_media_line): - media = parse_m3u8_attributes(x_media_line) - # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED - media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') - if not (media_type and group_id and name): - return - groups.setdefault(group_id, []).append(media) - if media_type not in ('VIDEO', 'AUDIO'): - return - media_url = media.get('URI') - if media_url: - format_id = [] - for v in (m3u8_id, group_id, name): - if v: - format_id.append(v) - f = { - 'format_id': '-'.join(format_id), - 'url': format_url(media_url), - 'manifest_url': m3u8_url, - 'language': media.get('LANGUAGE'), - 'ext': ext, - 'protocol': entry_protocol, - 'preference': preference, - } - if media_type == 'AUDIO': - f['vcodec'] = 'none' - formats.append(f) - - def build_stream_name(): - # Despite specification does not mention NAME attribute for - # EXT-X-STREAM-INF tag it still sometimes may be present (see [1] - # or vidio test in TestInfoExtractor.test_parse_m3u8_formats) - # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 - stream_name = last_stream_inf.get('NAME') - if stream_name: - return stream_name - # If there is no NAME in EXT-X-STREAM-INF it will be obtained - # from corresponding rendition group - stream_group_id = last_stream_inf.get('VIDEO') - if not stream_group_id: - return - stream_group = groups.get(stream_group_id) - if not stream_group: - return stream_group_id - rendition = stream_group[0] - return rendition.get('NAME') or stream_group_id - - for line in m3u8_doc.splitlines(): - if line.startswith('#EXT-X-STREAM-INF:'): - last_stream_inf = parse_m3u8_attributes(line) - elif line.startswith('#EXT-X-MEDIA:'): - extract_media(line) - elif line.startswith('#') or not line.strip(): - continue - else: - tbr = float_or_none( - last_stream_inf.get('AVERAGE-BANDWIDTH') or - last_stream_inf.get('BANDWIDTH'), scale=1000) - format_id = [] - if m3u8_id: - format_id.append(m3u8_id) - stream_name = build_stream_name() - # Bandwidth of live streams may differ over time thus making - # format_id unpredictable. So it's better to keep provided - # format_id intact. - if not live: - format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) - manifest_url = format_url(line.strip()) - f = { - 'format_id': '-'.join(format_id), - 'url': manifest_url, - 'manifest_url': m3u8_url, - 'tbr': tbr, - 'ext': ext, - 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), - 'protocol': entry_protocol, - 'preference': preference, - } - resolution = last_stream_inf.get('RESOLUTION') - if resolution: - mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) - if mobj: - f['width'] = int(mobj.group('width')) - f['height'] = int(mobj.group('height')) - # Unified Streaming Platform - mobj = re.search( - r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) - if mobj: - abr, vbr = mobj.groups() - abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) - f.update({ - 'vbr': vbr, - 'abr': abr, - }) - codecs = parse_codecs(last_stream_inf.get('CODECS')) - f.update(codecs) - audio_group_id = last_stream_inf.get('AUDIO') - # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which - # references a rendition group MUST have a CODECS attribute. - # However, this is not always respected, for example, [2] - # contains EXT-X-STREAM-INF tag which references AUDIO - # rendition group but does not have CODECS and despite - # referencing audio group an audio group, it represents - # a complete (with audio and video) format. So, for such cases - # we will ignore references to rendition groups and treat them - # as complete formats. - if audio_group_id and codecs and f.get('vcodec') != 'none': - audio_group = groups.get(audio_group_id) - if audio_group and audio_group[0].get('URI'): - # TODO: update acodec for audio only formats with - # the same GROUP-ID - f['acodec'] = 'none' - formats.append(f) - last_stream_inf = {} - return formats - - @staticmethod - def _xpath_ns(path, namespace=None): - if not namespace: - return path - out = [] - for c in path.split('/'): - if not c or c == '.': - out.append(c) - else: - out.append('{%s}%s' % (namespace, c)) - return '/'.join(out) - - def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) - - if smil is False: - assert not fatal - return [] - - namespace = self._parse_smil_namespace(smil) - - return self._parse_smil_formats( - smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - - def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal) - if smil is False: - return {} - return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) - - def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): - return self._download_xml( - smil_url, video_id, 'Downloading SMIL file', - 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) - - def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): - namespace = self._parse_smil_namespace(smil) - - formats = self._parse_smil_formats( - smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subtitles = self._parse_smil_subtitles(smil, namespace=namespace) - - video_id = os.path.splitext(url_basename(smil_url))[0] - title = None - description = None - upload_date = None - for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): - name = meta.attrib.get('name') - content = meta.attrib.get('content') - if not name or not content: - continue - if not title and name == 'title': - title = content - elif not description and name in ('description', 'abstract'): - description = content - elif not upload_date and name == 'date': - upload_date = unified_strdate(content) - - thumbnails = [{ - 'id': image.get('type'), - 'url': image.get('src'), - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] - - return { - 'id': video_id, - 'title': title or video_id, - 'description': description, - 'upload_date': upload_date, - 'thumbnails': thumbnails, - 'formats': formats, - 'subtitles': subtitles, - } - - def _parse_smil_namespace(self, smil): - return self._search_regex( - r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) - - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - base = smil_url - for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): - b = meta.get('base') or meta.get('httpBase') - if b: - base = b - break - - formats = [] - rtmp_count = 0 - http_count = 0 - m3u8_count = 0 - - srcs = [] - media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) - for medium in media: - src = medium.get('src') - if not src or src in srcs: - continue - srcs.append(src) - - bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) - filesize = int_or_none(medium.get('size') or medium.get('fileSize')) - width = int_or_none(medium.get('width')) - height = int_or_none(medium.get('height')) - proto = medium.get('proto') - ext = medium.get('ext') - src_ext = determine_ext(src) - streamer = medium.get('streamer') or base - - if proto == 'rtmp' or streamer.startswith('rtmp'): - rtmp_count += 1 - formats.append({ - 'url': streamer, - 'play_path': src, - 'ext': 'flv', - 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), - 'tbr': bitrate, - 'filesize': filesize, - 'width': width, - 'height': height, - }) - if transform_rtmp_url: - streamer, src = transform_rtmp_url(streamer, src) - formats[-1].update({ - 'url': streamer, - 'play_path': src, - }) - continue - - src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) - src_url = src_url.strip() - - if proto == 'm3u8' or src_ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) - if len(m3u8_formats) == 1: - m3u8_count += 1 - m3u8_formats[0].update({ - 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), - 'tbr': bitrate, - 'width': width, - 'height': height, - }) - formats.extend(m3u8_formats) - continue - - if src_ext == 'f4m': - f4m_url = src_url - if not f4m_params: - f4m_params = { - 'hdcore': '3.2.0', - 'plugin': 'flowplayer-3.2.0.1', - } - f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse_urlencode(f4m_params) - formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) - continue - - if src_url.startswith('http') and self._is_valid_url(src, video_id): - http_count += 1 - formats.append({ - 'url': src_url, - 'ext': ext or src_ext or 'flv', - 'format_id': 'http-%d' % (bitrate or http_count), - 'tbr': bitrate, - 'filesize': filesize, - 'width': width, - 'height': height, - }) - continue - - return formats - - def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): - urls = [] - subtitles = {} - for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): - src = textstream.get('src') - if not src or src in urls: - continue - urls.append(src) - ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) - lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang - subtitles.setdefault(lang, []).append({ - 'url': src, - 'ext': ext, - }) - return subtitles - - def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): - xspf = self._download_xml( - xspf_url, playlist_id, 'Downloading xpsf playlist', - 'Unable to download xspf manifest', fatal=fatal) - if xspf is False: - return [] - return self._parse_xspf( - xspf, playlist_id, xspf_url=xspf_url, - xspf_base_url=base_url(xspf_url)) - - def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): - NS_MAP = { - 'xspf': 'http://xspf.org/ns/0/', - 's1': 'http://static.streamone.nl/player/ns/0', - } - - entries = [] - for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): - title = xpath_text( - track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) - description = xpath_text( - track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') - thumbnail = xpath_text( - track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') - duration = float_or_none( - xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) - - formats = [] - for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)): - format_url = urljoin(xspf_base_url, location.text) - if not format_url: - continue - formats.append({ - 'url': format_url, - 'manifest_url': xspf_url, - 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), - 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), - 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), - }) - self._sort_formats(formats) - - entries.append({ - 'id': playlist_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - }) - return entries - - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): - res = self._download_xml_handle( - mpd_url, video_id, - note=note or 'Downloading MPD manifest', - errnote=errnote or 'Failed to download MPD manifest', - fatal=fatal) - if res is False: - return [] - mpd_doc, urlh = res - mpd_base_url = base_url(urlh.geturl()) - - return self._parse_mpd_formats( - mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url, - formats_dict=formats_dict, mpd_url=mpd_url) - - def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): - """ - Parse formats from MPD manifest. - References: - 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E), - http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip - 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP - """ - if mpd_doc.get('type') == 'dynamic': - return [] - - namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) - - def _add_ns(path): - return self._xpath_ns(path, namespace) - - def is_drm_protected(element): - return element.find(_add_ns('ContentProtection')) is not None - - def extract_multisegment_info(element, ms_parent_info): - ms_info = ms_parent_info.copy() - - # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some - # common attributes and elements. We will only extract relevant - # for us. - def extract_common(source): - segment_timeline = source.find(_add_ns('SegmentTimeline')) - if segment_timeline is not None: - s_e = segment_timeline.findall(_add_ns('S')) - if s_e: - ms_info['total_number'] = 0 - ms_info['s'] = [] - for s in s_e: - r = int(s.get('r', 0)) - ms_info['total_number'] += 1 + r - ms_info['s'].append({ - 't': int(s.get('t', 0)), - # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) - 'd': int(s.attrib['d']), - 'r': r, - }) - start_number = source.get('startNumber') - if start_number: - ms_info['start_number'] = int(start_number) - timescale = source.get('timescale') - if timescale: - ms_info['timescale'] = int(timescale) - segment_duration = source.get('duration') - if segment_duration: - ms_info['segment_duration'] = float(segment_duration) - - def extract_Initialization(source): - initialization = source.find(_add_ns('Initialization')) - if initialization is not None: - ms_info['initialization_url'] = initialization.attrib['sourceURL'] - - segment_list = element.find(_add_ns('SegmentList')) - if segment_list is not None: - extract_common(segment_list) - extract_Initialization(segment_list) - segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) - if segment_urls_e: - ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] - else: - segment_template = element.find(_add_ns('SegmentTemplate')) - if segment_template is not None: - extract_common(segment_template) - media = segment_template.get('media') - if media: - ms_info['media'] = media - initialization = segment_template.get('initialization') - if initialization: - ms_info['initialization'] = initialization - else: - extract_Initialization(segment_template) - return ms_info - - mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) - formats = [] - for period in mpd_doc.findall(_add_ns('Period')): - period_duration = parse_duration(period.get('duration')) or mpd_duration - period_ms_info = extract_multisegment_info(period, { - 'start_number': 1, - 'timescale': 1, - }) - for adaptation_set in period.findall(_add_ns('AdaptationSet')): - if is_drm_protected(adaptation_set): - continue - adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) - for representation in adaptation_set.findall(_add_ns('Representation')): - if is_drm_protected(representation): - continue - representation_attrib = adaptation_set.attrib.copy() - representation_attrib.update(representation.attrib) - # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory - mime_type = representation_attrib['mimeType'] - content_type = mime_type.split('/')[0] - if content_type == 'text': - # TODO implement WebVTT downloading - pass - elif content_type in ('video', 'audio'): - base_url = '' - for element in (representation, adaptation_set, period, mpd_doc): - base_url_e = element.find(_add_ns('BaseURL')) - if base_url_e is not None: - base_url = base_url_e.text + base_url - if re.match(r'^https?://', base_url): - break - if mpd_base_url and not re.match(r'^https?://', base_url): - if not mpd_base_url.endswith('/') and not base_url.startswith('/'): - mpd_base_url += '/' - base_url = mpd_base_url + base_url - representation_id = representation_attrib.get('id') - lang = representation_attrib.get('lang') - url_el = representation.find(_add_ns('BaseURL')) - filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) - bandwidth = int_or_none(representation_attrib.get('bandwidth')) - f = { - 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, - 'url': base_url, - 'manifest_url': mpd_url, - 'ext': mimetype2ext(mime_type), - 'width': int_or_none(representation_attrib.get('width')), - 'height': int_or_none(representation_attrib.get('height')), - 'tbr': float_or_none(bandwidth, 1000), - 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), - 'fps': int_or_none(representation_attrib.get('frameRate')), - 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, - 'format_note': 'DASH %s' % content_type, - 'filesize': filesize, - 'container': mimetype2ext(mime_type) + '_dash', - } - f.update(parse_codecs(representation_attrib.get('codecs'))) - representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) - - def prepare_template(template_name, identifiers): - tmpl = representation_ms_info[template_name] - # First of, % characters outside $...$ templates - # must be escaped by doubling for proper processing - # by % operator string formatting used further (see - # https://github.com/rg3/youtube-dl/issues/16867). - t = '' - in_template = False - for c in tmpl: - t += c - if c == '$': - in_template = not in_template - elif c == '%' and not in_template: - t += c - # Next, $...$ templates are translated to their - # %(...) counterparts to be used with % operator - t = t.replace('$RepresentationID$', representation_id) - t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) - t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) - t.replace('$$', '$') - return t - - # @initialization is a regular template like @media one - # so it should be handled just the same way (see - # https://github.com/rg3/youtube-dl/issues/11605) - if 'initialization' in representation_ms_info: - initialization_template = prepare_template( - 'initialization', - # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and - # $Time$ shall not be included for @initialization thus - # only $Bandwidth$ remains - ('Bandwidth', )) - representation_ms_info['initialization_url'] = initialization_template % { - 'Bandwidth': bandwidth, - } - - def location_key(location): - return 'url' if re.match(r'^https?://', location) else 'path' - - if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: - - media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) - media_location_key = location_key(media_template) - - # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ - # can't be used at the same time - if '%(Number' in media_template and 's' not in representation_ms_info: - segment_duration = None - if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: - segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) - representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) - representation_ms_info['fragments'] = [{ - media_location_key: media_template % { - 'Number': segment_number, - 'Bandwidth': bandwidth, - }, - 'duration': segment_duration, - } for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] - else: - # $Number*$ or $Time$ in media template with S list available - # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg - # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 - representation_ms_info['fragments'] = [] - segment_time = 0 - segment_d = None - segment_number = representation_ms_info['start_number'] - - def add_segment_url(): - segment_url = media_template % { - 'Time': segment_time, - 'Bandwidth': bandwidth, - 'Number': segment_number, - } - representation_ms_info['fragments'].append({ - media_location_key: segment_url, - 'duration': float_or_none(segment_d, representation_ms_info['timescale']), - }) - - for num, s in enumerate(representation_ms_info['s']): - segment_time = s.get('t') or segment_time - segment_d = s['d'] - add_segment_url() - segment_number += 1 - for r in range(s.get('r', 0)): - segment_time += segment_d - add_segment_url() - segment_number += 1 - segment_time += segment_d - elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: - # No media template - # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI - # or any YouTube dashsegments video - fragments = [] - segment_index = 0 - timescale = representation_ms_info['timescale'] - for s in representation_ms_info['s']: - duration = float_or_none(s['d'], timescale) - for r in range(s.get('r', 0) + 1): - segment_uri = representation_ms_info['segment_urls'][segment_index] - fragments.append({ - location_key(segment_uri): segment_uri, - 'duration': duration, - }) - segment_index += 1 - representation_ms_info['fragments'] = fragments - elif 'segment_urls' in representation_ms_info: - # Segment URLs with no SegmentTimeline - # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 - # https://github.com/rg3/youtube-dl/pull/14844 - fragments = [] - segment_duration = float_or_none( - representation_ms_info['segment_duration'], - representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None - for segment_url in representation_ms_info['segment_urls']: - fragment = { - location_key(segment_url): segment_url, - } - if segment_duration: - fragment['duration'] = segment_duration - fragments.append(fragment) - representation_ms_info['fragments'] = fragments - # NB: MPD manifest may contain direct URLs to unfragmented media. - # No fragments key is present in this case. - if 'fragments' in representation_ms_info: - f.update({ - 'fragment_base_url': base_url, - 'fragments': [], - 'protocol': 'http_dash_segments', - }) - if 'initialization_url' in representation_ms_info: - initialization_url = representation_ms_info['initialization_url'] - if not f.get('url'): - f['url'] = initialization_url - f['fragments'].append({location_key(initialization_url): initialization_url}) - f['fragments'].extend(representation_ms_info['fragments']) - # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation - # is not necessarily unique within a Period thus formats with - # the same `format_id` are quite possible. There are numerous examples - # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111, - # https://github.com/rg3/youtube-dl/issues/13919) - full_info = formats_dict.get(representation_id, {}).copy() - full_info.update(f) - formats.append(full_info) - else: - self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) - return formats - - def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): - res = self._download_xml_handle( - ism_url, video_id, - note=note or 'Downloading ISM manifest', - errnote=errnote or 'Failed to download ISM manifest', - fatal=fatal) - if res is False: - return [] - ism_doc, urlh = res - - return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) - - def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): - """ - Parse formats from ISM manifest. - References: - 1. [MS-SSTR]: Smooth Streaming Protocol, - https://msdn.microsoft.com/en-us/library/ff469518.aspx - """ - if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: - return [] - - duration = int(ism_doc.attrib['Duration']) - timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000 - - formats = [] - for stream in ism_doc.findall('StreamIndex'): - stream_type = stream.get('Type') - if stream_type not in ('video', 'audio'): - continue - url_pattern = stream.attrib['Url'] - stream_timescale = int_or_none(stream.get('TimeScale')) or timescale - stream_name = stream.get('Name') - for track in stream.findall('QualityLevel'): - fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) - # TODO: add support for WVC1 and WMAP - if fourcc not in ('H264', 'AVC1', 'AACL'): - self.report_warning('%s is not a supported codec' % fourcc) - continue - tbr = int(track.attrib['Bitrate']) // 1000 - # [1] does not mention Width and Height attributes. However, - # they're often present while MaxWidth and MaxHeight are - # missing, so should be used as fallbacks - width = int_or_none(track.get('MaxWidth') or track.get('Width')) - height = int_or_none(track.get('MaxHeight') or track.get('Height')) - sampling_rate = int_or_none(track.get('SamplingRate')) - - track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) - track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) - - fragments = [] - fragment_ctx = { - 'time': 0, - } - stream_fragments = stream.findall('c') - for stream_fragment_index, stream_fragment in enumerate(stream_fragments): - fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time'] - fragment_repeat = int_or_none(stream_fragment.get('r')) or 1 - fragment_ctx['duration'] = int_or_none(stream_fragment.get('d')) - if not fragment_ctx['duration']: - try: - next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t']) - except IndexError: - next_fragment_time = duration - fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat - for _ in range(fragment_repeat): - fragments.append({ - 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), - 'duration': fragment_ctx['duration'] / stream_timescale, - }) - fragment_ctx['time'] += fragment_ctx['duration'] - - format_id = [] - if ism_id: - format_id.append(ism_id) - if stream_name: - format_id.append(stream_name) - format_id.append(compat_str(tbr)) - - formats.append({ - 'format_id': '-'.join(format_id), - 'url': ism_url, - 'manifest_url': ism_url, - 'ext': 'ismv' if stream_type == 'video' else 'isma', - 'width': width, - 'height': height, - 'tbr': tbr, - 'asr': sampling_rate, - 'vcodec': 'none' if stream_type == 'audio' else fourcc, - 'acodec': 'none' if stream_type == 'video' else fourcc, - 'protocol': 'ism', - 'fragments': fragments, - '_download_params': { - 'duration': duration, - 'timescale': stream_timescale, - 'width': width or 0, - 'height': height or 0, - 'fourcc': fourcc, - 'codec_private_data': track.get('CodecPrivateData'), - 'sampling_rate': sampling_rate, - 'channels': int_or_none(track.get('Channels', 2)), - 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), - 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), - }, - }) - return formats - - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None): - def absolute_url(item_url): - return urljoin(base_url, item_url) - - def parse_content_type(content_type): - if not content_type: - return {} - ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type) - if ctr: - mimetype, codecs = ctr.groups() - f = parse_codecs(codecs) - f['ext'] = mimetype2ext(mimetype) - return f - return {} - - def _media_formats(src, cur_media_type, type_info={}): - full_url = absolute_url(src) - ext = type_info.get('ext') or determine_ext(full_url) - if ext == 'm3u8': - is_plain_url = False - formats = self._extract_m3u8_formats( - full_url, video_id, ext='mp4', - entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, - preference=preference, fatal=False) - elif ext == 'mpd': - is_plain_url = False - formats = self._extract_mpd_formats( - full_url, video_id, mpd_id=mpd_id, fatal=False) - else: - is_plain_url = True - formats = [{ - 'url': full_url, - 'vcodec': 'none' if cur_media_type == 'audio' else None, - }] - return is_plain_url, formats - - entries = [] - # amp-video and amp-audio are very similar to their HTML5 counterparts - # so we wll include them right here (see - # https://www.ampproject.org/docs/reference/components/amp-video) - media_tags = [(media_tag, media_type, '') - for media_tag, media_type - in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)] - media_tags.extend(re.findall( - # We only allow video|audio followed by a whitespace or '>'. - # Allowing more characters may end up in significant slow down (see - # https://github.com/rg3/youtube-dl/issues/11979, example URL: - # http://www.porntrex.com/maps/videositemap.xml). - r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage)) - for media_tag, media_type, media_content in media_tags: - media_info = { - 'formats': [], - 'subtitles': {}, - } - media_attributes = extract_attributes(media_tag) - src = media_attributes.get('src') - if src: - _, formats = _media_formats(src, media_type) - media_info['formats'].extend(formats) - media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) - if media_content: - for source_tag in re.findall(r'<source[^>]+>', media_content): - source_attributes = extract_attributes(source_tag) - src = source_attributes.get('src') - if not src: - continue - f = parse_content_type(source_attributes.get('type')) - is_plain_url, formats = _media_formats(src, media_type, f) - if is_plain_url: - # res attribute is not standard but seen several times - # in the wild - f.update({ - 'height': int_or_none(source_attributes.get('res')), - 'format_id': source_attributes.get('label'), - }) - f.update(formats[0]) - media_info['formats'].append(f) - else: - media_info['formats'].extend(formats) - for track_tag in re.findall(r'<track[^>]+>', media_content): - track_attributes = extract_attributes(track_tag) - kind = track_attributes.get('kind') - if not kind or kind in ('subtitles', 'captions'): - src = track_attributes.get('src') - if not src: - continue - lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') - media_info['subtitles'].setdefault(lang, []).append({ - 'url': absolute_url(src), - }) - for f in media_info['formats']: - f.setdefault('http_headers', {})['Referer'] = base_url - if media_info['formats'] or media_info['subtitles']: - entries.append(media_info) - return entries - - def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): - formats = [] - hdcore_sign = 'hdcore=3.7.0' - f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') - hds_host = hosts.get('hds') - if hds_host: - f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) - if 'hdcore=' not in f4m_url: - f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign - f4m_formats = self._extract_f4m_formats( - f4m_url, video_id, f4m_id='hds', fatal=False) - for entry in f4m_formats: - entry.update({'extra_param_to_segment_url': hdcore_sign}) - formats.extend(f4m_formats) - m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') - hls_host = hosts.get('hls') - if hls_host: - m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - return formats - - def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): - query = compat_urlparse.urlparse(url).query - url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) - mobj = re.search( - r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url) - url_base = mobj.group('url') - http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base) - formats = [] - - def manifest_url(manifest): - m_url = '%s/%s' % (http_base_url, manifest) - if query: - m_url += '?%s' % query - return m_url - - if 'm3u8' not in skip_protocols: - formats.extend(self._extract_m3u8_formats( - manifest_url('playlist.m3u8'), video_id, 'mp4', - m3u8_entry_protocol, m3u8_id='hls', fatal=False)) - if 'f4m' not in skip_protocols: - formats.extend(self._extract_f4m_formats( - manifest_url('manifest.f4m'), - video_id, f4m_id='hds', fatal=False)) - if 'dash' not in skip_protocols: - formats.extend(self._extract_mpd_formats( - manifest_url('manifest.mpd'), - video_id, mpd_id='dash', fatal=False)) - if re.search(r'(?:/smil:|\.smil)', url_base): - if 'smil' not in skip_protocols: - rtmp_formats = self._extract_smil_formats( - manifest_url('jwplayer.smil'), - video_id, fatal=False) - for rtmp_format in rtmp_formats: - rtsp_format = rtmp_format.copy() - rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) - del rtsp_format['play_path'] - del rtsp_format['ext'] - rtsp_format.update({ - 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), - 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), - 'protocol': 'rtsp', - }) - formats.extend([rtmp_format, rtsp_format]) - else: - for protocol in ('rtmp', 'rtsp'): - if protocol not in skip_protocols: - formats.append({ - 'url': '%s:%s' % (protocol, url_base), - 'format_id': protocol, - 'protocol': protocol, - }) - return formats - - def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): - mobj = re.search( - r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', - webpage) - if mobj: - try: - jwplayer_data = self._parse_json(mobj.group('options'), - video_id=video_id, - transform_source=transform_source) - except ExtractorError: - pass - else: - if isinstance(jwplayer_data, dict): - return jwplayer_data - - def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): - jwplayer_data = self._find_jwplayer_data( - webpage, video_id, transform_source=js_to_json) - return self._parse_jwplayer_data( - jwplayer_data, video_id, *args, **kwargs) - - def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, - m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - # JWPlayer backward compatibility: flattened playlists - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: - jwplayer_data = {'playlist': [jwplayer_data]} - - entries = [] - - # JWPlayer backward compatibility: single playlist item - # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 - if not isinstance(jwplayer_data['playlist'], list): - jwplayer_data['playlist'] = [jwplayer_data['playlist']] - - for video_data in jwplayer_data['playlist']: - # JWPlayer backward compatibility: flattened sources - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 - if 'sources' not in video_data: - video_data['sources'] = [video_data] - - this_video_id = video_id or video_data['mediaid'] - - formats = self._parse_jwplayer_formats( - video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, - mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) - - subtitles = {} - tracks = video_data.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if not isinstance(track, dict): - continue - track_kind = track.get('kind') - if not track_kind or not isinstance(track_kind, compat_str): - continue - if track_kind.lower() not in ('captions', 'subtitles'): - continue - track_url = urljoin(base_url, track.get('file')) - if not track_url: - continue - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track_url) - }) - - entry = { - 'id': this_video_id, - 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), - 'description': video_data.get('description'), - 'thumbnail': self._proto_relative_url(video_data.get('image')), - 'timestamp': int_or_none(video_data.get('pubdate')), - 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), - 'subtitles': subtitles, - } - # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 - if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): - entry.update({ - '_type': 'url_transparent', - 'url': formats[0]['url'], - }) - else: - self._sort_formats(formats) - entry['formats'] = formats - entries.append(entry) - if len(entries) == 1: - return entries[0] - else: - return self.playlist_result(entries) - - def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, - m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - urls = [] - formats = [] - for source in jwplayer_sources_data: - if not isinstance(source, dict): - continue - source_url = self._proto_relative_url(source.get('file')) - if not source_url: - continue - if base_url: - source_url = compat_urlparse.urljoin(base_url, source_url) - if source_url in urls: - continue - urls.append(source_url) - source_type = source.get('type') or '' - ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=m3u8_id, fatal=False)) - elif source_type == 'dash' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - source_url, video_id, mpd_id=mpd_id, fatal=False)) - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - source_url, video_id, fatal=False)) - # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 - elif source_type.startswith('audio') or ext in ( - 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): - formats.append({ - 'url': source_url, - 'vcodec': 'none', - 'ext': ext, - }) - else: - height = int_or_none(source.get('height')) - if height is None: - # Often no height is provided but there is a label in - # format like "1080p", "720p SD", or 1080. - height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), - 'height', default=None)) - a_format = { - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'height': height, - 'tbr': int_or_none(source.get('bitrate')), - 'ext': ext, - } - if source_url.startswith('rtmp'): - a_format['ext'] = 'flv' - # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as - # of jwplayer.flash.swf - rtmp_url_parts = re.split( - r'((?:mp4|mp3|flv):)', source_url, 1) - if len(rtmp_url_parts) == 3: - rtmp_url, prefix, play_path = rtmp_url_parts - a_format.update({ - 'url': rtmp_url, - 'play_path': prefix + play_path, - }) - if rtmp_params: - a_format.update(rtmp_params) - formats.append(a_format) - return formats - - def _live_title(self, name): - """ Generate the title for a live video """ - now = datetime.datetime.now() - now_str = now.strftime('%Y-%m-%d %H:%M') - return name + ' ' + now_str - - def _int(self, v, name, fatal=False, **kwargs): - res = int_or_none(v, **kwargs) - if 'get_attr' in kwargs: - print(getattr(v, kwargs['get_attr'])) - if res is None: - msg = 'Failed to extract %s: Could not parse value %r' % (name, v) - if fatal: - raise ExtractorError(msg) - else: - self._downloader.report_warning(msg) - return res - - def _float(self, v, name, fatal=False, **kwargs): - res = float_or_none(v, **kwargs) - if res is None: - msg = 'Failed to extract %s: Could not parse value %r' % (name, v) - if fatal: - raise ExtractorError(msg) - else: - self._downloader.report_warning(msg) - return res - - def _set_cookie(self, domain, name, value, expire_time=None, port=None, - path='/', secure=False, discard=False, rest={}, **kwargs): - cookie = compat_cookiejar.Cookie( - 0, name, value, port, port is not None, domain, True, - domain.startswith('.'), path, True, secure, expire_time, - discard, None, None, rest) - self._downloader.cookiejar.set_cookie(cookie) - - def _get_cookies(self, url): - """ Return a compat_cookies.SimpleCookie with the cookies for the url """ - req = sanitized_Request(url) - self._downloader.cookiejar.add_cookie_header(req) - return compat_cookies.SimpleCookie(req.get_header('Cookie')) - - def get_testcases(self, include_onlymatching=False): - t = getattr(self, '_TEST', None) - if t: - assert not hasattr(self, '_TESTS'), \ - '%s has _TEST and _TESTS' % type(self).__name__ - tests = [t] - else: - tests = getattr(self, '_TESTS', []) - for t in tests: - if not include_onlymatching and t.get('only_matching', False): - continue - t['name'] = type(self).__name__[:-len('IE')] - yield t - - def is_suitable(self, age_limit): - """ Test whether the extractor is generally suitable for the given - age limit (i.e. pornographic sites are not, all others usually are) """ - - any_restricted = False - for tc in self.get_testcases(include_onlymatching=False): - if tc.get('playlist', []): - tc = tc['playlist'][0] - is_restricted = age_restricted( - tc.get('info_dict', {}).get('age_limit'), age_limit) - if not is_restricted: - return True - any_restricted = any_restricted or is_restricted - return not any_restricted - - def extract_subtitles(self, *args, **kwargs): - if (self._downloader.params.get('writesubtitles', False) or - self._downloader.params.get('listsubtitles')): - return self._get_subtitles(*args, **kwargs) - return {} - - def _get_subtitles(self, *args, **kwargs): - raise NotImplementedError('This method must be implemented by subclasses') - - @staticmethod - def _merge_subtitle_items(subtitle_list1, subtitle_list2): - """ Merge subtitle items for one language. Items with duplicated URLs - will be dropped. """ - list1_urls = set([item['url'] for item in subtitle_list1]) - ret = list(subtitle_list1) - ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) - return ret - - @classmethod - def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): - """ Merge two subtitle dictionaries, language by language. """ - ret = dict(subtitle_dict1) - for lang in subtitle_dict2: - ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) - return ret - - def extract_automatic_captions(self, *args, **kwargs): - if (self._downloader.params.get('writeautomaticsub', False) or - self._downloader.params.get('listsubtitles')): - return self._get_automatic_captions(*args, **kwargs) - return {} - - def _get_automatic_captions(self, *args, **kwargs): - raise NotImplementedError('This method must be implemented by subclasses') - - def mark_watched(self, *args, **kwargs): - if (self._downloader.params.get('mark_watched', False) and - (self._get_login_info()[0] is not None or - self._downloader.params.get('cookiefile') is not None)): - self._mark_watched(*args, **kwargs) - - def _mark_watched(self, *args, **kwargs): - raise NotImplementedError('This method must be implemented by subclasses') - - def geo_verification_headers(self): - headers = {} - geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') - if geo_verification_proxy: - headers['Ytdl-request-proxy'] = geo_verification_proxy - return headers - - def _generic_id(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) - - def _generic_title(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) - - -class SearchInfoExtractor(InfoExtractor): - """ - Base class for paged search queries extractors. - They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} - Instances should define _SEARCH_KEY and _MAX_RESULTS. - """ - - @classmethod - def _make_valid_url(cls): - return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY - - @classmethod - def suitable(cls, url): - return re.match(cls._make_valid_url(), url) is not None - - def _real_extract(self, query): - mobj = re.match(self._make_valid_url(), query) - if mobj is None: - raise ExtractorError('Invalid search query "%s"' % query) - - prefix = mobj.group('prefix') - query = mobj.group('query') - if prefix == '': - return self._get_n_results(query, 1) - elif prefix == 'all': - return self._get_n_results(query, self._MAX_RESULTS) - else: - n = int(prefix) - if n <= 0: - raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) - elif n > self._MAX_RESULTS: - self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) - n = self._MAX_RESULTS - return self._get_n_results(query, n) - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - raise NotImplementedError('This method must be implemented by subclasses') - - @property - def SEARCH_KEY(self): - return self._SEARCH_KEY diff --git a/youtube_dl/extractor/commonmistakes.py b/youtube_dl/extractor/commonmistakes.py deleted file mode 100644 index 79f7a9c..0000000 --- a/youtube_dl/extractor/commonmistakes.py +++ /dev/null @@ -1,50 +0,0 @@ -from __future__ import unicode_literals - -import sys - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class CommonMistakesIE(InfoExtractor): - IE_DESC = False # Do not list - _VALID_URL = r'''(?x) - (?:url|URL)$ - ''' - - _TESTS = [{ - 'url': 'url', - 'only_matching': True, - }, { - 'url': 'URL', - 'only_matching': True, - }] - - def _real_extract(self, url): - msg = ( - 'You\'ve asked youtube-dl to download the URL "%s". ' - 'That doesn\'t make any sense. ' - 'Simply remove the parameter in your command or configuration.' - ) % url - if not self._downloader.params.get('verbose'): - msg += ' Add -v to the command line to see what arguments and configuration youtube-dl got.' - raise ExtractorError(msg, expected=True) - - -class UnicodeBOMIE(InfoExtractor): - IE_DESC = False - _VALID_URL = r'(?P<bom>\ufeff)(?P<id>.*)$' - - # Disable test for python 3.2 since BOM is broken in re in this version - # (see https://github.com/rg3/youtube-dl/issues/9751) - _TESTS = [] if (3, 0) < sys.version_info <= (3, 3) else [{ - 'url': '\ufeffhttp://www.youtube.com/watch?v=BaW_jenozKc', - 'only_matching': True, - }] - - def _real_extract(self, url): - real_url = self._match_id(url) - self.report_warning( - 'Your URL starts with a Byte Order Mark (BOM). ' - 'Removing the BOM and looking for "%s" ...' % real_url) - return self.url_result(real_url) diff --git a/youtube_dl/extractor/commonprotocols.py b/youtube_dl/extractor/commonprotocols.py deleted file mode 100644 index d98331a..0000000 --- a/youtube_dl/extractor/commonprotocols.py +++ /dev/null @@ -1,60 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import ( - compat_urlparse, -) - - -class RtmpIE(InfoExtractor): - IE_DESC = False # Do not list - _VALID_URL = r'(?i)rtmp[est]?://.+' - - _TESTS = [{ - 'url': 'rtmp://cp44293.edgefcs.net/ondemand?auth=daEcTdydfdqcsb8cZcDbAaCbhamacbbawaS-bw7dBb-bWG-GqpGFqCpNCnGoyL&aifp=v001&slist=public/unsecure/audio/2c97899446428e4301471a8cb72b4b97--audio--pmg-20110908-0900a_flv_aac_med_int.mp4', - 'only_matching': True, - }, { - 'url': 'rtmp://edge.live.hitbox.tv/live/dimak', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._generic_id(url) - title = self._generic_title(url) - return { - 'id': video_id, - 'title': title, - 'formats': [{ - 'url': url, - 'ext': 'flv', - 'format_id': compat_urlparse.urlparse(url).scheme, - }], - } - - -class MmsIE(InfoExtractor): - IE_DESC = False # Do not list - _VALID_URL = r'(?i)mms://.+' - - _TEST = { - # Direct MMS link - 'url': 'mms://kentro.kaist.ac.kr/200907/MilesReid(0709).wmv', - 'info_dict': { - 'id': 'MilesReid(0709)', - 'ext': 'wmv', - 'title': 'MilesReid(0709)', - }, - 'params': { - 'skip_download': True, # rtsp downloads, requiring mplayer or mpv - }, - } - - def _real_extract(self, url): - video_id = self._generic_id(url) - title = self._generic_title(url) - - return { - 'id': video_id, - 'title': title, - 'url': url, - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py deleted file mode 100644 index ceb72da..0000000 --- a/youtube_dl/extractor/extractors.py +++ /dev/null @@ -1,31 +0,0 @@ -# flake8: noqa -from __future__ import unicode_literals - - -from .commonmistakes import CommonMistakesIE, UnicodeBOMIE -from .commonprotocols import ( - MmsIE, - RtmpIE, -) - -from .openload import OpenloadIE - -from .youtube import ( - YoutubeIE, - YoutubeChannelIE, - YoutubeFavouritesIE, - YoutubeHistoryIE, - YoutubeLiveIE, - YoutubePlaylistIE, - YoutubePlaylistsIE, - YoutubeRecommendedIE, - YoutubeSearchDateIE, - YoutubeSearchIE, - YoutubeSearchURLIE, - YoutubeShowIE, - YoutubeSubscriptionsIE, - YoutubeTruncatedIDIE, - YoutubeTruncatedURLIE, - YoutubeUserIE, - YoutubeWatchLaterIE, -) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py deleted file mode 100644 index aa04905..0000000 --- a/youtube_dl/extractor/generic.py +++ /dev/null @@ -1,3335 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - -import os -import re -import sys - -from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import ( - compat_etree_fromstring, - compat_str, - compat_urllib_parse_unquote, - compat_urlparse, - compat_xml_parse_error, -) -from ..utils import ( - determine_ext, - ExtractorError, - float_or_none, - HEADRequest, - is_html, - js_to_json, - KNOWN_EXTENSIONS, - merge_dicts, - mimetype2ext, - orderedSet, - sanitized_Request, - smuggle_url, - unescapeHTML, - unified_strdate, - unsmuggle_url, - UnsupportedError, - xpath_text, -) -from .commonprotocols import RtmpIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .nbc import NBCSportsVPlayerIE -from .ooyala import OoyalaIE -from .rutv import RUTVIE -from .tvc import TVCIE -from .sportbox import SportBoxEmbedIE -from .smotri import SmotriIE -from .myvi import MyviIE -from .condenast import CondeNastIE -from .udn import UDNEmbedIE -from .senateisvp import SenateISVPIE -from .svt import SVTIE -from .pornhub import PornHubIE -from .xhamster import XHamsterEmbedIE -from .tnaflix import TNAFlixNetworkEmbedIE -from .drtuber import DrTuberIE -from .redtube import RedTubeIE -from .tube8 import Tube8IE -from .vimeo import VimeoIE -from .dailymotion import DailymotionIE -from .dailymail import DailyMailIE -from .onionstudios import OnionStudiosIE -from .viewlift import ViewLiftEmbedIE -from .mtv import MTVServicesEmbeddedIE -from .pladform import PladformIE -from .videomore import VideomoreIE -from .webcaster import WebcasterFeedIE -from .googledrive import GoogleDriveIE -from .jwplatform import JWPlatformIE -from .digiteka import DigitekaIE -from .arkena import ArkenaIE -from .instagram import InstagramIE -from .liveleak import LiveLeakIE -from .threeqsdn import ThreeQSDNIE -from .theplatform import ThePlatformIE -from .vessel import VesselIE -from .kaltura import KalturaIE -from .eagleplatform import EaglePlatformIE -from .facebook import FacebookIE -from .soundcloud import SoundcloudIE -from .tunein import TuneInBaseIE -from .vbox7 import Vbox7IE -from .dbtv import DBTVIE -from .piksel import PikselIE -from .videa import VideaIE -from .twentymin import TwentyMinutenIE -from .ustream import UstreamIE -from .openload import OpenloadIE -from .videopress import VideoPressIE -from .rutube import RutubeIE -from .limelight import LimelightBaseIE -from .anvato import AnvatoIE -from .washingtonpost import WashingtonPostIE -from .wistia import WistiaIE -from .mediaset import MediasetIE -from .joj import JojIE -from .megaphone import MegaphoneIE -from .vzaar import VzaarIE -from .channel9 import Channel9IE -from .vshare import VShareIE -from .mediasite import MediasiteIE -from .springboardplatform import SpringboardPlatformIE -from .yapfiles import YapFilesIE -from .vice import ViceIE -from .xfileshare import XFileShareIE -from .cloudflarestream import CloudflareStreamIE -from .peertube import PeerTubeIE -from .indavideo import IndavideoEmbedIE -from .apa import APAIE -from .foxnews import FoxNewsIE - - -class GenericIE(InfoExtractor): - IE_DESC = 'Generic downloader that works on some sites' - _VALID_URL = r'.*' - IE_NAME = 'generic' - _TESTS = [ - # Direct link to a video - { - 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', - 'md5': '67d406c2bcb6af27fa886f31aa934bbe', - 'info_dict': { - 'id': 'trailer', - 'ext': 'mp4', - 'title': 'trailer', - 'upload_date': '20100513', - } - }, - # Direct link to media delivered compressed (until Accept-Encoding is *) - { - 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', - 'md5': '128c42e68b13950268b648275386fc74', - 'info_dict': { - 'id': 'FictionJunction-Parallel_Hearts', - 'ext': 'flac', - 'title': 'FictionJunction-Parallel_Hearts', - 'upload_date': '20140522', - }, - 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ], - 'skip': 'URL invalid', - }, - # Direct download with broken HEAD - { - 'url': 'http://ai-radio.org:8000/radio.opus', - 'info_dict': { - 'id': 'radio', - 'ext': 'opus', - 'title': 'radio', - }, - 'params': { - 'skip_download': True, # infinite live stream - }, - 'expected_warnings': [ - r'501.*Not Implemented', - r'400.*Bad Request', - ], - }, - # Direct link with incorrect MIME type - { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'md5': '4ccbebe5f36706d85221f204d7eb5913', - 'info_dict': { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'id': '5_Lennart_Poettering_-_Systemd', - 'ext': 'webm', - 'title': '5_Lennart_Poettering_-_Systemd', - 'upload_date': '20141120', - }, - 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ] - }, - # RSS feed - { - 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'info_dict': { - 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'title': 'Zero Punctuation', - 'description': 're:.*groundbreaking video review series.*' - }, - 'playlist_mincount': 11, - }, - # RSS feed with enclosure - { - 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', - 'info_dict': { - 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - 'ext': 'm4v', - 'upload_date': '20150228', - 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - } - }, - # RSS feed with enclosures and unsupported link URLs - { - 'url': 'http://www.hellointernet.fm/podcast?format=rss', - 'info_dict': { - 'id': 'http://www.hellointernet.fm/podcast?format=rss', - 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', - 'title': 'Hello Internet', - }, - 'playlist_mincount': 100, - }, - # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng - { - 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', - 'info_dict': { - 'id': 'smil', - 'ext': 'mp4', - 'title': 'Automatics, robotics and biocybernetics', - 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', - 'upload_date': '20130627', - 'formats': 'mincount:16', - 'subtitles': 'mincount:1', - }, - 'params': { - 'force_generic_extractor': True, - 'skip_download': True, - }, - }, - # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html - { - 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', - 'info_dict': { - 'id': 'hds', - 'ext': 'flv', - 'title': 'hds', - 'formats': 'mincount:1', - }, - 'params': { - 'skip_download': True, - }, - }, - # SMIL from https://www.restudy.dk/video/play/id/1637 - { - 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', - 'info_dict': { - 'id': 'video_1637', - 'ext': 'flv', - 'title': 'video_1637', - 'formats': 'mincount:3', - }, - 'params': { - 'skip_download': True, - }, - }, - # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm - { - 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', - 'info_dict': { - 'id': 'smil-service', - 'ext': 'flv', - 'title': 'smil-service', - 'formats': 'mincount:1', - }, - 'params': { - 'skip_download': True, - }, - }, - # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 - { - 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', - 'info_dict': { - 'id': '4719370', - 'ext': 'mp4', - 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', - 'formats': 'mincount:3', - }, - 'params': { - 'skip_download': True, - }, - }, - # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html - { - 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf', - 'info_dict': { - 'id': 'mZlp2ctYIUEB', - 'ext': 'mp4', - 'title': 'Tikibad ontruimd wegens brand', - 'description': 'md5:05ca046ff47b931f9b04855015e163a4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 33, - }, - 'params': { - 'skip_download': True, - }, - }, - # MPD from http://dash-mse-test.appspot.com/media.html - { - 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd', - 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53', - 'info_dict': { - 'id': 'car-20120827-manifest', - 'ext': 'mp4', - 'title': 'car-20120827-manifest', - 'formats': 'mincount:9', - 'upload_date': '20130904', - }, - 'params': { - 'format': 'bestvideo', - }, - }, - # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 - { - 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', - 'info_dict': { - 'id': 'content', - 'ext': 'mp4', - 'title': 'content', - 'formats': 'mincount:8', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - 'skip': 'video gone', - }, - # m3u8 served with Content-Type: text/plain - { - 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', - 'info_dict': { - 'id': 'index', - 'ext': 'mp4', - 'title': 'index', - 'upload_date': '20140720', - 'formats': 'mincount:11', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - 'skip': 'video gone', - }, - # google redirect - { - 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', - 'info_dict': { - 'id': 'cmQHVoWB5FY', - 'ext': 'mp4', - 'upload_date': '20130224', - 'uploader_id': 'TheVerge', - 'description': r're:^Chris Ziegler takes a look at the\.*', - 'uploader': 'The Verge', - 'title': 'First Firefox OS phones side-by-side', - }, - 'params': { - 'skip_download': False, - } - }, - { - # redirect in Refresh HTTP header - 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', - 'info_dict': { - 'id': 'pO8h3EaFRdo', - 'ext': 'mp4', - 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', - 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', - 'upload_date': '20150917', - 'uploader_id': 'brtvofficial', - 'uploader': 'Boiler Room', - }, - 'params': { - 'skip_download': False, - }, - }, - { - 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', - 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', - 'info_dict': { - 'id': '13601338388002', - 'ext': 'mp4', - 'uploader': 'www.hodiho.fr', - 'title': 'R\u00e9gis plante sa Jeep', - } - }, - # bandcamp page with custom domain - { - 'add_ie': ['Bandcamp'], - 'url': 'http://bronyrock.com/track/the-pony-mash', - 'info_dict': { - 'id': '3235767654', - 'ext': 'mp3', - 'title': 'The Pony Mash', - 'uploader': 'M_Pallante', - }, - 'skip': 'There is a limit of 200 free downloads / month for the test song', - }, - { - # embedded brightcove video - # it also tests brightcove videos that need to set the 'Referer' - # in the http requests - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', - 'info_dict': { - 'id': '2765128793001', - 'ext': 'mp4', - 'title': 'Le cours de bourse : l’analyse technique', - 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', - 'uploader': 'BFM BUSINESS', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # embedded with itemprop embedURL and video id spelled as `idVideo` - 'add_id': ['BrightcoveLegacy'], - 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', - 'info_dict': { - 'id': '5255628253001', - 'ext': 'mp4', - 'title': 'md5:37c519b1128915607601e75a87995fc0', - 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', - 'uploader': 'BFM BUSINESS', - 'uploader_id': '876450612001', - 'timestamp': 1482255315, - 'upload_date': '20161220', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # https://github.com/rg3/youtube-dl/issues/2253 - 'url': 'http://bcove.me/i6nfkrc3', - 'md5': '0ba9446db037002366bab3b3eb30c88c', - 'info_dict': { - 'id': '3101154703001', - 'ext': 'mp4', - 'title': 'Still no power', - 'uploader': 'thestar.com', - 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', - }, - 'add_ie': ['BrightcoveLegacy'], - 'skip': 'video gone', - }, - { - 'url': 'http://www.championat.com/video/football/v/87/87499.html', - 'md5': 'fb973ecf6e4a78a67453647444222983', - 'info_dict': { - 'id': '3414141473001', - 'ext': 'mp4', - 'title': 'Видео. Удаление Дзагоева (ЦСКА)', - 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', - 'uploader': 'Championat', - }, - }, - { - # https://github.com/rg3/youtube-dl/issues/3541 - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', - 'info_dict': { - 'id': '3866516442001', - 'ext': 'mp4', - 'title': 'Leer mij vrouwen kennen: Aflevering 1', - 'description': 'Leer mij vrouwen kennen: Aflevering 1', - 'uploader': 'SBS Broadcasting', - }, - 'skip': 'Restricted to Netherlands', - 'params': { - 'skip_download': True, # m3u8 download - }, - }, - { - # Brightcove video in <iframe> - 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724', - 'md5': '36d74ef5e37c8b4a2ce92880d208b968', - 'info_dict': { - 'id': '5360463607001', - 'ext': 'mp4', - 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活', - 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。', - 'uploader': 'United Nations', - 'uploader_id': '1362235914001', - 'timestamp': 1489593889, - 'upload_date': '20170315', - }, - 'add_ie': ['BrightcoveLegacy'], - }, - { - # Brightcove with alternative playerID key - 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html', - 'info_dict': { - 'id': 'nmeth.2062_SV1', - 'title': 'Simultaneous multiview imaging of the Drosophila syncytial blastoderm : Quantitative high-speed imaging of entire developing embryos with simultaneous multiview light-sheet microscopy : Nature Methods : Nature Research', - }, - 'playlist': [{ - 'info_dict': { - 'id': '2228375078001', - 'ext': 'mp4', - 'title': 'nmeth.2062-sv1', - 'description': 'nmeth.2062-sv1', - 'timestamp': 1363357591, - 'upload_date': '20130315', - 'uploader': 'Nature Publishing Group', - 'uploader_id': '1964492299001', - }, - }], - }, - { - # Brightcove with UUID in videoPlayer - 'url': 'http://www8.hp.com/cn/zh/home.html', - 'info_dict': { - 'id': '5255815316001', - 'ext': 'mp4', - 'title': 'Sprocket Video - China', - 'description': 'Sprocket Video - China', - 'uploader': 'HP-Video Gallery', - 'timestamp': 1482263210, - 'upload_date': '20161220', - 'uploader_id': '1107601872001', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - 'skip': 'video rotates...weekly?', - }, - { - # Brightcove:new type [2]. - 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis', - 'md5': '2b35148fcf48da41c9fb4591650784f3', - 'info_dict': { - 'id': '5348741021001', - 'ext': 'mp4', - 'upload_date': '20170306', - 'uploader_id': '4191638492001', - 'timestamp': 1488769918, - 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis', - - }, - }, - { - # Alternative brightcove <video> attributes - 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/', - 'info_dict': { - 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche', - 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs", - }, - 'playlist': [{ - 'md5': '732d22ba3d33f2f3fc253c39f8f36523', - 'info_dict': { - 'id': '5311302538001', - 'ext': 'mp4', - 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche", - 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)", - 'timestamp': 1486321708, - 'upload_date': '20170205', - 'uploader_id': '800000640001', - }, - 'only_matching': True, - }], - }, - { - # Brightcove with UUID in videoPlayer - 'url': 'http://www8.hp.com/cn/zh/home.html', - 'info_dict': { - 'id': '5255815316001', - 'ext': 'mp4', - 'title': 'Sprocket Video - China', - 'description': 'Sprocket Video - China', - 'uploader': 'HP-Video Gallery', - 'timestamp': 1482263210, - 'upload_date': '20161220', - 'uploader_id': '1107601872001', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, - # ooyala video - { - 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', - 'md5': '166dd577b433b4d4ebfee10b0824d8ff', - 'info_dict': { - 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', - 'ext': 'mp4', - 'title': '2cc213299525360.mov', # that's what we get - 'duration': 238.231, - }, - 'add_ie': ['Ooyala'], - }, - { - # ooyala video embedded with http://player.ooyala.com/iframe.js - 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/', - 'info_dict': { - 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB', - 'ext': 'mp4', - 'title': '"Steve Jobs: Man in the Machine" trailer', - 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', - 'duration': 135.427, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'movie expired', - }, - # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js - { - 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/', - 'info_dict': { - 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2', - 'ext': 'mp4', - 'title': 'Steampunk Fest Comes to Honesdale', - 'duration': 43.276, - }, - 'params': { - 'skip_download': True, - } - }, - # embed.ly video - { - 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', - 'info_dict': { - 'id': '9ODmcdjQcHQ', - 'ext': 'mp4', - 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second', - 'upload_date': '20140225', - 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff', - 'uploader': 'Tested', - 'uploader_id': 'testedcom', - }, - # No need to test YoutubeIE here - 'params': { - 'skip_download': True, - }, - }, - # funnyordie embed - { - 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns', - 'info_dict': { - 'id': '18e820ec3f', - 'ext': 'mp4', - 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama', - 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', - }, - # HEAD requests lead to endless 301, while GET is OK - 'expected_warnings': ['301'], - }, - # RUTV embed - { - 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', - 'info_dict': { - 'id': '776940', - 'ext': 'mp4', - 'title': 'Охотское море стало целиком российским', - 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - # TVC embed - { - 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/', - 'info_dict': { - 'id': '55304', - 'ext': 'mp4', - 'title': 'Дошкольное воспитание', - }, - }, - # SportBox embed - { - 'url': 'http://www.vestifinance.ru/articles/25753', - 'info_dict': { - 'id': '25753', - 'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"', - }, - 'playlist': [{ - 'info_dict': { - 'id': '370908', - 'title': 'Госзаказ. День 3', - 'ext': 'mp4', - } - }, { - 'info_dict': { - 'id': '370905', - 'title': 'Госзаказ. День 2', - 'ext': 'mp4', - } - }, { - 'info_dict': { - 'id': '370902', - 'title': 'Госзаказ. День 1', - 'ext': 'mp4', - } - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - # Myvi.ru embed - { - 'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1', - 'info_dict': { - 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e', - 'ext': 'mp4', - 'title': 'Ужастики, русский трейлер (2015)', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 153, - } - }, - # XHamster embed - { - 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8', - 'info_dict': { - 'id': 'showthread', - 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', - }, - 'playlist_mincount': 7, - # This forum does not allow <iframe> syntaxes anymore - # Now HTML tags are displayed as-is - 'skip': 'No videos on this page', - }, - # Embedded TED video - { - 'url': 'http://en.support.wordpress.com/videos/ted-talks/', - 'md5': '65fdff94098e4a607385a60c5177c638', - 'info_dict': { - 'id': '1969', - 'ext': 'mp4', - 'title': 'Hidden miracles of the natural world', - 'uploader': 'Louie Schwartzberg', - 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', - } - }, - # nowvideo embed hidden behind percent encoding - { - 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/', - 'md5': '2baf4ddd70f697d94b1c18cf796d5107', - 'info_dict': { - 'id': '06e53103ca9aa', - 'ext': 'flv', - 'title': 'Macross Episode 001 Watch Macross Episode 001 onl', - 'description': 'No description', - }, - }, - # arte embed - { - 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html', - 'md5': '7653032cbb25bf6c80d80f217055fa43', - 'info_dict': { - 'id': '048195-004_PLUS7-F', - 'ext': 'flv', - 'title': 'X:enius', - 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168', - 'upload_date': '20140320', - }, - 'params': { - 'skip_download': 'Requires rtmpdump' - }, - 'skip': 'video gone', - }, - # francetv embed - { - 'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero', - 'info_dict': { - 'id': 'EV_30231', - 'ext': 'mp4', - 'title': 'Alcaline, le concert avec Calogero', - 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', - 'upload_date': '20150226', - 'timestamp': 1424989860, - 'duration': 5400, - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - 'expected_warnings': [ - 'Forbidden' - ] - }, - # Condé Nast embed - { - 'url': 'http://www.wired.com/2014/04/honda-asimo/', - 'md5': 'ba0dfe966fa007657bd1443ee672db0f', - 'info_dict': { - 'id': '53501be369702d3275860000', - 'ext': 'mp4', - 'title': 'Honda’s New Asimo Robot Is More Human Than Ever', - } - }, - # Dailymotion embed - { - 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/', - 'md5': '441aeeb82eb72c422c7f14ec533999cd', - 'info_dict': { - 'id': 'k2mm4bCdJ6CQ2i7c8o2', - 'ext': 'mp4', - 'title': 'Le Zap de Spi0n n°216 - Zapping du Web', - 'description': 'md5:faf028e48a461b8b7fad38f1e104b119', - 'uploader': 'Spi0n', - 'uploader_id': 'xgditw', - 'upload_date': '20140425', - 'timestamp': 1398441542, - }, - 'add_ie': ['Dailymotion'], - }, - # DailyMail embed - { - 'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot', - 'info_dict': { - 'id': '1495629', - 'ext': 'mp4', - 'title': 'Care worker punches elderly dementia patient in head 11 times', - 'description': 'md5:3a743dee84e57e48ec68bf67113199a5', - }, - 'add_ie': ['DailyMail'], - 'params': { - 'skip_download': True, - }, - }, - # YouTube embed - { - 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html', - 'info_dict': { - 'id': 'FXRb4ykk4S0', - 'ext': 'mp4', - 'title': 'The NBL Auction 2014', - 'uploader': 'BADMINTON England', - 'uploader_id': 'BADMINTONEvents', - 'upload_date': '20140603', - 'description': 'md5:9ef128a69f1e262a700ed83edb163a73', - }, - 'add_ie': ['Youtube'], - 'params': { - 'skip_download': True, - } - }, - # MTVSercices embed - { - 'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html', - 'md5': 'ca1aef97695ef2c1d6973256a57e5252', - 'info_dict': { - 'id': '769f7ec0-0692-4d62-9b45-0d88074bffc1', - 'ext': 'mp4', - 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored', - 'description': 'Two valets share their love for movie star Liam Neesons.', - 'timestamp': 1349922600, - 'upload_date': '20121011', - }, - }, - # YouTube embed via <data-embed-url=""> - { - 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM', - 'info_dict': { - 'id': '4vAffPZIT44', - 'ext': 'mp4', - 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!', - 'uploader': 'Gameloft', - 'uploader_id': 'gameloft', - 'upload_date': '20140828', - 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4', - }, - 'params': { - 'skip_download': True, - } - }, - # YouTube <object> embed - { - 'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/', - 'md5': '516718101ec834f74318df76259fb3cc', - 'info_dict': { - 'id': 'msN87y-iEx0', - 'ext': 'webm', - 'title': 'Feynman: Mirrors FUN TO IMAGINE 6', - 'upload_date': '20080526', - 'description': 'md5:0ffc78ea3f01b2e2c247d5f8d1d3c18d', - 'uploader': 'Christopher Sykes', - 'uploader_id': 'ChristopherJSykes', - }, - 'add_ie': ['Youtube'], - }, - # Camtasia studio - { - 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/', - 'playlist': [{ - 'md5': '0c5e352edabf715d762b0ad4e6d9ee67', - 'info_dict': { - 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1', - 'ext': 'flv', - 'duration': 2235.90, - } - }, { - 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63', - 'info_dict': { - 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP', - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip', - 'ext': 'flv', - 'duration': 2235.93, - } - }], - 'info_dict': { - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', - } - }, - # Flowplayer - { - 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html', - 'md5': '9d65602bf31c6e20014319c7d07fba27', - 'info_dict': { - 'id': '5123ea6d5e5a7', - 'ext': 'mp4', - 'age_limit': 18, - 'uploader': 'www.handjobhub.com', - 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', - } - }, - # Multiple brightcove videos - # https://github.com/rg3/youtube-dl/issues/2283 - { - 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', - 'info_dict': { - 'id': 'always-never', - 'title': 'Always / Never - The New Yorker', - }, - 'playlist_count': 3, - 'params': { - 'extract_flat': False, - 'skip_download': True, - } - }, - # MLB embed - { - 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/', - 'md5': '96f09a37e44da40dd083e12d9a683327', - 'info_dict': { - 'id': '33322633', - 'ext': 'mp4', - 'title': 'Ump changes call to ball', - 'description': 'md5:71c11215384298a172a6dcb4c2e20685', - 'duration': 48, - 'timestamp': 1401537900, - 'upload_date': '20140531', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, - # Wistia embed - { - 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', - 'md5': '1953f3a698ab51cfc948ed3992a0b7ff', - 'info_dict': { - 'id': '6e2wtrbdaf', - 'ext': 'mov', - 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', - 'description': 'a Paywall Videos video from Remilon', - 'duration': 644.072, - 'uploader': 'study.com', - 'timestamp': 1459678540, - 'upload_date': '20160403', - 'filesize': 24687186, - }, - }, - { - 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', - 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', - 'info_dict': { - 'id': 'uxjb0lwrcz', - 'ext': 'mp4', - 'title': 'Conversation about Hexagonal Rails Part 1', - 'description': 'a Martin Fowler video from ThoughtWorks', - 'duration': 1715.0, - 'uploader': 'thoughtworks.wistia.com', - 'timestamp': 1401832161, - 'upload_date': '20140603', - }, - }, - # Wistia standard embed (async) - { - 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/', - 'info_dict': { - 'id': '807fafadvk', - 'ext': 'mp4', - 'title': 'Drip Brennan Dunn Workshop', - 'description': 'a JV Webinars video from getdrip-1', - 'duration': 4986.95, - 'timestamp': 1463607249, - 'upload_date': '20160518', - }, - 'params': { - 'skip_download': True, - } - }, - # Soundcloud embed - { - 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', - 'info_dict': { - 'id': '174391317', - 'ext': 'mp3', - 'description': 'md5:ff867d6b555488ad3c52572bb33d432c', - 'uploader': 'Sophos Security', - 'title': 'Chet Chat 171 - Oct 29, 2014', - 'upload_date': '20141029', - } - }, - # Soundcloud multiple embeds - { - 'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809', - 'info_dict': { - 'id': '52809', - 'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance | TAB + AUDIO', - }, - 'playlist_mincount': 7, - }, - # TuneIn station embed - { - 'url': 'http://radiocnrv.com/promouvoir-radio-cnrv/', - 'info_dict': { - 'id': '204146', - 'ext': 'mp3', - 'title': 'CNRV', - 'location': 'Paris, France', - 'is_live': True, - }, - 'params': { - # Live stream - 'skip_download': True, - }, - }, - # Livestream embed - { - 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', - 'info_dict': { - 'id': '67864563', - 'ext': 'flv', - 'upload_date': '20141112', - 'title': 'Rosetta #CometLanding webcast HL 10', - } - }, - # Another Livestream embed, without 'new.' in URL - { - 'url': 'https://www.freespeech.org/', - 'info_dict': { - 'id': '123537347', - 'ext': 'mp4', - 'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - }, - 'params': { - # Live stream - 'skip_download': True, - }, - }, - # LazyYT - { - 'url': 'https://skiplagged.com/', - 'info_dict': { - 'id': 'skiplagged', - 'title': 'Skiplagged: The smart way to find cheap flights', - }, - 'playlist_mincount': 1, - 'add_ie': ['Youtube'], - }, - # Cinchcast embed - { - 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', - 'info_dict': { - 'id': '7141703', - 'ext': 'mp3', - 'upload_date': '20141126', - 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing', - } - }, - # Cinerama player - { - 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm', - 'info_dict': { - 'id': '730m_DandD_1901_512k', - 'ext': 'mp4', - 'uploader': 'www.abc.net.au', - 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015', - } - }, - # embedded viddler video - { - 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597', - 'info_dict': { - 'id': '4d03aad9', - 'ext': 'mp4', - 'uploader': 'deadspin', - 'title': 'WALL-TO-GORTAT', - 'timestamp': 1422285291, - 'upload_date': '20150126', - }, - 'add_ie': ['Viddler'], - }, - # Libsyn embed - { - 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve', - 'info_dict': { - 'id': '3377616', - 'ext': 'mp3', - 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", - 'description': 'md5:601cb790edd05908957dae8aaa866465', - 'upload_date': '20150220', - }, - 'skip': 'All The Daily Show URLs now redirect to http://www.cc.com/shows/', - }, - # jwplayer YouTube - { - 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/', - 'info_dict': { - 'id': 'Mrj4DVp2zeA', - 'ext': 'mp4', - 'upload_date': '20150212', - 'uploader': 'The National Archives UK', - 'description': 'md5:8078af856dca76edc42910b61273dbbf', - 'uploader_id': 'NationalArchives08', - 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue', - }, - }, - # jwplayer rtmp - { - 'url': 'http://www.suffolk.edu/sjc/live.php', - 'info_dict': { - 'id': 'live', - 'ext': 'flv', - 'title': 'Massachusetts Supreme Judicial Court Oral Arguments', - 'uploader': 'www.suffolk.edu', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/', - }, - # Complex jwplayer - { - 'url': 'http://www.indiedb.com/games/king-machine/videos', - 'info_dict': { - 'id': 'videos', - 'ext': 'mp4', - 'title': 'king machine trailer 1', - 'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, - { - # JWPlayer config passed as variable - 'url': 'http://www.txxx.com/videos/3326530/ariele/', - 'info_dict': { - 'id': '3326530_hq', - 'ext': 'mp4', - 'title': 'ARIELE | Tube Cup', - 'uploader': 'www.txxx.com', - 'age_limit': 18, - }, - 'params': { - 'skip_download': True, - } - }, - { - # JWPlatform iframe - 'url': 'https://www.mediaite.com/tv/dem-senator-claims-gary-cohn-faked-a-bad-connection-during-trump-call-to-get-him-off-the-phone/', - 'md5': 'ca00a040364b5b439230e7ebfd02c4e9', - 'info_dict': { - 'id': 'O0c5JcKT', - 'ext': 'mp4', - 'upload_date': '20171122', - 'timestamp': 1511366290, - 'title': 'Dem Senator Claims Gary Cohn Faked a Bad Connection During Trump Call to Get Him Off the Phone', - }, - 'add_ie': [JWPlatformIE.ie_key()], - }, - { - # Video.js embed, multiple formats - 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', - 'info_dict': { - 'id': 'yygqldloqIk', - 'ext': 'mp4', - 'title': 'SolidWorks. Урок 6 Настройка чертежа', - 'description': 'md5:baf95267792646afdbf030e4d06b2ab3', - 'upload_date': '20130314', - 'uploader': 'PROстое3D', - 'uploader_id': 'PROstoe3D', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Video.js embed, single format - 'url': 'https://www.vooplayer.com/v3/watch/watch.php?v=NzgwNTg=', - 'info_dict': { - 'id': 'watch', - 'ext': 'mp4', - 'title': 'Step 1 - Good Foundation', - 'description': 'md5:d1e7ff33a29fc3eb1673d6c270d344f4', - }, - 'params': { - 'skip_download': True, - }, - }, - # rtl.nl embed - { - 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'aanslagen-kopenhagen', - 'title': 'Aanslagen Kopenhagen', - } - }, - # Zapiks embed - { - 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html', - 'info_dict': { - 'id': '118046', - 'ext': 'mp4', - 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', - } - }, - # Kaltura embed (different embed code) - { - 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014', - 'info_dict': { - 'id': '1_a52wc67y', - 'ext': 'flv', - 'upload_date': '20150127', - 'uploader_id': 'PremierMedia', - 'timestamp': int, - 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014', - }, - }, - # Kaltura embed with single quotes - { - 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY', - 'info_dict': { - 'id': '0_izeg5utt', - 'ext': 'mp4', - 'title': '35871', - 'timestamp': 1355743100, - 'upload_date': '20121217', - 'uploader_id': 'cplapp@learn360.com', - }, - 'add_ie': ['Kaltura'], - }, - { - # Kaltura embedded via quoted entry_id - 'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures', - 'info_dict': { - 'id': '0_utuok90b', - 'ext': 'mp4', - 'title': '06_matthew_brender_raj_dutt', - 'timestamp': 1466638791, - 'upload_date': '20160622', - }, - 'add_ie': ['Kaltura'], - 'expected_warnings': [ - 'Could not send HEAD request' - ], - 'params': { - 'skip_download': True, - } - }, - { - # Kaltura embedded, some fileExt broken (#11480) - 'url': 'http://www.cornell.edu/video/nima-arkani-hamed-standard-models-of-particle-physics', - 'info_dict': { - 'id': '1_sgtvehim', - 'ext': 'mp4', - 'title': 'Our "Standard Models" of particle physics and cosmology', - 'description': 'md5:67ea74807b8c4fea92a6f38d6d323861', - 'timestamp': 1321158993, - 'upload_date': '20111113', - 'uploader_id': 'kps1', - }, - 'add_ie': ['Kaltura'], - }, - { - # Kaltura iframe embed - 'url': 'http://www.gsd.harvard.edu/event/i-m-pei-a-centennial-celebration/', - 'md5': 'ae5ace8eb09dc1a35d03b579a9c2cc44', - 'info_dict': { - 'id': '0_f2cfbpwy', - 'ext': 'mp4', - 'title': 'I. M. Pei: A Centennial Celebration', - 'description': 'md5:1db8f40c69edc46ca180ba30c567f37c', - 'upload_date': '20170403', - 'uploader_id': 'batchUser', - 'timestamp': 1491232186, - }, - 'add_ie': ['Kaltura'], - }, - { - # Kaltura iframe embed, more sophisticated - 'url': 'http://www.cns.nyu.edu/~eero/math-tools/Videos/lecture-05sep2017.html', - 'info_dict': { - 'id': '1_9gzouybz', - 'ext': 'mp4', - 'title': 'lecture-05sep2017', - 'description': 'md5:40f347d91fd4ba047e511c5321064b49', - 'upload_date': '20170913', - 'uploader_id': 'eps2', - 'timestamp': 1505340777, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Kaltura'], - }, - { - # meta twitter:player - 'url': 'http://thechive.com/2017/12/08/all-i-want-for-christmas-is-more-twerk/', - 'info_dict': { - 'id': '0_01b42zps', - 'ext': 'mp4', - 'title': 'Main Twerk (Video)', - 'upload_date': '20171208', - 'uploader_id': 'sebastian.salinas@thechive.com', - 'timestamp': 1512713057, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Kaltura'], - }, - # referrer protected EaglePlatform embed - { - 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', - 'info_dict': { - 'id': '582306', - 'ext': 'mp4', - 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 3382, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, - # ClipYou (EaglePlatform) embed (custom URL) - { - 'url': 'http://muz-tv.ru/play/7129/', - # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used - 'info_dict': { - 'id': '12820', - 'ext': 'mp4', - 'title': "'O Sole Mio", - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 216, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is unavailable.', - }, - # Pladform embed - { - 'url': 'http://muz-tv.ru/kinozal/view/7400/', - 'info_dict': { - 'id': '100183293', - 'ext': 'mp4', - 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', - 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 694, - 'age_limit': 0, - }, - 'skip': 'HTTP Error 404: Not Found', - }, - # Playwire embed - { - 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html', - 'info_dict': { - 'id': '3519514', - 'ext': 'mp4', - 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer', - 'thumbnail': r're:^https?://.*\.png$', - 'duration': 45.115, - }, - }, - # 5min embed - { - 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/', - 'md5': '4c6f127a30736b59b3e2c19234ee2bf7', - 'info_dict': { - 'id': '518726732', - 'ext': 'mp4', - 'title': 'Facebook Creates "On This Day" | Crunch Report', - 'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild', - 'timestamp': 1427237531, - 'uploader': 'Crunch Report', - 'upload_date': '20150324', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - # Crooks and Liars embed - { - 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', - 'info_dict': { - 'id': '8RUoRhRi', - 'ext': 'mp4', - 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!", - 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f', - 'timestamp': 1428207000, - 'upload_date': '20150405', - 'uploader': 'Heather', - }, - }, - # Crooks and Liars external embed - { - 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/', - 'info_dict': { - 'id': 'MTE3MjUtMzQ2MzA', - 'ext': 'mp4', - 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5', - 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec', - 'timestamp': 1265032391, - 'upload_date': '20100201', - 'uploader': 'Heather', - }, - }, - # NBC Sports vplayer embed - { - 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a', - 'info_dict': { - 'id': 'ln7x1qSThw4k', - 'ext': 'flv', - 'title': "PFT Live: New leader in the 'new-look' defense", - 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', - 'uploader': 'NBCU-SPORTS', - 'upload_date': '20140107', - 'timestamp': 1389118457, - }, - 'skip': 'Invalid Page URL', - }, - # NBC News embed - { - 'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html', - 'md5': '1aa589c675898ae6d37a17913cf68d66', - 'info_dict': { - 'id': 'x_dtl_oa_LettermanliftPR_160608', - 'ext': 'mp4', - 'title': 'David Letterman: A Preview', - 'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.', - 'upload_date': '20160609', - 'timestamp': 1465431544, - 'uploader': 'NBCU-NEWS', - }, - }, - # UDN embed - { - 'url': 'https://video.udn.com/news/300346', - 'md5': 'fd2060e988c326991037b9aff9df21a6', - 'info_dict': { - 'id': '300346', - 'ext': 'mp4', - 'title': '中一中男師變性 全校師生力挺', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Failed to parse JSON Expecting value'], - }, - # Brightcove URL in single quotes - { - 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', - 'md5': '4ae374f1f8b91c889c4b9203c8c752af', - 'info_dict': { - 'id': '4255764656001', - 'ext': 'mp4', - 'title': 'SN Presents: Russell Martin, World Citizen', - 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', - 'uploader': 'Rogers Sportsnet', - 'uploader_id': '1704050871', - 'upload_date': '20150525', - 'timestamp': 1432570283, - }, - }, - # OnionStudios embed - { - 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', - 'info_dict': { - 'id': '2855', - 'ext': 'mp4', - 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You', - 'thumbnail': r're:^https?://.*\.jpe?g$', - 'uploader': 'ClickHole', - 'uploader_id': 'clickhole', - } - }, - # SnagFilms embed - { - 'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html', - 'info_dict': { - 'id': '74849a00-85a9-11e1-9660-123139220831', - 'ext': 'mp4', - 'title': '#whilewewatch', - } - }, - # AdobeTVVideo embed - { - 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners', - 'md5': '43662b577c018ad707a63766462b1e87', - 'info_dict': { - 'id': '2456', - 'ext': 'mp4', - 'title': 'New experience with Acrobat DC', - 'description': 'New experience with Acrobat DC', - 'duration': 248.667, - }, - }, - # BrightcoveInPageEmbed embed - { - 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', - 'info_dict': { - 'id': '4238694884001', - 'ext': 'flv', - 'title': 'Tabletop: Dread, Last Thoughts', - 'description': 'Tabletop: Dread, Last Thoughts', - 'duration': 51690, - }, - }, - # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' - # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm - { - 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html', - 'info_dict': { - 'id': '4785848093001', - 'ext': 'mp4', - 'title': 'The Cardinal Pell Interview', - 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', - 'uploader': 'GlobeCast Australia - GlobeStream', - 'uploader_id': '2733773828001', - 'upload_date': '20160304', - 'timestamp': 1457083087, - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - }, - { - # Brightcove embed with whitespace around attribute names - 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill', - 'info_dict': { - 'id': '3167554373001', - 'ext': 'mp4', - 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill", - 'description': 'md5:57bacb0e0f29349de4972bfda3191713', - 'uploader_id': '1079349493', - 'upload_date': '20140207', - 'timestamp': 1391810548, - }, - 'params': { - 'skip_download': True, - }, - }, - # Another form of arte.tv embed - { - 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', - 'md5': '850bfe45417ddf221288c88a0cffe2e2', - 'info_dict': { - 'id': '030273-562_PLUS7-F', - 'ext': 'mp4', - 'title': 'ARTE Reportage - Nulle part, en France', - 'description': 'md5:e3a0e8868ed7303ed509b9e3af2b870d', - 'upload_date': '20160409', - }, - }, - # LiveLeak embed - { - 'url': 'http://www.wykop.pl/link/3088787/', - 'md5': '7619da8c820e835bef21a1efa2a0fc71', - 'info_dict': { - 'id': '874_1459135191', - 'ext': 'mp4', - 'title': 'Man shows poor quality of new apartment building', - 'description': 'The wall is like a sand pile.', - 'uploader': 'Lake8737', - }, - 'add_ie': [LiveLeakIE.ie_key()], - }, - # Another LiveLeak embed pattern (#13336) - { - 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/', - 'info_dict': { - 'id': '2eb_1496309988', - 'ext': 'mp4', - 'title': 'Thief robs place where everyone was armed', - 'description': 'md5:694d73ee79e535953cf2488562288eee', - 'uploader': 'brazilwtf', - }, - 'add_ie': [LiveLeakIE.ie_key()], - }, - # Duplicated embedded video URLs - { - 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443', - 'info_dict': { - 'id': '149298443_480_16c25b74_2', - 'ext': 'mp4', - 'title': 'vs. Blue Orange Spring Game', - 'uploader': 'www.hudl.com', - }, - }, - # twitter:player:stream embed - { - 'url': 'http://www.rtl.be/info/video/589263.aspx?CategoryID=288', - 'info_dict': { - 'id': 'master', - 'ext': 'mp4', - 'title': 'Une nouvelle espèce de dinosaure découverte en Argentine', - 'uploader': 'www.rtl.be', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - }, - # twitter:player embed - { - 'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/', - 'md5': 'a3e0df96369831de324f0778e126653c', - 'info_dict': { - 'id': '4909620399001', - 'ext': 'mp4', - 'title': 'What Do Black Holes Sound Like?', - 'description': 'what do black holes sound like', - 'upload_date': '20160524', - 'uploader_id': '29913724001', - 'timestamp': 1464107587, - 'uploader': 'TheAtlantic', - }, - 'add_ie': ['BrightcoveLegacy'], - }, - # Facebook <iframe> embed - { - 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', - 'md5': 'fbcde74f534176ecb015849146dd3aee', - 'info_dict': { - 'id': '599637780109885', - 'ext': 'mp4', - 'title': 'Facebook video #599637780109885', - }, - }, - # Facebook <iframe> embed, plugin video - { - 'url': 'http://5pillarsuk.com/2017/06/07/tariq-ramadan-disagrees-with-pr-exercise-by-imams-refusing-funeral-prayers-for-london-attackers/', - 'info_dict': { - 'id': '1754168231264132', - 'ext': 'mp4', - 'title': 'About the Imams and Religious leaders refusing to perform funeral prayers for...', - 'uploader': 'Tariq Ramadan (official)', - 'timestamp': 1496758379, - 'upload_date': '20170606', - }, - 'params': { - 'skip_download': True, - }, - }, - # Facebook API embed - { - 'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/', - 'md5': 'a47372ee61b39a7b90287094d447d94e', - 'info_dict': { - 'id': '10153467542406923', - 'ext': 'mp4', - 'title': 'Facebook video #10153467542406923', - }, - }, - # Wordpress "YouTube Video Importer" plugin - { - 'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/', - 'md5': 'd16797741b560b485194eddda8121b48', - 'info_dict': { - 'id': 'HNTXWDXV9Is', - 'ext': 'mp4', - 'title': 'Blue Devils Drumline Stanford lot 2016', - 'upload_date': '20160627', - 'uploader_id': 'GENOCIDE8GENERAL10', - 'uploader': 'cylus cyrus', - }, - }, - { - # video stored on custom kaltura server - 'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv', - 'md5': '537617d06e64dfed891fa1593c4b30cc', - 'info_dict': { - 'id': '0_1iotm5bh', - 'ext': 'mp4', - 'title': 'Elecciones británicas: 5 lecciones para Rajoy', - 'description': 'md5:435a89d68b9760b92ce67ed227055f16', - 'uploader_id': 'videos.expansion@el-mundo.net', - 'upload_date': '20150429', - 'timestamp': 1430303472, - }, - 'add_ie': ['Kaltura'], - }, - { - # Non-standard Vimeo embed - 'url': 'https://openclassrooms.com/courses/understanding-the-web', - 'md5': '64d86f1c7d369afd9a78b38cbb88d80a', - 'info_dict': { - 'id': '148867247', - 'ext': 'mp4', - 'title': 'Understanding the web - Teaser', - 'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.', - 'upload_date': '20151214', - 'uploader': 'OpenClassrooms', - 'uploader_id': 'openclassrooms', - }, - 'add_ie': ['Vimeo'], - }, - { - # generic vimeo embed that requires original URL passed as Referer - 'url': 'http://racing4everyone.eu/2016/07/30/formula-1-2016-round12-germany/', - 'only_matching': True, - }, - { - 'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video', - 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', - 'info_dict': { - 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', - 'ext': 'mp4', - 'title': 'Big Buck Bunny', - 'description': 'Royalty free test video', - 'timestamp': 1432816365, - 'upload_date': '20150528', - 'is_live': False, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [ArkenaIE.ie_key()], - }, - { - 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/', - 'info_dict': { - 'id': '1c7141f46c', - 'ext': 'mp4', - 'title': 'НА КОСЪМ ОТ ВЗРИВ: Изтичане на газ на бензиностанция в Пловдив', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [Vbox7IE.ie_key()], - }, - { - # DBTV embeds - 'url': 'http://www.dagbladet.no/2016/02/23/nyheter/nordlys/ski/troms/ver/43254897/', - 'info_dict': { - 'id': '43254897', - 'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans', - }, - 'playlist_mincount': 3, - }, - { - # Videa embeds - 'url': 'http://forum.dvdtalk.com/movie-talk/623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style.html', - 'info_dict': { - 'id': '623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style', - 'title': 'Deleted Magic - Star Wars: OT Deleted / Alt. Scenes Docu. Style - DVD Talk Forum', - }, - 'playlist_mincount': 2, - }, - { - # 20 minuten embed - 'url': 'http://www.20min.ch/schweiz/news/story/So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552', - 'info_dict': { - 'id': '523629', - 'ext': 'mp4', - 'title': 'So kommen Sie bei Eis und Schnee sicher an', - 'description': 'md5:117c212f64b25e3d95747e5276863f7d', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [TwentyMinutenIE.ie_key()], - }, - { - # VideoPress embed - 'url': 'https://en.support.wordpress.com/videopress/', - 'info_dict': { - 'id': 'OcobLTqC', - 'ext': 'm4v', - 'title': 'IMG_5786', - 'timestamp': 1435711927, - 'upload_date': '20150701', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [VideoPressIE.ie_key()], - }, - { - # Rutube embed - 'url': 'http://magazzino.friday.ru/videos/vipuski/kazan-2', - 'info_dict': { - 'id': '9b3d5bee0a8740bf70dfd29d3ea43541', - 'ext': 'flv', - 'title': 'Магаззино: Казань 2', - 'description': 'md5:99bccdfac2269f0e8fdbc4bbc9db184a', - 'uploader': 'Магаззино', - 'upload_date': '20170228', - 'uploader_id': '996642', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [RutubeIE.ie_key()], - }, - { - # ThePlatform embedded with whitespaces in URLs - 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm', - 'only_matching': True, - }, - { - # Senate ISVP iframe https - 'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security', - 'md5': 'fb8c70b0b515e5037981a2492099aab8', - 'info_dict': { - 'id': 'govtaff020316', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player', - }, - 'add_ie': [SenateISVPIE.ie_key()], - }, - { - # Limelight embeds (1 channel embed + 4 media embeds) - 'url': 'http://www.sedona.com/FacilitatorTraining2017', - 'info_dict': { - 'id': 'FacilitatorTraining2017', - 'title': 'Facilitator Training 2017', - }, - 'playlist_mincount': 5, - }, - { - # Limelight embed (LimelightPlayerUtil.embed) - 'url': 'https://tv5.ca/videos?v=xuu8qowr291ri', - 'info_dict': { - 'id': '95d035dc5c8a401588e9c0e6bd1e9c92', - 'ext': 'mp4', - 'title': '07448641', - 'timestamp': 1499890639, - 'upload_date': '20170712', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['LimelightMedia'], - }, - { - 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', - 'info_dict': { - 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest', - 'title': 'Standoff with Walnut Creek murder suspect ends', - 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788', - }, - 'playlist_mincount': 4, - }, - { - # WashingtonPost embed - 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches', - 'info_dict': { - 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac', - 'ext': 'mp4', - 'title': "No one has seen the drama series based on Trump's life \u2014 until now", - 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.', - 'timestamp': 1455216756, - 'uploader': 'The Washington Post', - 'upload_date': '20160211', - }, - 'add_ie': [WashingtonPostIE.ie_key()], - }, - { - # Mediaset embed - 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml', - 'info_dict': { - 'id': '720642', - 'ext': 'mp4', - 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [MediasetIE.ie_key()], - }, - { - # JOJ.sk embeds - 'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok', - 'info_dict': { - 'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok', - 'title': 'Slovenskom sa prehnala vlna silných búrok', - }, - 'playlist_mincount': 5, - 'add_ie': [JojIE.ie_key()], - }, - { - # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) - 'url': 'https://tvrain.ru/amp/418921/', - 'md5': 'cc00413936695987e8de148b67d14f1d', - 'info_dict': { - 'id': '418921', - 'ext': 'mp4', - 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', - }, - }, - { - # vzaar embed - 'url': 'http://help.vzaar.com/article/165-embedding-video', - 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4', - 'info_dict': { - 'id': '8707641', - 'ext': 'mp4', - 'title': 'Building A Business Online: Principal Chairs Q & A', - }, - }, - { - # multiple HTML5 videos on one page - 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html', - 'info_dict': { - 'id': 'keyscenarios', - 'title': 'Rescue Kit 14 Free Edition - Getting started', - }, - 'playlist_count': 4, - }, - { - # vshare embed - 'url': 'https://youtube-dl-demo.neocities.org/vshare.html', - 'md5': '17b39f55b5497ae8b59f5fbce8e35886', - 'info_dict': { - 'id': '0f64ce6', - 'title': 'vl14062007715967', - 'ext': 'mp4', - } - }, - { - 'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/', - 'md5': 'aecd089f55b1cb5a59032cb049d3a356', - 'info_dict': { - 'id': '90227f51a80c4d8f86c345a7fa62bd9a1d', - 'ext': 'mp4', - 'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare', - 'description': 'md5:5a51db84a62def7b7054df2ade403c6c', - 'timestamp': 1474354800, - 'upload_date': '20160920', - } - }, - { - 'url': 'http://www.kidzworld.com/article/30935-trolls-the-beat-goes-on-interview-skylar-astin-and-amanda-leighton', - 'info_dict': { - 'id': '1731611', - 'ext': 'mp4', - 'title': 'Official Trailer | TROLLS: THE BEAT GOES ON!', - 'description': 'md5:eb5f23826a027ba95277d105f248b825', - 'timestamp': 1516100691, - 'upload_date': '20180116', - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [SpringboardPlatformIE.ie_key()], - }, - { - 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', - 'info_dict': { - 'id': 'uPDB5I9wfp8', - 'ext': 'webm', - 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', - 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', - 'upload_date': '20160219', - 'uploader': 'Pocoyo - Português (BR)', - 'uploader_id': 'PocoyoBrazil', - }, - 'add_ie': [YoutubeIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html', - 'info_dict': { - 'id': 'vMDE4NzI1Mjgt690b', - 'ext': 'mp4', - 'title': 'Котята', - }, - 'add_ie': [YapFilesIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, - { - # CloudflareStream embed - 'url': 'https://www.cloudflare.com/products/cloudflare-stream/', - 'info_dict': { - 'id': '31c9291ab41fac05471db4e73aa11717', - 'ext': 'mp4', - 'title': '31c9291ab41fac05471db4e73aa11717', - }, - 'add_ie': [CloudflareStreamIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, - { - # PeerTube embed - 'url': 'https://joinpeertube.org/fr/home/', - 'info_dict': { - 'id': 'home', - 'title': 'Reprenez le contrôle de vos vidéos ! #JoinPeertube', - }, - 'playlist_count': 2, - }, - { - # Indavideo embed - 'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/', - 'info_dict': { - 'id': '1693903', - 'ext': 'mp4', - 'title': 'Így kell otthon hamburgert sütni', - 'description': 'md5:f5a730ecf900a5c852e1e00540bbb0f7', - 'timestamp': 1426330212, - 'upload_date': '20150314', - 'uploader': 'StreetKitchen', - 'uploader_id': '546363', - }, - 'add_ie': [IndavideoEmbedIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, - { - # APA embed via JWPlatform embed - 'url': 'http://www.vol.at/blue-man-group/5593454', - 'info_dict': { - 'id': 'jjv85FdZ', - 'ext': 'mp4', - 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 254, - 'timestamp': 1519211149, - 'upload_date': '20180221', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://share-videos.se/auto/video/83645793?uid=13', - 'md5': 'b68d276de422ab07ee1d49388103f457', - 'info_dict': { - 'id': '83645793', - 'title': 'Lock up and get excited', - 'ext': 'mp4' - }, - 'skip': 'TODO: fix nested playlists processing in tests', - }, - # { - # # TODO: find another test - # # http://schema.org/VideoObject - # 'url': 'https://flipagram.com/f/nyvTSJMKId', - # 'md5': '888dcf08b7ea671381f00fab74692755', - # 'info_dict': { - # 'id': 'nyvTSJMKId', - # 'ext': 'mp4', - # 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', - # 'description': '#love for cats.', - # 'timestamp': 1461244995, - # 'upload_date': '20160421', - # }, - # 'params': { - # 'force_generic_extractor': True, - # }, - # } - ] - - def report_following_redirect(self, new_url): - """Report information extraction.""" - self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) - - def _extract_rss(self, url, video_id, doc): - playlist_title = doc.find('./channel/title').text - playlist_desc_el = doc.find('./channel/description') - playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text - - entries = [] - for it in doc.findall('./channel/item'): - next_url = None - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break - - if not next_url: - next_url = xpath_text(it, 'link', fatal=False) - - if not next_url: - continue - - entries.append({ - '_type': 'url_transparent', - 'url': next_url, - 'title': it.find('title').text, - }) - - return { - '_type': 'playlist', - 'id': url, - 'title': playlist_title, - 'description': playlist_desc, - 'entries': entries, - } - - def _extract_camtasia(self, url, video_id, webpage): - """ Returns None if no camtasia video can be found. """ - - camtasia_cfg = self._search_regex( - r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);', - webpage, 'camtasia configuration file', default=None) - if camtasia_cfg is None: - return None - - title = self._html_search_meta('DC.title', webpage, fatal=True) - - camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg) - camtasia_cfg = self._download_xml( - camtasia_url, video_id, - note='Downloading camtasia configuration', - errnote='Failed to download camtasia configuration') - fileset_node = camtasia_cfg.find('./playlist/array/fileset') - - entries = [] - for n in fileset_node.getchildren(): - url_n = n.find('./uri') - if url_n is None: - continue - - entries.append({ - 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0], - 'title': '%s - %s' % (title, n.tag), - 'url': compat_urlparse.urljoin(url, url_n.text), - 'duration': float_or_none(n.find('./duration').text), - }) - - return { - '_type': 'playlist', - 'entries': entries, - 'title': title, - } - - def _real_extract(self, url): - if url.startswith('//'): - return { - '_type': 'url', - 'url': self.http_scheme() + url, - } - - parsed_url = compat_urlparse.urlparse(url) - if not parsed_url.scheme: - default_search = self._downloader.params.get('default_search') - if default_search is None: - default_search = 'fixup_error' - - if default_search in ('auto', 'auto_warning', 'fixup_error'): - if '/' in url: - self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') - return self.url_result('http://' + url) - elif default_search != 'fixup_error': - if default_search == 'auto_warning': - if re.match(r'^(?:url|URL)$', url): - raise ExtractorError( - 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url, - expected=True) - else: - self._downloader.report_warning( - 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url) - return self.url_result('ytsearch:' + url) - - if default_search in ('error', 'fixup_error'): - raise ExtractorError( - '%r is not a valid URL. ' - 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' - % (url, url), expected=True) - else: - if ':' not in default_search: - default_search += ':' - return self.url_result(default_search + url) - - url, smuggled_data = unsmuggle_url(url) - force_videoid = None - is_intentional = smuggled_data and smuggled_data.get('to_generic') - if smuggled_data and 'force_videoid' in smuggled_data: - force_videoid = smuggled_data['force_videoid'] - video_id = force_videoid - else: - video_id = self._generic_id(url) - - self.to_screen('%s: Requesting header' % video_id) - - head_req = HEADRequest(url) - head_response = self._request_webpage( - head_req, video_id, - note=False, errnote='Could not send HEAD request to %s' % url, - fatal=False) - - if head_response is not False: - # Check for redirect - new_url = compat_str(head_response.geturl()) - if url != new_url: - self.report_following_redirect(new_url) - if force_videoid: - new_url = smuggle_url( - new_url, {'force_videoid': force_videoid}) - return self.url_result(new_url) - - full_response = None - if head_response is False: - request = sanitized_Request(url) - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) - head_response = full_response - - info_dict = { - 'id': video_id, - 'title': self._generic_title(url), - 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) - } - - # Check for direct link to a video - content_type = head_response.headers.get('Content-Type', '').lower() - m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) - if m: - format_id = compat_str(m.group('format_id')) - if format_id.endswith('mpegurl'): - formats = self._extract_m3u8_formats(url, video_id, 'mp4') - elif format_id == 'f4m': - formats = self._extract_f4m_formats(url, video_id) - else: - formats = [{ - 'format_id': format_id, - 'url': url, - 'vcodec': 'none' if m.group('type') == 'audio' else None - }] - info_dict['direct'] = True - self._sort_formats(formats) - info_dict['formats'] = formats - return info_dict - - if not self._downloader.params.get('test', False) and not is_intentional: - force = self._downloader.params.get('force_generic_extractor', False) - self._downloader.report_warning( - '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) - - if not full_response: - request = sanitized_Request(url) - # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) - # making it impossible to download only chunk of the file (yet we need only 512kB to - # test whether it's HTML or not). According to youtube-dl default Accept-Encoding - # that will always result in downloading the whole file that is not desirable. - # Therefore for extraction pass we have to override Accept-Encoding to any in order - # to accept raw bytes and being able to download only a chunk. - # It may probably better to solve this by checking Content-Type for application/octet-stream - # after HEAD request finishes, but not sure if we can rely on this. - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) - - first_bytes = full_response.read(512) - - # Is it an M3U playlist? - if first_bytes.startswith(b'#EXTM3U'): - info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') - self._sort_formats(info_dict['formats']) - return info_dict - - # Maybe it's a direct link to a video? - # Be careful not to download the whole thing! - if not is_html(first_bytes): - self._downloader.report_warning( - 'URL could be a direct video link, returning it as such.') - info_dict.update({ - 'direct': True, - 'url': url, - }) - return info_dict - - webpage = self._webpage_read_content( - full_response, url, video_id, prefix=first_bytes) - - self.report_extraction(video_id) - - # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? - try: - doc = compat_etree_fromstring(webpage.encode('utf-8')) - if doc.tag == 'rss': - return self._extract_rss(url, video_id, doc) - elif doc.tag == 'SmoothStreamingMedia': - info_dict['formats'] = self._parse_ism_formats(doc, url) - self._sort_formats(info_dict['formats']) - return info_dict - elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): - smil = self._parse_smil(doc, url, video_id) - self._sort_formats(smil['formats']) - return smil - elif doc.tag == '{http://xspf.org/ns/0/}playlist': - return self.playlist_result( - self._parse_xspf( - doc, video_id, xspf_url=url, - xspf_base_url=compat_str(full_response.geturl())), - video_id) - elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): - info_dict['formats'] = self._parse_mpd_formats( - doc, - mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0], - mpd_url=url) - self._sort_formats(info_dict['formats']) - return info_dict - elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): - info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) - self._sort_formats(info_dict['formats']) - return info_dict - except compat_xml_parse_error: - pass - - # Is it a Camtasia project? - camtasia_res = self._extract_camtasia(url, video_id, webpage) - if camtasia_res is not None: - return camtasia_res - - # Sometimes embedded video player is hidden behind percent encoding - # (e.g. https://github.com/rg3/youtube-dl/issues/2448) - # Unescaping the whole page allows to handle those cases in a generic way - webpage = compat_urllib_parse_unquote(webpage) - - # it's tempting to parse this further, but you would - # have to take into account all the variations like - # Video Title - Site Name - # Site Name | Video Title - # Video Title - Tagline | Site Name - # and so on and so forth; it's just not practical - video_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(?s)<title>(.*?)</title>', webpage, 'video title', - default='video') - - # Try to detect age limit automatically - age_limit = self._rta_search(webpage) - # And then there are the jokers who advertise that they use RTA, - # but actually don't. - AGE_LIMIT_MARKERS = [ - r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', - ] - if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): - age_limit = 18 - - # video uploader is domain name - video_uploader = self._search_regex( - r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') - - video_description = self._og_search_description(webpage, default=None) - video_thumbnail = self._og_search_thumbnail(webpage, default=None) - - info_dict.update({ - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'age_limit': age_limit, - }) - - # Look for Brightcove Legacy Studio embeds - bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) - if bc_urls: - entries = [{ - '_type': 'url', - 'url': smuggle_url(bc_url, {'Referer': url}), - 'ie_key': 'BrightcoveLegacy' - } for bc_url in bc_urls] - - return { - '_type': 'playlist', - 'title': video_title, - 'id': video_id, - 'entries': entries, - } - - # Look for Brightcove New Studio embeds - bc_urls = BrightcoveNewIE._extract_urls(self, webpage) - if bc_urls: - return self.playlist_from_matches( - bc_urls, video_id, video_title, - getter=lambda x: smuggle_url(x, {'referrer': url}), - ie='BrightcoveNew') - - # Look for Nexx embeds - nexx_urls = NexxIE._extract_urls(webpage) - if nexx_urls: - return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key()) - - # Look for Nexx iFrame embeds - nexx_embed_urls = NexxEmbedIE._extract_urls(webpage) - if nexx_embed_urls: - return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key()) - - # Look for ThePlatform embeds - tp_urls = ThePlatformIE._extract_urls(webpage) - if tp_urls: - return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') - - # Look for Vessel embeds - vessel_urls = VesselIE._extract_urls(webpage) - if vessel_urls: - return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key()) - - # Look for embedded rtl.nl player - matches = re.findall( - r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', - webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') - - vimeo_urls = VimeoIE._extract_urls(url, webpage) - if vimeo_urls: - return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) - - vid_me_embed_url = self._search_regex( - r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', - webpage, 'vid.me embed', default=None) - if vid_me_embed_url is not None: - return self.url_result(vid_me_embed_url, 'Vidme') - - # Look for YouTube embeds - youtube_urls = YoutubeIE._extract_urls(webpage) - if youtube_urls: - return self.playlist_from_matches( - youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) - - matches = DailymotionIE._extract_urls(webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title) - - # Look for embedded Dailymotion playlist player (#3822) - m = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) - if m: - playlists = re.findall( - r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) - if playlists: - return self.playlist_from_matches( - playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) - - # Look for DailyMail embeds - dailymail_urls = DailyMailIE._extract_urls(webpage) - if dailymail_urls: - return self.playlist_from_matches( - dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) - - # Look for embedded Wistia player - wistia_url = WistiaIE._extract_url(webpage) - if wistia_url: - return { - '_type': 'url_transparent', - 'url': self._proto_relative_url(wistia_url), - 'ie_key': WistiaIE.ie_key(), - 'uploader': video_uploader, - } - - # Look for SVT player - svt_url = SVTIE._extract_url(webpage) - if svt_url: - return self.url_result(svt_url, 'SVT') - - # Look for Bandcamp pages with custom domain - mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) - if mobj is not None: - burl = unescapeHTML(mobj.group(1)) - # Don't set the extractor because it can be a track url or an album - return self.url_result(burl) - - # Look for embedded Vevo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Viddler player - mobj = re.search( - r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NYTimes player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Libsyn player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Ooyala videos - mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or - re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or - re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) or - re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or - re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) - if mobj is not None: - embed_token = self._search_regex( - r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', - webpage, 'ooyala embed token', default=None) - return OoyalaIE._build_url_result(smuggle_url( - mobj.group('ec'), { - 'domain': url, - 'embed_token': embed_token, - })) - - # Look for multiple Ooyala embeds on SBN network websites - mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) - if mobj is not None: - embeds = self._parse_json(mobj.group(1), video_id, fatal=False) - if embeds: - return self.playlist_from_matches( - embeds, video_id, video_title, - getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') - - # Look for Aparat videos - mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Aparat') - - # Look for MPORA videos - mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Mpora') - - # Look for embedded NovaMov-based player - mobj = re.search( - r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\']) - (?P<url>http://(?:(?:embed|www)\.)? - (?:novamov\.com| - nowvideo\.(?:ch|sx|eu|at|ag|co)| - videoweed\.(?:es|com)| - movshare\.(?:net|sx|ag)| - divxstage\.(?:eu|net|ch|co|at|ag)) - /embed\.php.+?)\1''', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Facebook player - facebook_urls = FacebookIE._extract_urls(webpage) - if facebook_urls: - return self.playlist_from_matches(facebook_urls, video_id, video_title) - - # Look for embedded VK player - mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'VK') - - # Look for embedded Odnoklassniki player - mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Odnoklassniki') - - # Look for embedded ivi player - mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Ivi') - - # Look for embedded Huffington Post player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'HuffPost') - - # Look for embed.ly - mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage) - if mobj is not None: - return self.url_result(compat_urllib_parse_unquote(mobj.group('url'))) - - # Look for funnyordie embed - matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) - if matches: - return self.playlist_from_matches( - matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') - - # Look for BBC iPlayer embed - matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk') - - # Look for embedded RUTV player - rutv_url = RUTVIE._extract_url(webpage) - if rutv_url: - return self.url_result(rutv_url, 'RUTV') - - # Look for embedded TVC player - tvc_url = TVCIE._extract_url(webpage) - if tvc_url: - return self.url_result(tvc_url, 'TVC') - - # Look for embedded SportBox player - sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) - if sportbox_urls: - return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed') - - # Look for embedded XHamster player - xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) - if xhamster_urls: - return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed') - - # Look for embedded TNAFlixNetwork player - tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) - if tnaflix_urls: - return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key()) - - # Look for embedded PornHub player - pornhub_urls = PornHubIE._extract_urls(webpage) - if pornhub_urls: - return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key()) - - # Look for embedded DrTuber player - drtuber_urls = DrTuberIE._extract_urls(webpage) - if drtuber_urls: - return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key()) - - # Look for embedded RedTube player - redtube_urls = RedTubeIE._extract_urls(webpage) - if redtube_urls: - return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) - - # Look for embedded Tube8 player - tube8_urls = Tube8IE._extract_urls(webpage) - if tube8_urls: - return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) - - # Look for embedded Tvigle player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Tvigle') - - # Look for embedded TED player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'TED') - - # Look for embedded Ustream videos - ustream_url = UstreamIE._extract_url(webpage) - if ustream_url: - return self.url_result(ustream_url, UstreamIE.ie_key()) - - # Look for embedded arte.tv player - mobj = re.search( - r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'ArteTVEmbed') - - # Look for embedded francetv player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded smotri.com player - smotri_url = SmotriIE._extract_url(webpage) - if smotri_url: - return self.url_result(smotri_url, 'Smotri') - - # Look for embedded Myvi.ru player - myvi_url = MyviIE._extract_url(webpage) - if myvi_url: - return self.url_result(myvi_url) - - # Look for embedded soundcloud player - soundcloud_urls = SoundcloudIE._extract_urls(webpage) - if soundcloud_urls: - return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) - - # Look for tunein player - tunein_urls = TuneInBaseIE._extract_urls(webpage) - if tunein_urls: - return self.playlist_from_matches(tunein_urls, video_id, video_title) - - # Look for embedded mtvservices player - mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) - if mtvservices_url: - return self.url_result(mtvservices_url, ie='MTVServicesEmbedded') - - # Look for embedded yahoo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Yahoo') - - # Look for embedded sbs.com.au player - mobj = re.search( - r'''(?x) - (?: - <meta\s+property="og:video"\s+content=| - <iframe[^>]+?src= - ) - (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'SBS') - - # Look for embedded Cinchcast player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Cinchcast') - - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', - webpage) - if not mobj: - mobj = re.search( - r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'MLB') - - mobj = re.search( - r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, - webpage) - if mobj is not None: - return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') - - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Livestream') - - # Look for Zapiks embed - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Zapiks') - - # Look for Kaltura embeds - kaltura_url = KalturaIE._extract_url(webpage) - if kaltura_url: - return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) - - # Look for EaglePlatform embeds - eagleplatform_url = EaglePlatformIE._extract_url(webpage) - if eagleplatform_url: - return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key()) - - # Look for ClipYou (uses EaglePlatform) embeds - mobj = re.search( - r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) - if mobj is not None: - return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') - - # Look for Pladform embeds - pladform_url = PladformIE._extract_url(webpage) - if pladform_url: - return self.url_result(pladform_url) - - # Look for Videomore embeds - videomore_url = VideomoreIE._extract_url(webpage) - if videomore_url: - return self.url_result(videomore_url) - - # Look for Webcaster embeds - webcaster_url = WebcasterFeedIE._extract_url(self, webpage) - if webcaster_url: - return self.url_result(webcaster_url, ie=WebcasterFeedIE.ie_key()) - - # Look for Playwire embeds - mobj = re.search( - r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for 5min embeds - mobj = re.search( - r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage) - if mobj is not None: - return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin') - - # Look for Crooks and Liars embeds - mobj = re.search( - r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NBC Sports VPlayer embeds - nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) - if nbc_sports_url: - return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') - - # Look for NBC News embeds - nbc_news_embed_url = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1', webpage) - if nbc_news_embed_url: - return self.url_result(nbc_news_embed_url.group('url'), 'NBCNews') - - # Look for Google Drive embeds - google_drive_url = GoogleDriveIE._extract_url(webpage) - if google_drive_url: - return self.url_result(google_drive_url, 'GoogleDrive') - - # Look for UDN embeds - mobj = re.search( - r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) - if mobj is not None: - return self.url_result( - compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') - - # Look for Senate ISVP iframe - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - return self.url_result(senate_isvp_url, 'SenateISVP') - - # Look for OnionStudios embeds - onionstudios_url = OnionStudiosIE._extract_url(webpage) - if onionstudios_url: - return self.url_result(onionstudios_url) - - # Look for ViewLift embeds - viewlift_url = ViewLiftEmbedIE._extract_url(webpage) - if viewlift_url: - return self.url_result(viewlift_url) - - # Look for JWPlatform embeds - jwplatform_urls = JWPlatformIE._extract_urls(webpage) - if jwplatform_urls: - return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) - - # Look for Digiteka embeds - digiteka_url = DigitekaIE._extract_url(webpage) - if digiteka_url: - return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) - - # Look for Arkena embeds - arkena_url = ArkenaIE._extract_url(webpage) - if arkena_url: - return self.url_result(arkena_url, ArkenaIE.ie_key()) - - # Look for Piksel embeds - piksel_url = PikselIE._extract_url(webpage) - if piksel_url: - return self.url_result(piksel_url, PikselIE.ie_key()) - - # Look for Limelight embeds - limelight_urls = LimelightBaseIE._extract_urls(webpage, url) - if limelight_urls: - return self.playlist_result( - limelight_urls, video_id, video_title, video_description) - - # Look for Anvato embeds - anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) - if anvato_urls: - return self.playlist_result( - anvato_urls, video_id, video_title, video_description) - - # Look for AdobeTVVideo embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), - 'AdobeTVVideo') - - # Look for Vine embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') - - # Look for VODPlatform embeds - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vod-platform\.net/[eE]mbed/.+?)\1', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform') - - # Look for Mangomolo embeds - mobj = re.search( - r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?admin\.mangomolo\.com/analytics/index\.php/customers/embed/ - (?: - video\?.*?\bid=(?P<video_id>\d+)| - index\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) - ).+?)\1''', webpage) - if mobj is not None: - info = { - '_type': 'url_transparent', - 'url': self._proto_relative_url(unescapeHTML(mobj.group('url'))), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - video_id = mobj.group('video_id') - if video_id: - info.update({ - 'ie_key': 'MangomoloVideo', - 'id': video_id, - }) - else: - info.update({ - 'ie_key': 'MangomoloLive', - 'id': mobj.group('channel_id'), - }) - return info - - # Look for Instagram embeds - instagram_embed_url = InstagramIE._extract_embed_url(webpage) - if instagram_embed_url is not None: - return self.url_result( - self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) - - # Look for LiveLeak embeds - liveleak_urls = LiveLeakIE._extract_urls(webpage) - if liveleak_urls: - return self.playlist_from_matches(liveleak_urls, video_id, video_title) - - # Look for 3Q SDN embeds - threeqsdn_url = ThreeQSDNIE._extract_url(webpage) - if threeqsdn_url: - return { - '_type': 'url_transparent', - 'ie_key': ThreeQSDNIE.ie_key(), - 'url': self._proto_relative_url(threeqsdn_url), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - - # Look for VBOX7 embeds - vbox7_url = Vbox7IE._extract_url(webpage) - if vbox7_url: - return self.url_result(vbox7_url, Vbox7IE.ie_key()) - - # Look for DBTV embeds - dbtv_urls = DBTVIE._extract_urls(webpage) - if dbtv_urls: - return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key()) - - # Look for Videa embeds - videa_urls = VideaIE._extract_urls(webpage) - if videa_urls: - return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key()) - - # Look for 20 minuten embeds - twentymin_urls = TwentyMinutenIE._extract_urls(webpage) - if twentymin_urls: - return self.playlist_from_matches( - twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) - - # Look for Openload embeds - openload_urls = OpenloadIE._extract_urls(webpage) - if openload_urls: - return self.playlist_from_matches( - openload_urls, video_id, video_title, ie=OpenloadIE.ie_key()) - - # Look for VideoPress embeds - videopress_urls = VideoPressIE._extract_urls(webpage) - if videopress_urls: - return self.playlist_from_matches( - videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) - - # Look for Rutube embeds - rutube_urls = RutubeIE._extract_urls(webpage) - if rutube_urls: - return self.playlist_from_matches( - rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) - - # Look for WashingtonPost embeds - wapo_urls = WashingtonPostIE._extract_urls(webpage) - if wapo_urls: - return self.playlist_from_matches( - wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) - - # Look for Mediaset embeds - mediaset_urls = MediasetIE._extract_urls(webpage) - if mediaset_urls: - return self.playlist_from_matches( - mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) - - # Look for JOJ.sk embeds - joj_urls = JojIE._extract_urls(webpage) - if joj_urls: - return self.playlist_from_matches( - joj_urls, video_id, video_title, ie=JojIE.ie_key()) - - # Look for megaphone.fm embeds - mpfn_urls = MegaphoneIE._extract_urls(webpage) - if mpfn_urls: - return self.playlist_from_matches( - mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) - - # Look for vzaar embeds - vzaar_urls = VzaarIE._extract_urls(webpage) - if vzaar_urls: - return self.playlist_from_matches( - vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) - - channel9_urls = Channel9IE._extract_urls(webpage) - if channel9_urls: - return self.playlist_from_matches( - channel9_urls, video_id, video_title, ie=Channel9IE.ie_key()) - - vshare_urls = VShareIE._extract_urls(webpage) - if vshare_urls: - return self.playlist_from_matches( - vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) - - # Look for Mediasite embeds - mediasite_urls = MediasiteIE._extract_urls(webpage) - if mediasite_urls: - entries = [ - self.url_result(smuggle_url( - compat_urlparse.urljoin(url, mediasite_url), - {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) - for mediasite_url in mediasite_urls] - return self.playlist_result(entries, video_id, video_title) - - springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage) - if springboardplatform_urls: - return self.playlist_from_matches( - springboardplatform_urls, video_id, video_title, - ie=SpringboardPlatformIE.ie_key()) - - yapfiles_urls = YapFilesIE._extract_urls(webpage) - if yapfiles_urls: - return self.playlist_from_matches( - yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key()) - - vice_urls = ViceIE._extract_urls(webpage) - if vice_urls: - return self.playlist_from_matches( - vice_urls, video_id, video_title, ie=ViceIE.ie_key()) - - xfileshare_urls = XFileShareIE._extract_urls(webpage) - if xfileshare_urls: - return self.playlist_from_matches( - xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) - - cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage) - if cloudflarestream_urls: - return self.playlist_from_matches( - cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) - - peertube_urls = PeerTubeIE._extract_urls(webpage, url) - if peertube_urls: - return self.playlist_from_matches( - peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) - - indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) - if indavideo_urls: - return self.playlist_from_matches( - indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) - - apa_urls = APAIE._extract_urls(webpage) - if apa_urls: - return self.playlist_from_matches( - apa_urls, video_id, video_title, ie=APAIE.ie_key()) - - foxnews_urls = FoxNewsIE._extract_urls(webpage) - if foxnews_urls: - return self.playlist_from_matches( - foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) - - sharevideos_urls = [mobj.group('url') for mobj in re.finditer( - r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', - webpage)] - if sharevideos_urls: - return self.playlist_from_matches( - sharevideos_urls, video_id, video_title) - - # Look for HTML5 media - entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') - if entries: - if len(entries) == 1: - entries[0].update({ - 'id': video_id, - 'title': video_title, - }) - else: - for num, entry in enumerate(entries, start=1): - entry.update({ - 'id': '%s-%s' % (video_id, num), - 'title': '%s (%d)' % (video_title, num), - }) - for entry in entries: - self._sort_formats(entry['formats']) - return self.playlist_result(entries, video_id, video_title) - - jwplayer_data = self._find_jwplayer_data( - webpage, video_id, transform_source=js_to_json) - if jwplayer_data: - info = self._parse_jwplayer_data( - jwplayer_data, video_id, require_title=False, base_url=url) - return merge_dicts(info, info_dict) - - # Video.js embed - mobj = re.search( - r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', - webpage) - if mobj is not None: - sources = self._parse_json( - mobj.group(1), video_id, transform_source=js_to_json, - fatal=False) or [] - if not isinstance(sources, list): - sources = [sources] - formats = [] - for source in sources: - src = source.get('src') - if not src or not isinstance(src, compat_str): - continue - src = compat_urlparse.urljoin(url, src) - src_type = source.get('type') - if isinstance(src_type, compat_str): - src_type = src_type.lower() - ext = determine_ext(src).lower() - if src_type == 'video/youtube': - return self.url_result(src, YoutubeIE.ie_key()) - if src_type == 'application/dash+xml' or ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src, video_id, mpd_id='dash', fatal=False)) - elif src_type == 'application/x-mpegurl' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': src, - 'ext': (mimetype2ext(src_type) or - ext if ext in KNOWN_EXTENSIONS else 'mp4'), - }) - if formats: - self._sort_formats(formats) - info_dict['formats'] = formats - return info_dict - - # Looking for http://schema.org/VideoObject - json_ld = self._search_json_ld( - webpage, video_id, default={}, expected_type='VideoObject') - if json_ld.get('url'): - return merge_dicts(json_ld, info_dict) - - def check_video(vurl): - if YoutubeIE.suitable(vurl): - return True - if RtmpIE.suitable(vurl): - return True - vpath = compat_urlparse.urlparse(vurl).path - vext = determine_ext(vpath) - return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') - - def filter_video(urls): - return list(filter(check_video, urls)) - - # Start with something easy: JW Player in SWFObject - found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)) - if not found: - # Look for gorilla-vid style embedding - found = filter_video(re.findall(r'''(?sx) - (?: - jw_plugins| - JWPlayerOptions| - jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup - ) - .*? - ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) - if not found: - # Broaden the search a little bit - found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) - if not found: - # Broaden the findall a little bit: JWPlayer JS loader - found = filter_video(re.findall( - r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) - if not found: - # Flow player - found = filter_video(re.findall(r'''(?xs) - flowplayer\("[^"]+",\s* - \{[^}]+?\}\s*, - \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s* - ["']?url["']?\s*:\s*["']([^"']+)["'] - ''', webpage)) - if not found: - # Cinerama player - found = re.findall( - r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) - if not found: - # Try to find twitter cards info - # twitter:player:stream should be checked before twitter:player since - # it is expected to contain a raw stream (see - # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) - found = filter_video(re.findall( - r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)) - if not found: - # We look for Open Graph info: - # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) - m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) - # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: - if m_video_type is not None: - found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)) - if not found: - REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' - found = re.search( - r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' - r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX, - webpage) - if not found: - # Look also in Refresh HTTP header - refresh_header = head_response.headers.get('Refresh') - if refresh_header: - # In python 2 response HTTP headers are bytestrings - if sys.version_info < (3, 0) and isinstance(refresh_header, str): - refresh_header = refresh_header.decode('iso-8859-1') - found = re.search(REDIRECT_REGEX, refresh_header) - if found: - new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) - if new_url != url: - self.report_following_redirect(new_url) - return { - '_type': 'url', - 'url': new_url, - } - else: - found = None - - if not found: - # twitter:player is a https URL to iframe player that may or may not - # be supported by youtube-dl thus this is checked the very last (see - # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) - embed_url = self._html_search_meta('twitter:player', webpage, default=None) - if embed_url and embed_url != url: - return self.url_result(embed_url) - - if not found: - raise UnsupportedError(url) - - entries = [] - for video_url in orderedSet(found): - video_url = unescapeHTML(video_url) - video_url = video_url.replace('\\/', '/') - video_url = compat_urlparse.urljoin(url, video_url) - video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) - - # Sometimes, jwplayer extraction will result in a YouTube URL - if YoutubeIE.suitable(video_url): - entries.append(self.url_result(video_url, 'Youtube')) - continue - - # here's a fun little line of code for you: - video_id = os.path.splitext(video_id)[0] - - entry_info_dict = { - 'id': video_id, - 'uploader': video_uploader, - 'title': video_title, - 'age_limit': age_limit, - } - - if RtmpIE.suitable(video_url): - entry_info_dict.update({ - '_type': 'url_transparent', - 'ie_key': RtmpIE.ie_key(), - 'url': video_url, - }) - entries.append(entry_info_dict) - continue - - ext = determine_ext(video_url) - if ext == 'smil': - entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id) - elif ext == 'xspf': - return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) - elif ext == 'm3u8': - entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4') - elif ext == 'mpd': - entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id) - elif ext == 'f4m': - entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id) - elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: - # Just matching .ism/manifest is not enough to be reliably sure - # whether it's actually an ISM manifest or some other streaming - # manifest since there are various streaming URL formats - # possible (see [1]) as well as some other shenanigans like - # .smil/manifest URLs that actually serve an ISM (see [2]) and - # so on. - # Thus the most reasonable way to solve this is to delegate - # to generic extractor in order to look into the contents of - # the manifest itself. - # 1. https://azure.microsoft.com/en-us/documentation/articles/media-services-deliver-content-overview/#streaming-url-formats - # 2. https://svs.itworkscdn.net/lbcivod/smil:itwfcdn/lbci/170976.smil/Manifest - entry_info_dict = self.url_result( - smuggle_url(video_url, {'to_generic': True}), - GenericIE.ie_key()) - else: - entry_info_dict['url'] = video_url - - if entry_info_dict.get('formats'): - self._sort_formats(entry_info_dict['formats']) - - entries.append(entry_info_dict) - - if len(entries) == 1: - return entries[0] - else: - for num, e in enumerate(entries, start=1): - # 'url' results don't have a title - if e.get('title') is not None: - e['title'] = '%s (%d)' % (e['title'], num) - return { - '_type': 'playlist', - 'entries': entries, - } diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py deleted file mode 100644 index d264fe2..0000000 --- a/youtube_dl/extractor/openload.py +++ /dev/null @@ -1,379 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import os -import re -import subprocess -import tempfile - -from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_kwargs, -) -from ..utils import ( - check_executable, - determine_ext, - encodeArgument, - ExtractorError, - get_element_by_id, - get_exe_version, - is_outdated_version, - std_headers, -) - - -def cookie_to_dict(cookie): - cookie_dict = { - 'name': cookie.name, - 'value': cookie.value, - } - if cookie.port_specified: - cookie_dict['port'] = cookie.port - if cookie.domain_specified: - cookie_dict['domain'] = cookie.domain - if cookie.path_specified: - cookie_dict['path'] = cookie.path - if cookie.expires is not None: - cookie_dict['expires'] = cookie.expires - if cookie.secure is not None: - cookie_dict['secure'] = cookie.secure - if cookie.discard is not None: - cookie_dict['discard'] = cookie.discard - try: - if (cookie.has_nonstandard_attr('httpOnly') or - cookie.has_nonstandard_attr('httponly') or - cookie.has_nonstandard_attr('HttpOnly')): - cookie_dict['httponly'] = True - except TypeError: - pass - return cookie_dict - - -def cookie_jar_to_list(cookie_jar): - return [cookie_to_dict(cookie) for cookie in cookie_jar] - - -class PhantomJSwrapper(object): - """PhantomJS wrapper class - - This class is experimental. - """ - - _TEMPLATE = r''' - phantom.onError = function(msg, trace) {{ - var msgStack = ['PHANTOM ERROR: ' + msg]; - if(trace && trace.length) {{ - msgStack.push('TRACE:'); - trace.forEach(function(t) {{ - msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line - + (t.function ? ' (in function ' + t.function +')' : '')); - }}); - }} - console.error(msgStack.join('\n')); - phantom.exit(1); - }}; - var page = require('webpage').create(); - var fs = require('fs'); - var read = {{ mode: 'r', charset: 'utf-8' }}; - var write = {{ mode: 'w', charset: 'utf-8' }}; - JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ - phantom.addCookie(x); - }}); - page.settings.resourceTimeout = {timeout}; - page.settings.userAgent = "{ua}"; - page.onLoadStarted = function() {{ - page.evaluate(function() {{ - delete window._phantom; - delete window.callPhantom; - }}); - }}; - var saveAndExit = function() {{ - fs.write("{html}", page.content, write); - fs.write("{cookies}", JSON.stringify(phantom.cookies), write); - phantom.exit(); - }}; - page.onLoadFinished = function(status) {{ - if(page.url === "") {{ - page.setContent(fs.read("{html}", read), "{url}"); - }} - else {{ - {jscode} - }} - }}; - page.open(""); - ''' - - _TMP_FILE_NAMES = ['script', 'html', 'cookies'] - - @staticmethod - def _version(): - return get_exe_version('phantomjs', version_re=r'([0-9.]+)') - - def __init__(self, extractor, required_version=None, timeout=10000): - self._TMP_FILES = {} - - self.exe = check_executable('phantomjs', ['-v']) - if not self.exe: - raise ExtractorError('PhantomJS executable not found in PATH, ' - 'download it from http://phantomjs.org', - expected=True) - - self.extractor = extractor - - if required_version: - version = self._version() - if is_outdated_version(version, required_version): - self.extractor._downloader.report_warning( - 'Your copy of PhantomJS is outdated, update it to version ' - '%s or newer if you encounter any errors.' % required_version) - - self.options = { - 'timeout': timeout, - } - for name in self._TMP_FILE_NAMES: - tmp = tempfile.NamedTemporaryFile(delete=False) - tmp.close() - self._TMP_FILES[name] = tmp - - def __del__(self): - for name in self._TMP_FILE_NAMES: - try: - os.remove(self._TMP_FILES[name].name) - except (IOError, OSError, KeyError): - pass - - def _save_cookies(self, url): - cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar) - for cookie in cookies: - if 'path' not in cookie: - cookie['path'] = '/' - if 'domain' not in cookie: - cookie['domain'] = compat_urlparse.urlparse(url).netloc - with open(self._TMP_FILES['cookies'].name, 'wb') as f: - f.write(json.dumps(cookies).encode('utf-8')) - - def _load_cookies(self): - with open(self._TMP_FILES['cookies'].name, 'rb') as f: - cookies = json.loads(f.read().decode('utf-8')) - for cookie in cookies: - if cookie['httponly'] is True: - cookie['rest'] = {'httpOnly': None} - if 'expiry' in cookie: - cookie['expire_time'] = cookie['expiry'] - self.extractor._set_cookie(**compat_kwargs(cookie)) - - def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): - """ - Downloads webpage (if needed) and executes JS - - Params: - url: website url - html: optional, html code of website - video_id: video id - note: optional, displayed when downloading webpage - note2: optional, displayed when executing JS - headers: custom http headers - jscode: code to be executed when page is loaded - - Returns tuple with: - * downloaded website (after JS execution) - * anything you print with `console.log` (but not inside `page.execute`!) - - In most cases you don't need to add any `jscode`. - It is executed in `page.onLoadFinished`. - `saveAndExit();` is mandatory, use it instead of `phantom.exit()` - It is possible to wait for some element on the webpage, for example: - var check = function() { - var elementFound = page.evaluate(function() { - return document.querySelector('#b.done') !== null; - }); - if(elementFound) - saveAndExit(); - else - window.setTimeout(check, 500); - } - - page.evaluate(function(){ - document.querySelector('#a').click(); - }); - check(); - """ - if 'saveAndExit();' not in jscode: - raise ExtractorError('`saveAndExit();` not found in `jscode`') - if not html: - html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) - with open(self._TMP_FILES['html'].name, 'wb') as f: - f.write(html.encode('utf-8')) - - self._save_cookies(url) - - replaces = self.options - replaces['url'] = url - user_agent = headers.get('User-Agent') or std_headers['User-Agent'] - replaces['ua'] = user_agent.replace('"', '\\"') - replaces['jscode'] = jscode - - for x in self._TMP_FILE_NAMES: - replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') - - with open(self._TMP_FILES['script'].name, 'wb') as f: - f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) - - if video_id is None: - self.extractor.to_screen('%s' % (note2,)) - else: - self.extractor.to_screen('%s: %s' % (video_id, note2)) - - p = subprocess.Popen([ - self.exe, '--ssl-protocol=any', - self._TMP_FILES['script'].name - ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = p.communicate() - if p.returncode != 0: - raise ExtractorError( - 'Executing JS failed\n:' + encodeArgument(err)) - with open(self._TMP_FILES['html'].name, 'rb') as f: - html = f.read().decode('utf-8') - - self._load_cookies() - - return (html, encodeArgument(out)) - - -class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)' - - _TESTS = [{ - 'url': 'https://openload.co/f/kUEfGclsU9o', - 'md5': 'bf1c059b004ebc7a256f89408e65c36e', - 'info_dict': { - 'id': 'kUEfGclsU9o', - 'ext': 'mp4', - 'title': 'skyrim_no-audio_1080.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - }, { - 'url': 'https://openload.co/embed/rjC09fkPLYs', - 'info_dict': { - 'id': 'rjC09fkPLYs', - 'ext': 'mp4', - 'title': 'movie.mp4', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': { - 'en': [{ - 'ext': 'vtt', - }], - }, - }, - 'params': { - 'skip_download': True, # test subtitles only - }, - }, { - 'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4', - 'only_matching': True, - }, { - 'url': 'https://openload.io/f/ZAn6oz-VZGE/', - 'only_matching': True, - }, { - 'url': 'https://openload.co/f/_-ztPaZtMhM/', - 'only_matching': True, - }, { - # unavailable via https://openload.co/f/Sxz5sADo82g/, different layout - # for title and ext - 'url': 'https://openload.co/embed/Sxz5sADo82g/', - 'only_matching': True, - }, { - # unavailable via https://openload.co/embed/e-Ixz9ZR5L0/ but available - # via https://openload.co/f/e-Ixz9ZR5L0/ - 'url': 'https://openload.co/f/e-Ixz9ZR5L0/', - 'only_matching': True, - }, { - 'url': 'https://oload.tv/embed/KnG-kKZdcfY/', - 'only_matching': True, - }, { - 'url': 'http://www.openload.link/f/KnG-kKZdcfY', - 'only_matching': True, - }, { - 'url': 'https://oload.stream/f/KnG-kKZdcfY', - 'only_matching': True, - }, { - 'url': 'https://oload.xyz/f/WwRBpzW8Wtk', - 'only_matching': True, - }, { - 'url': 'https://oload.win/f/kUEfGclsU9o', - 'only_matching': True, - }, { - 'url': 'https://oload.download/f/kUEfGclsU9o', - 'only_matching': True, - }, { - # Its title has not got its extension but url has it - 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4', - 'only_matching': True, - }] - - _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src=["\']((?:https?://)?(?:openload\.(?:co|io)|oload\.tv)/embed/[a-zA-Z0-9-_]+)', - webpage) - - def _real_extract(self, url): - video_id = self._match_id(url) - url_pattern = 'https://openload.co/%%s/%s/' % video_id - headers = { - 'User-Agent': self._USER_AGENT, - } - - for path in ('embed', 'f'): - page_url = url_pattern % path - last = path == 'f' - webpage = self._download_webpage( - page_url, video_id, 'Downloading %s webpage' % path, - headers=headers, fatal=last) - if not webpage: - continue - if 'File not found' in webpage or 'deleted by the owner' in webpage: - if not last: - continue - raise ExtractorError('File not found', expected=True, video_id=video_id) - break - - phantom = PhantomJSwrapper(self, required_version='2.0') - webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id, headers=headers) - - decoded_id = (get_element_by_id('streamurl', webpage) or - get_element_by_id('streamuri', webpage) or - get_element_by_id('streamurj', webpage) or - self._search_regex( - (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<', - r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)', - r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<', - r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<', - r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, - 'stream URL')) - - video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id - - title = self._og_search_title(webpage, default=None) or self._search_regex( - r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage, - 'title', default=None) or self._html_search_meta( - 'description', webpage, 'title', fatal=True) - - entries = self._parse_html5_media_entries(page_url, webpage, video_id) - entry = entries[0] if entries else {} - subtitles = entry.get('subtitles') - - info_dict = { - 'id': video_id, - 'title': title, - 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), - 'url': video_url, - 'ext': determine_ext(title, None) or determine_ext(url, 'mp4'), - 'subtitles': subtitles, - 'http_headers': headers, - } - return info_dict diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py deleted file mode 100644 index df8ec25..0000000 --- a/youtube_dl/extractor/youtube.py +++ /dev/null @@ -1,3394 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - - -import itertools -import json -import os.path -import random -import re -import time -import traceback -import html - -from .common import InfoExtractor, SearchInfoExtractor -from ..jsinterp import JSInterpreter -from ..swfinterp import SWFInterpreter -from ..compat import ( - compat_chr, - compat_HTTPError, - compat_kwargs, - compat_parse_qs, - compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urlparse, - compat_str, -) -from ..utils import ( - bool_or_none, - clean_html, - dict_get, - error_to_compat_str, - extract_attributes, - ExtractorError, - float_or_none, - get_element_by_attribute, - get_element_by_id, - int_or_none, - mimetype2ext, - orderedSet, - parse_codecs, - parse_duration, - remove_quotes, - remove_start, - smuggle_url, - str_or_none, - str_to_int, - try_get, - unescapeHTML, - unified_strdate, - unsmuggle_url, - uppercase_escape, - url_or_none, - urlencode_postdata, -) -class YoutubeError(Exception): - pass - -class YoutubeBaseInfoExtractor(InfoExtractor): - """Provide base functions for Youtube extractors""" - _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' - - _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' - _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' - _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' - - _NETRC_MACHINE = 'youtube' - # If True it will raise an error if no login info is provided - _LOGIN_REQUIRED = False - - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}' - - def _set_language(self): - self._set_cookie( - '.youtube.com', 'PREF', 'f1=50000000&hl=en', - # YouTube sets the expire time to about two months - expire_time=time.time() + 2 * 30 * 24 * 3600) - - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - - def _login(self): - """ - Attempt to log in to YouTube. - True is returned if successful or skipped. - False is returned if login failed. - - If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. - """ - username, password = self._get_login_info() - # No authentication to be performed - if username is None: - if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) - return True - - login_page = self._download_webpage( - self._LOGIN_URL, None, - note='Downloading login page', - errnote='unable to fetch login page', fatal=False) - if login_page is False: - return - - login_form = self._hidden_inputs(login_page) - - def req(url, f_req, note, errnote): - data = login_form.copy() - data.update({ - 'pstMsg': 1, - 'checkConnection': 'youtube', - 'checkedDomains': 'youtube', - 'hl': 'en', - 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', - 'f.req': json.dumps(f_req), - 'flowName': 'GlifWebSignIn', - 'flowEntry': 'ServiceLogin', - # TODO: reverse actual botguard identifier generation algo - 'bgRequest': '["identifier",""]', - }) - return self._download_json( - url, None, note=note, errnote=errnote, - transform_source=lambda s: re.sub(r'^[^[]*', '', s), - fatal=False, - data=urlencode_postdata(data), headers={ - 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', - 'Google-Accounts-XSRF': 1, - }) - - def warn(message): - self._downloader.report_warning(message) - - lookup_req = [ - username, - None, [], None, 'US', None, None, 2, False, True, - [ - None, None, - [2, 1, None, 1, - 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', - None, [], 4], - 1, [None, None, []], None, None, None, True - ], - username, - ] - - lookup_results = req( - self._LOOKUP_URL, lookup_req, - 'Looking up account info', 'Unable to look up account info') - - if lookup_results is False: - return False - - user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) - if not user_hash: - warn('Unable to extract user hash') - return False - - challenge_req = [ - user_hash, - None, 1, None, [1, None, None, None, [password, None, True]], - [ - None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], - 1, [None, None, []], None, None, None, True - ]] - - challenge_results = req( - self._CHALLENGE_URL, challenge_req, - 'Logging in', 'Unable to log in') - - if challenge_results is False: - return - - login_res = try_get(challenge_results, lambda x: x[0][5], list) - if login_res: - login_msg = try_get(login_res, lambda x: x[5], compat_str) - warn( - 'Unable to login: %s' % 'Invalid password' - if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) - return False - - res = try_get(challenge_results, lambda x: x[0][-1], list) - if not res: - warn('Unable to extract result entry') - return False - - login_challenge = try_get(res, lambda x: x[0][0], list) - if login_challenge: - challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) - if challenge_str == 'TWO_STEP_VERIFICATION': - # SEND_SUCCESS - TFA code has been successfully sent to phone - # QUOTA_EXCEEDED - reached the limit of TFA codes - status = try_get(login_challenge, lambda x: x[5], compat_str) - if status == 'QUOTA_EXCEEDED': - warn('Exceeded the limit of TFA codes, try later') - return False - - tl = try_get(challenge_results, lambda x: x[1][2], compat_str) - if not tl: - warn('Unable to extract TL') - return False - - tfa_code = self._get_tfa_info('2-step verification code') - - if not tfa_code: - warn( - 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' - '(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False - - tfa_code = remove_start(tfa_code, 'G-') - - tfa_req = [ - user_hash, None, 2, None, - [ - 9, None, None, None, None, None, None, None, - [None, tfa_code, True, 2] - ]] - - tfa_results = req( - self._TFA_URL.format(tl), tfa_req, - 'Submitting TFA code', 'Unable to submit TFA code') - - if tfa_results is False: - return False - - tfa_res = try_get(tfa_results, lambda x: x[0][5], list) - if tfa_res: - tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) - warn( - 'Unable to finish TFA: %s' % 'Invalid TFA code' - if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) - return False - - check_cookie_url = try_get( - tfa_results, lambda x: x[0][-1][2], compat_str) - else: - CHALLENGES = { - 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", - 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', - 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", - } - challenge = CHALLENGES.get( - challenge_str, - '%s returned error %s.' % (self.IE_NAME, challenge_str)) - warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) - return False - else: - check_cookie_url = try_get(res, lambda x: x[2], compat_str) - - if not check_cookie_url: - warn('Unable to extract CheckCookie URL') - return False - - check_cookie_results = self._download_webpage( - check_cookie_url, None, 'Checking cookie', fatal=False) - - if check_cookie_results is False: - return False - - if 'https://myaccount.google.com/' not in check_cookie_results: - warn('Unable to log in') - return False - - return True - - def _download_webpage_handle(self, *args, **kwargs): - query = kwargs.get('query', {}).copy() - query['disable_polymer'] = 'true' - kwargs['query'] = query - return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( - *args, **compat_kwargs(kwargs)) - - def _real_initialize(self): - if self._downloader is None: - return - self._set_language() - if not self._login(): - return - - -class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): - # Extract entries from page with "Load more" button - def _entries(self, page, playlist_id): - more_widget_html = content_html = page - for page_num in itertools.count(1): - for entry in self._process_page(content_html): - yield entry - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - count = 0 - retries = 3 - while count <= retries: - try: - # Downloading page may result in intermittent 5xx HTTP error - # that is usually worked around with a retry - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s%s' - % (page_num, ' (retry #%d)' % count if count else ''), - transform_source=uppercase_escape) - break - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): - count += 1 - if count <= retries: - continue - raise - - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] - - -class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for video_id, video_title in self.extract_videos_from_page(content): - yield self.url_result(video_id, 'Youtube', video_id, video_title) - - def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): - for mobj in re.finditer(video_re, page): - # The link with index 0 is not the first video of the playlist (not sure if still actual) - if 'index' in mobj.groupdict() and mobj.group('id') == '0': - continue - video_id = mobj.group('id') - video_title = unescapeHTML( - mobj.group('title')) if 'title' in mobj.groupdict() else None - if video_title: - video_title = video_title.strip() - if video_title == '► Play all': - video_title = None - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) - - -class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for playlist_id in orderedSet(re.findall( - r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', - content)): - yield self.url_result( - 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - title = self._og_search_title(webpage, fatal=False) - return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title) - - -class YoutubeIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com' - _VALID_URL = r"""(?x)^ - ( - (?:https?://|//) # http(s):// or protocol-independent URL - (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| - (?:www\.)?deturl\.com/www\.youtube\.com/| - (?:www\.)?pwnyoutube\.com/| - (?:www\.)?hooktube\.com/| - (?:www\.)?yourepeat\.com/| - tube\.majestyc\.net/| - # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances - (?:(?:www|dev)\.)?invidio\.us/| - (?:(?:www|no)\.)?invidiou\.sh/| - (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| - (?:www\.)?invidious\.kabi\.tk/| - (?:www\.)?invidious\.enkirton\.net/| - (?:www\.)?invidious\.13ad\.de/| - (?:www\.)?invidious\.mastodon\.host/| - (?:www\.)?invidious\.nixnet\.xyz/| - (?:www\.)?tube\.poal\.co/| - (?:www\.)?vid\.wxzm\.sx/| - (?:www\.)?yt\.elukerio\.org/| - (?:www\.)?kgg2m7yk5aybusll\.onion/| - (?:www\.)?qklhadlycap4cnod\.onion/| - (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/| - (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/| - (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/| - (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/| - youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains - (?:.*?\#/)? # handle anchor (#/) redirect urls - (?: # the various things that can precede the ID: - (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ - |(?: # or the v= param in all its forms - (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) - (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) - v= - ) - )) - |(?: - youtu\.be| # just youtu.be/xxxx - vid\.plus| # or vid.plus/xxxx - zwearz\.com/watch| # or zwearz.com/watch/xxxx - )/ - |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= - ) - )? # all until now is optional -> you can pass the naked ID - ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?\blist= - (?: - %(playlist_id)s| # combined list/video URLs are handled by the playlist IE - WL # WL are handled by the watch later IE - ) - ) - (?(1).+)? # if we found the ID, everything can follow - $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' - _formats = { - '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well - '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - - - # 3D videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, - '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - - # Apple HTTP Live Streaming - '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, - - # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, - - # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, - '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, - '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, - - # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - - # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, - - # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, - - # RTMP (unnamed) - '_rtmp': {'protocol': 'rtmp'}, - - # av01 video only formats sometimes served with "unknown" codecs - '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - } - _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') - - _GEO_BYPASS = False - - IE_NAME = 'youtube' - _TESTS = [ - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', - 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'start_time': 1, - 'end_time': 9, - } - }, - { - 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY', - 'note': 'Test generic use_cipher_signature video (#897)', - 'info_dict': { - 'id': 'UxxajLWwzqY', - 'ext': 'mp4', - 'upload_date': '20120506', - 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', - 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8', - 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', - 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', - 'iconic ep', 'iconic', 'love', 'it'], - 'duration': 180, - 'uploader': 'Icona Pop', - 'uploader_id': 'IconaPop', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', - 'creator': 'Icona Pop', - 'track': 'I Love It (feat. Charli XCX)', - 'artist': 'Icona Pop', - } - }, - { - 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ', - 'note': 'Test VEVO video with age protection (#956)', - 'info_dict': { - 'id': '07FYdnEawAQ', - 'ext': 'mp4', - 'upload_date': '20130703', - 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)', - 'alt_title': 'Tunnel Vision', - 'description': 'md5:07dab3356cde4199048e4c7cd93471e1', - 'duration': 419, - 'uploader': 'justintimberlakeVEVO', - 'uploader_id': 'justintimberlakeVEVO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', - 'creator': 'Justin Timberlake', - 'track': 'Tunnel Vision', - 'artist': 'Justin Timberlake', - 'age_limit': 18, - } - }, - { - 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', - 'note': 'Embed-only video (#1746)', - 'info_dict': { - 'id': 'yZIXLfi8CZQ', - 'ext': 'mp4', - 'upload_date': '20120608', - 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', - 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', - 'uploader': 'SET India', - 'uploader_id': 'setindia', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', - 'age_limit': 18, - } - }, - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', - 'note': 'Use the first video ID in the URL', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', - 'note': '256k DASH audio (format 141) via DASH manifest', - 'info_dict': { - 'id': 'a9LDPn-MO4I', - 'ext': 'm4a', - 'upload_date': '20121002', - 'uploader_id': '8KVIDEO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', - 'description': '', - 'uploader': '8KVIDEO', - 'title': 'UHDTV TEST 8K VIDEO.mp4' - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141', - }, - 'skip': 'format 141 not served anymore', - }, - # DASH manifest with encrypted signature - { - 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', - 'info_dict': { - 'id': 'IB3lcPjvWLA', - 'ext': 'm4a', - 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', - 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', - 'duration': 244, - 'uploader': 'AfrojackVEVO', - 'uploader_id': 'AfrojackVEVO', - 'upload_date': '20131011', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141/bestaudio[ext=m4a]', - }, - }, - # JS player signature function name containing $ - { - 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM', - 'info_dict': { - 'id': 'nfWlot6h_JM', - 'ext': 'm4a', - 'title': 'Taylor Swift - Shake It Off', - 'description': 'md5:bec2185232c05479482cb5a9b82719bf', - 'duration': 242, - 'uploader': 'TaylorSwiftVEVO', - 'uploader_id': 'TaylorSwiftVEVO', - 'upload_date': '20140818', - 'creator': 'Taylor Swift', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141/bestaudio[ext=m4a]', - }, - }, - # Controversy video - { - 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', - 'info_dict': { - 'id': 'T4XJQO3qol8', - 'ext': 'mp4', - 'duration': 219, - 'upload_date': '20100909', - 'uploader': 'Amazing Atheist', - 'uploader_id': 'TheAmazingAtheist', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', - 'title': 'Burning Everyone\'s Koran', - 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', - } - }, - # Normal age-gate video (No vevo, embed allowed) - { - 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', - 'info_dict': { - 'id': 'HtVdAasjOgU', - 'ext': 'mp4', - 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', - 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', - 'duration': 142, - 'uploader': 'The Witcher', - 'uploader_id': 'WitcherGame', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', - 'upload_date': '20140605', - 'age_limit': 18, - }, - }, - # Age-gate video with encrypted signature - { - 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', - 'info_dict': { - 'id': '6kLq3WMV1nU', - 'ext': 'mp4', - 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', - 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', - 'duration': 246, - 'uploader': 'LloydVEVO', - 'uploader_id': 'LloydVEVO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', - 'upload_date': '20110629', - 'age_limit': 18, - }, - }, - # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) - # YouTube Red ad is not captured for creator - { - 'url': '__2ABJjxzNo', - 'info_dict': { - 'id': '__2ABJjxzNo', - 'ext': 'mp4', - 'duration': 266, - 'upload_date': '20100430', - 'uploader_id': 'deadmau5', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'deadmau5', - 'description': 'md5:12c56784b8032162bb936a5f76d55360', - 'uploader': 'deadmau5', - 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'Some Chords', - }, - 'expected_warnings': [ - 'DASH manifest missing', - ] - }, - # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) - { - 'url': 'lqQg6PlCWgI', - 'info_dict': { - 'id': 'lqQg6PlCWgI', - 'ext': 'mp4', - 'duration': 6085, - 'upload_date': '20150827', - 'uploader_id': 'olympic', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', - 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', - 'uploader': 'Olympic', - 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', - }, - 'params': { - 'skip_download': 'requires avconv', - } - }, - # Non-square pixels - { - 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', - 'info_dict': { - 'id': '_b-2C3KPAM0', - 'ext': 'mp4', - 'stretched_ratio': 16 / 9., - 'duration': 85, - 'upload_date': '20110310', - 'uploader_id': 'AllenMeow', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', - 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫ᄋᄅ', - 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', - }, - }, - # url_encoded_fmt_stream_map is empty string - { - 'url': 'qEJwOuvDf7I', - 'info_dict': { - 'id': 'qEJwOuvDf7I', - 'ext': 'webm', - 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', - 'description': '', - 'upload_date': '20150404', - 'uploader_id': 'spbelect', - 'uploader': 'Наблюдатели Петербурга', - }, - 'params': { - 'skip_download': 'requires avconv', - }, - 'skip': 'This live event has ended.', - }, - # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) - { - 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', - 'info_dict': { - 'id': 'FIl7x6_3R5Y', - 'ext': 'webm', - 'title': 'md5:7b81415841e02ecd4313668cde88737a', - 'description': 'md5:116377fd2963b81ec4ce64b542173306', - 'duration': 220, - 'upload_date': '20150625', - 'uploader_id': 'dorappi2000', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', - 'uploader': 'dorappi2000', - 'formats': 'mincount:31', - }, - 'skip': 'not actual anymore', - }, - # DASH manifest with segment_list - { - 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', - 'md5': '8ce563a1d667b599d21064e982ab9e31', - 'info_dict': { - 'id': 'CsmdDsKjzN8', - 'ext': 'mp4', - 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510 - 'uploader': 'Airtek', - 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', - 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', - 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '135', # bestvideo - }, - 'skip': 'This live event has ended.', - }, - { - # Multifeed videos (multiple cameras), URL is for Main Camera - 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs', - 'info_dict': { - 'id': 'jqWvoWXjCVs', - 'title': 'teamPGP: Rocket League Noob Stream', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - }, - 'playlist': [{ - 'info_dict': { - 'id': 'jqWvoWXjCVs', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7335, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }, { - 'info_dict': { - 'id': '6h8e8xoXJzg', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7337, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }, { - 'info_dict': { - 'id': 'PUOgX5z9xZw', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (grizzle)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7337, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }, { - 'info_dict': { - 'id': 'teuwxikvS5k', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (zim)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7334, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }], - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - }, - { - # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) - 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo', - 'info_dict': { - 'id': 'gVfLd0zydlo', - 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', - }, - 'playlist_count': 2, - 'skip': 'Not multifeed anymore', - }, - { - 'url': 'https://vid.plus/FlRa-iH7PGw', - 'only_matching': True, - }, - { - 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html', - 'only_matching': True, - }, - { - # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468) - # Also tests cut-off URL expansion in video description (see - # https://github.com/ytdl-org/youtube-dl/issues/1892, - # https://github.com/ytdl-org/youtube-dl/issues/8164) - 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg', - 'info_dict': { - 'id': 'lsguqyKfVQg', - 'ext': 'mp4', - 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk - Position Music', - 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', - 'duration': 133, - 'upload_date': '20151119', - 'uploader_id': 'IronSoulElf', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', - 'uploader': 'IronSoulElf', - 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'track': 'Dark Walk - Position Music', - 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'album': 'Position Music - Production Music Vol. 143 - Dark Walk', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468) - 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8', - 'only_matching': True, - }, - { - # Video with yt:stretch=17:0 - 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM', - 'info_dict': { - 'id': 'Q39EVAstoRM', - 'ext': 'mp4', - 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4', - 'description': 'md5:ee18a25c350637c8faff806845bddee9', - 'upload_date': '20151107', - 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA', - 'uploader': 'CH GAMER DROID', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video does not exist.', - }, - { - # Video licensed under Creative Commons - 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', - 'info_dict': { - 'id': 'M4gD1WSo5mA', - 'ext': 'mp4', - 'title': 'md5:e41008789470fc2533a3252216f1c1d1', - 'description': 'md5:a677553cf0840649b731a3024aeff4cc', - 'duration': 721, - 'upload_date': '20150127', - 'uploader_id': 'BerkmanCenter', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', - 'uploader': 'The Berkman Klein Center for Internet & Society', - 'license': 'Creative Commons Attribution license (reuse allowed)', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Channel-like uploader_url - 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg', - 'info_dict': { - 'id': 'eQcmzGIKrzg', - 'ext': 'mp4', - 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', - 'description': 'md5:dda0d780d5a6e120758d1711d062a867', - 'duration': 4060, - 'upload_date': '20151119', - 'uploader': 'Bernie Sanders', - 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', - 'license': 'Creative Commons Attribution license (reuse allowed)', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', - 'only_matching': True, - }, - { - # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059) - 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', - 'only_matching': True, - }, - { - # Rental video preview - 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg', - 'info_dict': { - 'id': 'uGpuVWrhIzE', - 'ext': 'mp4', - 'title': 'Piku - Trailer', - 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb', - 'upload_date': '20150811', - 'uploader': 'FlixMatrix', - 'uploader_id': 'FlixMatrixKaravan', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan', - 'license': 'Standard YouTube License', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - }, - { - # YouTube Red video with episode data - 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4', - 'info_dict': { - 'id': 'iqKdEhx-dD4', - 'ext': 'mp4', - 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f', - 'duration': 2085, - 'upload_date': '20170118', - 'uploader': 'Vsauce', - 'uploader_id': 'Vsauce', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', - 'series': 'Mind Field', - 'season_number': 1, - 'episode_number': 1, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': [ - 'Skipping DASH manifest', - ], - }, - { - # The following content has been identified by the YouTube community - # as inappropriate or offensive to some audiences. - 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', - 'info_dict': { - 'id': '6SJNVb0GnPI', - 'ext': 'mp4', - 'title': 'Race Differences in Intelligence', - 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', - 'duration': 965, - 'upload_date': '20140124', - 'uploader': 'New Century Foundation', - 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # itag 212 - 'url': '1t24XAntNCY', - 'only_matching': True, - }, - { - # geo restricted to JP - 'url': 'sJL6WA-aGkQ', - 'only_matching': True, - }, - { - 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', - 'only_matching': True, - }, - { - 'url': 'https://invidio.us/watch?v=BaW_jenozKc', - 'only_matching': True, - }, - { - # DRM protected - 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', - 'only_matching': True, - }, - { - # Video with unsupported adaptive stream type formats - 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U', - 'info_dict': { - 'id': 'Z4Vy8R84T1U', - 'ext': 'mp4', - 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'duration': 433, - 'upload_date': '20130923', - 'uploader': 'Amelia Putri Harwita', - 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q', - 'formats': 'maxcount:10', - }, - 'params': { - 'skip_download': True, - 'youtube_include_dash_manifest': False, - }, - }, - { - # Youtube Music Auto-generated description - 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs', - 'info_dict': { - 'id': 'MgNrAu2pzNs', - 'ext': 'mp4', - 'title': 'Voyeur Girl', - 'description': 'md5:7ae382a65843d6df2685993e90a8628f', - 'upload_date': '20190312', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw', - 'artist': 'Stephen', - 'track': 'Voyeur Girl', - 'album': 'it\'s too much love to know my dear', - 'release_date': '20190313', - 'release_year': 2019, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # Retrieve 'artist' field from 'Artist:' in video description - # when it is present on youtube music video - 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', - 'info_dict': { - 'id': 'k0jLE7tTwjY', - 'ext': 'mp4', - 'title': 'Latch Feat. Sam Smith', - 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335', - 'upload_date': '20150110', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w', - 'artist': 'Disclosure', - 'track': 'Latch Feat. Sam Smith', - 'album': 'Latch Featuring Sam Smith', - 'release_date': '20121008', - 'release_year': 2012, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle multiple artists on youtube music video - 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', - 'info_dict': { - 'id': '74qn0eJSjpA', - 'ext': 'mp4', - 'title': 'Eastside', - 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2', - 'upload_date': '20180710', - 'uploader': 'Benny Blanco - Topic', - 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A', - 'artist': 'benny blanco, Halsey, Khalid', - 'track': 'Eastside', - 'album': 'Eastside', - 'release_date': '20180713', - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle youtube music video with release_year and no release_date - 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', - 'info_dict': { - 'id': '-hcAI0g-f5M', - 'ext': 'mp4', - 'title': 'Put It On Me', - 'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e', - 'upload_date': '20180426', - 'uploader': 'Matt Maeson - Topic', - 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', - 'artist': 'Matt Maeson', - 'track': 'Put It On Me', - 'album': 'The Hearse', - 'release_date': None, - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, - ] - - def __init__(self, *args, **kwargs): - super(YoutubeIE, self).__init__(*args, **kwargs) - self._player_cache = {} - - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen('%s: Downloading video info webpage' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self.to_screen('%s: Extracting video information' % video_id) - - def report_unavailable_format(self, video_id, format): - """Report extracted video URL.""" - self.to_screen('%s: Format %s not available' % (video_id, format)) - - def report_rtmp_download(self): - """Indicate the download will use the RTMP protocol.""" - self.to_screen('RTMP download detected') - - def _signature_cache_id(self, example_sig): - """ Return a string representation of a signature """ - return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) - - def _extract_signature_function(self, video_id, player_url, example_sig): - id_m = re.match( - r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', - player_url) - if not id_m: - raise ExtractorError('Cannot identify player %r' % player_url) - player_type = id_m.group('ext') - player_id = id_m.group('id') - - # Read from filesystem cache - func_id = '%s_%s_%s' % ( - player_type, player_id, self._signature_cache_id(example_sig)) - assert os.path.basename(func_id) == func_id - - cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) - if cache_spec is not None: - return lambda s: ''.join(s[i] for i in cache_spec) - - download_note = ( - 'Downloading player %s' % player_url - if self._downloader.params.get('verbose') else - 'Downloading %s player %s' % (player_type, player_id) - ) - if player_type == 'js': - code = self._download_webpage( - player_url, video_id, - note=download_note, - errnote='Download of %s failed' % player_url) - res = self._parse_sig_js(code) - elif player_type == 'swf': - urlh = self._request_webpage( - player_url, video_id, - note=download_note, - errnote='Download of %s failed' % player_url) - code = urlh.read() - res = self._parse_sig_swf(code) - else: - assert False, 'Invalid player type %r' % player_type - - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = res(test_string) - cache_spec = [ord(c) for c in cache_res] - - self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) - return res - - def _print_sig_code(self, func, example_sig): - def gen_sig_code(idxs): - def _genslice(start, end, step): - starts = '' if start == 0 else str(start) - ends = (':%d' % (end + step)) if end + step >= 0 else ':' - steps = '' if step == 1 else (':%d' % step) - return 's[%s%s%s]' % (starts, ends, steps) - - step = None - # Quelch pyflakes warnings - start will be set when step is set - start = '(Never used)' - for i, prev in zip(idxs[1:], idxs[:-1]): - if step is not None: - if i - prev == step: - continue - yield _genslice(start, prev, step) - step = None - continue - if i - prev in [-1, 1]: - step = i - prev - start = prev - continue - else: - yield 's[%d]' % prev - if step is None: - yield 's[%d]' % i - else: - yield _genslice(start, i, step) - - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = func(test_string) - cache_spec = [ord(c) for c in cache_res] - expr_code = ' + '.join(gen_sig_code(cache_spec)) - signature_id_tuple = '(%s)' % ( - ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) - code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' - ' return %s\n') % (signature_id_tuple, expr_code) - self.to_screen('Extracted signature function:\n' + code) - - def _parse_sig_js(self, jscode): - funcname = self._search_regex( - (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', - # Obsolete patterns - r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', - r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), - jscode, 'Initial JS player signature function name', group='sig') - - jsi = JSInterpreter(jscode) - initial_function = jsi.extract_function(funcname) - return lambda s: initial_function([s]) - - def _parse_sig_swf(self, file_contents): - swfi = SWFInterpreter(file_contents) - TARGET_CLASSNAME = 'SignatureDecipher' - searched_class = swfi.extract_class(TARGET_CLASSNAME) - initial_function = swfi.extract_function(searched_class, 'decipher') - return lambda s: initial_function([s]) - - def _decrypt_signature(self, s, video_id, player_url, age_gate=False): - """Turn the encrypted s field into a working signature""" - - if player_url is None: - raise ExtractorError('Cannot decrypt signature without player_url') - - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( - 'https://www.youtube.com', player_url) - try: - player_id = (player_url, self._signature_cache_id(s)) - if player_id not in self._player_cache: - func = self._extract_signature_function( - video_id, player_url, s - ) - self._player_cache[player_id] = func - func = self._player_cache[player_id] - if self._downloader.params.get('youtube_print_sig_code'): - self._print_sig_code(func, s) - return func(s) - except Exception as e: - tb = traceback.format_exc() - raise ExtractorError( - 'Signature extraction failed: ' + tb, cause=e) - - def _get_subtitles(self, video_id, webpage): - try: - subs_doc = self._download_xml( - 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, - video_id, note=False) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) - return {} - - sub_lang_list = {} - for track in subs_doc.findall('track'): - lang = track.attrib['lang_code'] - if lang in sub_lang_list: - continue - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': ext, - 'name': track.attrib['name'].encode('utf-8'), - }) - sub_formats.append({ - 'url': 'https://www.youtube.com/api/timedtext?' + params, - 'ext': ext, - }) - sub_lang_list[lang] = sub_formats - if not sub_lang_list: - self._downloader.report_warning('video doesn\'t have subtitles') - return {} - return sub_lang_list - - def _get_ytplayer_config(self, video_id, webpage): - patterns = ( - # User data may contain arbitrary character sequences that may affect - # JSON extraction with regex, e.g. when '};' is contained the second - # regex won't capture the whole JSON. Yet working around by trying more - # concrete regex first keeping in mind proper quoted string handling - # to be implemented in future that will replace this workaround (see - # https://github.com/ytdl-org/youtube-dl/issues/7468, - # https://github.com/ytdl-org/youtube-dl/pull/7599) - r';ytplayer\.config\s*=\s*({.+?});ytplayer', - r';ytplayer\.config\s*=\s*({.+?});', - ) - config = self._search_regex( - patterns, webpage, 'ytplayer.config', default=None) - if config: - return self._parse_json( - uppercase_escape(config), video_id, fatal=False) - - def _get_automatic_captions(self, video_id, webpage): - """We need the webpage for getting the captions url, pass it as an - argument to speed up the process.""" - self.to_screen('%s: Looking for automatic captions' % video_id) - player_config = self._get_ytplayer_config(video_id, webpage) - err_msg = 'Couldn\'t find automatic captions for %s' % video_id - if not player_config: - self._downloader.report_warning(err_msg) - return {} - try: - args = player_config['args'] - caption_url = args.get('ttsurl') - if caption_url: - timestamp = args['timestamp'] - # We get the available subtitles - list_params = compat_urllib_parse_urlencode({ - 'type': 'list', - 'tlangs': 1, - 'asrs': 1, - }) - list_url = caption_url + '&' + list_params - caption_list = self._download_xml(list_url, video_id) - original_lang_node = caption_list.find('track') - if original_lang_node is None: - self._downloader.report_warning('Video doesn\'t have automatic captions') - return {} - original_lang = original_lang_node.attrib['lang_code'] - caption_kind = original_lang_node.attrib.get('kind', '') - - sub_lang_list = {} - for lang_node in caption_list.findall('target'): - sub_lang = lang_node.attrib['lang_code'] - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': ext, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_formats.append({ - 'url': caption_url + '&' + params, - 'ext': ext, - }) - sub_lang_list[sub_lang] = sub_formats - return sub_lang_list - - def make_captions(sub_url, sub_langs): - parsed_sub_url = compat_urllib_parse_urlparse(sub_url) - caption_qs = compat_parse_qs(parsed_sub_url.query) - captions = {} - for sub_lang in sub_langs: - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - caption_qs.update({ - 'tlang': [sub_lang], - 'fmt': [ext], - }) - sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( - query=compat_urllib_parse_urlencode(caption_qs, True))) - sub_formats.append({ - 'url': sub_url, - 'ext': ext, - }) - captions[sub_lang] = sub_formats - return captions - - # New captions format as of 22.06.2017 - player_response = args.get('player_response') - if player_response and isinstance(player_response, compat_str): - player_response = self._parse_json( - player_response, video_id, fatal=False) - if player_response: - renderer = player_response['captions']['playerCaptionsTracklistRenderer'] - base_url = renderer['captionTracks'][0]['baseUrl'] - sub_lang_list = [] - for lang in renderer['translationLanguages']: - lang_code = lang.get('languageCode') - if lang_code: - sub_lang_list.append(lang_code) - return make_captions(base_url, sub_lang_list) - - # Some videos don't provide ttsurl but rather caption_tracks and - # caption_translation_languages (e.g. 20LmZk1hakA) - # Does not used anymore as of 22.06.2017 - caption_tracks = args['caption_tracks'] - caption_translation_languages = args['caption_translation_languages'] - caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - sub_lang_list = [] - for lang in caption_translation_languages.split(','): - lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) - sub_lang = lang_qs.get('lc', [None])[0] - if sub_lang: - sub_lang_list.append(sub_lang) - return make_captions(caption_url, sub_lang_list) - # An extractor error can be raise by the download process if there are - # no automatic captions but there are subtitles - except (KeyError, IndexError, ExtractorError): - self._downloader.report_warning(err_msg) - return {} - - def _mark_watched(self, video_id, video_info, player_response): - playback_url = url_or_none(try_get( - player_response, - lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( - video_info, lambda x: x['videostats_playback_base_url'][0])) - if not playback_url: - return - parsed_playback_url = compat_urlparse.urlparse(playback_url) - qs = compat_urlparse.parse_qs(parsed_playback_url.query) - - # cpn generation algorithm is reverse engineered from base.js. - # In fact it works even with dummy cpn. - CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' - cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) - - qs.update({ - 'ver': ['2'], - 'cpn': [cpn], - }) - playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) - - self._download_webpage( - playback_url, video_id, 'Marking watched', - 'Unable to mark watched', fatal=False) - - @staticmethod - def _extract_urls(webpage): - # Embedded YouTube player - entries = [ - unescapeHTML(mobj.group('url')) - for mobj in re.finditer(r'''(?x) - (?: - <iframe[^>]+?src=| - data-video-url=| - <embed[^>]+?src=| - embedSWF\(?:\s*| - <object[^>]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) - \1''', webpage)] - - # lazyYT YouTube embed - entries.extend(list(map( - unescapeHTML, - re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) - - # Wordpress "YouTube Video Importer" plugin - matches = re.findall(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) - entries.extend(m[-1] for m in matches) - - return entries - - @staticmethod - def _extract_url(webpage): - urls = YoutubeIE._extract_urls(webpage) - return urls[0] if urls else None - - @classmethod - def extract_id(cls, url): - mobj = re.match(cls._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - video_id = mobj.group(2) - return video_id - - @staticmethod - def _extract_chapters(description, duration): - if not description: - return None - chapter_lines = re.findall( - r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)', - description) - if not chapter_lines: - return None - chapters = [] - for next_num, (chapter_line, time_point) in enumerate( - chapter_lines, start=1): - start_time = parse_duration(time_point) - if start_time is None: - continue - if start_time > duration: - break - end_time = (duration if next_num == len(chapter_lines) - else parse_duration(chapter_lines[next_num][1])) - if end_time is None: - continue - if end_time > duration: - end_time = duration - if start_time > end_time: - break - chapter_title = re.sub( - r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-') - chapter_title = re.sub(r'\s+', ' ', chapter_title) - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': chapter_title, - }) - return chapters - - ul_tag_pattern = re.compile(r'(</?ul)') - music_info_pattern = re.compile(r'<h4 class="title">\s*(Song|Music|Artist|Album)\s*</h4>\s*<ul class="content watch-info-tag-list">\s*<li>(?:<a[^>]*>)?([^<]*)(?:</a>)?</li>') - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - proto = ( - 'http' if self._downloader.params.get('prefer_insecure', False) - else 'https') - - start_time = None - end_time = None - parsed_url = compat_urllib_parse_urlparse(url) - for component in [parsed_url.fragment, parsed_url.query]: - query = compat_parse_qs(component) - if start_time is None and 't' in query: - start_time = parse_duration(query['t'][0]) - if start_time is None and 'start' in query: - start_time = parse_duration(query['start'][0]) - if end_time is None and 'end' in query: - end_time = parse_duration(query['end'][0]) - - # Extract original video URL from URL with redirection, like age verification, using next_url parameter - mobj = re.search(self._NEXT_URL_RE, url) - if mobj: - url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/') - video_id = self.extract_id(url) - - # Get video webpage - url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id - video_webpage = self._download_webpage(url, video_id) - - # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) - if mobj is not None: - player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) - else: - player_url = None - - dash_mpds = [] - - def add_dash_mpd(video_info): - dash_mpd = video_info.get('dashmpd') - if dash_mpd and dash_mpd[0] not in dash_mpds: - dash_mpds.append(dash_mpd[0]) - - def add_dash_mpd_pr(pl_response): - dash_mpd = url_or_none(try_get( - pl_response, lambda x: x['streamingData']['dashManifestUrl'], - compat_str)) - if dash_mpd and dash_mpd not in dash_mpds: - dash_mpds.append(dash_mpd) - - is_live = None - view_count = None - - def extract_view_count(v_info): - return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) - - def extract_token(v_info): - return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token')) - - def extract_player_response(player_response, video_id): - pl_response = str_or_none(player_response) - if not pl_response: - return - pl_response = self._parse_json(pl_response, video_id, fatal=False) - if isinstance(pl_response, dict): - add_dash_mpd_pr(pl_response) - return pl_response - - player_response = {} - - - - # Is it unlisted? - unlisted = ('<span id="watch-privacy-icon"' in video_webpage) - - - # Related videos - related_vids = [] - try: - rvs_match = re.search(r'"rvs":"(.*?)[^\\]"', video_webpage) - if rvs_match is not None: - rvs = json.loads('"' + rvs_match.group(1) + '"') # unescape json string (\u0026 for example) - related_vid_parts = (compat_parse_qs(related_item) for related_item in rvs.split(",")) - related_vids = [{key : value[0] for key,value in vid.items()} for vid in related_vid_parts] - else: - print('Failed to extract related videos: no rvs') - - except Exception: - print('Error while extracting related videos:') - traceback.print_exc() - - - # Music list - # Test case: https://www.youtube.com/watch?v=jbkZdRglnKY - music_list = [] - metadata_start = video_webpage.find('<ul class="watch-extras-section">') - if metadata_start != -1: - metadata_start += 33 - tag_index = metadata_start - open_tags = 1 - while open_tags > 0: - match = self.ul_tag_pattern.search(video_webpage, tag_index) - if match is None: - print("Couldn't match ul tag") - break - tag_index = match.end() - tag = match.group(1) - if tag == "<ul": - open_tags += 1 - else: - open_tags -= 1 - else: - last_index = 0 - metadata = video_webpage[metadata_start:tag_index] - current_song = None - while True: - match = self.music_info_pattern.search(metadata, last_index) - if match is None: - if current_song is not None: - music_list.append(current_song) - break - title, value = match.group(1), html.unescape(match.group(2)) - if title in ("Song", "Music"): - if current_song is not None: - music_list.append(current_song) - current_song = {"title": value} - else: - current_song[title.lower()] = value - last_index = match.end() - - - - # Get video info - embed_webpage = None - if re.search(r'player-age-gate-content">', video_webpage) is not None: - age_gate = True - # We simulate the access to the video from www.youtube.com/v/{video_id} - # this can be viewed without login into Youtube - url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') - data = compat_urllib_parse_urlencode({ - 'video_id': video_id, - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), - }) - video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - pl_response = video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(video_info) - view_count = extract_view_count(video_info) - else: - age_gate = False - video_info = None - sts = None - # Try looking directly into the video webpage - ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) - if ytplayer_config: - args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - add_dash_mpd(video_info) - # Rental video is not rented but preview is available (e.g. - # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/ytdl-org/youtube-dl/issues/10532) - if not video_info and args.get('ypc_vid'): - return self.url_result( - args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) - if args.get('livestream') == '1' or args.get('live_playback') == 1: - is_live = True - sts = ytplayer_config.get('sts') - if not player_response: - player_response = extract_player_response(args.get('player_response'), video_id) - if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): - add_dash_mpd_pr(player_response) - # We also try looking in get_video_info since it may contain different dashmpd - # URL that points to a DASH manifest with possibly different itag set (some itags - # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH - # manifest pointed by get_video_info's dashmpd). - # The general idea is to take a union of itags of both DASH manifests (for example - # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093) - self.report_video_info_webpage_download(video_id) - for el in ('embedded', 'detailpage', 'vevo', ''): - query = { - 'video_id': video_id, - 'ps': 'default', - 'eurl': '', - 'gl': 'US', - 'hl': 'en', - } - if el: - query['el'] = el - if sts: - query['sts'] = sts - video_info_webpage = self._download_webpage( - '%s://www.youtube.com/get_video_info' % proto, - video_id, note=False, - errnote='unable to download video info webpage', - fatal=False, query=query) - if not video_info_webpage: - continue - get_video_info = compat_parse_qs(video_info_webpage) - if not player_response: - pl_response = get_video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(get_video_info) - if view_count is None: - view_count = extract_view_count(get_video_info) - if not video_info: - video_info = get_video_info - get_token = extract_token(get_video_info) - if get_token: - # Different get_video_info requests may report different results, e.g. - # some may report video unavailability, but some may serve it without - # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362, - # the original webpage as well as el=info and el=embedded get_video_info - # requests report video unavailability due to geo restriction while - # el=detailpage succeeds and returns valid data). This is probably - # due to YouTube measures against IP ranges of hosting providers. - # Working around by preferring the first succeeded video_info containing - # the token if no such video_info yet was found. - token = extract_token(video_info) - if not token: - video_info = get_video_info - break - - def extract_unavailable_message(): - messages = [] - for tag, kind in (('h1', 'message'), ('div', 'submessage')): - msg = self._html_search_regex( - r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind), - video_webpage, 'unavailable %s' % kind, default=None) - if msg: - messages.append(msg) - if messages: - return '\n'.join(messages) - - if not video_info: - unavailable_message = extract_unavailable_message() - if not unavailable_message: - unavailable_message = 'Unable to extract video data' - raise ExtractorError( - 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - - video_details = try_get( - player_response, lambda x: x['videoDetails'], dict) or {} - - video_title = video_info.get('title', [None])[0] or video_details.get('title') - if not video_title: - self._downloader.report_warning('Unable to extract video title') - video_title = '_' - - description_original = video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - - def replace_url(m): - redir_url = compat_urlparse.urljoin(url, m.group(1)) - parsed_redir_url = compat_urllib_parse_urlparse(redir_url) - if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': - qs = compat_parse_qs(parsed_redir_url.query) - q = qs.get('q') - if q and q[0]: - return q[0] - return redir_url - - description_original = video_description = re.sub(r'''(?x) - <a\s+ - (?:[a-zA-Z-]+="[^"]*"\s+)*? - (?:title|href)="([^"]+)"\s+ - (?:[a-zA-Z-]+="[^"]*"\s+)*? - class="[^"]*"[^>]*> - [^<]+\.{3}\s* - </a> - ''', replace_url, video_description) - video_description = clean_html(video_description) - else: - video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription') - - if not smuggled_data.get('force_singlefeed', False): - if not self._downloader.params.get('noplaylist'): - multifeed_metadata_list = try_get( - player_response, - lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], - compat_str) or try_get( - video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) - if multifeed_metadata_list: - entries = [] - feed_ids = [] - for feed in multifeed_metadata_list.split(','): - # Unquote should take place before split on comma (,) since textual - # fields may contain comma as well (see - # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), - {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), - }) - feed_ids.append(feed_data['id'][0]) - self.to_screen( - 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' - % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) - else: - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - - if view_count is None: - view_count = extract_view_count(video_info) - if view_count is None and video_details: - view_count = int_or_none(video_details.get('viewCount')) - - if is_live is None: - is_live = bool_or_none(video_details.get('isLive')) - - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) - - def _extract_filesize(media_url): - return int_or_none(self._search_regex( - r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) - - streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or [] - streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or []) - - if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - formats = [{ - 'format_id': '_rtmp', - 'protocol': 'rtmp', - 'url': video_info['conn'][0], - 'player_url': player_url, - }] - elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): - encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] - if 'rtmpe%3Dyes' in encoded_url_map: - raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) - formats = [] - formats_spec = {} - fmt_list = video_info.get('fmt_list', [''])[0] - if fmt_list: - for fmt in fmt_list.split(','): - spec = fmt.split('/') - if len(spec) > 1: - width_height = spec[1].split('x') - if len(width_height) == 2: - formats_spec[spec[0]] = { - 'resolution': spec[1], - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - } - for fmt in streaming_formats: - itag = str_or_none(fmt.get('itag')) - if not itag: - continue - quality = fmt.get('quality') - quality_label = fmt.get('qualityLabel') or quality - formats_spec[itag] = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_note': quality_label, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - # bitrate for itag 43 is always 2147483647 - 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, - 'width': int_or_none(fmt.get('width')), - } - - for fmt in streaming_formats: - if fmt.get('drm_families'): - continue - url = url_or_none(fmt.get('url')) - - if not url: - cipher = fmt.get('cipher') - if not cipher: - continue - url_data = compat_parse_qs(cipher) - url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str)) - if not url: - continue - else: - cipher = None - url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - - stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) - # Unsupported FORMAT_STREAM_TYPE_OTF - if stream_type == 3: - continue - - format_id = fmt.get('itag') or url_data['itag'][0] - if not format_id: - continue - format_id = compat_str(format_id) - - if cipher: - if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' - jsplayer_url_json = self._search_regex( - ASSETS_RE, - embed_webpage if age_gate else video_webpage, - 'JS player URL (1)', default=None) - if not jsplayer_url_json and not age_gate: - # We need the embed website after all - if embed_webpage is None: - embed_url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') - jsplayer_url_json = self._search_regex( - ASSETS_RE, embed_webpage, 'JS player URL') - - player_url = json.loads(jsplayer_url_json) - if player_url is None: - player_url_json = self._search_regex( - r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, 'age gate player URL') - player_url = json.loads(player_url_json) - - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] - - if self._downloader.params.get('verbose'): - if player_url is None: - player_version = 'unknown' - player_desc = 'unknown' - else: - if player_url.endswith('swf'): - player_version = self._search_regex( - r'-(.+?)(?:/watch_as3)?\.swf$', player_url, - 'flash player', fatal=False) - player_desc = 'flash player %s' % player_version - else: - player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], - player_url, - 'html5 player', fatal=False) - player_desc = 'html5 player %s' % player_version - - parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) - - signature = self._decrypt_signature( - encrypted_sig, video_id, player_url, age_gate) - sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' - url += '&%s=%s' % (sp, signature) - if 'ratebypass' not in url: - url += '&ratebypass=yes' - - dct = { - 'format_id': format_id, - 'url': url, - 'player_url': player_url, - } - if format_id in self._formats: - dct.update(self._formats[format_id]) - if format_id in formats_spec: - dct.update(formats_spec[format_id]) - - # Some itags are not included in DASH manifest thus corresponding formats will - # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993). - # Trying to extract metadata from url_encoded_fmt_stream_map entry. - mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0]) - width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) - - if width is None: - width = int_or_none(fmt.get('width')) - if height is None: - height = int_or_none(fmt.get('height')) - - filesize = int_or_none(url_data.get( - 'clen', [None])[0]) or _extract_filesize(url) - - quality = url_data.get('quality', [None])[0] or fmt.get('quality') - quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel') - - tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000) - or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None - fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps')) - - more_fields = { - 'filesize': filesize, - 'tbr': tbr, - 'width': width, - 'height': height, - 'fps': fps, - 'format_note': quality_label or quality, - } - for key, value in more_fields.items(): - if value: - dct[key] = value - type_ = url_data.get('type', [None])[0] or fmt.get('mimeType') - if type_: - type_split = type_.split(';') - kind_ext = type_split[0].split('/') - if len(kind_ext) == 2: - kind, _ = kind_ext - dct['ext'] = mimetype2ext(type_split[0]) - if kind in ('audio', 'video'): - codecs = None - for mobj in re.finditer( - r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_): - if mobj.group('key') == 'codecs': - codecs = mobj.group('val') - break - if codecs: - dct.update(parse_codecs(codecs)) - if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': - dct['downloader_options'] = { - # Youtube throttles chunks >~10M - 'http_chunk_size': 10485760, - } - formats.append(dct) - else: - manifest_url = ( - url_or_none(try_get( - player_response, - lambda x: x['streamingData']['hlsManifestUrl'], - compat_str)) - or url_or_none(try_get( - video_info, lambda x: x['hlsvp'][0], compat_str))) - if manifest_url: - formats = [] - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', fatal=False) - for a_format in m3u8_formats: - itag = self._search_regex( - r'/itag/(\d+)/', a_format['url'], 'itag', default=None) - if itag: - a_format['format_id'] = itag - if itag in self._formats: - dct = self._formats[itag].copy() - dct.update(a_format) - a_format = dct - a_format['player_url'] = player_url - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' - formats.append(a_format) - else: - error_message = extract_unavailable_message() - alt_error_message = clean_html(video_info.get('reason', [None])[0]) - print(alt_error_message) - if not error_message: - error_message = alt_error_message - if not error_message: - error_message = clean_html( - try_get(video_info, lambda x: x['reason'][0], compat_str)) - if error_message: - raise YoutubeError(error_message) - raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') - - # uploader - video_uploader = try_get( - video_info, lambda x: x['author'][0], - compat_str) or str_or_none(video_details.get('author')) - if video_uploader: - video_uploader = compat_urllib_parse_unquote_plus(video_uploader) - else: - self._downloader.report_warning('unable to extract uploader name') - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - self._downloader.report_warning('unable to extract uploader nickname') - - channel_id = ( - str_or_none(video_details.get('channelId')) - or self._html_search_meta( - 'channelId', video_webpage, 'channel id', default=None) - or self._search_regex( - r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - video_webpage, 'channel id', default=None, group='id')) - channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None - - # thumbnail image - # We try first to get a high quality image: - m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - elif 'thumbnail_url' not in video_info: - self._downloader.report_warning('unable to extract video thumbnail') - video_thumbnail = None - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)</span>', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', - video_webpage, 'license', default=None) - - m_music = re.search( - r'''(?x) - <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* - <ul[^>]*>\s* - <li>(?P<title>.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) - else: - video_alt_title = video_creator = None - - def extract_meta(field): - return self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, - video_webpage, field, default=None) - - track = extract_meta('Song') - artist = extract_meta('Artist') - album = extract_meta('Album') - - # Youtube Music Auto-generated description - release_date = release_year = None - if video_description: - mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description) - if mobj: - if not track: - track = mobj.group('track').strip() - if not artist: - artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) - if not album: - album = mobj.group('album'.strip()) - release_year = mobj.group('release_year') - release_date = mobj.group('release_date') - if release_date: - release_date = release_date.replace('-', '') - if not release_year: - release_year = int(release_date[:4]) - if release_year: - release_year = int(release_year) - - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = unescapeHTML(m_episode.group('series')) - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None - - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - video_categories = None if category is None else [category] - else: - video_categories = None - - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - - def _extract_count(count_name): - return str_to_int(self._search_regex( - r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' - % re.escape(count_name), - video_webpage, count_name, default=None)) - - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') - - if view_count is None: - view_count = str_to_int(self._search_regex( - r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, - 'view count', default=None)) - - average_rating = ( - float_or_none(video_details.get('averageRating')) - or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) - - # subtitles - video_subtitles = self._get_subtitles(video_id, video_webpage) - automatic_captions = self._get_automatic_captions(video_id, video_webpage) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = int_or_none(video_details.get('lengthSeconds')) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - xsrf_token = self._search_regex( - r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2', - video_webpage, 'xsrf token', group='xsrf_token', fatal=False) - invideo_url = try_get( - player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) - if xsrf_token and invideo_url: - xsrf_field_name = self._search_regex( - r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2', - video_webpage, 'xsrf field name', - group='xsrf_field_name', default='session_token') - video_annotations = self._download_webpage( - self._proto_relative_url(invideo_url), - video_id, note='Downloading annotations', - errnote='Unable to download video annotations', fatal=False, - data=urlencode_postdata({xsrf_field_name: xsrf_token})) - - chapters = self._extract_chapters(description_original, video_duration) - - # Look for the DASH manifest - if self._downloader.params.get('youtube_include_dash_manifest', True): - dash_mpd_fatal = True - for mpd_url in dash_mpds: - dash_formats = {} - try: - def decrypt_sig(mobj): - s = mobj.group(1) - dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) - return '/signature/%s' % dec_s - - mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url) - - for df in self._extract_mpd_formats( - mpd_url, video_id, fatal=dash_mpd_fatal, - formats_dict=self._formats): - if not df.get('filesize'): - df['filesize'] = _extract_filesize(df['url']) - # Do not overwrite DASH format found in some previous DASH manifest - if df['format_id'] not in dash_formats: - dash_formats[df['format_id']] = df - # Additional DASH manifests may end up in HTTP Error 403 therefore - # allow them to fail without bug report message if we already have - # some DASH manifest succeeded. This is temporary workaround to reduce - # burst of bug reports until we figure out the reason and whether it - # can be fixed at all. - dash_mpd_fatal = False - except (ExtractorError, KeyError) as e: - self.report_warning( - 'Skipping DASH manifest: %r' % e, video_id) - if dash_formats: - # Remove the formats we found through non-DASH, they - # contain less info and it can be wrong, because we use - # fixed values (for example the resolution). See - # https://github.com/ytdl-org/youtube-dl/issues/5774 for an - # example. - formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] - formats.extend(dash_formats.values()) - - # Check for malformed aspect ratio - stretched_m = re.search( - r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">', - video_webpage) - if stretched_m: - w = float(stretched_m.group('w')) - h = float(stretched_m.group('h')) - # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). - # We will only process correct ratios. - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio - - if not formats: - token = extract_token(video_info) - if not token: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta( - 'regionsAllowed', video_webpage, default=None) - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - msg=video_info['reason'][0], countries=countries) - reason = video_info['reason'][0] - if 'Invalid parameters' in reason: - unavailable_message = extract_unavailable_message() - if unavailable_message: - reason = unavailable_message - raise YoutubeError( - 'YouTube said: %s' % reason, - expected=True, video_id=video_id) - else: - raise ExtractorError( - '"token" parameter not in video info for unknown reason', - video_id=video_id) - - if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])): - raise ExtractorError('This video is DRM protected.', expected=True) - - self._sort_formats(formats) - - self.mark_watched(video_id, video_info, player_response) - - return { - 'id': video_id, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'uploader_url': video_uploader_url, - 'channel_id': channel_id, - 'channel_url': channel_url, - 'upload_date': upload_date, - 'license': video_license, - 'creator': video_creator or artist, - 'title': video_title, - 'alt_title': video_alt_title or track, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'categories': video_categories, - 'tags': video_tags, - 'subtitles': video_subtitles, - 'automatic_captions': automatic_captions, - 'duration': video_duration, - 'age_limit': 18 if age_gate else 0, - 'annotations': video_annotations, - 'chapters': chapters, - 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'average_rating': average_rating, - 'formats': formats, - 'is_live': is_live, - 'start_time': start_time, - 'end_time': end_time, - 'series': series, - 'season_number': season_number, - 'episode_number': episode_number, - 'track': track, - 'artist': artist, - 'album': album, - 'release_date': release_date, - 'release_year': release_year, - 'related_vids': related_vids, - 'music_list': music_list, - 'unlisted': unlisted, - } - - -class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com playlists' - _VALID_URL = r"""(?x)(?: - (?:https?://)? - (?:\w+\.)? - (?: - (?: - youtube\.com| - invidio\.us - ) - / - (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) - \? (?:.*?[&;])*? (?:p|a|list)= - | p/ - )| - youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= - ) - ( - (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,} - # Top tracks, they can also include dots - |(?:MC)[\w\.]* - ) - .* - | - (%(playlist_id)s) - )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?' - _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' - IE_NAME = 'youtube:playlist' - _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', - 'info_dict': { - 'title': 'ytdl test PL', - 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', - }, - 'playlist_count': 3, - }, { - 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'info_dict': { - 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'title': 'YDL_Empty_List', - }, - 'playlist_count': 0, - 'skip': 'This playlist is private', - }, { - 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', - 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'info_dict': { - 'title': '29C3: Not my department', - 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'uploader': 'Christiaan008', - 'uploader_id': 'ChRiStIaAn008', - }, - 'playlist_count': 95, - }, { - 'note': 'issue #673', - 'url': 'PLBB231211A4F62143', - 'info_dict': { - 'title': '[OLD]Team Fortress 2 (Class-based LP)', - 'id': 'PLBB231211A4F62143', - 'uploader': 'Wickydoo', - 'uploader_id': 'Wickydoo', - }, - 'playlist_mincount': 26, - }, { - 'note': 'Large playlist', - 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', - 'info_dict': { - 'title': 'Uploads from Cauchemar', - 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', - 'uploader': 'Cauchemar', - 'uploader_id': 'Cauchemar89', - }, - 'playlist_mincount': 799, - }, { - 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - 'info_dict': { - 'title': 'YDL_safe_search', - 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - }, - 'playlist_count': 2, - 'skip': 'This playlist is private', - }, { - 'note': 'embedded', - 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA15', - 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'uploader': 'milan', - 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', - } - }, { - 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'playlist_mincount': 485, - 'info_dict': { - 'title': '2018 Chinese New Singles (11/6 updated)', - 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'uploader': 'LBK', - 'uploader_id': 'sdragonfang', - } - }, { - 'note': 'Embedded SWF player', - 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA7', - 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', - }, - 'skip': 'This playlist does not exist', - }, { - 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', - 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', - 'info_dict': { - 'title': 'Uploads from Interstellar Movie', - 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', - 'uploader': 'Interstellar Movie', - 'uploader_id': 'InterstellarMovie1', - }, - 'playlist_mincount': 21, - }, { - # Playlist URL that does not actually serve a playlist - 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', - 'info_dict': { - 'id': 'FqZTN594JQw', - 'ext': 'webm', - 'title': "Smiley's People 01 detective, Adventure Series, Action", - 'uploader': 'STREEM', - 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', - 'upload_date': '20150526', - 'license': 'Standard YouTube License', - 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', - 'categories': ['People & Blogs'], - 'tags': list, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - 'add_ie': [YoutubeIE.ie_key()], - }, { - 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', - 'info_dict': { - 'id': 'yeWKywCrFtk', - 'ext': 'mp4', - 'title': 'Small Scale Baler and Braiding Rugs', - 'uploader': 'Backus-Page House Museum', - 'uploader_id': 'backuspagemuseum', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', - 'upload_date': '20161008', - 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', - 'categories': ['Nonprofits & Activism'], - 'tags': list, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - }, - }, { - # https://github.com/ytdl-org/youtube-dl/issues/21844 - 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'info_dict': { - 'title': 'Data Analysis with Dr Mike Pound', - 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'Computerphile', - 'uploader': 'Computerphile', - }, - 'playlist_mincount': 11, - }, { - 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', - 'only_matching': True, - }, { - 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', - 'only_matching': True, - }, { - # music album playlist - 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', - 'only_matching': True, - }, { - 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', - 'only_matching': True, - }] - - def _real_initialize(self): - self._login() - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - - for item in re.findall( - r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page): - attrs = extract_attributes(item) - video_id = attrs['data-video-id'] - video_title = unescapeHTML(attrs.get('data-title')) - if video_title: - video_title = video_title.strip() - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - # Fallback with old _VIDEO_RE - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - - # Relaxed fallbacks - self.extract_videos_from_page_impl( - r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) - self.extract_videos_from_page_impl( - r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) - - return zip(ids_in_page, titles_in_page) - - def _extract_mix(self, playlist_id): - # The mixes are generated from a single video - # the id of the playlist is just 'RD' + video_id - ids = [] - last_id = playlist_id[-11:] - for n in itertools.count(1): - url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) - webpage = self._download_webpage( - url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) - new_ids = orderedSet(re.findall( - r'''(?xs)data-video-username=".*?".*? - href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), - webpage)) - # Fetch new pages until all the videos are repeated, it seems that - # there are always 51 unique videos. - new_ids = [_id for _id in new_ids if _id not in ids] - if not new_ids: - break - ids.extend(new_ids) - last_id = ids[-1] - - url_results = self._ids_to_results(ids) - - search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) - title_span = ( - search_title('playlist-title') - or search_title('title long-title') - or search_title('title')) - title = clean_html(title_span) - - return self.playlist_result(url_results, playlist_id, title) - - def _extract_playlist(self, playlist_id): - url = self._TEMPLATE_URL % playlist_id - page = self._download_webpage(url, playlist_id) - - # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604) - for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): - match = match.strip() - # Check if the playlist exists or is private - mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match) - if mobj: - reason = mobj.group('reason') - message = 'This playlist %s' % reason - if 'private' in reason: - message += ', use --username or --netrc to access it' - message += '.' - raise ExtractorError(message, expected=True) - elif re.match(r'[^<]*Invalid parameters[^<]*', match): - raise ExtractorError( - 'Invalid parameters. Maybe URL is incorrect.', - expected=True) - elif re.match(r'[^<]*Choose your language[^<]*', match): - continue - else: - self.report_warning('Youtube gives an alert message: ' + match) - - playlist_title = self._html_search_regex( - r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', - page, 'title', default=None) - - _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' - uploader = self._search_regex( - r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, - page, 'uploader', default=None) - mobj = re.search( - r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE, - page) - if mobj: - uploader_id = mobj.group('uploader_id') - uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) - else: - uploader_id = uploader_url = None - - has_videos = True - - if not playlist_title: - try: - # Some playlist URLs don't actually serve a playlist (e.g. - # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) - next(self._entries(page, playlist_id)) - except StopIteration: - has_videos = False - - playlist = self.playlist_result( - self._entries(page, playlist_id), playlist_id, playlist_title) - playlist.update({ - 'uploader': uploader, - 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - }) - - return has_videos, playlist - - def _check_download_just_video(self, url, playlist_id): - # Check if it's a video-specific URL - query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = query_dict.get('v', [None])[0] or self._search_regex( - r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url, - 'video id', default=None) - if video_id: - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - return video_id, None - return None, None - - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) - - video_id, video = self._check_download_just_video(url, playlist_id) - if video: - return video - - if playlist_id.startswith(('RD', 'UL', 'PU')): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) - - has_videos, playlist = self._extract_playlist(playlist_id) - if has_videos or not video_id: - return playlist - - # Some playlist URLs don't actually serve a playlist (see - # https://github.com/ytdl-org/youtube-dl/issues/10537). - # Fallback to plain video extraction if there is a video id - # along with playlist id. - return self.url_result(video_id, 'Youtube', video_id=video_id) - - -class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' - _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' - IE_NAME = 'youtube:channel' - _TESTS = [{ - 'note': 'paginated channel', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'playlist_mincount': 91, - 'info_dict': { - 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'Uploads from lex will', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - } - }, { - 'note': 'Age restricted channel', - # from https://www.youtube.com/user/DeusExOfficial - 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w', - 'playlist_mincount': 64, - 'info_dict': { - 'id': 'UUs0ifCMCm1icqRbqhUINa0w', - 'title': 'Uploads from Deus Ex', - 'uploader': 'Deus Ex', - 'uploader_id': 'DeusExOfficial', - }, - }, { - 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) - else super(YoutubeChannelIE, cls).suitable(url)) - - def _build_template_url(self, url, channel_id): - return self._TEMPLATE_URL % channel_id - - def _real_extract(self, url): - channel_id = self._match_id(url) - - url = self._build_template_url(url, channel_id) - - # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) - # Workaround by extracting as a playlist if managed to obtain channel playlist URL - # otherwise fallback on channel by page extraction - channel_page = self._download_webpage( - url + '?view=57', channel_id, - 'Downloading channel page', fatal=False) - if channel_page is False: - channel_playlist_id = False - else: - channel_playlist_id = self._html_search_meta( - 'channelId', channel_page, 'channel id', default=None) - if not channel_playlist_id: - channel_url = self._html_search_meta( - ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), - channel_page, 'channel url', default=None) - if channel_url: - channel_playlist_id = self._search_regex( - r'vnd\.youtube://user/([0-9A-Za-z_-]+)', - channel_url, 'channel id', default=None) - if channel_playlist_id and channel_playlist_id.startswith('UC'): - playlist_id = 'UU' + channel_playlist_id[2:] - return self.url_result( - compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') - - channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') - autogenerated = re.search(r'''(?x) - class="[^"]*?(?: - channel-header-autogenerated-label| - yt-channel-title-autogenerated - )[^"]*"''', channel_page) is not None - - if autogenerated: - # The videos are contained in a single page - # the ajax pages can't be used, they are empty - entries = [ - self.url_result( - video_id, 'Youtube', video_id=video_id, - video_title=video_title) - for video_id, video_title in self.extract_videos_from_page(channel_page)] - return self.playlist_result(entries, channel_id) - - try: - next(self._entries(channel_page, channel_id)) - except StopIteration: - alert_message = self._html_search_regex( - r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', - channel_page, 'alert', default=None, group='alert') - if alert_message: - raise ExtractorError('Youtube said: %s' % alert_message, expected=True) - - return self.playlist_result(self._entries(channel_page, channel_id), channel_id) - - -class YoutubeUserIE(YoutubeChannelIE): - IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' - IE_NAME = 'youtube:user' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheLinuxFoundation', - 'playlist_mincount': 320, - 'info_dict': { - 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', - 'title': 'Uploads from The Linux Foundation', - 'uploader': 'The Linux Foundation', - 'uploader_id': 'TheLinuxFoundation', - } - }, { - # Only available via https://www.youtube.com/c/12minuteathlete/videos - # but not https://www.youtube.com/user/12minuteathlete/videos - 'url': 'https://www.youtube.com/c/12minuteathlete/videos', - 'playlist_mincount': 249, - 'info_dict': { - 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', - 'title': 'Uploads from 12 Minute Athlete', - 'uploader': '12 Minute Athlete', - 'uploader_id': 'the12minuteathlete', - } - }, { - 'url': 'ytuser:phihag', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/gametrailers', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/gametrailers', - 'only_matching': True, - }, { - # This channel is not available, geo restricted to JP - 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - # Don't return True if the url can be extracted with other youtube - # extractor, the regex would is too permissive and it would match. - other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls) - if any(ie.suitable(url) for ie in other_yt_ies): - return False - else: - return super(YoutubeUserIE, cls).suitable(url) - - def _build_template_url(self, url, channel_id): - mobj = re.match(self._VALID_URL, url) - return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) - - -class YoutubeLiveIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com live streams' - _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live' - IE_NAME = 'youtube:live' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheYoungTurks/live', - 'info_dict': { - 'id': 'a48o2S1cPoo', - 'ext': 'mp4', - 'title': 'The Young Turks - Live Main Show', - 'uploader': 'The Young Turks', - 'uploader_id': 'TheYoungTurks', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', - 'upload_date': '20150715', - 'license': 'Standard YouTube License', - 'description': 'md5:438179573adcdff3c97ebb1ee632b891', - 'categories': ['News & Politics'], - 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - base_url = mobj.group('base_url') - webpage = self._download_webpage(url, channel_id, fatal=False) - if webpage: - page_type = self._og_search_property( - 'type', webpage, 'page type', default='') - video_id = self._html_search_meta( - 'videoId', webpage, 'video id', default=None) - if page_type.startswith('video') and video_id and re.match( - r'^[0-9A-Za-z_-]{11}$', video_id): - return self.url_result(video_id, YoutubeIE.ie_key()) - return self.url_result(base_url) - - -class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com user/channel playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' - IE_NAME = 'youtube:playlists' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', - 'playlist_mincount': 4, - 'info_dict': { - 'id': 'ThirstForScience', - 'title': 'ThirstForScience', - }, - }, { - # with "Load more" button - 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', - 'playlist_mincount': 70, - 'info_dict': { - 'id': 'igorkle1', - 'title': 'Игорь Клейнер', - }, - }, { - 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', - 'playlist_mincount': 17, - 'info_dict': { - 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', - 'title': 'Chem Player', - }, - 'skip': 'Blocked', - }] - - -class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' - - -class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): - IE_DESC = 'YouTube.com searches' - # there doesn't appear to be a real limit, for example if you search for - # 'python' you get more than 8.000.000 results - _MAX_RESULTS = float('inf') - IE_NAME = 'youtube:search' - _SEARCH_KEY = 'ytsearch' - _EXTRA_QUERY_ARGS = {} - _TESTS = [] - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - - videos = [] - limit = n - - url_query = { - 'search_query': query.encode('utf-8'), - } - url_query.update(self._EXTRA_QUERY_ARGS) - result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) - - for pagenum in itertools.count(1): - data = self._download_json( - result_url, video_id='query "%s"' % query, - note='Downloading page %s' % pagenum, - errnote='Unable to download API page', - query={'spf': 'navigate'}) - html_content = data[1]['body']['content'] - - if 'class="search-message' in html_content: - raise ExtractorError( - '[youtube] No video results', expected=True) - - new_videos = list(self._process_page(html_content)) - videos += new_videos - if not new_videos or len(videos) > limit: - break - next_link = self._html_search_regex( - r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next', - html_content, 'next link', default=None) - if next_link is None: - break - result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link) - - if len(videos) > n: - videos = videos[:n] - return self.playlist_result(videos, query) - - -class YoutubeSearchDateIE(YoutubeSearchIE): - IE_NAME = YoutubeSearchIE.IE_NAME + ':date' - _SEARCH_KEY = 'ytsearchdate' - IE_DESC = 'YouTube.com searches, newest videos first' - _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} - - -class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): - IE_DESC = 'YouTube.com search URLs' - IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _TESTS = [{ - 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', - 'playlist_mincount': 5, - 'info_dict': { - 'title': 'youtube-dl test video', - } - }, { - 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - return self.playlist_result(self._process_page(webpage), playlist_title=query) - - -class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' - IE_NAME = 'youtube:show' - _TESTS = [{ - 'url': 'https://www.youtube.com/show/airdisasters', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'airdisasters', - 'title': 'Air Disasters', - } - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - return super(YoutubeShowIE, self)._real_extract( - 'https://www.youtube.com/show/%s/playlists' % playlist_id) - - -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): - """ - Base class for feed extractors - Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. - """ - _LOGIN_REQUIRED = True - - @property - def IE_NAME(self): - return 'youtube:%s' % self._FEED_NAME - - def _real_initialize(self): - self._login() - - def _entries(self, page): - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page - for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: - break - - ids.extend(new_ids) - - for entry in self._ids_to_results(new_ids): - yield entry - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - - def _real_extract(self, url): - page = self._download_webpage( - 'https://www.youtube.com/feed/%s' % self._FEED_NAME, - self._PLAYLIST_TITLE) - return self.playlist_result( - self._entries(page), playlist_title=self._PLAYLIST_TITLE) - - -class YoutubeWatchLaterIE(YoutubePlaylistIE): - IE_NAME = 'youtube:watchlater' - IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' - - _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=WL', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', - 'only_matching': True, - }] - - def _real_extract(self, url): - _, video = self._check_download_just_video(url, 'WL') - if video: - return video - _, playlist = self._extract_playlist('WL') - return playlist - - -class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' - _LOGIN_REQUIRED = True - - def _real_extract(self, url): - webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') - playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id') - return self.url_result(playlist_id, 'YoutubePlaylist') - - -class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' - _FEED_NAME = 'recommended' - _PLAYLIST_TITLE = 'Youtube Recommended videos' - - -class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' - _FEED_NAME = 'subscriptions' - _PLAYLIST_TITLE = 'Youtube Subscriptions' - - -class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' - _FEED_NAME = 'history' - _PLAYLIST_TITLE = 'Youtube History' - - -class YoutubeTruncatedURLIE(InfoExtractor): - IE_NAME = 'youtube:truncated_url' - IE_DESC = False # Do not list - _VALID_URL = r'''(?x) - (?:https?://)? - (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/ - (?:watch\?(?: - feature=[a-z_]+| - annotation_id=annotation_[^&]+| - x-yt-cl=[0-9]+| - hl=[^&]*| - t=[0-9]+ - )? - | - attribution_link\?a=[^&]+ - ) - $ - ''' - - _TESTS = [{ - 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?feature=foo', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?hl=en-GB', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?t=2372', - 'only_matching': True, - }] - - def _real_extract(self, url): - raise ExtractorError( - 'Did you forget to quote the URL? Remember that & is a meta ' - 'character in most shells, so you want to put the URL in quotes, ' - 'like youtube-dl ' - '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' - ' or simply youtube-dl BaW_jenozKc .', - expected=True) - - -class YoutubeTruncatedIDIE(InfoExtractor): - IE_NAME = 'youtube:truncated_id' - IE_DESC = False # Do not list - _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' - - _TESTS = [{ - 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - raise ExtractorError( - 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), - expected=True) diff --git a/youtube_dl/extractor/youtube_unmodified_reference.py b/youtube_dl/extractor/youtube_unmodified_reference.py deleted file mode 100644 index f002d87..0000000 --- a/youtube_dl/extractor/youtube_unmodified_reference.py +++ /dev/null @@ -1,3325 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - - -import itertools -import json -import os.path -import random -import re -import time -import traceback - -from .common import InfoExtractor, SearchInfoExtractor -from ..jsinterp import JSInterpreter -from ..swfinterp import SWFInterpreter -from ..compat import ( - compat_chr, - compat_HTTPError, - compat_kwargs, - compat_parse_qs, - compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urlparse, - compat_str, -) -from ..utils import ( - bool_or_none, - clean_html, - dict_get, - error_to_compat_str, - extract_attributes, - ExtractorError, - float_or_none, - get_element_by_attribute, - get_element_by_id, - int_or_none, - mimetype2ext, - orderedSet, - parse_codecs, - parse_duration, - remove_quotes, - remove_start, - smuggle_url, - str_or_none, - str_to_int, - try_get, - unescapeHTML, - unified_strdate, - unsmuggle_url, - uppercase_escape, - url_or_none, - urlencode_postdata, -) - - -class YoutubeBaseInfoExtractor(InfoExtractor): - """Provide base functions for Youtube extractors""" - _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' - - _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' - _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' - _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' - - _NETRC_MACHINE = 'youtube' - # If True it will raise an error if no login info is provided - _LOGIN_REQUIRED = False - - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}' - - def _set_language(self): - self._set_cookie( - '.youtube.com', 'PREF', 'f1=50000000&hl=en', - # YouTube sets the expire time to about two months - expire_time=time.time() + 2 * 30 * 24 * 3600) - - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - - def _login(self): - """ - Attempt to log in to YouTube. - True is returned if successful or skipped. - False is returned if login failed. - - If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. - """ - username, password = self._get_login_info() - # No authentication to be performed - if username is None: - if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) - return True - - login_page = self._download_webpage( - self._LOGIN_URL, None, - note='Downloading login page', - errnote='unable to fetch login page', fatal=False) - if login_page is False: - return - - login_form = self._hidden_inputs(login_page) - - def req(url, f_req, note, errnote): - data = login_form.copy() - data.update({ - 'pstMsg': 1, - 'checkConnection': 'youtube', - 'checkedDomains': 'youtube', - 'hl': 'en', - 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', - 'f.req': json.dumps(f_req), - 'flowName': 'GlifWebSignIn', - 'flowEntry': 'ServiceLogin', - # TODO: reverse actual botguard identifier generation algo - 'bgRequest': '["identifier",""]', - }) - return self._download_json( - url, None, note=note, errnote=errnote, - transform_source=lambda s: re.sub(r'^[^[]*', '', s), - fatal=False, - data=urlencode_postdata(data), headers={ - 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', - 'Google-Accounts-XSRF': 1, - }) - - def warn(message): - self._downloader.report_warning(message) - - lookup_req = [ - username, - None, [], None, 'US', None, None, 2, False, True, - [ - None, None, - [2, 1, None, 1, - 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', - None, [], 4], - 1, [None, None, []], None, None, None, True - ], - username, - ] - - lookup_results = req( - self._LOOKUP_URL, lookup_req, - 'Looking up account info', 'Unable to look up account info') - - if lookup_results is False: - return False - - user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) - if not user_hash: - warn('Unable to extract user hash') - return False - - challenge_req = [ - user_hash, - None, 1, None, [1, None, None, None, [password, None, True]], - [ - None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], - 1, [None, None, []], None, None, None, True - ]] - - challenge_results = req( - self._CHALLENGE_URL, challenge_req, - 'Logging in', 'Unable to log in') - - if challenge_results is False: - return - - login_res = try_get(challenge_results, lambda x: x[0][5], list) - if login_res: - login_msg = try_get(login_res, lambda x: x[5], compat_str) - warn( - 'Unable to login: %s' % 'Invalid password' - if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) - return False - - res = try_get(challenge_results, lambda x: x[0][-1], list) - if not res: - warn('Unable to extract result entry') - return False - - login_challenge = try_get(res, lambda x: x[0][0], list) - if login_challenge: - challenge_str = try_get(login_challenge, lambda x: x[2], compat_str) - if challenge_str == 'TWO_STEP_VERIFICATION': - # SEND_SUCCESS - TFA code has been successfully sent to phone - # QUOTA_EXCEEDED - reached the limit of TFA codes - status = try_get(login_challenge, lambda x: x[5], compat_str) - if status == 'QUOTA_EXCEEDED': - warn('Exceeded the limit of TFA codes, try later') - return False - - tl = try_get(challenge_results, lambda x: x[1][2], compat_str) - if not tl: - warn('Unable to extract TL') - return False - - tfa_code = self._get_tfa_info('2-step verification code') - - if not tfa_code: - warn( - 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' - '(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False - - tfa_code = remove_start(tfa_code, 'G-') - - tfa_req = [ - user_hash, None, 2, None, - [ - 9, None, None, None, None, None, None, None, - [None, tfa_code, True, 2] - ]] - - tfa_results = req( - self._TFA_URL.format(tl), tfa_req, - 'Submitting TFA code', 'Unable to submit TFA code') - - if tfa_results is False: - return False - - tfa_res = try_get(tfa_results, lambda x: x[0][5], list) - if tfa_res: - tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) - warn( - 'Unable to finish TFA: %s' % 'Invalid TFA code' - if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) - return False - - check_cookie_url = try_get( - tfa_results, lambda x: x[0][-1][2], compat_str) - else: - CHALLENGES = { - 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.", - 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.', - 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.", - } - challenge = CHALLENGES.get( - challenge_str, - '%s returned error %s.' % (self.IE_NAME, challenge_str)) - warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge) - return False - else: - check_cookie_url = try_get(res, lambda x: x[2], compat_str) - - if not check_cookie_url: - warn('Unable to extract CheckCookie URL') - return False - - check_cookie_results = self._download_webpage( - check_cookie_url, None, 'Checking cookie', fatal=False) - - if check_cookie_results is False: - return False - - if 'https://myaccount.google.com/' not in check_cookie_results: - warn('Unable to log in') - return False - - return True - - def _download_webpage_handle(self, *args, **kwargs): - query = kwargs.get('query', {}).copy() - query['disable_polymer'] = 'true' - kwargs['query'] = query - return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( - *args, **compat_kwargs(kwargs)) - - def _real_initialize(self): - if self._downloader is None: - return - self._set_language() - if not self._login(): - return - - -class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): - # Extract entries from page with "Load more" button - def _entries(self, page, playlist_id): - more_widget_html = content_html = page - for page_num in itertools.count(1): - for entry in self._process_page(content_html): - yield entry - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - count = 0 - retries = 3 - while count <= retries: - try: - # Downloading page may result in intermittent 5xx HTTP error - # that is usually worked around with a retry - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s%s' - % (page_num, ' (retry #%d)' % count if count else ''), - transform_source=uppercase_escape) - break - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): - count += 1 - if count <= retries: - continue - raise - - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] - - -class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for video_id, video_title in self.extract_videos_from_page(content): - yield self.url_result(video_id, 'Youtube', video_id, video_title) - - def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): - for mobj in re.finditer(video_re, page): - # The link with index 0 is not the first video of the playlist (not sure if still actual) - if 'index' in mobj.groupdict() and mobj.group('id') == '0': - continue - video_id = mobj.group('id') - video_title = unescapeHTML( - mobj.group('title')) if 'title' in mobj.groupdict() else None - if video_title: - video_title = video_title.strip() - if video_title == '► Play all': - video_title = None - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) - - -class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for playlist_id in orderedSet(re.findall( - r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', - content)): - yield self.url_result( - 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - title = self._og_search_title(webpage, fatal=False) - return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title) - - -class YoutubeIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com' - _VALID_URL = r"""(?x)^ - ( - (?:https?://|//) # http(s):// or protocol-independent URL - (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| - (?:www\.)?deturl\.com/www\.youtube\.com/| - (?:www\.)?pwnyoutube\.com/| - (?:www\.)?hooktube\.com/| - (?:www\.)?yourepeat\.com/| - tube\.majestyc\.net/| - # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances - (?:(?:www|dev)\.)?invidio\.us/| - (?:(?:www|no)\.)?invidiou\.sh/| - (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| - (?:www\.)?invidious\.kabi\.tk/| - (?:www\.)?invidious\.enkirton\.net/| - (?:www\.)?invidious\.13ad\.de/| - (?:www\.)?invidious\.mastodon\.host/| - (?:www\.)?invidious\.nixnet\.xyz/| - (?:www\.)?tube\.poal\.co/| - (?:www\.)?vid\.wxzm\.sx/| - (?:www\.)?yt\.elukerio\.org/| - (?:www\.)?kgg2m7yk5aybusll\.onion/| - (?:www\.)?qklhadlycap4cnod\.onion/| - (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/| - (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/| - (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/| - (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/| - youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains - (?:.*?\#/)? # handle anchor (#/) redirect urls - (?: # the various things that can precede the ID: - (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ - |(?: # or the v= param in all its forms - (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) - (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) - v= - ) - )) - |(?: - youtu\.be| # just youtu.be/xxxx - vid\.plus| # or vid.plus/xxxx - zwearz\.com/watch| # or zwearz.com/watch/xxxx - )/ - |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= - ) - )? # all until now is optional -> you can pass the naked ID - ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?\blist= - (?: - %(playlist_id)s| # combined list/video URLs are handled by the playlist IE - WL # WL are handled by the watch later IE - ) - ) - (?(1).+)? # if we found the ID, everything can follow - $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' - _formats = { - '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well - '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - - - # 3D videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, - '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - - # Apple HTTP Live Streaming - '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, - - # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, - - # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, - '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, - '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, - - # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - - # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, - - # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, - - # RTMP (unnamed) - '_rtmp': {'protocol': 'rtmp'}, - - # av01 video only formats sometimes served with "unknown" codecs - '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, - } - _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') - - _GEO_BYPASS = False - - IE_NAME = 'youtube' - _TESTS = [ - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', - 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - 'start_time': 1, - 'end_time': 9, - } - }, - { - 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY', - 'note': 'Test generic use_cipher_signature video (#897)', - 'info_dict': { - 'id': 'UxxajLWwzqY', - 'ext': 'mp4', - 'upload_date': '20120506', - 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', - 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8', - 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', - 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', - 'iconic ep', 'iconic', 'love', 'it'], - 'duration': 180, - 'uploader': 'Icona Pop', - 'uploader_id': 'IconaPop', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', - 'creator': 'Icona Pop', - 'track': 'I Love It (feat. Charli XCX)', - 'artist': 'Icona Pop', - } - }, - { - 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ', - 'note': 'Test VEVO video with age protection (#956)', - 'info_dict': { - 'id': '07FYdnEawAQ', - 'ext': 'mp4', - 'upload_date': '20130703', - 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)', - 'alt_title': 'Tunnel Vision', - 'description': 'md5:07dab3356cde4199048e4c7cd93471e1', - 'duration': 419, - 'uploader': 'justintimberlakeVEVO', - 'uploader_id': 'justintimberlakeVEVO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', - 'creator': 'Justin Timberlake', - 'track': 'Tunnel Vision', - 'artist': 'Justin Timberlake', - 'age_limit': 18, - } - }, - { - 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', - 'note': 'Embed-only video (#1746)', - 'info_dict': { - 'id': 'yZIXLfi8CZQ', - 'ext': 'mp4', - 'upload_date': '20120608', - 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', - 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', - 'uploader': 'SET India', - 'uploader_id': 'setindia', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia', - 'age_limit': 18, - } - }, - { - 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', - 'note': 'Use the first video ID in the URL', - 'info_dict': { - 'id': 'BaW_jenozKc', - 'ext': 'mp4', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐', - 'uploader': 'Philipp Hagemeister', - 'uploader_id': 'phihag', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', - 'upload_date': '20121002', - 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', - 'categories': ['Science & Technology'], - 'tags': ['youtube-dl'], - 'duration': 10, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I', - 'note': '256k DASH audio (format 141) via DASH manifest', - 'info_dict': { - 'id': 'a9LDPn-MO4I', - 'ext': 'm4a', - 'upload_date': '20121002', - 'uploader_id': '8KVIDEO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', - 'description': '', - 'uploader': '8KVIDEO', - 'title': 'UHDTV TEST 8K VIDEO.mp4' - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141', - }, - 'skip': 'format 141 not served anymore', - }, - # DASH manifest with encrypted signature - { - 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', - 'info_dict': { - 'id': 'IB3lcPjvWLA', - 'ext': 'm4a', - 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', - 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', - 'duration': 244, - 'uploader': 'AfrojackVEVO', - 'uploader_id': 'AfrojackVEVO', - 'upload_date': '20131011', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141/bestaudio[ext=m4a]', - }, - }, - # JS player signature function name containing $ - { - 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM', - 'info_dict': { - 'id': 'nfWlot6h_JM', - 'ext': 'm4a', - 'title': 'Taylor Swift - Shake It Off', - 'description': 'md5:bec2185232c05479482cb5a9b82719bf', - 'duration': 242, - 'uploader': 'TaylorSwiftVEVO', - 'uploader_id': 'TaylorSwiftVEVO', - 'upload_date': '20140818', - 'creator': 'Taylor Swift', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '141/bestaudio[ext=m4a]', - }, - }, - # Controversy video - { - 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', - 'info_dict': { - 'id': 'T4XJQO3qol8', - 'ext': 'mp4', - 'duration': 219, - 'upload_date': '20100909', - 'uploader': 'Amazing Atheist', - 'uploader_id': 'TheAmazingAtheist', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', - 'title': 'Burning Everyone\'s Koran', - 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', - } - }, - # Normal age-gate video (No vevo, embed allowed) - { - 'url': 'https://youtube.com/watch?v=HtVdAasjOgU', - 'info_dict': { - 'id': 'HtVdAasjOgU', - 'ext': 'mp4', - 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', - 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', - 'duration': 142, - 'uploader': 'The Witcher', - 'uploader_id': 'WitcherGame', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', - 'upload_date': '20140605', - 'age_limit': 18, - }, - }, - # Age-gate video with encrypted signature - { - 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU', - 'info_dict': { - 'id': '6kLq3WMV1nU', - 'ext': 'mp4', - 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', - 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', - 'duration': 246, - 'uploader': 'LloydVEVO', - 'uploader_id': 'LloydVEVO', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', - 'upload_date': '20110629', - 'age_limit': 18, - }, - }, - # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) - # YouTube Red ad is not captured for creator - { - 'url': '__2ABJjxzNo', - 'info_dict': { - 'id': '__2ABJjxzNo', - 'ext': 'mp4', - 'duration': 266, - 'upload_date': '20100430', - 'uploader_id': 'deadmau5', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'deadmau5', - 'description': 'md5:12c56784b8032162bb936a5f76d55360', - 'uploader': 'deadmau5', - 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'Some Chords', - }, - 'expected_warnings': [ - 'DASH manifest missing', - ] - }, - # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) - { - 'url': 'lqQg6PlCWgI', - 'info_dict': { - 'id': 'lqQg6PlCWgI', - 'ext': 'mp4', - 'duration': 6085, - 'upload_date': '20150827', - 'uploader_id': 'olympic', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', - 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', - 'uploader': 'Olympic', - 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', - }, - 'params': { - 'skip_download': 'requires avconv', - } - }, - # Non-square pixels - { - 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', - 'info_dict': { - 'id': '_b-2C3KPAM0', - 'ext': 'mp4', - 'stretched_ratio': 16 / 9., - 'duration': 85, - 'upload_date': '20110310', - 'uploader_id': 'AllenMeow', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', - 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', - 'uploader': '孫ᄋᄅ', - 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', - }, - }, - # url_encoded_fmt_stream_map is empty string - { - 'url': 'qEJwOuvDf7I', - 'info_dict': { - 'id': 'qEJwOuvDf7I', - 'ext': 'webm', - 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', - 'description': '', - 'upload_date': '20150404', - 'uploader_id': 'spbelect', - 'uploader': 'Наблюдатели Петербурга', - }, - 'params': { - 'skip_download': 'requires avconv', - }, - 'skip': 'This live event has ended.', - }, - # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) - { - 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', - 'info_dict': { - 'id': 'FIl7x6_3R5Y', - 'ext': 'webm', - 'title': 'md5:7b81415841e02ecd4313668cde88737a', - 'description': 'md5:116377fd2963b81ec4ce64b542173306', - 'duration': 220, - 'upload_date': '20150625', - 'uploader_id': 'dorappi2000', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', - 'uploader': 'dorappi2000', - 'formats': 'mincount:31', - }, - 'skip': 'not actual anymore', - }, - # DASH manifest with segment_list - { - 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', - 'md5': '8ce563a1d667b599d21064e982ab9e31', - 'info_dict': { - 'id': 'CsmdDsKjzN8', - 'ext': 'mp4', - 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510 - 'uploader': 'Airtek', - 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', - 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', - 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', - }, - 'params': { - 'youtube_include_dash_manifest': True, - 'format': '135', # bestvideo - }, - 'skip': 'This live event has ended.', - }, - { - # Multifeed videos (multiple cameras), URL is for Main Camera - 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs', - 'info_dict': { - 'id': 'jqWvoWXjCVs', - 'title': 'teamPGP: Rocket League Noob Stream', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - }, - 'playlist': [{ - 'info_dict': { - 'id': 'jqWvoWXjCVs', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7335, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }, { - 'info_dict': { - 'id': '6h8e8xoXJzg', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7337, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }, { - 'info_dict': { - 'id': 'PUOgX5z9xZw', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (grizzle)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7337, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }, { - 'info_dict': { - 'id': 'teuwxikvS5k', - 'ext': 'mp4', - 'title': 'teamPGP: Rocket League Noob Stream (zim)', - 'description': 'md5:dc7872fb300e143831327f1bae3af010', - 'duration': 7334, - 'upload_date': '20150721', - 'uploader': 'Beer Games Beer', - 'uploader_id': 'beergamesbeer', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', - 'license': 'Standard YouTube License', - }, - }], - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - }, - { - # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) - 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo', - 'info_dict': { - 'id': 'gVfLd0zydlo', - 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', - }, - 'playlist_count': 2, - 'skip': 'Not multifeed anymore', - }, - { - 'url': 'https://vid.plus/FlRa-iH7PGw', - 'only_matching': True, - }, - { - 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html', - 'only_matching': True, - }, - { - # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468) - # Also tests cut-off URL expansion in video description (see - # https://github.com/ytdl-org/youtube-dl/issues/1892, - # https://github.com/ytdl-org/youtube-dl/issues/8164) - 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg', - 'info_dict': { - 'id': 'lsguqyKfVQg', - 'ext': 'mp4', - 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk - Position Music', - 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', - 'duration': 133, - 'upload_date': '20151119', - 'uploader_id': 'IronSoulElf', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', - 'uploader': 'IronSoulElf', - 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'track': 'Dark Walk - Position Music', - 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'album': 'Position Music - Production Music Vol. 143 - Dark Walk', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468) - 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8', - 'only_matching': True, - }, - { - # Video with yt:stretch=17:0 - 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM', - 'info_dict': { - 'id': 'Q39EVAstoRM', - 'ext': 'mp4', - 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4', - 'description': 'md5:ee18a25c350637c8faff806845bddee9', - 'upload_date': '20151107', - 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA', - 'uploader': 'CH GAMER DROID', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video does not exist.', - }, - { - # Video licensed under Creative Commons - 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', - 'info_dict': { - 'id': 'M4gD1WSo5mA', - 'ext': 'mp4', - 'title': 'md5:e41008789470fc2533a3252216f1c1d1', - 'description': 'md5:a677553cf0840649b731a3024aeff4cc', - 'duration': 721, - 'upload_date': '20150127', - 'uploader_id': 'BerkmanCenter', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', - 'uploader': 'The Berkman Klein Center for Internet & Society', - 'license': 'Creative Commons Attribution license (reuse allowed)', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Channel-like uploader_url - 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg', - 'info_dict': { - 'id': 'eQcmzGIKrzg', - 'ext': 'mp4', - 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', - 'description': 'md5:dda0d780d5a6e120758d1711d062a867', - 'duration': 4060, - 'upload_date': '20151119', - 'uploader': 'Bernie Sanders', - 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', - 'license': 'Creative Commons Attribution license (reuse allowed)', - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', - 'only_matching': True, - }, - { - # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059) - 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', - 'only_matching': True, - }, - { - # Rental video preview - 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg', - 'info_dict': { - 'id': 'uGpuVWrhIzE', - 'ext': 'mp4', - 'title': 'Piku - Trailer', - 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb', - 'upload_date': '20150811', - 'uploader': 'FlixMatrix', - 'uploader_id': 'FlixMatrixKaravan', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan', - 'license': 'Standard YouTube License', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - }, - { - # YouTube Red video with episode data - 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4', - 'info_dict': { - 'id': 'iqKdEhx-dD4', - 'ext': 'mp4', - 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f', - 'duration': 2085, - 'upload_date': '20170118', - 'uploader': 'Vsauce', - 'uploader_id': 'Vsauce', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', - 'series': 'Mind Field', - 'season_number': 1, - 'episode_number': 1, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': [ - 'Skipping DASH manifest', - ], - }, - { - # The following content has been identified by the YouTube community - # as inappropriate or offensive to some audiences. - 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', - 'info_dict': { - 'id': '6SJNVb0GnPI', - 'ext': 'mp4', - 'title': 'Race Differences in Intelligence', - 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', - 'duration': 965, - 'upload_date': '20140124', - 'uploader': 'New Century Foundation', - 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # itag 212 - 'url': '1t24XAntNCY', - 'only_matching': True, - }, - { - # geo restricted to JP - 'url': 'sJL6WA-aGkQ', - 'only_matching': True, - }, - { - 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', - 'only_matching': True, - }, - { - 'url': 'https://invidio.us/watch?v=BaW_jenozKc', - 'only_matching': True, - }, - { - # DRM protected - 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', - 'only_matching': True, - }, - { - # Video with unsupported adaptive stream type formats - 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U', - 'info_dict': { - 'id': 'Z4Vy8R84T1U', - 'ext': 'mp4', - 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'duration': 433, - 'upload_date': '20130923', - 'uploader': 'Amelia Putri Harwita', - 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q', - 'formats': 'maxcount:10', - }, - 'params': { - 'skip_download': True, - 'youtube_include_dash_manifest': False, - }, - }, - { - # Youtube Music Auto-generated description - 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs', - 'info_dict': { - 'id': 'MgNrAu2pzNs', - 'ext': 'mp4', - 'title': 'Voyeur Girl', - 'description': 'md5:7ae382a65843d6df2685993e90a8628f', - 'upload_date': '20190312', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw', - 'artist': 'Stephen', - 'track': 'Voyeur Girl', - 'album': 'it\'s too much love to know my dear', - 'release_date': '20190313', - 'release_year': 2019, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # Retrieve 'artist' field from 'Artist:' in video description - # when it is present on youtube music video - 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', - 'info_dict': { - 'id': 'k0jLE7tTwjY', - 'ext': 'mp4', - 'title': 'Latch Feat. Sam Smith', - 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335', - 'upload_date': '20150110', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w', - 'artist': 'Disclosure', - 'track': 'Latch Feat. Sam Smith', - 'album': 'Latch Featuring Sam Smith', - 'release_date': '20121008', - 'release_year': 2012, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle multiple artists on youtube music video - 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', - 'info_dict': { - 'id': '74qn0eJSjpA', - 'ext': 'mp4', - 'title': 'Eastside', - 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2', - 'upload_date': '20180710', - 'uploader': 'Benny Blanco - Topic', - 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A', - 'artist': 'benny blanco, Halsey, Khalid', - 'track': 'Eastside', - 'album': 'Eastside', - 'release_date': '20180713', - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, - { - # Youtube Music Auto-generated description - # handle youtube music video with release_year and no release_date - 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', - 'info_dict': { - 'id': '-hcAI0g-f5M', - 'ext': 'mp4', - 'title': 'Put It On Me', - 'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e', - 'upload_date': '20180426', - 'uploader': 'Matt Maeson - Topic', - 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', - 'artist': 'Matt Maeson', - 'track': 'Put It On Me', - 'album': 'The Hearse', - 'release_date': None, - 'release_year': 2018, - }, - 'params': { - 'skip_download': True, - }, - }, - ] - - def __init__(self, *args, **kwargs): - super(YoutubeIE, self).__init__(*args, **kwargs) - self._player_cache = {} - - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen('%s: Downloading video info webpage' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self.to_screen('%s: Extracting video information' % video_id) - - def report_unavailable_format(self, video_id, format): - """Report extracted video URL.""" - self.to_screen('%s: Format %s not available' % (video_id, format)) - - def report_rtmp_download(self): - """Indicate the download will use the RTMP protocol.""" - self.to_screen('RTMP download detected') - - def _signature_cache_id(self, example_sig): - """ Return a string representation of a signature """ - return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) - - def _extract_signature_function(self, video_id, player_url, example_sig): - id_m = re.match( - r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', - player_url) - if not id_m: - raise ExtractorError('Cannot identify player %r' % player_url) - player_type = id_m.group('ext') - player_id = id_m.group('id') - - # Read from filesystem cache - func_id = '%s_%s_%s' % ( - player_type, player_id, self._signature_cache_id(example_sig)) - assert os.path.basename(func_id) == func_id - - cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) - if cache_spec is not None: - return lambda s: ''.join(s[i] for i in cache_spec) - - download_note = ( - 'Downloading player %s' % player_url - if self._downloader.params.get('verbose') else - 'Downloading %s player %s' % (player_type, player_id) - ) - if player_type == 'js': - code = self._download_webpage( - player_url, video_id, - note=download_note, - errnote='Download of %s failed' % player_url) - res = self._parse_sig_js(code) - elif player_type == 'swf': - urlh = self._request_webpage( - player_url, video_id, - note=download_note, - errnote='Download of %s failed' % player_url) - code = urlh.read() - res = self._parse_sig_swf(code) - else: - assert False, 'Invalid player type %r' % player_type - - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = res(test_string) - cache_spec = [ord(c) for c in cache_res] - - self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) - return res - - def _print_sig_code(self, func, example_sig): - def gen_sig_code(idxs): - def _genslice(start, end, step): - starts = '' if start == 0 else str(start) - ends = (':%d' % (end + step)) if end + step >= 0 else ':' - steps = '' if step == 1 else (':%d' % step) - return 's[%s%s%s]' % (starts, ends, steps) - - step = None - # Quelch pyflakes warnings - start will be set when step is set - start = '(Never used)' - for i, prev in zip(idxs[1:], idxs[:-1]): - if step is not None: - if i - prev == step: - continue - yield _genslice(start, prev, step) - step = None - continue - if i - prev in [-1, 1]: - step = i - prev - start = prev - continue - else: - yield 's[%d]' % prev - if step is None: - yield 's[%d]' % i - else: - yield _genslice(start, i, step) - - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = func(test_string) - cache_spec = [ord(c) for c in cache_res] - expr_code = ' + '.join(gen_sig_code(cache_spec)) - signature_id_tuple = '(%s)' % ( - ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) - code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' - ' return %s\n') % (signature_id_tuple, expr_code) - self.to_screen('Extracted signature function:\n' + code) - - def _parse_sig_js(self, jscode): - funcname = self._search_regex( - (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', - # Obsolete patterns - r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', - r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), - jscode, 'Initial JS player signature function name', group='sig') - - jsi = JSInterpreter(jscode) - initial_function = jsi.extract_function(funcname) - return lambda s: initial_function([s]) - - def _parse_sig_swf(self, file_contents): - swfi = SWFInterpreter(file_contents) - TARGET_CLASSNAME = 'SignatureDecipher' - searched_class = swfi.extract_class(TARGET_CLASSNAME) - initial_function = swfi.extract_function(searched_class, 'decipher') - return lambda s: initial_function([s]) - - def _decrypt_signature(self, s, video_id, player_url, age_gate=False): - """Turn the encrypted s field into a working signature""" - - if player_url is None: - raise ExtractorError('Cannot decrypt signature without player_url') - - if player_url.startswith('//'): - player_url = 'https:' + player_url - elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( - 'https://www.youtube.com', player_url) - try: - player_id = (player_url, self._signature_cache_id(s)) - if player_id not in self._player_cache: - func = self._extract_signature_function( - video_id, player_url, s - ) - self._player_cache[player_id] = func - func = self._player_cache[player_id] - if self._downloader.params.get('youtube_print_sig_code'): - self._print_sig_code(func, s) - return func(s) - except Exception as e: - tb = traceback.format_exc() - raise ExtractorError( - 'Signature extraction failed: ' + tb, cause=e) - - def _get_subtitles(self, video_id, webpage): - try: - subs_doc = self._download_xml( - 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, - video_id, note=False) - except ExtractorError as err: - self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) - return {} - - sub_lang_list = {} - for track in subs_doc.findall('track'): - lang = track.attrib['lang_code'] - if lang in sub_lang_list: - continue - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': ext, - 'name': track.attrib['name'].encode('utf-8'), - }) - sub_formats.append({ - 'url': 'https://www.youtube.com/api/timedtext?' + params, - 'ext': ext, - }) - sub_lang_list[lang] = sub_formats - if not sub_lang_list: - self._downloader.report_warning('video doesn\'t have subtitles') - return {} - return sub_lang_list - - def _get_ytplayer_config(self, video_id, webpage): - patterns = ( - # User data may contain arbitrary character sequences that may affect - # JSON extraction with regex, e.g. when '};' is contained the second - # regex won't capture the whole JSON. Yet working around by trying more - # concrete regex first keeping in mind proper quoted string handling - # to be implemented in future that will replace this workaround (see - # https://github.com/ytdl-org/youtube-dl/issues/7468, - # https://github.com/ytdl-org/youtube-dl/pull/7599) - r';ytplayer\.config\s*=\s*({.+?});ytplayer', - r';ytplayer\.config\s*=\s*({.+?});', - ) - config = self._search_regex( - patterns, webpage, 'ytplayer.config', default=None) - if config: - return self._parse_json( - uppercase_escape(config), video_id, fatal=False) - - def _get_automatic_captions(self, video_id, webpage): - """We need the webpage for getting the captions url, pass it as an - argument to speed up the process.""" - self.to_screen('%s: Looking for automatic captions' % video_id) - player_config = self._get_ytplayer_config(video_id, webpage) - err_msg = 'Couldn\'t find automatic captions for %s' % video_id - if not player_config: - self._downloader.report_warning(err_msg) - return {} - try: - args = player_config['args'] - caption_url = args.get('ttsurl') - if caption_url: - timestamp = args['timestamp'] - # We get the available subtitles - list_params = compat_urllib_parse_urlencode({ - 'type': 'list', - 'tlangs': 1, - 'asrs': 1, - }) - list_url = caption_url + '&' + list_params - caption_list = self._download_xml(list_url, video_id) - original_lang_node = caption_list.find('track') - if original_lang_node is None: - self._downloader.report_warning('Video doesn\'t have automatic captions') - return {} - original_lang = original_lang_node.attrib['lang_code'] - caption_kind = original_lang_node.attrib.get('kind', '') - - sub_lang_list = {} - for lang_node in caption_list.findall('target'): - sub_lang = lang_node.attrib['lang_code'] - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse_urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': ext, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_formats.append({ - 'url': caption_url + '&' + params, - 'ext': ext, - }) - sub_lang_list[sub_lang] = sub_formats - return sub_lang_list - - def make_captions(sub_url, sub_langs): - parsed_sub_url = compat_urllib_parse_urlparse(sub_url) - caption_qs = compat_parse_qs(parsed_sub_url.query) - captions = {} - for sub_lang in sub_langs: - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - caption_qs.update({ - 'tlang': [sub_lang], - 'fmt': [ext], - }) - sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( - query=compat_urllib_parse_urlencode(caption_qs, True))) - sub_formats.append({ - 'url': sub_url, - 'ext': ext, - }) - captions[sub_lang] = sub_formats - return captions - - # New captions format as of 22.06.2017 - player_response = args.get('player_response') - if player_response and isinstance(player_response, compat_str): - player_response = self._parse_json( - player_response, video_id, fatal=False) - if player_response: - renderer = player_response['captions']['playerCaptionsTracklistRenderer'] - base_url = renderer['captionTracks'][0]['baseUrl'] - sub_lang_list = [] - for lang in renderer['translationLanguages']: - lang_code = lang.get('languageCode') - if lang_code: - sub_lang_list.append(lang_code) - return make_captions(base_url, sub_lang_list) - - # Some videos don't provide ttsurl but rather caption_tracks and - # caption_translation_languages (e.g. 20LmZk1hakA) - # Does not used anymore as of 22.06.2017 - caption_tracks = args['caption_tracks'] - caption_translation_languages = args['caption_translation_languages'] - caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - sub_lang_list = [] - for lang in caption_translation_languages.split(','): - lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) - sub_lang = lang_qs.get('lc', [None])[0] - if sub_lang: - sub_lang_list.append(sub_lang) - return make_captions(caption_url, sub_lang_list) - # An extractor error can be raise by the download process if there are - # no automatic captions but there are subtitles - except (KeyError, IndexError, ExtractorError): - self._downloader.report_warning(err_msg) - return {} - - def _mark_watched(self, video_id, video_info, player_response): - playback_url = url_or_none(try_get( - player_response, - lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( - video_info, lambda x: x['videostats_playback_base_url'][0])) - if not playback_url: - return - parsed_playback_url = compat_urlparse.urlparse(playback_url) - qs = compat_urlparse.parse_qs(parsed_playback_url.query) - - # cpn generation algorithm is reverse engineered from base.js. - # In fact it works even with dummy cpn. - CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' - cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) - - qs.update({ - 'ver': ['2'], - 'cpn': [cpn], - }) - playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) - - self._download_webpage( - playback_url, video_id, 'Marking watched', - 'Unable to mark watched', fatal=False) - - @staticmethod - def _extract_urls(webpage): - # Embedded YouTube player - entries = [ - unescapeHTML(mobj.group('url')) - for mobj in re.finditer(r'''(?x) - (?: - <iframe[^>]+?src=| - data-video-url=| - <embed[^>]+?src=| - embedSWF\(?:\s*| - <object[^>]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) - \1''', webpage)] - - # lazyYT YouTube embed - entries.extend(list(map( - unescapeHTML, - re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) - - # Wordpress "YouTube Video Importer" plugin - matches = re.findall(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) - entries.extend(m[-1] for m in matches) - - return entries - - @staticmethod - def _extract_url(webpage): - urls = YoutubeIE._extract_urls(webpage) - return urls[0] if urls else None - - @classmethod - def extract_id(cls, url): - mobj = re.match(cls._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - video_id = mobj.group(2) - return video_id - - @staticmethod - def _extract_chapters(description, duration): - if not description: - return None - chapter_lines = re.findall( - r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)', - description) - if not chapter_lines: - return None - chapters = [] - for next_num, (chapter_line, time_point) in enumerate( - chapter_lines, start=1): - start_time = parse_duration(time_point) - if start_time is None: - continue - if start_time > duration: - break - end_time = (duration if next_num == len(chapter_lines) - else parse_duration(chapter_lines[next_num][1])) - if end_time is None: - continue - if end_time > duration: - end_time = duration - if start_time > end_time: - break - chapter_title = re.sub( - r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-') - chapter_title = re.sub(r'\s+', ' ', chapter_title) - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': chapter_title, - }) - return chapters - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - proto = ( - 'http' if self._downloader.params.get('prefer_insecure', False) - else 'https') - - start_time = None - end_time = None - parsed_url = compat_urllib_parse_urlparse(url) - for component in [parsed_url.fragment, parsed_url.query]: - query = compat_parse_qs(component) - if start_time is None and 't' in query: - start_time = parse_duration(query['t'][0]) - if start_time is None and 'start' in query: - start_time = parse_duration(query['start'][0]) - if end_time is None and 'end' in query: - end_time = parse_duration(query['end'][0]) - - # Extract original video URL from URL with redirection, like age verification, using next_url parameter - mobj = re.search(self._NEXT_URL_RE, url) - if mobj: - url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/') - video_id = self.extract_id(url) - - # Get video webpage - url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id - video_webpage = self._download_webpage(url, video_id) - - # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) - if mobj is not None: - player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) - else: - player_url = None - - dash_mpds = [] - - def add_dash_mpd(video_info): - dash_mpd = video_info.get('dashmpd') - if dash_mpd and dash_mpd[0] not in dash_mpds: - dash_mpds.append(dash_mpd[0]) - - def add_dash_mpd_pr(pl_response): - dash_mpd = url_or_none(try_get( - pl_response, lambda x: x['streamingData']['dashManifestUrl'], - compat_str)) - if dash_mpd and dash_mpd not in dash_mpds: - dash_mpds.append(dash_mpd) - - is_live = None - view_count = None - - def extract_view_count(v_info): - return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) - - def extract_token(v_info): - return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token')) - - def extract_player_response(player_response, video_id): - pl_response = str_or_none(player_response) - if not pl_response: - return - pl_response = self._parse_json(pl_response, video_id, fatal=False) - if isinstance(pl_response, dict): - add_dash_mpd_pr(pl_response) - return pl_response - - player_response = {} - - # Get video info - embed_webpage = None - if re.search(r'player-age-gate-content">', video_webpage) is not None: - age_gate = True - # We simulate the access to the video from www.youtube.com/v/{video_id} - # this can be viewed without login into Youtube - url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') - data = compat_urllib_parse_urlencode({ - 'video_id': video_id, - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), - }) - video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - pl_response = video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(video_info) - view_count = extract_view_count(video_info) - else: - age_gate = False - video_info = None - sts = None - # Try looking directly into the video webpage - ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) - if ytplayer_config: - args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - add_dash_mpd(video_info) - # Rental video is not rented but preview is available (e.g. - # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/ytdl-org/youtube-dl/issues/10532) - if not video_info and args.get('ypc_vid'): - return self.url_result( - args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) - if args.get('livestream') == '1' or args.get('live_playback') == 1: - is_live = True - sts = ytplayer_config.get('sts') - if not player_response: - player_response = extract_player_response(args.get('player_response'), video_id) - if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): - add_dash_mpd_pr(player_response) - # We also try looking in get_video_info since it may contain different dashmpd - # URL that points to a DASH manifest with possibly different itag set (some itags - # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH - # manifest pointed by get_video_info's dashmpd). - # The general idea is to take a union of itags of both DASH manifests (for example - # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093) - self.report_video_info_webpage_download(video_id) - for el in ('embedded', 'detailpage', 'vevo', ''): - query = { - 'video_id': video_id, - 'ps': 'default', - 'eurl': '', - 'gl': 'US', - 'hl': 'en', - } - if el: - query['el'] = el - if sts: - query['sts'] = sts - video_info_webpage = self._download_webpage( - '%s://www.youtube.com/get_video_info' % proto, - video_id, note=False, - errnote='unable to download video info webpage', - fatal=False, query=query) - if not video_info_webpage: - continue - get_video_info = compat_parse_qs(video_info_webpage) - if not player_response: - pl_response = get_video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(get_video_info) - if view_count is None: - view_count = extract_view_count(get_video_info) - if not video_info: - video_info = get_video_info - get_token = extract_token(get_video_info) - if get_token: - # Different get_video_info requests may report different results, e.g. - # some may report video unavailability, but some may serve it without - # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362, - # the original webpage as well as el=info and el=embedded get_video_info - # requests report video unavailability due to geo restriction while - # el=detailpage succeeds and returns valid data). This is probably - # due to YouTube measures against IP ranges of hosting providers. - # Working around by preferring the first succeeded video_info containing - # the token if no such video_info yet was found. - token = extract_token(video_info) - if not token: - video_info = get_video_info - break - - def extract_unavailable_message(): - messages = [] - for tag, kind in (('h1', 'message'), ('div', 'submessage')): - msg = self._html_search_regex( - r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind), - video_webpage, 'unavailable %s' % kind, default=None) - if msg: - messages.append(msg) - if messages: - return '\n'.join(messages) - - if not video_info: - unavailable_message = extract_unavailable_message() - if not unavailable_message: - unavailable_message = 'Unable to extract video data' - raise ExtractorError( - 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - - video_details = try_get( - player_response, lambda x: x['videoDetails'], dict) or {} - - video_title = video_info.get('title', [None])[0] or video_details.get('title') - if not video_title: - self._downloader.report_warning('Unable to extract video title') - video_title = '_' - - description_original = video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - - def replace_url(m): - redir_url = compat_urlparse.urljoin(url, m.group(1)) - parsed_redir_url = compat_urllib_parse_urlparse(redir_url) - if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': - qs = compat_parse_qs(parsed_redir_url.query) - q = qs.get('q') - if q and q[0]: - return q[0] - return redir_url - - description_original = video_description = re.sub(r'''(?x) - <a\s+ - (?:[a-zA-Z-]+="[^"]*"\s+)*? - (?:title|href)="([^"]+)"\s+ - (?:[a-zA-Z-]+="[^"]*"\s+)*? - class="[^"]*"[^>]*> - [^<]+\.{3}\s* - </a> - ''', replace_url, video_description) - video_description = clean_html(video_description) - else: - video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription') - - if not smuggled_data.get('force_singlefeed', False): - if not self._downloader.params.get('noplaylist'): - multifeed_metadata_list = try_get( - player_response, - lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], - compat_str) or try_get( - video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) - if multifeed_metadata_list: - entries = [] - feed_ids = [] - for feed in multifeed_metadata_list.split(','): - # Unquote should take place before split on comma (,) since textual - # fields may contain comma as well (see - # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), - {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), - }) - feed_ids.append(feed_data['id'][0]) - self.to_screen( - 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' - % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) - else: - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - - if view_count is None: - view_count = extract_view_count(video_info) - if view_count is None and video_details: - view_count = int_or_none(video_details.get('viewCount')) - - if is_live is None: - is_live = bool_or_none(video_details.get('isLive')) - - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) - - def _extract_filesize(media_url): - return int_or_none(self._search_regex( - r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) - - streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or [] - streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or []) - - if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): - self.report_rtmp_download() - formats = [{ - 'format_id': '_rtmp', - 'protocol': 'rtmp', - 'url': video_info['conn'][0], - 'player_url': player_url, - }] - elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): - encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] - if 'rtmpe%3Dyes' in encoded_url_map: - raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) - formats = [] - formats_spec = {} - fmt_list = video_info.get('fmt_list', [''])[0] - if fmt_list: - for fmt in fmt_list.split(','): - spec = fmt.split('/') - if len(spec) > 1: - width_height = spec[1].split('x') - if len(width_height) == 2: - formats_spec[spec[0]] = { - 'resolution': spec[1], - 'width': int_or_none(width_height[0]), - 'height': int_or_none(width_height[1]), - } - for fmt in streaming_formats: - itag = str_or_none(fmt.get('itag')) - if not itag: - continue - quality = fmt.get('quality') - quality_label = fmt.get('qualityLabel') or quality - formats_spec[itag] = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_note': quality_label, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - # bitrate for itag 43 is always 2147483647 - 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, - 'width': int_or_none(fmt.get('width')), - } - - for fmt in streaming_formats: - if fmt.get('drm_families'): - continue - url = url_or_none(fmt.get('url')) - - if not url: - cipher = fmt.get('cipher') - if not cipher: - continue - url_data = compat_parse_qs(cipher) - url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str)) - if not url: - continue - else: - cipher = None - url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - - stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) - # Unsupported FORMAT_STREAM_TYPE_OTF - if stream_type == 3: - continue - - format_id = fmt.get('itag') or url_data['itag'][0] - if not format_id: - continue - format_id = compat_str(format_id) - - if cipher: - if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' - jsplayer_url_json = self._search_regex( - ASSETS_RE, - embed_webpage if age_gate else video_webpage, - 'JS player URL (1)', default=None) - if not jsplayer_url_json and not age_gate: - # We need the embed website after all - if embed_webpage is None: - embed_url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') - jsplayer_url_json = self._search_regex( - ASSETS_RE, embed_webpage, 'JS player URL') - - player_url = json.loads(jsplayer_url_json) - if player_url is None: - player_url_json = self._search_regex( - r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, 'age gate player URL') - player_url = json.loads(player_url_json) - - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] - - if self._downloader.params.get('verbose'): - if player_url is None: - player_version = 'unknown' - player_desc = 'unknown' - else: - if player_url.endswith('swf'): - player_version = self._search_regex( - r'-(.+?)(?:/watch_as3)?\.swf$', player_url, - 'flash player', fatal=False) - player_desc = 'flash player %s' % player_version - else: - player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], - player_url, - 'html5 player', fatal=False) - player_desc = 'html5 player %s' % player_version - - parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) - - signature = self._decrypt_signature( - encrypted_sig, video_id, player_url, age_gate) - sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' - url += '&%s=%s' % (sp, signature) - if 'ratebypass' not in url: - url += '&ratebypass=yes' - - dct = { - 'format_id': format_id, - 'url': url, - 'player_url': player_url, - } - if format_id in self._formats: - dct.update(self._formats[format_id]) - if format_id in formats_spec: - dct.update(formats_spec[format_id]) - - # Some itags are not included in DASH manifest thus corresponding formats will - # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993). - # Trying to extract metadata from url_encoded_fmt_stream_map entry. - mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0]) - width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) - - if width is None: - width = int_or_none(fmt.get('width')) - if height is None: - height = int_or_none(fmt.get('height')) - - filesize = int_or_none(url_data.get( - 'clen', [None])[0]) or _extract_filesize(url) - - quality = url_data.get('quality', [None])[0] or fmt.get('quality') - quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel') - - tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000) - or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None - fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps')) - - more_fields = { - 'filesize': filesize, - 'tbr': tbr, - 'width': width, - 'height': height, - 'fps': fps, - 'format_note': quality_label or quality, - } - for key, value in more_fields.items(): - if value: - dct[key] = value - type_ = url_data.get('type', [None])[0] or fmt.get('mimeType') - if type_: - type_split = type_.split(';') - kind_ext = type_split[0].split('/') - if len(kind_ext) == 2: - kind, _ = kind_ext - dct['ext'] = mimetype2ext(type_split[0]) - if kind in ('audio', 'video'): - codecs = None - for mobj in re.finditer( - r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_): - if mobj.group('key') == 'codecs': - codecs = mobj.group('val') - break - if codecs: - dct.update(parse_codecs(codecs)) - if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': - dct['downloader_options'] = { - # Youtube throttles chunks >~10M - 'http_chunk_size': 10485760, - } - formats.append(dct) - else: - manifest_url = ( - url_or_none(try_get( - player_response, - lambda x: x['streamingData']['hlsManifestUrl'], - compat_str)) - or url_or_none(try_get( - video_info, lambda x: x['hlsvp'][0], compat_str))) - if manifest_url: - formats = [] - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', fatal=False) - for a_format in m3u8_formats: - itag = self._search_regex( - r'/itag/(\d+)/', a_format['url'], 'itag', default=None) - if itag: - a_format['format_id'] = itag - if itag in self._formats: - dct = self._formats[itag].copy() - dct.update(a_format) - a_format = dct - a_format['player_url'] = player_url - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' - formats.append(a_format) - else: - error_message = extract_unavailable_message() - if not error_message: - error_message = clean_html(try_get( - player_response, lambda x: x['playabilityStatus']['reason'], - compat_str)) - if not error_message: - error_message = clean_html( - try_get(video_info, lambda x: x['reason'][0], compat_str)) - if error_message: - raise ExtractorError(error_message, expected=True) - raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') - - # uploader - video_uploader = try_get( - video_info, lambda x: x['author'][0], - compat_str) or str_or_none(video_details.get('author')) - if video_uploader: - video_uploader = compat_urllib_parse_unquote_plus(video_uploader) - else: - self._downloader.report_warning('unable to extract uploader name') - - # uploader_id - video_uploader_id = None - video_uploader_url = None - mobj = re.search( - r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', - video_webpage) - if mobj is not None: - video_uploader_id = mobj.group('uploader_id') - video_uploader_url = mobj.group('uploader_url') - else: - self._downloader.report_warning('unable to extract uploader nickname') - - channel_id = ( - str_or_none(video_details.get('channelId')) - or self._html_search_meta( - 'channelId', video_webpage, 'channel id', default=None) - or self._search_regex( - r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', - video_webpage, 'channel id', default=None, group='id')) - channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None - - # thumbnail image - # We try first to get a high quality image: - m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', - video_webpage, re.DOTALL) - if m_thumb is not None: - video_thumbnail = m_thumb.group(1) - elif 'thumbnail_url' not in video_info: - self._downloader.report_warning('unable to extract video thumbnail') - video_thumbnail = None - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = self._html_search_meta( - 'datePublished', video_webpage, 'upload date', default=None) - if not upload_date: - upload_date = self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)</span>', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], - video_webpage, 'upload date', default=None) - upload_date = unified_strdate(upload_date) - - video_license = self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', - video_webpage, 'license', default=None) - - m_music = re.search( - r'''(?x) - <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* - <ul[^>]*>\s* - <li>(?P<title>.+?) - by (?P<creator>.+?) - (?: - \(.+?\)| - <a[^>]* - (?: - \bhref=["\']/red[^>]*>| # drop possible - >\s*Listen ad-free with YouTube Red # YouTube Red ad - ) - .*? - )?</li - ''', - video_webpage) - if m_music: - video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) - video_creator = clean_html(m_music.group('creator')) - else: - video_alt_title = video_creator = None - - def extract_meta(field): - return self._html_search_regex( - r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, - video_webpage, field, default=None) - - track = extract_meta('Song') - artist = extract_meta('Artist') - album = extract_meta('Album') - - # Youtube Music Auto-generated description - release_date = release_year = None - if video_description: - mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description) - if mobj: - if not track: - track = mobj.group('track').strip() - if not artist: - artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) - if not album: - album = mobj.group('album'.strip()) - release_year = mobj.group('release_year') - release_date = mobj.group('release_date') - if release_date: - release_date = release_date.replace('-', '') - if not release_year: - release_year = int(release_date[:4]) - if release_year: - release_year = int(release_year) - - m_episode = re.search( - r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', - video_webpage) - if m_episode: - series = unescapeHTML(m_episode.group('series')) - season_number = int(m_episode.group('season')) - episode_number = int(m_episode.group('episode')) - else: - series = season_number = episode_number = None - - m_cat_container = self._search_regex( - r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', default=None) - if m_cat_container: - category = self._html_search_regex( - r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', - default=None) - video_categories = None if category is None else [category] - else: - video_categories = None - - video_tags = [ - unescapeHTML(m.group('content')) - for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - - def _extract_count(count_name): - return str_to_int(self._search_regex( - r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' - % re.escape(count_name), - video_webpage, count_name, default=None)) - - like_count = _extract_count('like') - dislike_count = _extract_count('dislike') - - if view_count is None: - view_count = str_to_int(self._search_regex( - r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, - 'view count', default=None)) - - average_rating = ( - float_or_none(video_details.get('averageRating')) - or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) - - # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) - automatic_captions = self.extract_automatic_captions(video_id, video_webpage) - - video_duration = try_get( - video_info, lambda x: int_or_none(x['length_seconds'][0])) - if not video_duration: - video_duration = int_or_none(video_details.get('lengthSeconds')) - if not video_duration: - video_duration = parse_duration(self._html_search_meta( - 'duration', video_webpage, 'video duration')) - - # annotations - video_annotations = None - if self._downloader.params.get('writeannotations', False): - xsrf_token = self._search_regex( - r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2', - video_webpage, 'xsrf token', group='xsrf_token', fatal=False) - invideo_url = try_get( - player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) - if xsrf_token and invideo_url: - xsrf_field_name = self._search_regex( - r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2', - video_webpage, 'xsrf field name', - group='xsrf_field_name', default='session_token') - video_annotations = self._download_webpage( - self._proto_relative_url(invideo_url), - video_id, note='Downloading annotations', - errnote='Unable to download video annotations', fatal=False, - data=urlencode_postdata({xsrf_field_name: xsrf_token})) - - chapters = self._extract_chapters(description_original, video_duration) - - # Look for the DASH manifest - if self._downloader.params.get('youtube_include_dash_manifest', True): - dash_mpd_fatal = True - for mpd_url in dash_mpds: - dash_formats = {} - try: - def decrypt_sig(mobj): - s = mobj.group(1) - dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) - return '/signature/%s' % dec_s - - mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url) - - for df in self._extract_mpd_formats( - mpd_url, video_id, fatal=dash_mpd_fatal, - formats_dict=self._formats): - if not df.get('filesize'): - df['filesize'] = _extract_filesize(df['url']) - # Do not overwrite DASH format found in some previous DASH manifest - if df['format_id'] not in dash_formats: - dash_formats[df['format_id']] = df - # Additional DASH manifests may end up in HTTP Error 403 therefore - # allow them to fail without bug report message if we already have - # some DASH manifest succeeded. This is temporary workaround to reduce - # burst of bug reports until we figure out the reason and whether it - # can be fixed at all. - dash_mpd_fatal = False - except (ExtractorError, KeyError) as e: - self.report_warning( - 'Skipping DASH manifest: %r' % e, video_id) - if dash_formats: - # Remove the formats we found through non-DASH, they - # contain less info and it can be wrong, because we use - # fixed values (for example the resolution). See - # https://github.com/ytdl-org/youtube-dl/issues/5774 for an - # example. - formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] - formats.extend(dash_formats.values()) - - # Check for malformed aspect ratio - stretched_m = re.search( - r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">', - video_webpage) - if stretched_m: - w = float(stretched_m.group('w')) - h = float(stretched_m.group('h')) - # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). - # We will only process correct ratios. - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio - - if not formats: - token = extract_token(video_info) - if not token: - if 'reason' in video_info: - if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta( - 'regionsAllowed', video_webpage, default=None) - countries = regions_allowed.split(',') if regions_allowed else None - self.raise_geo_restricted( - msg=video_info['reason'][0], countries=countries) - reason = video_info['reason'][0] - if 'Invalid parameters' in reason: - unavailable_message = extract_unavailable_message() - if unavailable_message: - reason = unavailable_message - raise ExtractorError( - 'YouTube said: %s' % reason, - expected=True, video_id=video_id) - else: - raise ExtractorError( - '"token" parameter not in video info for unknown reason', - video_id=video_id) - - if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])): - raise ExtractorError('This video is DRM protected.', expected=True) - - self._sort_formats(formats) - - self.mark_watched(video_id, video_info, player_response) - - return { - 'id': video_id, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'uploader_url': video_uploader_url, - 'channel_id': channel_id, - 'channel_url': channel_url, - 'upload_date': upload_date, - 'license': video_license, - 'creator': video_creator or artist, - 'title': video_title, - 'alt_title': video_alt_title or track, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'categories': video_categories, - 'tags': video_tags, - 'subtitles': video_subtitles, - 'automatic_captions': automatic_captions, - 'duration': video_duration, - 'age_limit': 18 if age_gate else 0, - 'annotations': video_annotations, - 'chapters': chapters, - 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'average_rating': average_rating, - 'formats': formats, - 'is_live': is_live, - 'start_time': start_time, - 'end_time': end_time, - 'series': series, - 'season_number': season_number, - 'episode_number': episode_number, - 'track': track, - 'artist': artist, - 'album': album, - 'release_date': release_date, - 'release_year': release_year, - } - - -class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com playlists' - _VALID_URL = r"""(?x)(?: - (?:https?://)? - (?:\w+\.)? - (?: - (?: - youtube\.com| - invidio\.us - ) - / - (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) - \? (?:.*?[&;])*? (?:p|a|list)= - | p/ - )| - youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= - ) - ( - (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,} - # Top tracks, they can also include dots - |(?:MC)[\w\.]* - ) - .* - | - (%(playlist_id)s) - )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?' - _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' - IE_NAME = 'youtube:playlist' - _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', - 'info_dict': { - 'title': 'ytdl test PL', - 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', - }, - 'playlist_count': 3, - }, { - 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'info_dict': { - 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'title': 'YDL_Empty_List', - }, - 'playlist_count': 0, - 'skip': 'This playlist is private', - }, { - 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', - 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'info_dict': { - 'title': '29C3: Not my department', - 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'uploader': 'Christiaan008', - 'uploader_id': 'ChRiStIaAn008', - }, - 'playlist_count': 95, - }, { - 'note': 'issue #673', - 'url': 'PLBB231211A4F62143', - 'info_dict': { - 'title': '[OLD]Team Fortress 2 (Class-based LP)', - 'id': 'PLBB231211A4F62143', - 'uploader': 'Wickydoo', - 'uploader_id': 'Wickydoo', - }, - 'playlist_mincount': 26, - }, { - 'note': 'Large playlist', - 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', - 'info_dict': { - 'title': 'Uploads from Cauchemar', - 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', - 'uploader': 'Cauchemar', - 'uploader_id': 'Cauchemar89', - }, - 'playlist_mincount': 799, - }, { - 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - 'info_dict': { - 'title': 'YDL_safe_search', - 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', - }, - 'playlist_count': 2, - 'skip': 'This playlist is private', - }, { - 'note': 'embedded', - 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA15', - 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', - 'uploader': 'milan', - 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', - } - }, { - 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'playlist_mincount': 485, - 'info_dict': { - 'title': '2018 Chinese New Singles (11/6 updated)', - 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'uploader': 'LBK', - 'uploader_id': 'sdragonfang', - } - }, { - 'note': 'Embedded SWF player', - 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', - 'playlist_count': 4, - 'info_dict': { - 'title': 'JODA7', - 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', - }, - 'skip': 'This playlist does not exist', - }, { - 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', - 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', - 'info_dict': { - 'title': 'Uploads from Interstellar Movie', - 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', - 'uploader': 'Interstellar Movie', - 'uploader_id': 'InterstellarMovie1', - }, - 'playlist_mincount': 21, - }, { - # Playlist URL that does not actually serve a playlist - 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', - 'info_dict': { - 'id': 'FqZTN594JQw', - 'ext': 'webm', - 'title': "Smiley's People 01 detective, Adventure Series, Action", - 'uploader': 'STREEM', - 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', - 'upload_date': '20150526', - 'license': 'Standard YouTube License', - 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', - 'categories': ['People & Blogs'], - 'tags': list, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - 'add_ie': [YoutubeIE.ie_key()], - }, { - 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', - 'info_dict': { - 'id': 'yeWKywCrFtk', - 'ext': 'mp4', - 'title': 'Small Scale Baler and Braiding Rugs', - 'uploader': 'Backus-Page House Museum', - 'uploader_id': 'backuspagemuseum', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', - 'upload_date': '20161008', - 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', - 'categories': ['Nonprofits & Activism'], - 'tags': list, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - }, - }, { - # https://github.com/ytdl-org/youtube-dl/issues/21844 - 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'info_dict': { - 'title': 'Data Analysis with Dr Mike Pound', - 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'Computerphile', - 'uploader': 'Computerphile', - }, - 'playlist_mincount': 11, - }, { - 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', - 'only_matching': True, - }, { - 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', - 'only_matching': True, - }, { - # music album playlist - 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', - 'only_matching': True, - }, { - 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', - 'only_matching': True, - }] - - def _real_initialize(self): - self._login() - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - - for item in re.findall( - r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page): - attrs = extract_attributes(item) - video_id = attrs['data-video-id'] - video_title = unescapeHTML(attrs.get('data-title')) - if video_title: - video_title = video_title.strip() - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - # Fallback with old _VIDEO_RE - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - - # Relaxed fallbacks - self.extract_videos_from_page_impl( - r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) - self.extract_videos_from_page_impl( - r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page, - ids_in_page, titles_in_page) - - return zip(ids_in_page, titles_in_page) - - def _extract_mix(self, playlist_id): - # The mixes are generated from a single video - # the id of the playlist is just 'RD' + video_id - ids = [] - last_id = playlist_id[-11:] - for n in itertools.count(1): - url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) - webpage = self._download_webpage( - url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) - new_ids = orderedSet(re.findall( - r'''(?xs)data-video-username=".*?".*? - href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), - webpage)) - # Fetch new pages until all the videos are repeated, it seems that - # there are always 51 unique videos. - new_ids = [_id for _id in new_ids if _id not in ids] - if not new_ids: - break - ids.extend(new_ids) - last_id = ids[-1] - - url_results = self._ids_to_results(ids) - - search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) - title_span = ( - search_title('playlist-title') - or search_title('title long-title') - or search_title('title')) - title = clean_html(title_span) - - return self.playlist_result(url_results, playlist_id, title) - - def _extract_playlist(self, playlist_id): - url = self._TEMPLATE_URL % playlist_id - page = self._download_webpage(url, playlist_id) - - # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604) - for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): - match = match.strip() - # Check if the playlist exists or is private - mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match) - if mobj: - reason = mobj.group('reason') - message = 'This playlist %s' % reason - if 'private' in reason: - message += ', use --username or --netrc to access it' - message += '.' - raise ExtractorError(message, expected=True) - elif re.match(r'[^<]*Invalid parameters[^<]*', match): - raise ExtractorError( - 'Invalid parameters. Maybe URL is incorrect.', - expected=True) - elif re.match(r'[^<]*Choose your language[^<]*', match): - continue - else: - self.report_warning('Youtube gives an alert message: ' + match) - - playlist_title = self._html_search_regex( - r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', - page, 'title', default=None) - - _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' - uploader = self._search_regex( - r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, - page, 'uploader', default=None) - mobj = re.search( - r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE, - page) - if mobj: - uploader_id = mobj.group('uploader_id') - uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) - else: - uploader_id = uploader_url = None - - has_videos = True - - if not playlist_title: - try: - # Some playlist URLs don't actually serve a playlist (e.g. - # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) - next(self._entries(page, playlist_id)) - except StopIteration: - has_videos = False - - playlist = self.playlist_result( - self._entries(page, playlist_id), playlist_id, playlist_title) - playlist.update({ - 'uploader': uploader, - 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - }) - - return has_videos, playlist - - def _check_download_just_video(self, url, playlist_id): - # Check if it's a video-specific URL - query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - video_id = query_dict.get('v', [None])[0] or self._search_regex( - r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url, - 'video id', default=None) - if video_id: - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - return video_id, None - return None, None - - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) - - video_id, video = self._check_download_just_video(url, playlist_id) - if video: - return video - - if playlist_id.startswith(('RD', 'UL', 'PU')): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) - - has_videos, playlist = self._extract_playlist(playlist_id) - if has_videos or not video_id: - return playlist - - # Some playlist URLs don't actually serve a playlist (see - # https://github.com/ytdl-org/youtube-dl/issues/10537). - # Fallback to plain video extraction if there is a video id - # along with playlist id. - return self.url_result(video_id, 'Youtube', video_id=video_id) - - -class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): - IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' - _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' - IE_NAME = 'youtube:channel' - _TESTS = [{ - 'note': 'paginated channel', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', - 'playlist_mincount': 91, - 'info_dict': { - 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'Uploads from lex will', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - } - }, { - 'note': 'Age restricted channel', - # from https://www.youtube.com/user/DeusExOfficial - 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w', - 'playlist_mincount': 64, - 'info_dict': { - 'id': 'UUs0ifCMCm1icqRbqhUINa0w', - 'title': 'Uploads from Deus Ex', - 'uploader': 'Deus Ex', - 'uploader_id': 'DeusExOfficial', - }, - }, { - 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) - else super(YoutubeChannelIE, cls).suitable(url)) - - def _build_template_url(self, url, channel_id): - return self._TEMPLATE_URL % channel_id - - def _real_extract(self, url): - channel_id = self._match_id(url) - - url = self._build_template_url(url, channel_id) - - # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) - # Workaround by extracting as a playlist if managed to obtain channel playlist URL - # otherwise fallback on channel by page extraction - channel_page = self._download_webpage( - url + '?view=57', channel_id, - 'Downloading channel page', fatal=False) - if channel_page is False: - channel_playlist_id = False - else: - channel_playlist_id = self._html_search_meta( - 'channelId', channel_page, 'channel id', default=None) - if not channel_playlist_id: - channel_url = self._html_search_meta( - ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), - channel_page, 'channel url', default=None) - if channel_url: - channel_playlist_id = self._search_regex( - r'vnd\.youtube://user/([0-9A-Za-z_-]+)', - channel_url, 'channel id', default=None) - if channel_playlist_id and channel_playlist_id.startswith('UC'): - playlist_id = 'UU' + channel_playlist_id[2:] - return self.url_result( - compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') - - channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') - autogenerated = re.search(r'''(?x) - class="[^"]*?(?: - channel-header-autogenerated-label| - yt-channel-title-autogenerated - )[^"]*"''', channel_page) is not None - - if autogenerated: - # The videos are contained in a single page - # the ajax pages can't be used, they are empty - entries = [ - self.url_result( - video_id, 'Youtube', video_id=video_id, - video_title=video_title) - for video_id, video_title in self.extract_videos_from_page(channel_page)] - return self.playlist_result(entries, channel_id) - - try: - next(self._entries(channel_page, channel_id)) - except StopIteration: - alert_message = self._html_search_regex( - r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', - channel_page, 'alert', default=None, group='alert') - if alert_message: - raise ExtractorError('Youtube said: %s' % alert_message, expected=True) - - return self.playlist_result(self._entries(channel_page, channel_id), channel_id) - - -class YoutubeUserIE(YoutubeChannelIE): - IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' - IE_NAME = 'youtube:user' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheLinuxFoundation', - 'playlist_mincount': 320, - 'info_dict': { - 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', - 'title': 'Uploads from The Linux Foundation', - 'uploader': 'The Linux Foundation', - 'uploader_id': 'TheLinuxFoundation', - } - }, { - # Only available via https://www.youtube.com/c/12minuteathlete/videos - # but not https://www.youtube.com/user/12minuteathlete/videos - 'url': 'https://www.youtube.com/c/12minuteathlete/videos', - 'playlist_mincount': 249, - 'info_dict': { - 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', - 'title': 'Uploads from 12 Minute Athlete', - 'uploader': '12 Minute Athlete', - 'uploader_id': 'the12minuteathlete', - } - }, { - 'url': 'ytuser:phihag', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/gametrailers', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/gametrailers', - 'only_matching': True, - }, { - # This channel is not available, geo restricted to JP - 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - # Don't return True if the url can be extracted with other youtube - # extractor, the regex would is too permissive and it would match. - other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls) - if any(ie.suitable(url) for ie in other_yt_ies): - return False - else: - return super(YoutubeUserIE, cls).suitable(url) - - def _build_template_url(self, url, channel_id): - mobj = re.match(self._VALID_URL, url) - return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) - - -class YoutubeLiveIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube.com live streams' - _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live' - IE_NAME = 'youtube:live' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/TheYoungTurks/live', - 'info_dict': { - 'id': 'a48o2S1cPoo', - 'ext': 'mp4', - 'title': 'The Young Turks - Live Main Show', - 'uploader': 'The Young Turks', - 'uploader_id': 'TheYoungTurks', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', - 'upload_date': '20150715', - 'license': 'Standard YouTube License', - 'description': 'md5:438179573adcdff3c97ebb1ee632b891', - 'categories': ['News & Politics'], - 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - base_url = mobj.group('base_url') - webpage = self._download_webpage(url, channel_id, fatal=False) - if webpage: - page_type = self._og_search_property( - 'type', webpage, 'page type', default='') - video_id = self._html_search_meta( - 'videoId', webpage, 'video id', default=None) - if page_type.startswith('video') and video_id and re.match( - r'^[0-9A-Za-z_-]{11}$', video_id): - return self.url_result(video_id, YoutubeIE.ie_key()) - return self.url_result(base_url) - - -class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com user/channel playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' - IE_NAME = 'youtube:playlists' - - _TESTS = [{ - 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', - 'playlist_mincount': 4, - 'info_dict': { - 'id': 'ThirstForScience', - 'title': 'ThirstForScience', - }, - }, { - # with "Load more" button - 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', - 'playlist_mincount': 70, - 'info_dict': { - 'id': 'igorkle1', - 'title': 'Игорь Клейнер', - }, - }, { - 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', - 'playlist_mincount': 17, - 'info_dict': { - 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', - 'title': 'Chem Player', - }, - 'skip': 'Blocked', - }] - - -class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' - - -class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): - IE_DESC = 'YouTube.com searches' - # there doesn't appear to be a real limit, for example if you search for - # 'python' you get more than 8.000.000 results - _MAX_RESULTS = float('inf') - IE_NAME = 'youtube:search' - _SEARCH_KEY = 'ytsearch' - _EXTRA_QUERY_ARGS = {} - _TESTS = [] - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - - videos = [] - limit = n - - url_query = { - 'search_query': query.encode('utf-8'), - } - url_query.update(self._EXTRA_QUERY_ARGS) - result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) - - for pagenum in itertools.count(1): - data = self._download_json( - result_url, video_id='query "%s"' % query, - note='Downloading page %s' % pagenum, - errnote='Unable to download API page', - query={'spf': 'navigate'}) - html_content = data[1]['body']['content'] - - if 'class="search-message' in html_content: - raise ExtractorError( - '[youtube] No video results', expected=True) - - new_videos = list(self._process_page(html_content)) - videos += new_videos - if not new_videos or len(videos) > limit: - break - next_link = self._html_search_regex( - r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next', - html_content, 'next link', default=None) - if next_link is None: - break - result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link) - - if len(videos) > n: - videos = videos[:n] - return self.playlist_result(videos, query) - - -class YoutubeSearchDateIE(YoutubeSearchIE): - IE_NAME = YoutubeSearchIE.IE_NAME + ':date' - _SEARCH_KEY = 'ytsearchdate' - IE_DESC = 'YouTube.com searches, newest videos first' - _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} - - -class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): - IE_DESC = 'YouTube.com search URLs' - IE_NAME = 'youtube:search_url' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _TESTS = [{ - 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', - 'playlist_mincount': 5, - 'info_dict': { - 'title': 'youtube-dl test video', - } - }, { - 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - return self.playlist_result(self._process_page(webpage), playlist_title=query) - - -class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): - IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' - IE_NAME = 'youtube:show' - _TESTS = [{ - 'url': 'https://www.youtube.com/show/airdisasters', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'airdisasters', - 'title': 'Air Disasters', - } - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - return super(YoutubeShowIE, self)._real_extract( - 'https://www.youtube.com/show/%s/playlists' % playlist_id) - - -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): - """ - Base class for feed extractors - Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. - """ - _LOGIN_REQUIRED = True - - @property - def IE_NAME(self): - return 'youtube:%s' % self._FEED_NAME - - def _real_initialize(self): - self._login() - - def _entries(self, page): - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page - for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: - break - - ids.extend(new_ids) - - for entry in self._ids_to_results(new_ids): - yield entry - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - - def _real_extract(self, url): - page = self._download_webpage( - 'https://www.youtube.com/feed/%s' % self._FEED_NAME, - self._PLAYLIST_TITLE) - return self.playlist_result( - self._entries(page), playlist_title=self._PLAYLIST_TITLE) - - -class YoutubeWatchLaterIE(YoutubePlaylistIE): - IE_NAME = 'youtube:watchlater' - IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' - - _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=WL', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', - 'only_matching': True, - }] - - def _real_extract(self, url): - _, video = self._check_download_just_video(url, 'WL') - if video: - return video - _, playlist = self._extract_playlist('WL') - return playlist - - -class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): - IE_NAME = 'youtube:favorites' - IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' - _LOGIN_REQUIRED = True - - def _real_extract(self, url): - webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') - playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id') - return self.url_result(playlist_id, 'YoutubePlaylist') - - -class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' - _FEED_NAME = 'recommended' - _PLAYLIST_TITLE = 'Youtube Recommended videos' - - -class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' - _FEED_NAME = 'subscriptions' - _PLAYLIST_TITLE = 'Youtube Subscriptions' - - -class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): - IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' - _FEED_NAME = 'history' - _PLAYLIST_TITLE = 'Youtube History' - - -class YoutubeTruncatedURLIE(InfoExtractor): - IE_NAME = 'youtube:truncated_url' - IE_DESC = False # Do not list - _VALID_URL = r'''(?x) - (?:https?://)? - (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/ - (?:watch\?(?: - feature=[a-z_]+| - annotation_id=annotation_[^&]+| - x-yt-cl=[0-9]+| - hl=[^&]*| - t=[0-9]+ - )? - | - attribution_link\?a=[^&]+ - ) - $ - ''' - - _TESTS = [{ - 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?feature=foo', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?hl=en-GB', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?t=2372', - 'only_matching': True, - }] - - def _real_extract(self, url): - raise ExtractorError( - 'Did you forget to quote the URL? Remember that & is a meta ' - 'character in most shells, so you want to put the URL in quotes, ' - 'like youtube-dl ' - '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' - ' or simply youtube-dl BaW_jenozKc .', - expected=True) - - -class YoutubeTruncatedIDIE(InfoExtractor): - IE_NAME = 'youtube:truncated_id' - IE_DESC = False # Do not list - _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' - - _TESTS = [{ - 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - raise ExtractorError( - 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), - expected=True) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py deleted file mode 100644 index 7bda596..0000000 --- a/youtube_dl/jsinterp.py +++ /dev/null @@ -1,262 +0,0 @@ -from __future__ import unicode_literals - -import json -import operator -import re - -from .utils import ( - ExtractorError, - remove_quotes, -) - -_OPERATORS = [ - ('|', operator.or_), - ('^', operator.xor), - ('&', operator.and_), - ('>>', operator.rshift), - ('<<', operator.lshift), - ('-', operator.sub), - ('+', operator.add), - ('%', operator.mod), - ('/', operator.truediv), - ('*', operator.mul), -] -_ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS] -_ASSIGN_OPERATORS.append(('=', lambda cur, right: right)) - -_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' - - -class JSInterpreter(object): - def __init__(self, code, objects=None): - if objects is None: - objects = {} - self.code = code - self._functions = {} - self._objects = objects - - def interpret_statement(self, stmt, local_vars, allow_recursion=100): - if allow_recursion < 0: - raise ExtractorError('Recursion limit reached') - - should_abort = False - stmt = stmt.lstrip() - stmt_m = re.match(r'var\s', stmt) - if stmt_m: - expr = stmt[len(stmt_m.group(0)):] - else: - return_m = re.match(r'return(?:\s+|$)', stmt) - if return_m: - expr = stmt[len(return_m.group(0)):] - should_abort = True - else: - # Try interpreting it as an expression - expr = stmt - - v = self.interpret_expression(expr, local_vars, allow_recursion) - return v, should_abort - - def interpret_expression(self, expr, local_vars, allow_recursion): - expr = expr.strip() - if expr == '': # Empty expression - return None - - if expr.startswith('('): - parens_count = 0 - for m in re.finditer(r'[()]', expr): - if m.group(0) == '(': - parens_count += 1 - else: - parens_count -= 1 - if parens_count == 0: - sub_expr = expr[1:m.start()] - sub_result = self.interpret_expression( - sub_expr, local_vars, allow_recursion) - remaining_expr = expr[m.end():].strip() - if not remaining_expr: - return sub_result - else: - expr = json.dumps(sub_result) + remaining_expr - break - else: - raise ExtractorError('Premature end of parens in %r' % expr) - - for op, opfunc in _ASSIGN_OPERATORS: - m = re.match(r'''(?x) - (?P<out>%s)(?:\[(?P<index>[^\]]+?)\])? - \s*%s - (?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr) - if not m: - continue - right_val = self.interpret_expression( - m.group('expr'), local_vars, allow_recursion - 1) - - if m.groupdict().get('index'): - lvar = local_vars[m.group('out')] - idx = self.interpret_expression( - m.group('index'), local_vars, allow_recursion) - assert isinstance(idx, int) - cur = lvar[idx] - val = opfunc(cur, right_val) - lvar[idx] = val - return val - else: - cur = local_vars.get(m.group('out')) - val = opfunc(cur, right_val) - local_vars[m.group('out')] = val - return val - - if expr.isdigit(): - return int(expr) - - var_m = re.match( - r'(?!if|return|true|false)(?P<name>%s)$' % _NAME_RE, - expr) - if var_m: - return local_vars[var_m.group('name')] - - try: - return json.loads(expr) - except ValueError: - pass - - m = re.match( - r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) - if m: - val = local_vars[m.group('in')] - idx = self.interpret_expression( - m.group('idx'), local_vars, allow_recursion - 1) - return val[idx] - - m = re.match( - r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE, - expr) - if m: - variable = m.group('var') - member = remove_quotes(m.group('member') or m.group('member2')) - arg_str = m.group('args') - - if variable in local_vars: - obj = local_vars[variable] - else: - if variable not in self._objects: - self._objects[variable] = self.extract_object(variable) - obj = self._objects[variable] - - if arg_str is None: - # Member access - if member == 'length': - return len(obj) - return obj[member] - - assert expr.endswith(')') - # Function call - if arg_str == '': - argvals = tuple() - else: - argvals = tuple([ - self.interpret_expression(v, local_vars, allow_recursion) - for v in arg_str.split(',')]) - - if member == 'split': - assert argvals == ('',) - return list(obj) - if member == 'join': - assert len(argvals) == 1 - return argvals[0].join(obj) - if member == 'reverse': - assert len(argvals) == 0 - obj.reverse() - return obj - if member == 'slice': - assert len(argvals) == 1 - return obj[argvals[0]:] - if member == 'splice': - assert isinstance(obj, list) - index, howMany = argvals - res = [] - for i in range(index, min(index + howMany, len(obj))): - res.append(obj.pop(index)) - return res - - return obj[member](argvals) - - for op, opfunc in _OPERATORS: - m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr) - if not m: - continue - x, abort = self.interpret_statement( - m.group('x'), local_vars, allow_recursion - 1) - if abort: - raise ExtractorError( - 'Premature left-side return of %s in %r' % (op, expr)) - y, abort = self.interpret_statement( - m.group('y'), local_vars, allow_recursion - 1) - if abort: - raise ExtractorError( - 'Premature right-side return of %s in %r' % (op, expr)) - return opfunc(x, y) - - m = re.match( - r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) - if m: - fname = m.group('func') - argvals = tuple([ - int(v) if v.isdigit() else local_vars[v] - for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple() - if fname not in self._functions: - self._functions[fname] = self.extract_function(fname) - return self._functions[fname](argvals) - - raise ExtractorError('Unsupported JS expression %r' % expr) - - def extract_object(self, objname): - _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' - obj = {} - obj_m = re.search( - r'''(?x) - (?<!this\.)%s\s*=\s*{\s* - (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*) - }\s*; - ''' % (re.escape(objname), _FUNC_NAME_RE), - self.code) - fields = obj_m.group('fields') - # Currently, it only supports function definitions - fields_m = re.finditer( - r'''(?x) - (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)} - ''' % _FUNC_NAME_RE, - fields) - for f in fields_m: - argnames = f.group('args').split(',') - obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code')) - - return obj - - def extract_function(self, funcname): - func_m = re.search( - r'''(?x) - (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* - \((?P<args>[^)]*)\)\s* - \{(?P<code>[^}]+)\}''' % ( - re.escape(funcname), re.escape(funcname), re.escape(funcname)), - self.code) - if func_m is None: - raise ExtractorError('Could not find JS function %r' % funcname) - argnames = func_m.group('args').split(',') - - return self.build_function(argnames, func_m.group('code')) - - def call_function(self, funcname, *args): - f = self.extract_function(funcname) - return f(args) - - def build_function(self, argnames, code): - def resf(args): - local_vars = dict(zip(argnames, args)) - for stmt in code.split(';'): - res, abort = self.interpret_statement(stmt, local_vars) - if abort: - break - return res - return resf diff --git a/youtube_dl/options.py b/youtube_dl/options.py deleted file mode 100644 index e7d8e89..0000000 --- a/youtube_dl/options.py +++ /dev/null @@ -1,916 +0,0 @@ -from __future__ import unicode_literals - -import os.path -import optparse -import re -import sys - -from .downloader.external import list_external_downloaders -from .compat import ( - compat_expanduser, - compat_get_terminal_size, - compat_getenv, - compat_kwargs, - compat_shlex_split, -) -from .utils import ( - preferredencoding, - write_string, -) -from .version import __version__ - - -def _hide_login_info(opts): - PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username']) - eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') - - def _scrub_eq(o): - m = eqre.match(o) - if m: - return m.group('key') + '=PRIVATE' - else: - return o - - opts = list(map(_scrub_eq, opts)) - for idx, opt in enumerate(opts): - if opt in PRIVATE_OPTS and idx + 1 < len(opts): - opts[idx + 1] = 'PRIVATE' - return opts - - -def parseOpts(overrideArguments=None): - def _readOptions(filename_bytes, default=[]): - try: - optionf = open(filename_bytes) - except IOError: - return default # silently skip if file is not present - try: - # FIXME: https://github.com/rg3/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 - contents = optionf.read() - if sys.version_info < (3,): - contents = contents.decode(preferredencoding()) - res = compat_shlex_split(contents, comments=True) - finally: - optionf.close() - return res - - def _readUserConf(): - xdg_config_home = compat_getenv('XDG_CONFIG_HOME') - if xdg_config_home: - userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config') - if not os.path.isfile(userConfFile): - userConfFile = os.path.join(xdg_config_home, 'youtube-dl.conf') - else: - userConfFile = os.path.join(compat_expanduser('~'), '.config', 'youtube-dl', 'config') - if not os.path.isfile(userConfFile): - userConfFile = os.path.join(compat_expanduser('~'), '.config', 'youtube-dl.conf') - userConf = _readOptions(userConfFile, None) - - if userConf is None: - appdata_dir = compat_getenv('appdata') - if appdata_dir: - userConf = _readOptions( - os.path.join(appdata_dir, 'youtube-dl', 'config'), - default=None) - if userConf is None: - userConf = _readOptions( - os.path.join(appdata_dir, 'youtube-dl', 'config.txt'), - default=None) - - if userConf is None: - userConf = _readOptions( - os.path.join(compat_expanduser('~'), 'youtube-dl.conf'), - default=None) - if userConf is None: - userConf = _readOptions( - os.path.join(compat_expanduser('~'), 'youtube-dl.conf.txt'), - default=None) - - if userConf is None: - userConf = [] - - return userConf - - def _format_option_string(option): - ''' ('-o', '--option') -> -o, --format METAVAR''' - - opts = [] - - if option._short_opts: - opts.append(option._short_opts[0]) - if option._long_opts: - opts.append(option._long_opts[0]) - if len(opts) > 1: - opts.insert(1, ', ') - - if option.takes_value(): - opts.append(' %s' % option.metavar) - - return ''.join(opts) - - def _comma_separated_values_options_callback(option, opt_str, value, parser): - setattr(parser.values, option.dest, value.split(',')) - - # No need to wrap help messages if we're on a wide console - columns = compat_get_terminal_size().columns - max_width = columns if columns else 80 - max_help_position = 80 - - fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position) - fmt.format_option_strings = _format_option_string - - kw = { - 'version': __version__, - 'formatter': fmt, - 'usage': '%prog [OPTIONS] URL [URL...]', - 'conflict_handler': 'resolve', - } - - parser = optparse.OptionParser(**compat_kwargs(kw)) - - general = optparse.OptionGroup(parser, 'General Options') - general.add_option( - '-h', '--help', - action='help', - help='Print this help text and exit') - general.add_option( - '-v', '--version', - action='version', - help='Print program version and exit') - general.add_option( - '-U', '--update', - action='store_true', dest='update_self', - help='Update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)') - general.add_option( - '-i', '--ignore-errors', - action='store_true', dest='ignoreerrors', default=False, - help='Continue on download errors, for example to skip unavailable videos in a playlist') - general.add_option( - '--abort-on-error', - action='store_false', dest='ignoreerrors', - help='Abort downloading of further videos (in the playlist or the command line) if an error occurs') - general.add_option( - '--dump-user-agent', - action='store_true', dest='dump_user_agent', default=False, - help='Display the current browser identification') - general.add_option( - '--list-extractors', - action='store_true', dest='list_extractors', default=False, - help='List all supported extractors') - general.add_option( - '--extractor-descriptions', - action='store_true', dest='list_extractor_descriptions', default=False, - help='Output descriptions of all supported extractors') - general.add_option( - '--force-generic-extractor', - action='store_true', dest='force_generic_extractor', default=False, - help='Force extraction to use the generic extractor') - general.add_option( - '--default-search', - dest='default_search', metavar='PREFIX', - help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching.') - general.add_option( - '--ignore-config', - action='store_true', - help='Do not read configuration files. ' - 'When given in the global configuration file /etc/youtube-dl.conf: ' - 'Do not read the user configuration in ~/.config/youtube-dl/config ' - '(%APPDATA%/youtube-dl/config.txt on Windows)') - general.add_option( - '--config-location', - dest='config_location', metavar='PATH', - help='Location of the configuration file; either the path to the config or its containing directory.') - general.add_option( - '--flat-playlist', - action='store_const', dest='extract_flat', const='in_playlist', - default=False, - help='Do not extract the videos of a playlist, only list them.') - general.add_option( - '--mark-watched', - action='store_true', dest='mark_watched', default=False, - help='Mark videos watched (YouTube only)') - general.add_option( - '--no-mark-watched', - action='store_false', dest='mark_watched', default=False, - help='Do not mark videos watched (YouTube only)') - general.add_option( - '--no-color', '--no-colors', - action='store_true', dest='no_color', - default=False, - help='Do not emit color codes in output') - - network = optparse.OptionGroup(parser, 'Network Options') - network.add_option( - '--proxy', dest='proxy', - default=None, metavar='URL', - help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable ' - 'SOCKS proxy, specify a proper scheme. For example ' - 'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") ' - 'for direct connection') - network.add_option( - '--socket-timeout', - dest='socket_timeout', type=float, default=None, metavar='SECONDS', - help='Time to wait before giving up, in seconds') - network.add_option( - '--source-address', - metavar='IP', dest='source_address', default=None, - help='Client-side IP address to bind to', - ) - network.add_option( - '-4', '--force-ipv4', - action='store_const', const='0.0.0.0', dest='source_address', - help='Make all connections via IPv4', - ) - network.add_option( - '-6', '--force-ipv6', - action='store_const', const='::', dest='source_address', - help='Make all connections via IPv6', - ) - - geo = optparse.OptionGroup(parser, 'Geo Restriction') - geo.add_option( - '--geo-verification-proxy', - dest='geo_verification_proxy', default=None, metavar='URL', - help='Use this proxy to verify the IP address for some geo-restricted sites. ' - 'The default proxy specified by --proxy (or none, if the option is not present) is used for the actual downloading.') - geo.add_option( - '--cn-verification-proxy', - dest='cn_verification_proxy', default=None, metavar='URL', - help=optparse.SUPPRESS_HELP) - geo.add_option( - '--geo-bypass', - action='store_true', dest='geo_bypass', default=True, - help='Bypass geographic restriction via faking X-Forwarded-For HTTP header') - geo.add_option( - '--no-geo-bypass', - action='store_false', dest='geo_bypass', default=True, - help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header') - geo.add_option( - '--geo-bypass-country', metavar='CODE', - dest='geo_bypass_country', default=None, - help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code') - geo.add_option( - '--geo-bypass-ip-block', metavar='IP_BLOCK', - dest='geo_bypass_ip_block', default=None, - help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation') - - selection = optparse.OptionGroup(parser, 'Video Selection') - selection.add_option( - '--playlist-start', - dest='playliststart', metavar='NUMBER', default=1, type=int, - help='Playlist video to start at (default is %default)') - selection.add_option( - '--playlist-end', - dest='playlistend', metavar='NUMBER', default=None, type=int, - help='Playlist video to end at (default is last)') - selection.add_option( - '--playlist-items', - dest='playlist_items', metavar='ITEM_SPEC', default=None, - help='Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.') - selection.add_option( - '--match-title', - dest='matchtitle', metavar='REGEX', - help='Download only matching titles (regex or caseless sub-string)') - selection.add_option( - '--reject-title', - dest='rejecttitle', metavar='REGEX', - help='Skip download for matching titles (regex or caseless sub-string)') - selection.add_option( - '--max-downloads', - dest='max_downloads', metavar='NUMBER', type=int, default=None, - help='Abort after downloading NUMBER files') - selection.add_option( - '--min-filesize', - metavar='SIZE', dest='min_filesize', default=None, - help='Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)') - selection.add_option( - '--max-filesize', - metavar='SIZE', dest='max_filesize', default=None, - help='Do not download any videos larger than SIZE (e.g. 50k or 44.6m)') - selection.add_option( - '--date', - metavar='DATE', dest='date', default=None, - help='Download only videos uploaded in this date') - selection.add_option( - '--datebefore', - metavar='DATE', dest='datebefore', default=None, - help='Download only videos uploaded on or before this date (i.e. inclusive)') - selection.add_option( - '--dateafter', - metavar='DATE', dest='dateafter', default=None, - help='Download only videos uploaded on or after this date (i.e. inclusive)') - selection.add_option( - '--min-views', - metavar='COUNT', dest='min_views', default=None, type=int, - help='Do not download any videos with less than COUNT views') - selection.add_option( - '--max-views', - metavar='COUNT', dest='max_views', default=None, type=int, - help='Do not download any videos with more than COUNT views') - selection.add_option( - '--match-filter', - metavar='FILTER', dest='match_filter', default=None, - help=( - 'Generic video filter. ' - 'Specify any key (see the "OUTPUT TEMPLATE" for a list of available keys) to ' - 'match if the key is present, ' - '!key to check if the key is not present, ' - 'key > NUMBER (like "comment_count > 12", also works with ' - '>=, <, <=, !=, =) to compare against a number, ' - 'key = \'LITERAL\' (like "uploader = \'Mike Smith\'", also works with !=) ' - 'to match against a string literal ' - 'and & to require multiple matches. ' - 'Values which are not known are excluded unless you ' - 'put a question mark (?) after the operator. ' - 'For example, to only match videos that have been liked more than ' - '100 times and disliked less than 50 times (or the dislike ' - 'functionality is not available at the given service), but who ' - 'also have a description, use --match-filter ' - '"like_count > 100 & dislike_count <? 50 & description" .' - )) - selection.add_option( - '--no-playlist', - action='store_true', dest='noplaylist', default=False, - help='Download only the video, if the URL refers to a video and a playlist.') - selection.add_option( - '--yes-playlist', - action='store_false', dest='noplaylist', default=False, - help='Download the playlist, if the URL refers to a video and a playlist.') - selection.add_option( - '--age-limit', - metavar='YEARS', dest='age_limit', default=None, type=int, - help='Download only videos suitable for the given age') - selection.add_option( - '--download-archive', metavar='FILE', - dest='download_archive', - help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.') - selection.add_option( - '--include-ads', - dest='include_ads', action='store_true', - help='Download advertisements as well (experimental)') - - authentication = optparse.OptionGroup(parser, 'Authentication Options') - authentication.add_option( - '-u', '--username', - dest='username', metavar='USERNAME', - help='Login with this account ID') - authentication.add_option( - '-p', '--password', - dest='password', metavar='PASSWORD', - help='Account password. If this option is left out, youtube-dl will ask interactively.') - authentication.add_option( - '-2', '--twofactor', - dest='twofactor', metavar='TWOFACTOR', - help='Two-factor authentication code') - authentication.add_option( - '-n', '--netrc', - action='store_true', dest='usenetrc', default=False, - help='Use .netrc authentication data') - authentication.add_option( - '--video-password', - dest='videopassword', metavar='PASSWORD', - help='Video password (vimeo, smotri, youku)') - - adobe_pass = optparse.OptionGroup(parser, 'Adobe Pass Options') - adobe_pass.add_option( - '--ap-mso', - dest='ap_mso', metavar='MSO', - help='Adobe Pass multiple-system operator (TV provider) identifier, use --ap-list-mso for a list of available MSOs') - adobe_pass.add_option( - '--ap-username', - dest='ap_username', metavar='USERNAME', - help='Multiple-system operator account login') - adobe_pass.add_option( - '--ap-password', - dest='ap_password', metavar='PASSWORD', - help='Multiple-system operator account password. If this option is left out, youtube-dl will ask interactively.') - adobe_pass.add_option( - '--ap-list-mso', - action='store_true', dest='ap_list_mso', default=False, - help='List all supported multiple-system operators') - - video_format = optparse.OptionGroup(parser, 'Video Format Options') - video_format.add_option( - '-f', '--format', - action='store', dest='format', metavar='FORMAT', default=None, - help='Video format code, see the "FORMAT SELECTION" for all the info') - video_format.add_option( - '--all-formats', - action='store_const', dest='format', const='all', - help='Download all available video formats') - video_format.add_option( - '--prefer-free-formats', - action='store_true', dest='prefer_free_formats', default=False, - help='Prefer free video formats unless a specific one is requested') - video_format.add_option( - '-F', '--list-formats', - action='store_true', dest='listformats', - help='List all available formats of requested videos') - video_format.add_option( - '--youtube-include-dash-manifest', - action='store_true', dest='youtube_include_dash_manifest', default=True, - help=optparse.SUPPRESS_HELP) - video_format.add_option( - '--youtube-skip-dash-manifest', - action='store_false', dest='youtube_include_dash_manifest', - help='Do not download the DASH manifests and related data on YouTube videos') - video_format.add_option( - '--merge-output-format', - action='store', dest='merge_output_format', metavar='FORMAT', default=None, - help=( - 'If a merge is required (e.g. bestvideo+bestaudio), ' - 'output to given container format. One of mkv, mp4, ogg, webm, flv. ' - 'Ignored if no merge is required')) - - subtitles = optparse.OptionGroup(parser, 'Subtitle Options') - subtitles.add_option( - '--write-sub', '--write-srt', - action='store_true', dest='writesubtitles', default=False, - help='Write subtitle file') - subtitles.add_option( - '--write-auto-sub', '--write-automatic-sub', - action='store_true', dest='writeautomaticsub', default=False, - help='Write automatically generated subtitle file (YouTube only)') - subtitles.add_option( - '--all-subs', - action='store_true', dest='allsubtitles', default=False, - help='Download all the available subtitles of the video') - subtitles.add_option( - '--list-subs', - action='store_true', dest='listsubtitles', default=False, - help='List all available subtitles for the video') - subtitles.add_option( - '--sub-format', - action='store', dest='subtitlesformat', metavar='FORMAT', default='best', - help='Subtitle format, accepts formats preference, for example: "srt" or "ass/srt/best"') - subtitles.add_option( - '--sub-lang', '--sub-langs', '--srt-lang', - action='callback', dest='subtitleslangs', metavar='LANGS', type='str', - default=[], callback=_comma_separated_values_options_callback, - help='Languages of the subtitles to download (optional) separated by commas, use --list-subs for available language tags') - - downloader = optparse.OptionGroup(parser, 'Download Options') - downloader.add_option( - '-r', '--limit-rate', '--rate-limit', - dest='ratelimit', metavar='RATE', - help='Maximum download rate in bytes per second (e.g. 50K or 4.2M)') - downloader.add_option( - '-R', '--retries', - dest='retries', metavar='RETRIES', default=10, - help='Number of retries (default is %default), or "infinite".') - downloader.add_option( - '--fragment-retries', - dest='fragment_retries', metavar='RETRIES', default=10, - help='Number of retries for a fragment (default is %default), or "infinite" (DASH, hlsnative and ISM)') - downloader.add_option( - '--skip-unavailable-fragments', - action='store_true', dest='skip_unavailable_fragments', default=True, - help='Skip unavailable fragments (DASH, hlsnative and ISM)') - downloader.add_option( - '--abort-on-unavailable-fragment', - action='store_false', dest='skip_unavailable_fragments', - help='Abort downloading when some fragment is not available') - downloader.add_option( - '--keep-fragments', - action='store_true', dest='keep_fragments', default=False, - help='Keep downloaded fragments on disk after downloading is finished; fragments are erased by default') - downloader.add_option( - '--buffer-size', - dest='buffersize', metavar='SIZE', default='1024', - help='Size of download buffer (e.g. 1024 or 16K) (default is %default)') - downloader.add_option( - '--no-resize-buffer', - action='store_true', dest='noresizebuffer', default=False, - help='Do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.') - downloader.add_option( - '--http-chunk-size', - dest='http_chunk_size', metavar='SIZE', default=None, - help='Size of a chunk for chunk-based HTTP downloading (e.g. 10485760 or 10M) (default is disabled). ' - 'May be useful for bypassing bandwidth throttling imposed by a webserver (experimental)') - downloader.add_option( - '--test', - action='store_true', dest='test', default=False, - help=optparse.SUPPRESS_HELP) - downloader.add_option( - '--playlist-reverse', - action='store_true', - help='Download playlist videos in reverse order') - downloader.add_option( - '--playlist-random', - action='store_true', - help='Download playlist videos in random order') - downloader.add_option( - '--xattr-set-filesize', - dest='xattr_set_filesize', action='store_true', - help='Set file xattribute ytdl.filesize with expected file size') - downloader.add_option( - '--hls-prefer-native', - dest='hls_prefer_native', action='store_true', default=None, - help='Use the native HLS downloader instead of ffmpeg') - downloader.add_option( - '--hls-prefer-ffmpeg', - dest='hls_prefer_native', action='store_false', default=None, - help='Use ffmpeg instead of the native HLS downloader') - downloader.add_option( - '--hls-use-mpegts', - dest='hls_use_mpegts', action='store_true', - help='Use the mpegts container for HLS videos, allowing to play the ' - 'video while downloading (some players may not be able to play it)') - downloader.add_option( - '--external-downloader', - dest='external_downloader', metavar='COMMAND', - help='Use the specified external downloader. ' - 'Currently supports %s' % ','.join(list_external_downloaders())) - downloader.add_option( - '--external-downloader-args', - dest='external_downloader_args', metavar='ARGS', - help='Give these arguments to the external downloader') - - workarounds = optparse.OptionGroup(parser, 'Workarounds') - workarounds.add_option( - '--encoding', - dest='encoding', metavar='ENCODING', - help='Force the specified encoding (experimental)') - workarounds.add_option( - '--no-check-certificate', - action='store_true', dest='no_check_certificate', default=False, - help='Suppress HTTPS certificate validation') - workarounds.add_option( - '--prefer-insecure', - '--prefer-unsecure', action='store_true', dest='prefer_insecure', - help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)') - workarounds.add_option( - '--user-agent', - metavar='UA', dest='user_agent', - help='Specify a custom user agent') - workarounds.add_option( - '--referer', - metavar='URL', dest='referer', default=None, - help='Specify a custom referer, use if the video access is restricted to one domain', - ) - workarounds.add_option( - '--add-header', - metavar='FIELD:VALUE', dest='headers', action='append', - help='Specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times', - ) - workarounds.add_option( - '--bidi-workaround', - dest='bidi_workaround', action='store_true', - help='Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') - workarounds.add_option( - '--sleep-interval', '--min-sleep-interval', metavar='SECONDS', - dest='sleep_interval', type=float, - help=( - 'Number of seconds to sleep before each download when used alone ' - 'or a lower bound of a range for randomized sleep before each download ' - '(minimum possible number of seconds to sleep) when used along with ' - '--max-sleep-interval.')) - workarounds.add_option( - '--max-sleep-interval', metavar='SECONDS', - dest='max_sleep_interval', type=float, - help=( - 'Upper bound of a range for randomized sleep before each download ' - '(maximum possible number of seconds to sleep). Must only be used ' - 'along with --min-sleep-interval.')) - - verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') - verbosity.add_option( - '-q', '--quiet', - action='store_true', dest='quiet', default=False, - help='Activate quiet mode') - verbosity.add_option( - '--no-warnings', - dest='no_warnings', action='store_true', default=False, - help='Ignore warnings') - verbosity.add_option( - '-s', '--simulate', - action='store_true', dest='simulate', default=False, - help='Do not download the video and do not write anything to disk') - verbosity.add_option( - '--skip-download', - action='store_true', dest='skip_download', default=False, - help='Do not download the video') - verbosity.add_option( - '-g', '--get-url', - action='store_true', dest='geturl', default=False, - help='Simulate, quiet but print URL') - verbosity.add_option( - '-e', '--get-title', - action='store_true', dest='gettitle', default=False, - help='Simulate, quiet but print title') - verbosity.add_option( - '--get-id', - action='store_true', dest='getid', default=False, - help='Simulate, quiet but print id') - verbosity.add_option( - '--get-thumbnail', - action='store_true', dest='getthumbnail', default=False, - help='Simulate, quiet but print thumbnail URL') - verbosity.add_option( - '--get-description', - action='store_true', dest='getdescription', default=False, - help='Simulate, quiet but print video description') - verbosity.add_option( - '--get-duration', - action='store_true', dest='getduration', default=False, - help='Simulate, quiet but print video length') - verbosity.add_option( - '--get-filename', - action='store_true', dest='getfilename', default=False, - help='Simulate, quiet but print output filename') - verbosity.add_option( - '--get-format', - action='store_true', dest='getformat', default=False, - help='Simulate, quiet but print output format') - verbosity.add_option( - '-j', '--dump-json', - action='store_true', dest='dumpjson', default=False, - help='Simulate, quiet but print JSON information. See the "OUTPUT TEMPLATE" for a description of available keys.') - verbosity.add_option( - '-J', '--dump-single-json', - action='store_true', dest='dump_single_json', default=False, - help='Simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole playlist information in a single line.') - verbosity.add_option( - '--print-json', - action='store_true', dest='print_json', default=False, - help='Be quiet and print the video information as JSON (video is still being downloaded).', - ) - verbosity.add_option( - '--newline', - action='store_true', dest='progress_with_newline', default=False, - help='Output progress bar as new lines') - verbosity.add_option( - '--no-progress', - action='store_true', dest='noprogress', default=False, - help='Do not print progress bar') - verbosity.add_option( - '--console-title', - action='store_true', dest='consoletitle', default=False, - help='Display progress in console titlebar') - verbosity.add_option( - '-v', '--verbose', - action='store_true', dest='verbose', default=False, - help='Print various debugging information') - verbosity.add_option( - '--dump-pages', '--dump-intermediate-pages', - action='store_true', dest='dump_intermediate_pages', default=False, - help='Print downloaded pages encoded using base64 to debug problems (very verbose)') - verbosity.add_option( - '--write-pages', - action='store_true', dest='write_pages', default=False, - help='Write downloaded intermediary pages to files in the current directory to debug problems') - verbosity.add_option( - '--youtube-print-sig-code', - action='store_true', dest='youtube_print_sig_code', default=False, - help=optparse.SUPPRESS_HELP) - verbosity.add_option( - '--print-traffic', '--dump-headers', - dest='debug_printtraffic', action='store_true', default=False, - help='Display sent and read HTTP traffic') - verbosity.add_option( - '-C', '--call-home', - dest='call_home', action='store_true', default=False, - help='Contact the youtube-dl server for debugging') - verbosity.add_option( - '--no-call-home', - dest='call_home', action='store_false', default=False, - help='Do NOT contact the youtube-dl server for debugging') - - filesystem = optparse.OptionGroup(parser, 'Filesystem Options') - filesystem.add_option( - '-a', '--batch-file', - dest='batchfile', metavar='FILE', - help="File containing URLs to download ('-' for stdin), one URL per line. " - "Lines starting with '#', ';' or ']' are considered as comments and ignored.") - filesystem.add_option( - '--id', default=False, - action='store_true', dest='useid', help='Use only video ID in file name') - filesystem.add_option( - '-o', '--output', - dest='outtmpl', metavar='TEMPLATE', - help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info')) - filesystem.add_option( - '--autonumber-size', - dest='autonumber_size', metavar='NUMBER', type=int, - help=optparse.SUPPRESS_HELP) - filesystem.add_option( - '--autonumber-start', - dest='autonumber_start', metavar='NUMBER', default=1, type=int, - help='Specify the start value for %(autonumber)s (default is %default)') - filesystem.add_option( - '--restrict-filenames', - action='store_true', dest='restrictfilenames', default=False, - help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames') - filesystem.add_option( - '-A', '--auto-number', - action='store_true', dest='autonumber', default=False, - help=optparse.SUPPRESS_HELP) - filesystem.add_option( - '-t', '--title', - action='store_true', dest='usetitle', default=False, - help=optparse.SUPPRESS_HELP) - filesystem.add_option( - '-l', '--literal', default=False, - action='store_true', dest='usetitle', - help=optparse.SUPPRESS_HELP) - filesystem.add_option( - '-w', '--no-overwrites', - action='store_true', dest='nooverwrites', default=False, - help='Do not overwrite files') - filesystem.add_option( - '-c', '--continue', - action='store_true', dest='continue_dl', default=True, - help='Force resume of partially downloaded files. By default, youtube-dl will resume downloads if possible.') - filesystem.add_option( - '--no-continue', - action='store_false', dest='continue_dl', - help='Do not resume partially downloaded files (restart from beginning)') - filesystem.add_option( - '--no-part', - action='store_true', dest='nopart', default=False, - help='Do not use .part files - write directly into output file') - filesystem.add_option( - '--no-mtime', - action='store_false', dest='updatetime', default=True, - help='Do not use the Last-modified header to set the file modification time') - filesystem.add_option( - '--write-description', - action='store_true', dest='writedescription', default=False, - help='Write video description to a .description file') - filesystem.add_option( - '--write-info-json', - action='store_true', dest='writeinfojson', default=False, - help='Write video metadata to a .info.json file') - filesystem.add_option( - '--write-annotations', - action='store_true', dest='writeannotations', default=False, - help='Write video annotations to a .annotations.xml file') - filesystem.add_option( - '--load-info-json', '--load-info', - dest='load_info_filename', metavar='FILE', - help='JSON file containing the video information (created with the "--write-info-json" option)') - filesystem.add_option( - '--cookies', - dest='cookiefile', metavar='FILE', - help='File to read cookies from and dump cookie jar in') - filesystem.add_option( - '--cache-dir', dest='cachedir', default=None, metavar='DIR', - help='Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.') - filesystem.add_option( - '--no-cache-dir', action='store_const', const=False, dest='cachedir', - help='Disable filesystem caching') - filesystem.add_option( - '--rm-cache-dir', - action='store_true', dest='rm_cachedir', - help='Delete all filesystem cache files') - - thumbnail = optparse.OptionGroup(parser, 'Thumbnail images') - thumbnail.add_option( - '--write-thumbnail', - action='store_true', dest='writethumbnail', default=False, - help='Write thumbnail image to disk') - thumbnail.add_option( - '--write-all-thumbnails', - action='store_true', dest='write_all_thumbnails', default=False, - help='Write all thumbnail image formats to disk') - thumbnail.add_option( - '--list-thumbnails', - action='store_true', dest='list_thumbnails', default=False, - help='Simulate and list all available thumbnail formats') - - postproc = optparse.OptionGroup(parser, 'Post-processing Options') - postproc.add_option( - '-x', '--extract-audio', - action='store_true', dest='extractaudio', default=False, - help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') - postproc.add_option( - '--audio-format', metavar='FORMAT', dest='audioformat', default='best', - help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x') - postproc.add_option( - '--audio-quality', metavar='QUALITY', - dest='audioquality', default='5', - help='Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default %default)') - postproc.add_option( - '--recode-video', - metavar='FORMAT', dest='recodevideo', default=None, - help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|avi)') - postproc.add_option( - '--postprocessor-args', - dest='postprocessor_args', metavar='ARGS', - help='Give these arguments to the postprocessor') - postproc.add_option( - '-k', '--keep-video', - action='store_true', dest='keepvideo', default=False, - help='Keep the video file on disk after the post-processing; the video is erased by default') - postproc.add_option( - '--no-post-overwrites', - action='store_true', dest='nopostoverwrites', default=False, - help='Do not overwrite post-processed files; the post-processed files are overwritten by default') - postproc.add_option( - '--embed-subs', - action='store_true', dest='embedsubtitles', default=False, - help='Embed subtitles in the video (only for mp4, webm and mkv videos)') - postproc.add_option( - '--embed-thumbnail', - action='store_true', dest='embedthumbnail', default=False, - help='Embed thumbnail in the audio as cover art') - postproc.add_option( - '--add-metadata', - action='store_true', dest='addmetadata', default=False, - help='Write metadata to the video file') - postproc.add_option( - '--metadata-from-title', - metavar='FORMAT', dest='metafromtitle', - help='Parse additional metadata like song title / artist from the video title. ' - 'The format syntax is the same as --output. Regular expression with ' - 'named capture groups may also be used. ' - 'The parsed parameters replace existing values. ' - 'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like ' - '"Coldplay - Paradise". ' - 'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"') - postproc.add_option( - '--xattrs', - action='store_true', dest='xattrs', default=False, - help='Write metadata to the video file\'s xattrs (using dublin core and xdg standards)') - postproc.add_option( - '--fixup', - metavar='POLICY', dest='fixup', default='detect_or_warn', - help='Automatically correct known faults of the file. ' - 'One of never (do nothing), warn (only emit a warning), ' - 'detect_or_warn (the default; fix file if we can, warn otherwise)') - postproc.add_option( - '--prefer-avconv', - action='store_false', dest='prefer_ffmpeg', - help='Prefer avconv over ffmpeg for running the postprocessors') - postproc.add_option( - '--prefer-ffmpeg', - action='store_true', dest='prefer_ffmpeg', - help='Prefer ffmpeg over avconv for running the postprocessors (default)') - postproc.add_option( - '--ffmpeg-location', '--avconv-location', metavar='PATH', - dest='ffmpeg_location', - help='Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory.') - postproc.add_option( - '--exec', - metavar='CMD', dest='exec_cmd', - help='Execute a command on the file after downloading, similar to find\'s -exec syntax. Example: --exec \'adb push {} /sdcard/Music/ && rm {}\'') - postproc.add_option( - '--convert-subs', '--convert-subtitles', - metavar='FORMAT', dest='convertsubtitles', default=None, - help='Convert the subtitles to other format (currently supported: srt|ass|vtt|lrc)') - - parser.add_option_group(general) - parser.add_option_group(network) - parser.add_option_group(geo) - parser.add_option_group(selection) - parser.add_option_group(downloader) - parser.add_option_group(filesystem) - parser.add_option_group(thumbnail) - parser.add_option_group(verbosity) - parser.add_option_group(workarounds) - parser.add_option_group(video_format) - parser.add_option_group(subtitles) - parser.add_option_group(authentication) - parser.add_option_group(adobe_pass) - parser.add_option_group(postproc) - - if overrideArguments is not None: - opts, args = parser.parse_args(overrideArguments) - if opts.verbose: - write_string('[debug] Override config: ' + repr(overrideArguments) + '\n') - else: - def compat_conf(conf): - if sys.version_info < (3,): - return [a.decode(preferredencoding(), 'replace') for a in conf] - return conf - - command_line_conf = compat_conf(sys.argv[1:]) - opts, args = parser.parse_args(command_line_conf) - - system_conf = user_conf = custom_conf = [] - - if '--config-location' in command_line_conf: - location = compat_expanduser(opts.config_location) - if os.path.isdir(location): - location = os.path.join(location, 'youtube-dl.conf') - if not os.path.exists(location): - parser.error('config-location %s does not exist.' % location) - custom_conf = _readOptions(location) - elif '--ignore-config' in command_line_conf: - pass - else: - system_conf = _readOptions('/etc/youtube-dl.conf') - if '--ignore-config' not in system_conf: - user_conf = _readUserConf() - - argv = system_conf + user_conf + custom_conf + command_line_conf - opts, args = parser.parse_args(argv) - if opts.verbose: - for conf_label, conf in ( - ('System config', system_conf), - ('User config', user_conf), - ('Custom config', custom_conf), - ('Command-line args', command_line_conf)): - write_string('[debug] %s: %s\n' % (conf_label, repr(_hide_login_info(conf)))) - - return parser, opts, args diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py deleted file mode 100644 index 3ea5183..0000000 --- a/youtube_dl/postprocessor/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -from __future__ import unicode_literals - -from .embedthumbnail import EmbedThumbnailPP -from .ffmpeg import ( - FFmpegPostProcessor, - FFmpegEmbedSubtitlePP, - FFmpegExtractAudioPP, - FFmpegFixupStretchedPP, - FFmpegFixupM3u8PP, - FFmpegFixupM4aPP, - FFmpegMergerPP, - FFmpegMetadataPP, - FFmpegVideoConvertorPP, - FFmpegSubtitlesConvertorPP, -) -from .xattrpp import XAttrMetadataPP -from .execafterdownload import ExecAfterDownloadPP -from .metadatafromtitle import MetadataFromTitlePP - - -def get_postprocessor(key): - return globals()[key + 'PP'] - - -__all__ = [ - 'EmbedThumbnailPP', - 'ExecAfterDownloadPP', - 'FFmpegEmbedSubtitlePP', - 'FFmpegExtractAudioPP', - 'FFmpegFixupM3u8PP', - 'FFmpegFixupM4aPP', - 'FFmpegFixupStretchedPP', - 'FFmpegMergerPP', - 'FFmpegMetadataPP', - 'FFmpegPostProcessor', - 'FFmpegSubtitlesConvertorPP', - 'FFmpegVideoConvertorPP', - 'MetadataFromTitlePP', - 'XAttrMetadataPP', -] diff --git a/youtube_dl/postprocessor/common.py b/youtube_dl/postprocessor/common.py deleted file mode 100644 index 599dd1d..0000000 --- a/youtube_dl/postprocessor/common.py +++ /dev/null @@ -1,69 +0,0 @@ -from __future__ import unicode_literals - -import os - -from ..utils import ( - PostProcessingError, - cli_configuration_args, - encodeFilename, -) - - -class PostProcessor(object): - """Post Processor class. - - PostProcessor objects can be added to downloaders with their - add_post_processor() method. When the downloader has finished a - successful download, it will take its internal chain of PostProcessors - and start calling the run() method on each one of them, first with - an initial argument and then with the returned value of the previous - PostProcessor. - - The chain will be stopped if one of them ever returns None or the end - of the chain is reached. - - PostProcessor objects follow a "mutual registration" process similar - to InfoExtractor objects. - - Optionally PostProcessor can use a list of additional command-line arguments - with self._configuration_args. - """ - - _downloader = None - - def __init__(self, downloader=None): - self._downloader = downloader - - def set_downloader(self, downloader): - """Sets the downloader for this PP.""" - self._downloader = downloader - - def run(self, information): - """Run the PostProcessor. - - The "information" argument is a dictionary like the ones - composed by InfoExtractors. The only difference is that this - one has an extra field called "filepath" that points to the - downloaded file. - - This method returns a tuple, the first element is a list of the files - that can be deleted, and the second of which is the updated - information. - - In addition, this method may raise a PostProcessingError - exception if post processing fails. - """ - return [], information # by default, keep file and do nothing - - def try_utime(self, path, atime, mtime, errnote='Cannot update utime of file'): - try: - os.utime(encodeFilename(path), (atime, mtime)) - except Exception: - self._downloader.report_warning(errnote) - - def _configuration_args(self, default=[]): - return cli_configuration_args(self._downloader.params, 'postprocessor_args', default) - - -class AudioConversionError(PostProcessingError): - pass diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py deleted file mode 100644 index 56be914..0000000 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ /dev/null @@ -1,93 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - - -import os -import subprocess - -from .ffmpeg import FFmpegPostProcessor - -from ..utils import ( - check_executable, - encodeArgument, - encodeFilename, - PostProcessingError, - prepend_extension, - shell_quote -) - - -class EmbedThumbnailPPError(PostProcessingError): - pass - - -class EmbedThumbnailPP(FFmpegPostProcessor): - def __init__(self, downloader=None, already_have_thumbnail=False): - super(EmbedThumbnailPP, self).__init__(downloader) - self._already_have_thumbnail = already_have_thumbnail - - def run(self, info): - filename = info['filepath'] - temp_filename = prepend_extension(filename, 'temp') - - if not info.get('thumbnails'): - self._downloader.to_screen('[embedthumbnail] There aren\'t any thumbnails to embed') - return [], info - - thumbnail_filename = info['thumbnails'][-1]['filename'] - - if not os.path.exists(encodeFilename(thumbnail_filename)): - self._downloader.report_warning( - 'Skipping embedding the thumbnail because the file is missing.') - return [], info - - if info['ext'] == 'mp3': - options = [ - '-c', 'copy', '-map', '0', '-map', '1', - '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"'] - - self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename) - - self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options) - - if not self._already_have_thumbnail: - os.remove(encodeFilename(thumbnail_filename)) - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - - elif info['ext'] in ['m4a', 'mp4']: - if not check_executable('AtomicParsley', ['-v']): - raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') - - cmd = [encodeFilename('AtomicParsley', True), - encodeFilename(filename, True), - encodeArgument('--artwork'), - encodeFilename(thumbnail_filename, True), - encodeArgument('-o'), - encodeFilename(temp_filename, True)] - - self._downloader.to_screen('[atomicparsley] Adding thumbnail to "%s"' % filename) - - if self._downloader.params.get('verbose', False): - self._downloader.to_screen('[debug] AtomicParsley command line: %s' % shell_quote(cmd)) - - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - - if p.returncode != 0: - msg = stderr.decode('utf-8', 'replace').strip() - raise EmbedThumbnailPPError(msg) - - if not self._already_have_thumbnail: - os.remove(encodeFilename(thumbnail_filename)) - # for formats that don't support thumbnails (like 3gp) AtomicParsley - # won't create to the temporary file - if b'No changes' in stdout: - self._downloader.report_warning('The file format doesn\'t support embedding a thumbnail') - else: - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - else: - raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.') - - return [], info diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py deleted file mode 100644 index 64dabe7..0000000 --- a/youtube_dl/postprocessor/execafterdownload.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import unicode_literals - -import subprocess - -from .common import PostProcessor -from ..compat import compat_shlex_quote -from ..utils import ( - encodeArgument, - PostProcessingError, -) - - -class ExecAfterDownloadPP(PostProcessor): - def __init__(self, downloader, exec_cmd): - super(ExecAfterDownloadPP, self).__init__(downloader) - self.exec_cmd = exec_cmd - - def run(self, information): - cmd = self.exec_cmd - if '{}' not in cmd: - cmd += ' {}' - - cmd = cmd.replace('{}', compat_shlex_quote(information['filepath'])) - - self._downloader.to_screen('[exec] Executing command: %s' % cmd) - retCode = subprocess.call(encodeArgument(cmd), shell=True) - if retCode != 0: - raise PostProcessingError( - 'Command returned error code %d' % retCode) - - return [], information diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py deleted file mode 100644 index 757b496..0000000 --- a/youtube_dl/postprocessor/ffmpeg.py +++ /dev/null @@ -1,613 +0,0 @@ -from __future__ import unicode_literals - -import io -import os -import subprocess -import time -import re - - -from .common import AudioConversionError, PostProcessor - -from ..compat import ( - compat_subprocess_get_DEVNULL, -) -from ..utils import ( - encodeArgument, - encodeFilename, - get_exe_version, - is_outdated_version, - PostProcessingError, - prepend_extension, - shell_quote, - subtitles_filename, - dfxp2srt, - ISO639Utils, - replace_extension, -) - - -EXT_TO_OUT_FORMATS = { - 'aac': 'adts', - 'flac': 'flac', - 'm4a': 'ipod', - 'mka': 'matroska', - 'mkv': 'matroska', - 'mpg': 'mpeg', - 'ogv': 'ogg', - 'ts': 'mpegts', - 'wma': 'asf', - 'wmv': 'asf', -} -ACODECS = { - 'mp3': 'libmp3lame', - 'aac': 'aac', - 'flac': 'flac', - 'm4a': 'aac', - 'opus': 'libopus', - 'vorbis': 'libvorbis', - 'wav': None, -} - - -class FFmpegPostProcessorError(PostProcessingError): - pass - - -class FFmpegPostProcessor(PostProcessor): - def __init__(self, downloader=None): - PostProcessor.__init__(self, downloader) - self._determine_executables() - - def check_version(self): - if not self.available: - raise FFmpegPostProcessorError('ffmpeg or avconv not found. Please install one.') - - required_version = '10-0' if self.basename == 'avconv' else '1.0' - if is_outdated_version( - self._versions[self.basename], required_version): - warning = 'Your copy of %s is outdated, update %s to version %s or newer if you encounter any errors.' % ( - self.basename, self.basename, required_version) - if self._downloader: - self._downloader.report_warning(warning) - - @staticmethod - def get_versions(downloader=None): - return FFmpegPostProcessor(downloader)._versions - - def _determine_executables(self): - programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - prefer_ffmpeg = True - - self.basename = None - self.probe_basename = None - - self._paths = None - self._versions = None - if self._downloader: - prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', True) - location = self._downloader.params.get('ffmpeg_location') - if location is not None: - if not os.path.exists(location): - self._downloader.report_warning( - 'ffmpeg-location %s does not exist! ' - 'Continuing without avconv/ffmpeg.' % (location)) - self._versions = {} - return - elif not os.path.isdir(location): - basename = os.path.splitext(os.path.basename(location))[0] - if basename not in programs: - self._downloader.report_warning( - 'Cannot identify executable %s, its basename should be one of %s. ' - 'Continuing without avconv/ffmpeg.' % - (location, ', '.join(programs))) - self._versions = {} - return None - location = os.path.dirname(os.path.abspath(location)) - if basename in ('ffmpeg', 'ffprobe'): - prefer_ffmpeg = True - - self._paths = dict( - (p, os.path.join(location, p)) for p in programs) - self._versions = dict( - (p, get_exe_version(self._paths[p], args=['-version'])) - for p in programs) - if self._versions is None: - self._versions = dict( - (p, get_exe_version(p, args=['-version'])) for p in programs) - self._paths = dict((p, p) for p in programs) - - if prefer_ffmpeg is False: - prefs = ('avconv', 'ffmpeg') - else: - prefs = ('ffmpeg', 'avconv') - for p in prefs: - if self._versions[p]: - self.basename = p - break - - if prefer_ffmpeg is False: - prefs = ('avprobe', 'ffprobe') - else: - prefs = ('ffprobe', 'avprobe') - for p in prefs: - if self._versions[p]: - self.probe_basename = p - break - - @property - def available(self): - return self.basename is not None - - @property - def executable(self): - return self._paths[self.basename] - - @property - def probe_available(self): - return self.probe_basename is not None - - @property - def probe_executable(self): - return self._paths[self.probe_basename] - - def get_audio_codec(self, path): - if not self.probe_available: - raise PostProcessingError('ffprobe or avprobe not found. Please install one.') - try: - cmd = [ - encodeFilename(self.probe_executable, True), - encodeArgument('-show_streams'), - encodeFilename(self._ffmpeg_filename_argument(path), True)] - if self._downloader.params.get('verbose', False): - self._downloader.to_screen('[debug] %s command line: %s' % (self.basename, shell_quote(cmd))) - handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE, stdin=subprocess.PIPE) - output = handle.communicate()[0] - if handle.wait() != 0: - return None - except (IOError, OSError): - return None - audio_codec = None - for line in output.decode('ascii', 'ignore').split('\n'): - if line.startswith('codec_name='): - audio_codec = line.split('=')[1].strip() - elif line.strip() == 'codec_type=audio' and audio_codec is not None: - return audio_codec - return None - - def run_ffmpeg_multiple_files(self, input_paths, out_path, opts): - self.check_version() - - oldest_mtime = min( - os.stat(encodeFilename(path)).st_mtime for path in input_paths) - - opts += self._configuration_args() - - files_cmd = [] - for path in input_paths: - files_cmd.extend([ - encodeArgument('-i'), - encodeFilename(self._ffmpeg_filename_argument(path), True) - ]) - cmd = ([encodeFilename(self.executable, True), encodeArgument('-y')] + - files_cmd + - [encodeArgument(o) for o in opts] + - [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) - - if self._downloader.params.get('verbose', False): - self._downloader.to_screen('[debug] ffmpeg command line: %s' % shell_quote(cmd)) - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - stdout, stderr = p.communicate() - if p.returncode != 0: - stderr = stderr.decode('utf-8', 'replace') - msg = stderr.strip().split('\n')[-1] - raise FFmpegPostProcessorError(msg) - self.try_utime(out_path, oldest_mtime, oldest_mtime) - - def run_ffmpeg(self, path, out_path, opts): - self.run_ffmpeg_multiple_files([path], out_path, opts) - - def _ffmpeg_filename_argument(self, fn): - # Always use 'file:' because the filename may contain ':' (ffmpeg - # interprets that as a protocol) or can start with '-' (-- is broken in - # ffmpeg, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details) - # Also leave '-' intact in order not to break streaming to stdout. - return 'file:' + fn if fn != '-' else fn - - -class FFmpegExtractAudioPP(FFmpegPostProcessor): - def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, nopostoverwrites=False): - FFmpegPostProcessor.__init__(self, downloader) - if preferredcodec is None: - preferredcodec = 'best' - self._preferredcodec = preferredcodec - self._preferredquality = preferredquality - self._nopostoverwrites = nopostoverwrites - - def run_ffmpeg(self, path, out_path, codec, more_opts): - if codec is None: - acodec_opts = [] - else: - acodec_opts = ['-acodec', codec] - opts = ['-vn'] + acodec_opts + more_opts - try: - FFmpegPostProcessor.run_ffmpeg(self, path, out_path, opts) - except FFmpegPostProcessorError as err: - raise AudioConversionError(err.msg) - - def run(self, information): - path = information['filepath'] - - filecodec = self.get_audio_codec(path) - if filecodec is None: - raise PostProcessingError('WARNING: unable to obtain file audio codec with ffprobe') - - more_opts = [] - if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'): - if filecodec == 'aac' and self._preferredcodec in ['m4a', 'best']: - # Lossless, but in another container - acodec = 'copy' - extension = 'm4a' - more_opts = ['-bsf:a', 'aac_adtstoasc'] - elif filecodec in ['aac', 'flac', 'mp3', 'vorbis', 'opus']: - # Lossless if possible - acodec = 'copy' - extension = filecodec - if filecodec == 'aac': - more_opts = ['-f', 'adts'] - if filecodec == 'vorbis': - extension = 'ogg' - else: - # MP3 otherwise. - acodec = 'libmp3lame' - extension = 'mp3' - more_opts = [] - if self._preferredquality is not None: - if int(self._preferredquality) < 10: - more_opts += ['-q:a', self._preferredquality] - else: - more_opts += ['-b:a', self._preferredquality + 'k'] - else: - # We convert the audio (lossy if codec is lossy) - acodec = ACODECS[self._preferredcodec] - extension = self._preferredcodec - more_opts = [] - if self._preferredquality is not None: - # The opus codec doesn't support the -aq option - if int(self._preferredquality) < 10 and extension != 'opus': - more_opts += ['-q:a', self._preferredquality] - else: - more_opts += ['-b:a', self._preferredquality + 'k'] - if self._preferredcodec == 'aac': - more_opts += ['-f', 'adts'] - if self._preferredcodec == 'm4a': - more_opts += ['-bsf:a', 'aac_adtstoasc'] - if self._preferredcodec == 'vorbis': - extension = 'ogg' - if self._preferredcodec == 'wav': - extension = 'wav' - more_opts += ['-f', 'wav'] - - prefix, sep, ext = path.rpartition('.') # not os.path.splitext, since the latter does not work on unicode in all setups - new_path = prefix + sep + extension - - information['filepath'] = new_path - information['ext'] = extension - - # If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly. - if (new_path == path or - (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))): - self._downloader.to_screen('[ffmpeg] Post-process file %s exists, skipping' % new_path) - return [], information - - try: - self._downloader.to_screen('[ffmpeg] Destination: ' + new_path) - self.run_ffmpeg(path, new_path, acodec, more_opts) - except AudioConversionError as e: - raise PostProcessingError( - 'audio conversion failed: ' + e.msg) - except Exception: - raise PostProcessingError('error running ' + self.basename) - - # Try to update the date time for extracted audio file. - if information.get('filetime') is not None: - self.try_utime( - new_path, time.time(), information['filetime'], - errnote='Cannot update utime of audio file') - - return [path], information - - -class FFmpegVideoConvertorPP(FFmpegPostProcessor): - def __init__(self, downloader=None, preferedformat=None): - super(FFmpegVideoConvertorPP, self).__init__(downloader) - self._preferedformat = preferedformat - - def run(self, information): - path = information['filepath'] - if information['ext'] == self._preferedformat: - self._downloader.to_screen('[ffmpeg] Not converting video file %s - already is in target format %s' % (path, self._preferedformat)) - return [], information - options = [] - if self._preferedformat == 'avi': - options.extend(['-c:v', 'libxvid', '-vtag', 'XVID']) - prefix, sep, ext = path.rpartition('.') - outpath = prefix + sep + self._preferedformat - self._downloader.to_screen('[' + 'ffmpeg' + '] Converting video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath) - self.run_ffmpeg(path, outpath, options) - information['filepath'] = outpath - information['format'] = self._preferedformat - information['ext'] = self._preferedformat - return [path], information - - -class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): - def run(self, information): - if information['ext'] not in ('mp4', 'webm', 'mkv'): - self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4, webm or mkv files') - return [], information - subtitles = information.get('requested_subtitles') - if not subtitles: - self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed') - return [], information - - filename = information['filepath'] - - ext = information['ext'] - sub_langs = [] - sub_filenames = [] - webm_vtt_warn = False - - for lang, sub_info in subtitles.items(): - sub_ext = sub_info['ext'] - if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': - sub_langs.append(lang) - sub_filenames.append(subtitles_filename(filename, lang, sub_ext)) - else: - if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt': - webm_vtt_warn = True - self._downloader.to_screen('[ffmpeg] Only WebVTT subtitles can be embedded in webm files') - - if not sub_langs: - return [], information - - input_files = [filename] + sub_filenames - - opts = [ - '-map', '0', - '-c', 'copy', - # Don't copy the existing subtitles, we may be running the - # postprocessor a second time - '-map', '-0:s', - ] - if information['ext'] == 'mp4': - opts += ['-c:s', 'mov_text'] - for (i, lang) in enumerate(sub_langs): - opts.extend(['-map', '%d:0' % (i + 1)]) - lang_code = ISO639Utils.short2long(lang) - if lang_code is not None: - opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) - - temp_filename = prepend_extension(filename, 'temp') - self._downloader.to_screen('[ffmpeg] Embedding subtitles in \'%s\'' % filename) - self.run_ffmpeg_multiple_files(input_files, temp_filename, opts) - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - - return sub_filenames, information - - -class FFmpegMetadataPP(FFmpegPostProcessor): - def run(self, info): - metadata = {} - - def add(meta_list, info_list=None): - if not info_list: - info_list = meta_list - if not isinstance(meta_list, (list, tuple)): - meta_list = (meta_list,) - if not isinstance(info_list, (list, tuple)): - info_list = (info_list,) - for info_f in info_list: - if info.get(info_f) is not None: - for meta_f in meta_list: - metadata[meta_f] = info[info_f] - break - - add('title', ('track', 'title')) - add('date', 'upload_date') - add(('description', 'comment'), 'description') - add('purl', 'webpage_url') - add('track', 'track_number') - add('artist', ('artist', 'creator', 'uploader', 'uploader_id')) - add('genre') - add('album') - add('album_artist') - add('disc', 'disc_number') - - if not metadata: - self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add') - return [], info - - filename = info['filepath'] - temp_filename = prepend_extension(filename, 'temp') - in_filenames = [filename] - options = [] - - if info['ext'] == 'm4a': - options.extend(['-vn', '-acodec', 'copy']) - else: - options.extend(['-c', 'copy']) - - for (name, value) in metadata.items(): - options.extend(['-metadata', '%s=%s' % (name, value)]) - - chapters = info.get('chapters', []) - if chapters: - metadata_filename = replace_extension(filename, 'meta') - with io.open(metadata_filename, 'wt', encoding='utf-8') as f: - def ffmpeg_escape(text): - return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text) - - metadata_file_content = ';FFMETADATA1\n' - for chapter in chapters: - metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n' - metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000) - metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000) - chapter_title = chapter.get('title') - if chapter_title: - metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title) - f.write(metadata_file_content) - in_filenames.append(metadata_filename) - options.extend(['-map_metadata', '1']) - - self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename) - self.run_ffmpeg_multiple_files(in_filenames, temp_filename, options) - if chapters: - os.remove(metadata_filename) - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - return [], info - - -class FFmpegMergerPP(FFmpegPostProcessor): - def run(self, info): - filename = info['filepath'] - temp_filename = prepend_extension(filename, 'temp') - args = ['-c', 'copy', '-map', '0:v:0', '-map', '1:a:0'] - self._downloader.to_screen('[ffmpeg] Merging formats into "%s"' % filename) - self.run_ffmpeg_multiple_files(info['__files_to_merge'], temp_filename, args) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - return info['__files_to_merge'], info - - def can_merge(self): - # TODO: figure out merge-capable ffmpeg version - if self.basename != 'avconv': - return True - - required_version = '10-0' - if is_outdated_version( - self._versions[self.basename], required_version): - warning = ('Your copy of %s is outdated and unable to properly mux separate video and audio files, ' - 'youtube-dl will download single file media. ' - 'Update %s to version %s or newer to fix this.') % ( - self.basename, self.basename, required_version) - if self._downloader: - self._downloader.report_warning(warning) - return False - return True - - -class FFmpegFixupStretchedPP(FFmpegPostProcessor): - def run(self, info): - stretched_ratio = info.get('stretched_ratio') - if stretched_ratio is None or stretched_ratio == 1: - return [], info - - filename = info['filepath'] - temp_filename = prepend_extension(filename, 'temp') - - options = ['-c', 'copy', '-aspect', '%f' % stretched_ratio] - self._downloader.to_screen('[ffmpeg] Fixing aspect ratio in "%s"' % filename) - self.run_ffmpeg(filename, temp_filename, options) - - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - - return [], info - - -class FFmpegFixupM4aPP(FFmpegPostProcessor): - def run(self, info): - if info.get('container') != 'm4a_dash': - return [], info - - filename = info['filepath'] - temp_filename = prepend_extension(filename, 'temp') - - options = ['-c', 'copy', '-f', 'mp4'] - self._downloader.to_screen('[ffmpeg] Correcting container in "%s"' % filename) - self.run_ffmpeg(filename, temp_filename, options) - - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - - return [], info - - -class FFmpegFixupM3u8PP(FFmpegPostProcessor): - def run(self, info): - filename = info['filepath'] - if self.get_audio_codec(filename) == 'aac': - temp_filename = prepend_extension(filename, 'temp') - - options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] - self._downloader.to_screen('[ffmpeg] Fixing malformed AAC bitstream in "%s"' % filename) - self.run_ffmpeg(filename, temp_filename, options) - - os.remove(encodeFilename(filename)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - return [], info - - -class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): - def __init__(self, downloader=None, format=None): - super(FFmpegSubtitlesConvertorPP, self).__init__(downloader) - self.format = format - - def run(self, info): - subs = info.get('requested_subtitles') - filename = info['filepath'] - new_ext = self.format - new_format = new_ext - if new_format == 'vtt': - new_format = 'webvtt' - if subs is None: - self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to convert') - return [], info - self._downloader.to_screen('[ffmpeg] Converting subtitles') - sub_filenames = [] - for lang, sub in subs.items(): - ext = sub['ext'] - if ext == new_ext: - self._downloader.to_screen( - '[ffmpeg] Subtitle file for %s is already in the requested format' % new_ext) - continue - old_file = subtitles_filename(filename, lang, ext) - sub_filenames.append(old_file) - new_file = subtitles_filename(filename, lang, new_ext) - - if ext in ('dfxp', 'ttml', 'tt'): - self._downloader.report_warning( - 'You have requested to convert dfxp (TTML) subtitles into another format, ' - 'which results in style information loss') - - dfxp_file = old_file - srt_file = subtitles_filename(filename, lang, 'srt') - - with open(dfxp_file, 'rb') as f: - srt_data = dfxp2srt(f.read()) - - with io.open(srt_file, 'wt', encoding='utf-8') as f: - f.write(srt_data) - old_file = srt_file - - subs[lang] = { - 'ext': 'srt', - 'data': srt_data - } - - if new_ext == 'srt': - continue - else: - sub_filenames.append(srt_file) - - self.run_ffmpeg(old_file, new_file, ['-f', new_format]) - - with io.open(new_file, 'rt', encoding='utf-8') as f: - subs[lang] = { - 'ext': new_ext, - 'data': f.read(), - } - - return sub_filenames, info diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py deleted file mode 100644 index f5c14d9..0000000 --- a/youtube_dl/postprocessor/metadatafromtitle.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import PostProcessor - - -class MetadataFromTitlePP(PostProcessor): - def __init__(self, downloader, titleformat): - super(MetadataFromTitlePP, self).__init__(downloader) - self._titleformat = titleformat - self._titleregex = (self.format_to_regex(titleformat) - if re.search(r'%\(\w+\)s', titleformat) - else titleformat) - - def format_to_regex(self, fmt): - r""" - Converts a string like - '%(title)s - %(artist)s' - to a regex like - '(?P<title>.+)\ \-\ (?P<artist>.+)' - """ - lastpos = 0 - regex = '' - # replace %(..)s with regex group and escape other string parts - for match in re.finditer(r'%\((\w+)\)s', fmt): - regex += re.escape(fmt[lastpos:match.start()]) - regex += r'(?P<' + match.group(1) + '>.+)' - lastpos = match.end() - if lastpos < len(fmt): - regex += re.escape(fmt[lastpos:]) - return regex - - def run(self, info): - title = info['title'] - match = re.match(self._titleregex, title) - if match is None: - self._downloader.to_screen( - '[fromtitle] Could not interpret title of video as "%s"' - % self._titleformat) - return [], info - for attribute, value in match.groupdict().items(): - info[attribute] = value - self._downloader.to_screen( - '[fromtitle] parsed %s: %s' - % (attribute, value if value is not None else 'NA')) - - return [], info diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py deleted file mode 100644 index b0aed9c..0000000 --- a/youtube_dl/postprocessor/xattrpp.py +++ /dev/null @@ -1,79 +0,0 @@ -from __future__ import unicode_literals - -from .common import PostProcessor -from ..compat import compat_os_name -from ..utils import ( - hyphenate_date, - write_xattr, - XAttrMetadataError, - XAttrUnavailableError, -) - - -class XAttrMetadataPP(PostProcessor): - - # - # More info about extended attributes for media: - # http://freedesktop.org/wiki/CommonExtendedAttributes/ - # http://www.freedesktop.org/wiki/PhreedomDraft/ - # http://dublincore.org/documents/usageguide/elements.shtml - # - # TODO: - # * capture youtube keywords and put them in 'user.dublincore.subject' (comma-separated) - # * figure out which xattrs can be used for 'duration', 'thumbnail', 'resolution' - # - - def run(self, info): - """ Set extended attributes on downloaded file (if xattr support is found). """ - - # Write the metadata to the file's xattrs - self._downloader.to_screen('[metadata] Writing metadata to file\'s xattrs') - - filename = info['filepath'] - - try: - xattr_mapping = { - 'user.xdg.referrer.url': 'webpage_url', - # 'user.xdg.comment': 'description', - 'user.dublincore.title': 'title', - 'user.dublincore.date': 'upload_date', - 'user.dublincore.description': 'description', - 'user.dublincore.contributor': 'uploader', - 'user.dublincore.format': 'format', - } - - num_written = 0 - for xattrname, infoname in xattr_mapping.items(): - - value = info.get(infoname) - - if value: - if infoname == 'upload_date': - value = hyphenate_date(value) - - byte_value = value.encode('utf-8') - write_xattr(filename, xattrname, byte_value) - num_written += 1 - - return [], info - - except XAttrUnavailableError as e: - self._downloader.report_error(str(e)) - return [], info - - except XAttrMetadataError as e: - if e.reason == 'NO_SPACE': - self._downloader.report_warning( - 'There\'s no disk space left, disk quota exceeded or filesystem xattr limit exceeded. ' + - (('Some ' if num_written else '') + 'extended attributes are not written.').capitalize()) - elif e.reason == 'VALUE_TOO_LONG': - self._downloader.report_warning( - 'Unable to write extended attributes due to too long values.') - else: - msg = 'This filesystem doesn\'t support extended attributes. ' - if compat_os_name == 'nt': - msg += 'You need to use NTFS.' - else: - msg += '(You may have to enable them in your /etc/fstab)' - self._downloader.report_error(msg) - return [], info diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py deleted file mode 100644 index 5d4adbe..0000000 --- a/youtube_dl/socks.py +++ /dev/null @@ -1,273 +0,0 @@ -# Public Domain SOCKS proxy protocol implementation -# Adapted from https://gist.github.com/bluec0re/cafd3764412967417fd3 - -from __future__ import unicode_literals - -# References: -# SOCKS4 protocol http://www.openssh.com/txt/socks4.protocol -# SOCKS4A protocol http://www.openssh.com/txt/socks4a.protocol -# SOCKS5 protocol https://tools.ietf.org/html/rfc1928 -# SOCKS5 username/password authentication https://tools.ietf.org/html/rfc1929 - -import collections -import socket - -from .compat import ( - compat_ord, - compat_struct_pack, - compat_struct_unpack, -) - -__author__ = 'Timo Schmid <coding@timoschmid.de>' - -SOCKS4_VERSION = 4 -SOCKS4_REPLY_VERSION = 0x00 -# Excerpt from SOCKS4A protocol: -# if the client cannot resolve the destination host's domain name to find its -# IP address, it should set the first three bytes of DSTIP to NULL and the last -# byte to a non-zero value. -SOCKS4_DEFAULT_DSTIP = compat_struct_pack('!BBBB', 0, 0, 0, 0xFF) - -SOCKS5_VERSION = 5 -SOCKS5_USER_AUTH_VERSION = 0x01 -SOCKS5_USER_AUTH_SUCCESS = 0x00 - - -class Socks4Command(object): - CMD_CONNECT = 0x01 - CMD_BIND = 0x02 - - -class Socks5Command(Socks4Command): - CMD_UDP_ASSOCIATE = 0x03 - - -class Socks5Auth(object): - AUTH_NONE = 0x00 - AUTH_GSSAPI = 0x01 - AUTH_USER_PASS = 0x02 - AUTH_NO_ACCEPTABLE = 0xFF # For server response - - -class Socks5AddressType(object): - ATYP_IPV4 = 0x01 - ATYP_DOMAINNAME = 0x03 - ATYP_IPV6 = 0x04 - - -class ProxyError(socket.error): - ERR_SUCCESS = 0x00 - - def __init__(self, code=None, msg=None): - if code is not None and msg is None: - msg = self.CODES.get(code) or 'unknown error' - super(ProxyError, self).__init__(code, msg) - - -class InvalidVersionError(ProxyError): - def __init__(self, expected_version, got_version): - msg = ('Invalid response version from server. Expected {0:02x} got ' - '{1:02x}'.format(expected_version, got_version)) - super(InvalidVersionError, self).__init__(0, msg) - - -class Socks4Error(ProxyError): - ERR_SUCCESS = 90 - - CODES = { - 91: 'request rejected or failed', - 92: 'request rejected because SOCKS server cannot connect to identd on the client', - 93: 'request rejected because the client program and identd report different user-ids' - } - - -class Socks5Error(ProxyError): - ERR_GENERAL_FAILURE = 0x01 - - CODES = { - 0x01: 'general SOCKS server failure', - 0x02: 'connection not allowed by ruleset', - 0x03: 'Network unreachable', - 0x04: 'Host unreachable', - 0x05: 'Connection refused', - 0x06: 'TTL expired', - 0x07: 'Command not supported', - 0x08: 'Address type not supported', - 0xFE: 'unknown username or invalid password', - 0xFF: 'all offered authentication methods were rejected' - } - - -class ProxyType(object): - SOCKS4 = 0 - SOCKS4A = 1 - SOCKS5 = 2 - - -Proxy = collections.namedtuple('Proxy', ( - 'type', 'host', 'port', 'username', 'password', 'remote_dns')) - - -class sockssocket(socket.socket): - def __init__(self, *args, **kwargs): - self._proxy = None - super(sockssocket, self).__init__(*args, **kwargs) - - def setproxy(self, proxytype, addr, port, rdns=True, username=None, password=None): - assert proxytype in (ProxyType.SOCKS4, ProxyType.SOCKS4A, ProxyType.SOCKS5) - - self._proxy = Proxy(proxytype, addr, port, username, password, rdns) - - def recvall(self, cnt): - data = b'' - while len(data) < cnt: - cur = self.recv(cnt - len(data)) - if not cur: - raise EOFError('{0} bytes missing'.format(cnt - len(data))) - data += cur - return data - - def _recv_bytes(self, cnt): - data = self.recvall(cnt) - return compat_struct_unpack('!{0}B'.format(cnt), data) - - @staticmethod - def _len_and_data(data): - return compat_struct_pack('!B', len(data)) + data - - def _check_response_version(self, expected_version, got_version): - if got_version != expected_version: - self.close() - raise InvalidVersionError(expected_version, got_version) - - def _resolve_address(self, destaddr, default, use_remote_dns): - try: - return socket.inet_aton(destaddr) - except socket.error: - if use_remote_dns and self._proxy.remote_dns: - return default - else: - return socket.inet_aton(socket.gethostbyname(destaddr)) - - def _setup_socks4(self, address, is_4a=False): - destaddr, port = address - - ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a) - - packet = compat_struct_pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr - - username = (self._proxy.username or '').encode('utf-8') - packet += username + b'\x00' - - if is_4a and self._proxy.remote_dns: - packet += destaddr.encode('utf-8') + b'\x00' - - self.sendall(packet) - - version, resp_code, dstport, dsthost = compat_struct_unpack('!BBHI', self.recvall(8)) - - self._check_response_version(SOCKS4_REPLY_VERSION, version) - - if resp_code != Socks4Error.ERR_SUCCESS: - self.close() - raise Socks4Error(resp_code) - - return (dsthost, dstport) - - def _setup_socks4a(self, address): - self._setup_socks4(address, is_4a=True) - - def _socks5_auth(self): - packet = compat_struct_pack('!B', SOCKS5_VERSION) - - auth_methods = [Socks5Auth.AUTH_NONE] - if self._proxy.username and self._proxy.password: - auth_methods.append(Socks5Auth.AUTH_USER_PASS) - - packet += compat_struct_pack('!B', len(auth_methods)) - packet += compat_struct_pack('!{0}B'.format(len(auth_methods)), *auth_methods) - - self.sendall(packet) - - version, method = self._recv_bytes(2) - - self._check_response_version(SOCKS5_VERSION, version) - - if method == Socks5Auth.AUTH_NO_ACCEPTABLE or ( - method == Socks5Auth.AUTH_USER_PASS and (not self._proxy.username or not self._proxy.password)): - self.close() - raise Socks5Error(Socks5Auth.AUTH_NO_ACCEPTABLE) - - if method == Socks5Auth.AUTH_USER_PASS: - username = self._proxy.username.encode('utf-8') - password = self._proxy.password.encode('utf-8') - packet = compat_struct_pack('!B', SOCKS5_USER_AUTH_VERSION) - packet += self._len_and_data(username) + self._len_and_data(password) - self.sendall(packet) - - version, status = self._recv_bytes(2) - - self._check_response_version(SOCKS5_USER_AUTH_VERSION, version) - - if status != SOCKS5_USER_AUTH_SUCCESS: - self.close() - raise Socks5Error(Socks5Error.ERR_GENERAL_FAILURE) - - def _setup_socks5(self, address): - destaddr, port = address - - ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True) - - self._socks5_auth() - - reserved = 0 - packet = compat_struct_pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved) - if ipaddr is None: - destaddr = destaddr.encode('utf-8') - packet += compat_struct_pack('!B', Socks5AddressType.ATYP_DOMAINNAME) - packet += self._len_and_data(destaddr) - else: - packet += compat_struct_pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr - packet += compat_struct_pack('!H', port) - - self.sendall(packet) - - version, status, reserved, atype = self._recv_bytes(4) - - self._check_response_version(SOCKS5_VERSION, version) - - if status != Socks5Error.ERR_SUCCESS: - self.close() - raise Socks5Error(status) - - if atype == Socks5AddressType.ATYP_IPV4: - destaddr = self.recvall(4) - elif atype == Socks5AddressType.ATYP_DOMAINNAME: - alen = compat_ord(self.recv(1)) - destaddr = self.recvall(alen) - elif atype == Socks5AddressType.ATYP_IPV6: - destaddr = self.recvall(16) - destport = compat_struct_unpack('!H', self.recvall(2))[0] - - return (destaddr, destport) - - def _make_proxy(self, connect_func, address): - if not self._proxy: - return connect_func(self, address) - - result = connect_func(self, (self._proxy.host, self._proxy.port)) - if result != 0 and result is not None: - return result - setup_funcs = { - ProxyType.SOCKS4: self._setup_socks4, - ProxyType.SOCKS4A: self._setup_socks4a, - ProxyType.SOCKS5: self._setup_socks5, - } - setup_funcs[self._proxy.type](address) - return result - - def connect(self, address): - self._make_proxy(socket.socket.connect, address) - - def connect_ex(self, address): - return self._make_proxy(socket.socket.connect_ex, address) diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py deleted file mode 100644 index 0c71585..0000000 --- a/youtube_dl/swfinterp.py +++ /dev/null @@ -1,834 +0,0 @@ -from __future__ import unicode_literals - -import collections -import io -import zlib - -from .compat import ( - compat_str, - compat_struct_unpack, -) -from .utils import ( - ExtractorError, -) - - -def _extract_tags(file_contents): - if file_contents[1:3] != b'WS': - raise ExtractorError( - 'Not an SWF file; header is %r' % file_contents[:3]) - if file_contents[:1] == b'C': - content = zlib.decompress(file_contents[8:]) - else: - raise NotImplementedError( - 'Unsupported compression format %r' % - file_contents[:1]) - - # Determine number of bits in framesize rectangle - framesize_nbits = compat_struct_unpack('!B', content[:1])[0] >> 3 - framesize_len = (5 + 4 * framesize_nbits + 7) // 8 - - pos = framesize_len + 2 + 2 - while pos < len(content): - header16 = compat_struct_unpack('<H', content[pos:pos + 2])[0] - pos += 2 - tag_code = header16 >> 6 - tag_len = header16 & 0x3f - if tag_len == 0x3f: - tag_len = compat_struct_unpack('<I', content[pos:pos + 4])[0] - pos += 4 - assert pos + tag_len <= len(content), \ - ('Tag %d ends at %d+%d - that\'s longer than the file (%d)' - % (tag_code, pos, tag_len, len(content))) - yield (tag_code, content[pos:pos + tag_len]) - pos += tag_len - - -class _AVMClass_Object(object): - def __init__(self, avm_class): - self.avm_class = avm_class - - def __repr__(self): - return '%s#%x' % (self.avm_class.name, id(self)) - - -class _ScopeDict(dict): - def __init__(self, avm_class): - super(_ScopeDict, self).__init__() - self.avm_class = avm_class - - def __repr__(self): - return '%s__Scope(%s)' % ( - self.avm_class.name, - super(_ScopeDict, self).__repr__()) - - -class _AVMClass(object): - def __init__(self, name_idx, name, static_properties=None): - self.name_idx = name_idx - self.name = name - self.method_names = {} - self.method_idxs = {} - self.methods = {} - self.method_pyfunctions = {} - self.static_properties = static_properties if static_properties else {} - - self.variables = _ScopeDict(self) - self.constants = {} - - def make_object(self): - return _AVMClass_Object(self) - - def __repr__(self): - return '_AVMClass(%s)' % (self.name) - - def register_methods(self, methods): - self.method_names.update(methods.items()) - self.method_idxs.update(dict( - (idx, name) - for name, idx in methods.items())) - - -class _Multiname(object): - def __init__(self, kind): - self.kind = kind - - def __repr__(self): - return '[MULTINAME kind: 0x%x]' % self.kind - - -def _read_int(reader): - res = 0 - shift = 0 - for _ in range(5): - buf = reader.read(1) - assert len(buf) == 1 - b = compat_struct_unpack('<B', buf)[0] - res = res | ((b & 0x7f) << shift) - if b & 0x80 == 0: - break - shift += 7 - return res - - -def _u30(reader): - res = _read_int(reader) - assert res & 0xf0000000 == 0 - return res - - -_u32 = _read_int - - -def _s32(reader): - v = _read_int(reader) - if v & 0x80000000 != 0: - v = - ((v ^ 0xffffffff) + 1) - return v - - -def _s24(reader): - bs = reader.read(3) - assert len(bs) == 3 - last_byte = b'\xff' if (ord(bs[2:3]) >= 0x80) else b'\x00' - return compat_struct_unpack('<i', bs + last_byte)[0] - - -def _read_string(reader): - slen = _u30(reader) - resb = reader.read(slen) - assert len(resb) == slen - return resb.decode('utf-8') - - -def _read_bytes(count, reader): - assert count >= 0 - resb = reader.read(count) - assert len(resb) == count - return resb - - -def _read_byte(reader): - resb = _read_bytes(1, reader=reader) - res = compat_struct_unpack('<B', resb)[0] - return res - - -StringClass = _AVMClass('(no name idx)', 'String') -ByteArrayClass = _AVMClass('(no name idx)', 'ByteArray') -TimerClass = _AVMClass('(no name idx)', 'Timer') -TimerEventClass = _AVMClass('(no name idx)', 'TimerEvent', {'TIMER': 'timer'}) -_builtin_classes = { - StringClass.name: StringClass, - ByteArrayClass.name: ByteArrayClass, - TimerClass.name: TimerClass, - TimerEventClass.name: TimerEventClass, -} - - -class _Undefined(object): - def __bool__(self): - return False - __nonzero__ = __bool__ - - def __hash__(self): - return 0 - - def __str__(self): - return 'undefined' - __repr__ = __str__ - - -undefined = _Undefined() - - -class SWFInterpreter(object): - def __init__(self, file_contents): - self._patched_functions = { - (TimerClass, 'addEventListener'): lambda params: undefined, - } - code_tag = next(tag - for tag_code, tag in _extract_tags(file_contents) - if tag_code == 82) - p = code_tag.index(b'\0', 4) + 1 - code_reader = io.BytesIO(code_tag[p:]) - - # Parse ABC (AVM2 ByteCode) - - # Define a couple convenience methods - u30 = lambda *args: _u30(*args, reader=code_reader) - s32 = lambda *args: _s32(*args, reader=code_reader) - u32 = lambda *args: _u32(*args, reader=code_reader) - read_bytes = lambda *args: _read_bytes(*args, reader=code_reader) - read_byte = lambda *args: _read_byte(*args, reader=code_reader) - - # minor_version + major_version - read_bytes(2 + 2) - - # Constant pool - int_count = u30() - self.constant_ints = [0] - for _c in range(1, int_count): - self.constant_ints.append(s32()) - self.constant_uints = [0] - uint_count = u30() - for _c in range(1, uint_count): - self.constant_uints.append(u32()) - double_count = u30() - read_bytes(max(0, (double_count - 1)) * 8) - string_count = u30() - self.constant_strings = [''] - for _c in range(1, string_count): - s = _read_string(code_reader) - self.constant_strings.append(s) - namespace_count = u30() - for _c in range(1, namespace_count): - read_bytes(1) # kind - u30() # name - ns_set_count = u30() - for _c in range(1, ns_set_count): - count = u30() - for _c2 in range(count): - u30() - multiname_count = u30() - MULTINAME_SIZES = { - 0x07: 2, # QName - 0x0d: 2, # QNameA - 0x0f: 1, # RTQName - 0x10: 1, # RTQNameA - 0x11: 0, # RTQNameL - 0x12: 0, # RTQNameLA - 0x09: 2, # Multiname - 0x0e: 2, # MultinameA - 0x1b: 1, # MultinameL - 0x1c: 1, # MultinameLA - } - self.multinames = [''] - for _c in range(1, multiname_count): - kind = u30() - assert kind in MULTINAME_SIZES, 'Invalid multiname kind %r' % kind - if kind == 0x07: - u30() # namespace_idx - name_idx = u30() - self.multinames.append(self.constant_strings[name_idx]) - elif kind == 0x09: - name_idx = u30() - u30() - self.multinames.append(self.constant_strings[name_idx]) - else: - self.multinames.append(_Multiname(kind)) - for _c2 in range(MULTINAME_SIZES[kind]): - u30() - - # Methods - method_count = u30() - MethodInfo = collections.namedtuple( - 'MethodInfo', - ['NEED_ARGUMENTS', 'NEED_REST']) - method_infos = [] - for method_id in range(method_count): - param_count = u30() - u30() # return type - for _ in range(param_count): - u30() # param type - u30() # name index (always 0 for youtube) - flags = read_byte() - if flags & 0x08 != 0: - # Options present - option_count = u30() - for c in range(option_count): - u30() # val - read_bytes(1) # kind - if flags & 0x80 != 0: - # Param names present - for _ in range(param_count): - u30() # param name - mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0) - method_infos.append(mi) - - # Metadata - metadata_count = u30() - for _c in range(metadata_count): - u30() # name - item_count = u30() - for _c2 in range(item_count): - u30() # key - u30() # value - - def parse_traits_info(): - trait_name_idx = u30() - kind_full = read_byte() - kind = kind_full & 0x0f - attrs = kind_full >> 4 - methods = {} - constants = None - if kind == 0x00: # Slot - u30() # Slot id - u30() # type_name_idx - vindex = u30() - if vindex != 0: - read_byte() # vkind - elif kind == 0x06: # Const - u30() # Slot id - u30() # type_name_idx - vindex = u30() - vkind = 'any' - if vindex != 0: - vkind = read_byte() - if vkind == 0x03: # Constant_Int - value = self.constant_ints[vindex] - elif vkind == 0x04: # Constant_UInt - value = self.constant_uints[vindex] - else: - return {}, None # Ignore silently for now - constants = {self.multinames[trait_name_idx]: value} - elif kind in (0x01, 0x02, 0x03): # Method / Getter / Setter - u30() # disp_id - method_idx = u30() - methods[self.multinames[trait_name_idx]] = method_idx - elif kind == 0x04: # Class - u30() # slot_id - u30() # classi - elif kind == 0x05: # Function - u30() # slot_id - function_idx = u30() - methods[function_idx] = self.multinames[trait_name_idx] - else: - raise ExtractorError('Unsupported trait kind %d' % kind) - - if attrs & 0x4 != 0: # Metadata present - metadata_count = u30() - for _c3 in range(metadata_count): - u30() # metadata index - - return methods, constants - - # Classes - class_count = u30() - classes = [] - for class_id in range(class_count): - name_idx = u30() - - cname = self.multinames[name_idx] - avm_class = _AVMClass(name_idx, cname) - classes.append(avm_class) - - u30() # super_name idx - flags = read_byte() - if flags & 0x08 != 0: # Protected namespace is present - u30() # protected_ns_idx - intrf_count = u30() - for _c2 in range(intrf_count): - u30() - u30() # iinit - trait_count = u30() - for _c2 in range(trait_count): - trait_methods, trait_constants = parse_traits_info() - avm_class.register_methods(trait_methods) - if trait_constants: - avm_class.constants.update(trait_constants) - - assert len(classes) == class_count - self._classes_by_name = dict((c.name, c) for c in classes) - - for avm_class in classes: - avm_class.cinit_idx = u30() - trait_count = u30() - for _c2 in range(trait_count): - trait_methods, trait_constants = parse_traits_info() - avm_class.register_methods(trait_methods) - if trait_constants: - avm_class.constants.update(trait_constants) - - # Scripts - script_count = u30() - for _c in range(script_count): - u30() # init - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - - # Method bodies - method_body_count = u30() - Method = collections.namedtuple('Method', ['code', 'local_count']) - self._all_methods = [] - for _c in range(method_body_count): - method_idx = u30() - u30() # max_stack - local_count = u30() - u30() # init_scope_depth - u30() # max_scope_depth - code_length = u30() - code = read_bytes(code_length) - m = Method(code, local_count) - self._all_methods.append(m) - for avm_class in classes: - if method_idx in avm_class.method_idxs: - avm_class.methods[avm_class.method_idxs[method_idx]] = m - exception_count = u30() - for _c2 in range(exception_count): - u30() # from - u30() # to - u30() # target - u30() # exc_type - u30() # var_name - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - - assert p + code_reader.tell() == len(code_tag) - - def patch_function(self, avm_class, func_name, f): - self._patched_functions[(avm_class, func_name)] = f - - def extract_class(self, class_name, call_cinit=True): - try: - res = self._classes_by_name[class_name] - except KeyError: - raise ExtractorError('Class %r not found' % class_name) - - if call_cinit and hasattr(res, 'cinit_idx'): - res.register_methods({'$cinit': res.cinit_idx}) - res.methods['$cinit'] = self._all_methods[res.cinit_idx] - cinit = self.extract_function(res, '$cinit') - cinit([]) - - return res - - def extract_function(self, avm_class, func_name): - p = self._patched_functions.get((avm_class, func_name)) - if p: - return p - if func_name in avm_class.method_pyfunctions: - return avm_class.method_pyfunctions[func_name] - if func_name in self._classes_by_name: - return self._classes_by_name[func_name].make_object() - if func_name not in avm_class.methods: - raise ExtractorError('Cannot find function %s.%s' % ( - avm_class.name, func_name)) - m = avm_class.methods[func_name] - - def resfunc(args): - # Helper functions - coder = io.BytesIO(m.code) - s24 = lambda: _s24(coder) - u30 = lambda: _u30(coder) - - registers = [avm_class.variables] + list(args) + [None] * m.local_count - stack = [] - scopes = collections.deque([ - self._classes_by_name, avm_class.constants, avm_class.variables]) - while True: - opcode = _read_byte(coder) - if opcode == 9: # label - pass # Spec says: "Do nothing." - elif opcode == 16: # jump - offset = s24() - coder.seek(coder.tell() + offset) - elif opcode == 17: # iftrue - offset = s24() - value = stack.pop() - if value: - coder.seek(coder.tell() + offset) - elif opcode == 18: # iffalse - offset = s24() - value = stack.pop() - if not value: - coder.seek(coder.tell() + offset) - elif opcode == 19: # ifeq - offset = s24() - value2 = stack.pop() - value1 = stack.pop() - if value2 == value1: - coder.seek(coder.tell() + offset) - elif opcode == 20: # ifne - offset = s24() - value2 = stack.pop() - value1 = stack.pop() - if value2 != value1: - coder.seek(coder.tell() + offset) - elif opcode == 21: # iflt - offset = s24() - value2 = stack.pop() - value1 = stack.pop() - if value1 < value2: - coder.seek(coder.tell() + offset) - elif opcode == 32: # pushnull - stack.append(None) - elif opcode == 33: # pushundefined - stack.append(undefined) - elif opcode == 36: # pushbyte - v = _read_byte(coder) - stack.append(v) - elif opcode == 37: # pushshort - v = u30() - stack.append(v) - elif opcode == 38: # pushtrue - stack.append(True) - elif opcode == 39: # pushfalse - stack.append(False) - elif opcode == 40: # pushnan - stack.append(float('NaN')) - elif opcode == 42: # dup - value = stack[-1] - stack.append(value) - elif opcode == 44: # pushstring - idx = u30() - stack.append(self.constant_strings[idx]) - elif opcode == 48: # pushscope - new_scope = stack.pop() - scopes.append(new_scope) - elif opcode == 66: # construct - arg_count = u30() - args = list(reversed( - [stack.pop() for _ in range(arg_count)])) - obj = stack.pop() - res = obj.avm_class.make_object() - stack.append(res) - elif opcode == 70: # callproperty - index = u30() - mname = self.multinames[index] - arg_count = u30() - args = list(reversed( - [stack.pop() for _ in range(arg_count)])) - obj = stack.pop() - - if obj == StringClass: - if mname == 'String': - assert len(args) == 1 - assert isinstance(args[0], ( - int, compat_str, _Undefined)) - if args[0] == undefined: - res = 'undefined' - else: - res = compat_str(args[0]) - stack.append(res) - continue - else: - raise NotImplementedError( - 'Function String.%s is not yet implemented' - % mname) - elif isinstance(obj, _AVMClass_Object): - func = self.extract_function(obj.avm_class, mname) - res = func(args) - stack.append(res) - continue - elif isinstance(obj, _AVMClass): - func = self.extract_function(obj, mname) - res = func(args) - stack.append(res) - continue - elif isinstance(obj, _ScopeDict): - if mname in obj.avm_class.method_names: - func = self.extract_function(obj.avm_class, mname) - res = func(args) - else: - res = obj[mname] - stack.append(res) - continue - elif isinstance(obj, compat_str): - if mname == 'split': - assert len(args) == 1 - assert isinstance(args[0], compat_str) - if args[0] == '': - res = list(obj) - else: - res = obj.split(args[0]) - stack.append(res) - continue - elif mname == 'charCodeAt': - assert len(args) <= 1 - idx = 0 if len(args) == 0 else args[0] - assert isinstance(idx, int) - res = ord(obj[idx]) - stack.append(res) - continue - elif isinstance(obj, list): - if mname == 'slice': - assert len(args) == 1 - assert isinstance(args[0], int) - res = obj[args[0]:] - stack.append(res) - continue - elif mname == 'join': - assert len(args) == 1 - assert isinstance(args[0], compat_str) - res = args[0].join(obj) - stack.append(res) - continue - raise NotImplementedError( - 'Unsupported property %r on %r' - % (mname, obj)) - elif opcode == 71: # returnvoid - res = undefined - return res - elif opcode == 72: # returnvalue - res = stack.pop() - return res - elif opcode == 73: # constructsuper - # Not yet implemented, just hope it works without it - arg_count = u30() - args = list(reversed( - [stack.pop() for _ in range(arg_count)])) - obj = stack.pop() - elif opcode == 74: # constructproperty - index = u30() - arg_count = u30() - args = list(reversed( - [stack.pop() for _ in range(arg_count)])) - obj = stack.pop() - - mname = self.multinames[index] - assert isinstance(obj, _AVMClass) - - # We do not actually call the constructor for now; - # we just pretend it does nothing - stack.append(obj.make_object()) - elif opcode == 79: # callpropvoid - index = u30() - mname = self.multinames[index] - arg_count = u30() - args = list(reversed( - [stack.pop() for _ in range(arg_count)])) - obj = stack.pop() - if isinstance(obj, _AVMClass_Object): - func = self.extract_function(obj.avm_class, mname) - res = func(args) - assert res is undefined - continue - if isinstance(obj, _ScopeDict): - assert mname in obj.avm_class.method_names - func = self.extract_function(obj.avm_class, mname) - res = func(args) - assert res is undefined - continue - if mname == 'reverse': - assert isinstance(obj, list) - obj.reverse() - else: - raise NotImplementedError( - 'Unsupported (void) property %r on %r' - % (mname, obj)) - elif opcode == 86: # newarray - arg_count = u30() - arr = [] - for i in range(arg_count): - arr.append(stack.pop()) - arr = arr[::-1] - stack.append(arr) - elif opcode == 93: # findpropstrict - index = u30() - mname = self.multinames[index] - for s in reversed(scopes): - if mname in s: - res = s - break - else: - res = scopes[0] - if mname not in res and mname in _builtin_classes: - stack.append(_builtin_classes[mname]) - else: - stack.append(res[mname]) - elif opcode == 94: # findproperty - index = u30() - mname = self.multinames[index] - for s in reversed(scopes): - if mname in s: - res = s - break - else: - res = avm_class.variables - stack.append(res) - elif opcode == 96: # getlex - index = u30() - mname = self.multinames[index] - for s in reversed(scopes): - if mname in s: - scope = s - break - else: - scope = avm_class.variables - - if mname in scope: - res = scope[mname] - elif mname in _builtin_classes: - res = _builtin_classes[mname] - else: - # Assume uninitialized - # TODO warn here - res = undefined - stack.append(res) - elif opcode == 97: # setproperty - index = u30() - value = stack.pop() - idx = self.multinames[index] - if isinstance(idx, _Multiname): - idx = stack.pop() - obj = stack.pop() - obj[idx] = value - elif opcode == 98: # getlocal - index = u30() - stack.append(registers[index]) - elif opcode == 99: # setlocal - index = u30() - value = stack.pop() - registers[index] = value - elif opcode == 102: # getproperty - index = u30() - pname = self.multinames[index] - if pname == 'length': - obj = stack.pop() - assert isinstance(obj, (compat_str, list)) - stack.append(len(obj)) - elif isinstance(pname, compat_str): # Member access - obj = stack.pop() - if isinstance(obj, _AVMClass): - res = obj.static_properties[pname] - stack.append(res) - continue - - assert isinstance(obj, (dict, _ScopeDict)),\ - 'Accessing member %r on %r' % (pname, obj) - res = obj.get(pname, undefined) - stack.append(res) - else: # Assume attribute access - idx = stack.pop() - assert isinstance(idx, int) - obj = stack.pop() - assert isinstance(obj, list) - stack.append(obj[idx]) - elif opcode == 104: # initproperty - index = u30() - value = stack.pop() - idx = self.multinames[index] - if isinstance(idx, _Multiname): - idx = stack.pop() - obj = stack.pop() - obj[idx] = value - elif opcode == 115: # convert_ - value = stack.pop() - intvalue = int(value) - stack.append(intvalue) - elif opcode == 128: # coerce - u30() - elif opcode == 130: # coerce_a - value = stack.pop() - # um, yes, it's any value - stack.append(value) - elif opcode == 133: # coerce_s - assert isinstance(stack[-1], (type(None), compat_str)) - elif opcode == 147: # decrement - value = stack.pop() - assert isinstance(value, int) - stack.append(value - 1) - elif opcode == 149: # typeof - value = stack.pop() - return { - _Undefined: 'undefined', - compat_str: 'String', - int: 'Number', - float: 'Number', - }[type(value)] - elif opcode == 160: # add - value2 = stack.pop() - value1 = stack.pop() - res = value1 + value2 - stack.append(res) - elif opcode == 161: # subtract - value2 = stack.pop() - value1 = stack.pop() - res = value1 - value2 - stack.append(res) - elif opcode == 162: # multiply - value2 = stack.pop() - value1 = stack.pop() - res = value1 * value2 - stack.append(res) - elif opcode == 164: # modulo - value2 = stack.pop() - value1 = stack.pop() - res = value1 % value2 - stack.append(res) - elif opcode == 168: # bitand - value2 = stack.pop() - value1 = stack.pop() - assert isinstance(value1, int) - assert isinstance(value2, int) - res = value1 & value2 - stack.append(res) - elif opcode == 171: # equals - value2 = stack.pop() - value1 = stack.pop() - result = value1 == value2 - stack.append(result) - elif opcode == 175: # greaterequals - value2 = stack.pop() - value1 = stack.pop() - result = value1 >= value2 - stack.append(result) - elif opcode == 192: # increment_i - value = stack.pop() - assert isinstance(value, int) - stack.append(value + 1) - elif opcode == 208: # getlocal_0 - stack.append(registers[0]) - elif opcode == 209: # getlocal_1 - stack.append(registers[1]) - elif opcode == 210: # getlocal_2 - stack.append(registers[2]) - elif opcode == 211: # getlocal_3 - stack.append(registers[3]) - elif opcode == 212: # setlocal_0 - registers[0] = stack.pop() - elif opcode == 213: # setlocal_1 - registers[1] = stack.pop() - elif opcode == 214: # setlocal_2 - registers[2] = stack.pop() - elif opcode == 215: # setlocal_3 - registers[3] = stack.pop() - else: - raise NotImplementedError( - 'Unsupported opcode %d' % opcode) - - avm_class.method_pyfunctions[func_name] = resfunc - return resfunc diff --git a/youtube_dl/update.py b/youtube_dl/update.py deleted file mode 100644 index ebce966..0000000 --- a/youtube_dl/update.py +++ /dev/null @@ -1,187 +0,0 @@ -from __future__ import unicode_literals - -import io -import json -import traceback -import hashlib -import os -import subprocess -import sys -from zipimport import zipimporter - -from .utils import encode_compat_str - -from .version import __version__ - - -def rsa_verify(message, signature, key): - from hashlib import sha256 - assert isinstance(message, bytes) - byte_size = (len(bin(key[0])) - 2 + 8 - 1) // 8 - signature = ('%x' % pow(int(signature, 16), key[1], key[0])).encode() - signature = (byte_size * 2 - len(signature)) * b'0' + signature - asn1 = b'3031300d060960864801650304020105000420' - asn1 += sha256(message).hexdigest().encode() - if byte_size < len(asn1) // 2 + 11: - return False - expected = b'0001' + (byte_size - len(asn1) // 2 - 3) * b'ff' + b'00' + asn1 - return expected == signature - - -def update_self(to_screen, verbose, opener): - """Update the program file with the latest version from the repository""" - - UPDATE_URL = 'https://rg3.github.io/youtube-dl/update/' - VERSION_URL = UPDATE_URL + 'LATEST_VERSION' - JSON_URL = UPDATE_URL + 'versions.json' - UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537) - - if not isinstance(globals().get('__loader__'), zipimporter) and not hasattr(sys, 'frozen'): - to_screen('It looks like you installed youtube-dl with a package manager, pip, setup.py or a tarball. Please use that to update.') - return - - # Check if there is a new version - try: - newversion = opener.open(VERSION_URL).read().decode('utf-8').strip() - except Exception: - if verbose: - to_screen(encode_compat_str(traceback.format_exc())) - to_screen('ERROR: can\'t find the current version. Please try again later.') - return - if newversion == __version__: - to_screen('youtube-dl is up-to-date (' + __version__ + ')') - return - - # Download and check versions info - try: - versions_info = opener.open(JSON_URL).read().decode('utf-8') - versions_info = json.loads(versions_info) - except Exception: - if verbose: - to_screen(encode_compat_str(traceback.format_exc())) - to_screen('ERROR: can\'t obtain versions info. Please try again later.') - return - if 'signature' not in versions_info: - to_screen('ERROR: the versions file is not signed or corrupted. Aborting.') - return - signature = versions_info['signature'] - del versions_info['signature'] - if not rsa_verify(json.dumps(versions_info, sort_keys=True).encode('utf-8'), signature, UPDATES_RSA_KEY): - to_screen('ERROR: the versions file signature is invalid. Aborting.') - return - - version_id = versions_info['latest'] - - def version_tuple(version_str): - return tuple(map(int, version_str.split('.'))) - if version_tuple(__version__) >= version_tuple(version_id): - to_screen('youtube-dl is up to date (%s)' % __version__) - return - - to_screen('Updating to version ' + version_id + ' ...') - version = versions_info['versions'][version_id] - - print_notes(to_screen, versions_info['versions']) - - # sys.executable is set to the full pathname of the exe-file for py2exe - filename = sys.executable if hasattr(sys, 'frozen') else sys.argv[0] - - if not os.access(filename, os.W_OK): - to_screen('ERROR: no write permissions on %s' % filename) - return - - # Py2EXE - if hasattr(sys, 'frozen'): - exe = filename - directory = os.path.dirname(exe) - if not os.access(directory, os.W_OK): - to_screen('ERROR: no write permissions on %s' % directory) - return - - try: - urlh = opener.open(version['exe'][0]) - newcontent = urlh.read() - urlh.close() - except (IOError, OSError): - if verbose: - to_screen(encode_compat_str(traceback.format_exc())) - to_screen('ERROR: unable to download latest version') - return - - newcontent_hash = hashlib.sha256(newcontent).hexdigest() - if newcontent_hash != version['exe'][1]: - to_screen('ERROR: the downloaded file hash does not match. Aborting.') - return - - try: - with open(exe + '.new', 'wb') as outf: - outf.write(newcontent) - except (IOError, OSError): - if verbose: - to_screen(encode_compat_str(traceback.format_exc())) - to_screen('ERROR: unable to write the new version') - return - - try: - bat = os.path.join(directory, 'youtube-dl-updater.bat') - with io.open(bat, 'w') as batfile: - batfile.write(''' -@echo off -echo Waiting for file handle to be closed ... -ping 127.0.0.1 -n 5 -w 1000 > NUL -move /Y "%s.new" "%s" > NUL -echo Updated youtube-dl to version %s. -start /b "" cmd /c del "%%~f0"&exit /b" - \n''' % (exe, exe, version_id)) - - subprocess.Popen([bat]) # Continues to run in the background - return # Do not show premature success messages - except (IOError, OSError): - if verbose: - to_screen(encode_compat_str(traceback.format_exc())) - to_screen('ERROR: unable to overwrite current version') - return - - # Zip unix package - elif isinstance(globals().get('__loader__'), zipimporter): - try: - urlh = opener.open(version['bin'][0]) - newcontent = urlh.read() - urlh.close() - except (IOError, OSError): - if verbose: - to_screen(encode_compat_str(traceback.format_exc())) - to_screen('ERROR: unable to download latest version') - return - - newcontent_hash = hashlib.sha256(newcontent).hexdigest() - if newcontent_hash != version['bin'][1]: - to_screen('ERROR: the downloaded file hash does not match. Aborting.') - return - - try: - with open(filename, 'wb') as outf: - outf.write(newcontent) - except (IOError, OSError): - if verbose: - to_screen(encode_compat_str(traceback.format_exc())) - to_screen('ERROR: unable to overwrite current version') - return - - to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.') - - -def get_notes(versions, fromVersion): - notes = [] - for v, vdata in sorted(versions.items()): - if v > fromVersion: - notes.extend(vdata.get('notes', [])) - return notes - - -def print_notes(to_screen, versions, fromVersion=__version__): - notes = get_notes(versions, fromVersion) - if notes: - to_screen('PLEASE NOTE:') - for note in notes: - to_screen(note) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py deleted file mode 100644 index d2d3c1a..0000000 --- a/youtube_dl/utils.py +++ /dev/null @@ -1,3990 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -from __future__ import unicode_literals - -import base64 -import binascii -import calendar -import codecs -import contextlib -import ctypes -import datetime -import email.utils -import email.header -import errno -import functools -import gzip -import io -import itertools -import json -import locale -import math -import operator -import os -import platform -import random -import re -import socket -import ssl -import subprocess -import sys -import tempfile -import traceback -import xml.etree.ElementTree -import zlib - -from .compat import ( - compat_HTMLParseError, - compat_HTMLParser, - compat_basestring, - compat_chr, - compat_cookiejar, - compat_ctypes_WINFUNCTYPE, - compat_etree_fromstring, - compat_expanduser, - compat_html_entities, - compat_html_entities_html5, - compat_http_client, - compat_kwargs, - compat_os_name, - compat_parse_qs, - compat_shlex_quote, - compat_str, - compat_struct_pack, - compat_struct_unpack, - compat_urllib_error, - compat_urllib_parse, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urllib_parse_unquote_plus, - compat_urllib_request, - compat_urlparse, - compat_xpath, -) - -from .socks import ( - ProxyType, - sockssocket, -) - - -def register_socks_protocols(): - # "Register" SOCKS protocols - # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 - # URLs with protocols not in urlparse.uses_netloc are not handled correctly - for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): - if scheme not in compat_urlparse.uses_netloc: - compat_urlparse.uses_netloc.append(scheme) - - -# This is not clearly defined otherwise -compiled_regex_type = type(re.compile('')) - -std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0', - 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate', - 'Accept-Language': 'en-us,en;q=0.5', -} - - -USER_AGENTS = { - 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', -} - - -NO_DEFAULT = object() - -ENGLISH_MONTH_NAMES = [ - 'January', 'February', 'March', 'April', 'May', 'June', - 'July', 'August', 'September', 'October', 'November', 'December'] - -MONTH_NAMES = { - 'en': ENGLISH_MONTH_NAMES, - 'fr': [ - 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', - 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'], -} - -KNOWN_EXTENSIONS = ( - 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', - 'flv', 'f4v', 'f4a', 'f4b', - 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', - 'mkv', 'mka', 'mk3d', - 'avi', 'divx', - 'mov', - 'asf', 'wmv', 'wma', - '3gp', '3g2', - 'mp3', - 'flac', - 'ape', - 'wav', - 'f4f', 'f4m', 'm3u8', 'smil') - -# needed for sanitizing filenames in restricted mode -ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', - itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'], - 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy'))) - -DATE_FORMATS = ( - '%d %B %Y', - '%d %b %Y', - '%B %d %Y', - '%B %dst %Y', - '%B %dnd %Y', - '%B %dth %Y', - '%b %d %Y', - '%b %dst %Y', - '%b %dnd %Y', - '%b %dth %Y', - '%b %dst %Y %I:%M', - '%b %dnd %Y %I:%M', - '%b %dth %Y %I:%M', - '%Y %m %d', - '%Y-%m-%d', - '%Y/%m/%d', - '%Y/%m/%d %H:%M', - '%Y/%m/%d %H:%M:%S', - '%Y-%m-%d %H:%M', - '%Y-%m-%d %H:%M:%S', - '%Y-%m-%d %H:%M:%S.%f', - '%d.%m.%Y %H:%M', - '%d.%m.%Y %H.%M', - '%Y-%m-%dT%H:%M:%SZ', - '%Y-%m-%dT%H:%M:%S.%fZ', - '%Y-%m-%dT%H:%M:%S.%f0Z', - '%Y-%m-%dT%H:%M:%S', - '%Y-%m-%dT%H:%M:%S.%f', - '%Y-%m-%dT%H:%M', - '%b %d %Y at %H:%M', - '%b %d %Y at %H:%M:%S', - '%B %d %Y at %H:%M', - '%B %d %Y at %H:%M:%S', -) - -DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) -DATE_FORMATS_DAY_FIRST.extend([ - '%d-%m-%Y', - '%d.%m.%Y', - '%d.%m.%y', - '%d/%m/%Y', - '%d/%m/%y', - '%d/%m/%Y %H:%M:%S', -]) - -DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) -DATE_FORMATS_MONTH_FIRST.extend([ - '%m-%d-%Y', - '%m.%d.%Y', - '%m/%d/%Y', - '%m/%d/%y', - '%m/%d/%Y %H:%M:%S', -]) - -PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" -JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>' - - -def preferredencoding(): - """Get preferred encoding. - - Returns the best encoding scheme for the system, based on - locale.getpreferredencoding() and some further tweaks. - """ - try: - pref = locale.getpreferredencoding() - 'TEST'.encode(pref) - except Exception: - pref = 'UTF-8' - - return pref - - -def write_json_file(obj, fn): - """ Encode obj as JSON and write it to fn, atomically if possible """ - - fn = encodeFilename(fn) - if sys.version_info < (3, 0) and sys.platform != 'win32': - encoding = get_filesystem_encoding() - # os.path.basename returns a bytes object, but NamedTemporaryFile - # will fail if the filename contains non ascii characters unless we - # use a unicode object - path_basename = lambda f: os.path.basename(fn).decode(encoding) - # the same for os.path.dirname - path_dirname = lambda f: os.path.dirname(fn).decode(encoding) - else: - path_basename = os.path.basename - path_dirname = os.path.dirname - - args = { - 'suffix': '.tmp', - 'prefix': path_basename(fn) + '.', - 'dir': path_dirname(fn), - 'delete': False, - } - - # In Python 2.x, json.dump expects a bytestream. - # In Python 3.x, it writes to a character stream - if sys.version_info < (3, 0): - args['mode'] = 'wb' - else: - args.update({ - 'mode': 'w', - 'encoding': 'utf-8', - }) - - tf = tempfile.NamedTemporaryFile(**compat_kwargs(args)) - - try: - with tf: - json.dump(obj, tf) - if sys.platform == 'win32': - # Need to remove existing file on Windows, else os.rename raises - # WindowsError or FileExistsError. - try: - os.unlink(fn) - except OSError: - pass - os.rename(tf.name, fn) - except Exception: - try: - os.remove(tf.name) - except OSError: - pass - raise - - -if sys.version_info >= (2, 7): - def find_xpath_attr(node, xpath, key, val=None): - """ Find the xpath xpath[@key=val] """ - assert re.match(r'^[a-zA-Z_-]+$', key) - expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) - return node.find(expr) -else: - def find_xpath_attr(node, xpath, key, val=None): - for f in node.findall(compat_xpath(xpath)): - if key not in f.attrib: - continue - if val is None or f.attrib.get(key) == val: - return f - return None - -# On python2.6 the xml.etree.ElementTree.Element methods don't support -# the namespace parameter - - -def xpath_with_ns(path, ns_map): - components = [c.split(':') for c in path.split('/')] - replaced = [] - for c in components: - if len(c) == 1: - replaced.append(c[0]) - else: - ns, tag = c - replaced.append('{%s}%s' % (ns_map[ns], tag)) - return '/'.join(replaced) - - -def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): - def _find_xpath(xpath): - return node.find(compat_xpath(xpath)) - - if isinstance(xpath, (str, compat_str)): - n = _find_xpath(xpath) - else: - for xp in xpath: - n = _find_xpath(xp) - if n is not None: - break - - if n is None: - if default is not NO_DEFAULT: - return default - elif fatal: - name = xpath if name is None else name - raise ExtractorError('Could not find XML element %s' % name) - else: - return None - return n - - -def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): - n = xpath_element(node, xpath, name, fatal=fatal, default=default) - if n is None or n == default: - return n - if n.text is None: - if default is not NO_DEFAULT: - return default - elif fatal: - name = xpath if name is None else name - raise ExtractorError('Could not find XML element\'s text %s' % name) - else: - return None - return n.text - - -def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): - n = find_xpath_attr(node, xpath, key) - if n is None: - if default is not NO_DEFAULT: - return default - elif fatal: - name = '%s[@%s]' % (xpath, key) if name is None else name - raise ExtractorError('Could not find XML attribute %s' % name) - else: - return None - return n.attrib[key] - - -def get_element_by_id(id, html): - """Return the content of the tag with the specified ID in the passed HTML document""" - return get_element_by_attribute('id', id, html) - - -def get_element_by_class(class_name, html): - """Return the content of the first tag with the specified class in the passed HTML document""" - retval = get_elements_by_class(class_name, html) - return retval[0] if retval else None - - -def get_element_by_attribute(attribute, value, html, escape_value=True): - retval = get_elements_by_attribute(attribute, value, html, escape_value) - return retval[0] if retval else None - - -def get_elements_by_class(class_name, html): - """Return the content of all tags with the specified class in the passed HTML document as a list""" - return get_elements_by_attribute( - 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), - html, escape_value=False) - - -def get_elements_by_attribute(attribute, value, html, escape_value=True): - """Return the content of the tag with the specified attribute in the passed HTML document""" - - value = re.escape(value) if escape_value else value - - retlist = [] - for m in re.finditer(r'''(?xs) - <([a-zA-Z0-9:._-]+) - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? - \s+%s=['"]?%s['"]? - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? - \s*> - (?P<content>.*?) - </\1> - ''' % (re.escape(attribute), value), html): - res = m.group('content') - - if res.startswith('"') or res.startswith("'"): - res = res[1:-1] - - retlist.append(unescapeHTML(res)) - - return retlist - - -class HTMLAttributeParser(compat_HTMLParser): - """Trivial HTML parser to gather the attributes for a single element""" - def __init__(self): - self.attrs = {} - compat_HTMLParser.__init__(self) - - def handle_starttag(self, tag, attrs): - self.attrs = dict(attrs) - - -def extract_attributes(html_element): - """Given a string for an HTML element such as - <el - a="foo" B="bar" c="&98;az" d=boz - empty= noval entity="&" - sq='"' dq="'" - > - Decode and return a dictionary of attributes. - { - 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', - 'empty': '', 'noval': None, 'entity': '&', - 'sq': '"', 'dq': '\'' - }. - NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, - but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. - """ - parser = HTMLAttributeParser() - try: - parser.feed(html_element) - parser.close() - # Older Python may throw HTMLParseError in case of malformed HTML - except compat_HTMLParseError: - pass - return parser.attrs - - -def clean_html(html): - """Clean an HTML snippet into a readable string""" - - if html is None: # Convenience for sanitizing descriptions etc. - return html - - # Newline vs <br /> - html = html.replace('\n', ' ') - html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html) - html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) - # Strip html tags - html = re.sub('<.*?>', '', html) - # Replace html entities - html = unescapeHTML(html) - return html.strip() - - -def sanitize_open(filename, open_mode): - """Try to open the given filename, and slightly tweak it if this fails. - - Attempts to open the given filename. If this fails, it tries to change - the filename slightly, step by step, until it's either able to open it - or it fails and raises a final exception, like the standard open() - function. - - It returns the tuple (stream, definitive_file_name). - """ - try: - if filename == '-': - if sys.platform == 'win32': - import msvcrt - msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) - return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename) - stream = open(encodeFilename(filename), open_mode) - return (stream, filename) - except (IOError, OSError) as err: - if err.errno in (errno.EACCES,): - raise - - # In case of error, try to remove win32 forbidden chars - alt_filename = sanitize_path(filename) - if alt_filename == filename: - raise - else: - # An exception here should be caught in the caller - stream = open(encodeFilename(alt_filename), open_mode) - return (stream, alt_filename) - - -def timeconvert(timestr): - """Convert RFC 2822 defined time string into system timestamp""" - timestamp = None - timetuple = email.utils.parsedate_tz(timestr) - if timetuple is not None: - timestamp = email.utils.mktime_tz(timetuple) - return timestamp - - -def sanitize_filename(s, restricted=False, is_id=False): - """Sanitizes a string so it could be used as part of a filename. - If restricted is set, use a stricter subset of allowed characters. - Set is_id if this is not an arbitrary string, but an ID that should be kept - if possible. - """ - def replace_insane(char): - if restricted and char in ACCENT_CHARS: - return ACCENT_CHARS[char] - if char == '?' or ord(char) < 32 or ord(char) == 127: - return '' - elif char == '"': - return '' if restricted else '\'' - elif char == ':': - return '_-' if restricted else ' -' - elif char in '\\/|*<>': - return '_' - if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()): - return '_' - if restricted and ord(char) > 127: - return '_' - return char - - # Handle timestamps - s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) - result = ''.join(map(replace_insane, s)) - if not is_id: - while '__' in result: - result = result.replace('__', '_') - result = result.strip('_') - # Common case of "Foreign band name - English song title" - if restricted and result.startswith('-_'): - result = result[2:] - if result.startswith('-'): - result = '_' + result[len('-'):] - result = result.lstrip('.') - if not result: - result = '_' - return result - - -def sanitize_path(s): - """Sanitizes and normalizes path on Windows""" - if sys.platform != 'win32': - return s - drive_or_unc, _ = os.path.splitdrive(s) - if sys.version_info < (2, 7) and not drive_or_unc: - drive_or_unc, _ = os.path.splitunc(s) - norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) - if drive_or_unc: - norm_path.pop(0) - sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part) - for path_part in norm_path] - if drive_or_unc: - sanitized_path.insert(0, drive_or_unc + os.path.sep) - return os.path.join(*sanitized_path) - - -def sanitize_url(url): - # Prepend protocol-less URLs with `http:` scheme in order to mitigate - # the number of unwanted failures due to missing protocol - if url.startswith('//'): - return 'http:%s' % url - # Fix some common typos seen so far - COMMON_TYPOS = ( - # https://github.com/rg3/youtube-dl/issues/15649 - (r'^httpss://', r'https://'), - # https://bx1.be/lives/direct-tv/ - (r'^rmtp([es]?)://', r'rtmp\1://'), - ) - for mistake, fixup in COMMON_TYPOS: - if re.match(mistake, url): - return re.sub(mistake, fixup, url) - return url - - -def sanitized_Request(url, *args, **kwargs): - return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) - - -def expand_path(s): - """Expand shell variables and ~""" - return os.path.expandvars(compat_expanduser(s)) - - -def orderedSet(iterable): - """ Remove all duplicates from the input iterable """ - res = [] - for el in iterable: - if el not in res: - res.append(el) - return res - - -def _htmlentity_transform(entity_with_semicolon): - """Transforms an HTML entity to a character.""" - entity = entity_with_semicolon[:-1] - - # Known non-numeric HTML entity - if entity in compat_html_entities.name2codepoint: - return compat_chr(compat_html_entities.name2codepoint[entity]) - - # TODO: HTML5 allows entities without a semicolon. For example, - # 'Éric' should be decoded as 'Éric'. - if entity_with_semicolon in compat_html_entities_html5: - return compat_html_entities_html5[entity_with_semicolon] - - mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) - if mobj is not None: - numstr = mobj.group(1) - if numstr.startswith('x'): - base = 16 - numstr = '0%s' % numstr - else: - base = 10 - # See https://github.com/rg3/youtube-dl/issues/7518 - try: - return compat_chr(int(numstr, base)) - except ValueError: - pass - - # Unknown entity in name, return its literal representation - return '&%s;' % entity - - -def unescapeHTML(s): - if s is None: - return None - assert type(s) == compat_str - - return re.sub( - r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) - - -def get_subprocess_encoding(): - if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - # For subprocess calls, encode with locale encoding - # Refer to http://stackoverflow.com/a/9951851/35070 - encoding = preferredencoding() - else: - encoding = sys.getfilesystemencoding() - if encoding is None: - encoding = 'utf-8' - return encoding - - -def encodeFilename(s, for_subprocess=False): - """ - @param s The name of the file - """ - - assert type(s) == compat_str - - # Python 3 has a Unicode API - if sys.version_info >= (3, 0): - return s - - # Pass '' directly to use Unicode APIs on Windows 2000 and up - # (Detecting Windows NT 4 is tricky because 'major >= 4' would - # match Windows 9x series as well. Besides, NT 4 is obsolete.) - if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - return s - - # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible - if sys.platform.startswith('java'): - return s - - return s.encode(get_subprocess_encoding(), 'ignore') - - -def decodeFilename(b, for_subprocess=False): - - if sys.version_info >= (3, 0): - return b - - if not isinstance(b, bytes): - return b - - return b.decode(get_subprocess_encoding(), 'ignore') - - -def encodeArgument(s): - if not isinstance(s, compat_str): - # Legacy code that uses byte strings - # Uncomment the following line after fixing all post processors - # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) - s = s.decode('ascii') - return encodeFilename(s, True) - - -def decodeArgument(b): - return decodeFilename(b, True) - - -def decodeOption(optval): - if optval is None: - return optval - if isinstance(optval, bytes): - optval = optval.decode(preferredencoding()) - - assert isinstance(optval, compat_str) - return optval - - -def formatSeconds(secs): - if secs > 3600: - return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60) - elif secs > 60: - return '%d:%02d' % (secs // 60, secs % 60) - else: - return '%d' % secs - - -def make_HTTPS_handler(params, **kwargs): - opts_no_check_certificate = params.get('nocheckcertificate', False) - if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9 - context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH) - if opts_no_check_certificate: - context.check_hostname = False - context.verify_mode = ssl.CERT_NONE - try: - return YoutubeDLHTTPSHandler(params, context=context, **kwargs) - except TypeError: - # Python 2.7.8 - # (create_default_context present but HTTPSHandler has no context=) - pass - - if sys.version_info < (3, 2): - return YoutubeDLHTTPSHandler(params, **kwargs) - else: # Python < 3.4 - context = ssl.SSLContext(ssl.PROTOCOL_TLSv1) - context.verify_mode = (ssl.CERT_NONE - if opts_no_check_certificate - else ssl.CERT_REQUIRED) - context.set_default_verify_paths() - return YoutubeDLHTTPSHandler(params, context=context, **kwargs) - - -def bug_reports_message(): - if ytdl_is_updateable(): - update_cmd = 'type youtube-dl -U to update' - else: - update_cmd = 'see https://yt-dl.org/update on how to update' - msg = '; please report this issue on https://yt-dl.org/bug .' - msg += ' Make sure you are using the latest version; %s.' % update_cmd - msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.' - return msg - - -class YoutubeDLError(Exception): - """Base exception for YoutubeDL errors.""" - pass - - -class ExtractorError(YoutubeDLError): - """Error during info extraction.""" - - def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): - """ tb, if given, is the original traceback (so that it can be printed out). - If expected is set, this is a normal error message and most likely not a bug in youtube-dl. - """ - - if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): - expected = True - if video_id is not None: - msg = video_id + ': ' + msg - if cause: - msg += ' (caused by %r)' % cause - if not expected: - msg += bug_reports_message() - super(ExtractorError, self).__init__(msg) - - self.traceback = tb - self.exc_info = sys.exc_info() # preserve original exception - self.cause = cause - self.video_id = video_id - - def format_traceback(self): - if self.traceback is None: - return None - return ''.join(traceback.format_tb(self.traceback)) - - -class UnsupportedError(ExtractorError): - def __init__(self, url): - super(UnsupportedError, self).__init__( - 'Unsupported URL: %s' % url, expected=True) - self.url = url - - -class RegexNotFoundError(ExtractorError): - """Error when a regex didn't match""" - pass - - -class GeoRestrictedError(ExtractorError): - """Geographic restriction Error exception. - - This exception may be thrown when a video is not available from your - geographic location due to geographic restrictions imposed by a website. - """ - def __init__(self, msg, countries=None): - super(GeoRestrictedError, self).__init__(msg, expected=True) - self.msg = msg - self.countries = countries - - -class DownloadError(YoutubeDLError): - """Download Error exception. - - This exception may be thrown by FileDownloader objects if they are not - configured to continue on errors. They will contain the appropriate - error message. - """ - - def __init__(self, msg, exc_info=None): - """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """ - super(DownloadError, self).__init__(msg) - self.exc_info = exc_info - - -class SameFileError(YoutubeDLError): - """Same File exception. - - This exception will be thrown by FileDownloader objects if they detect - multiple files would have to be downloaded to the same file on disk. - """ - pass - - -class PostProcessingError(YoutubeDLError): - """Post Processing exception. - - This exception may be raised by PostProcessor's .run() method to - indicate an error in the postprocessing task. - """ - - def __init__(self, msg): - super(PostProcessingError, self).__init__(msg) - self.msg = msg - - -class MaxDownloadsReached(YoutubeDLError): - """ --max-downloads limit has been reached. """ - pass - - -class UnavailableVideoError(YoutubeDLError): - """Unavailable Format exception. - - This exception will be thrown when a video is requested - in a format that is not available for that video. - """ - pass - - -class ContentTooShortError(YoutubeDLError): - """Content Too Short exception. - - This exception may be raised by FileDownloader objects when a file they - download is too small for what the server announced first, indicating - the connection was probably interrupted. - """ - - def __init__(self, downloaded, expected): - super(ContentTooShortError, self).__init__( - 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected) - ) - # Both in bytes - self.downloaded = downloaded - self.expected = expected - - -class XAttrMetadataError(YoutubeDLError): - def __init__(self, code=None, msg='Unknown error'): - super(XAttrMetadataError, self).__init__(msg) - self.code = code - self.msg = msg - - # Parsing code and msg - if (self.code in (errno.ENOSPC, errno.EDQUOT) or - 'No space left' in self.msg or 'Disk quota excedded' in self.msg): - self.reason = 'NO_SPACE' - elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: - self.reason = 'VALUE_TOO_LONG' - else: - self.reason = 'NOT_SUPPORTED' - - -class XAttrUnavailableError(YoutubeDLError): - pass - - -def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): - # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting - # expected HTTP responses to meet HTTP/1.0 or later (see also - # https://github.com/rg3/youtube-dl/issues/6727) - if sys.version_info < (3, 0): - kwargs['strict'] = True - hc = http_class(*args, **compat_kwargs(kwargs)) - source_address = ydl_handler._params.get('source_address') - - if source_address is not None: - # This is to workaround _create_connection() from socket where it will try all - # address data from getaddrinfo() including IPv6. This filters the result from - # getaddrinfo() based on the source_address value. - # This is based on the cpython socket.create_connection() function. - # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 - def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): - host, port = address - err = None - addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) - af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 - ip_addrs = [addr for addr in addrs if addr[0] == af] - if addrs and not ip_addrs: - ip_version = 'v4' if af == socket.AF_INET else 'v6' - raise socket.error( - "No remote IP%s addresses available for connect, can't use '%s' as source address" - % (ip_version, source_address[0])) - for res in ip_addrs: - af, socktype, proto, canonname, sa = res - sock = None - try: - sock = socket.socket(af, socktype, proto) - if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: - sock.settimeout(timeout) - sock.bind(source_address) - sock.connect(sa) - err = None # Explicitly break reference cycle - return sock - except socket.error as _: - err = _ - if sock is not None: - sock.close() - if err is not None: - raise err - else: - raise socket.error('getaddrinfo returns an empty list') - if hasattr(hc, '_create_connection'): - hc._create_connection = _create_connection - sa = (source_address, 0) - if hasattr(hc, 'source_address'): # Python 2.7+ - hc.source_address = sa - else: # Python 2.6 - def _hc_connect(self, *args, **kwargs): - sock = _create_connection( - (self.host, self.port), self.timeout, sa) - if is_https: - self.sock = ssl.wrap_socket( - sock, self.key_file, self.cert_file, - ssl_version=ssl.PROTOCOL_TLSv1) - else: - self.sock = sock - hc.connect = functools.partial(_hc_connect, hc) - - return hc - - -def handle_youtubedl_headers(headers): - filtered_headers = headers - - if 'Youtubedl-no-compression' in filtered_headers: - filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding') - del filtered_headers['Youtubedl-no-compression'] - - return filtered_headers - - -class YoutubeDLHandler(compat_urllib_request.HTTPHandler): - """Handler for HTTP requests and responses. - - This class, when installed with an OpenerDirector, automatically adds - the standard headers to every HTTP request and handles gzipped and - deflated responses from web servers. If compression is to be avoided in - a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-no-compression", which will be - removed before making the real request. - - Part of this code was copied from: - - http://techknack.net/python-urllib2-handlers/ - - Andrew Rowls, the author of that code, agreed to release it to the - public domain. - """ - - def __init__(self, params, *args, **kwargs): - compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs) - self._params = params - - def http_open(self, req): - conn_class = compat_http_client.HTTPConnection - - socks_proxy = req.headers.get('Ytdl-socks-proxy') - if socks_proxy: - conn_class = make_socks_conn_class(conn_class, socks_proxy) - del req.headers['Ytdl-socks-proxy'] - - return self.do_open(functools.partial( - _create_http_connection, self, conn_class, False), - req) - - @staticmethod - def deflate(data): - try: - return zlib.decompress(data, -zlib.MAX_WBITS) - except zlib.error: - return zlib.decompress(data) - - def http_request(self, req): - # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not - # always respected by websites, some tend to give out URLs with non percent-encoded - # non-ASCII characters (see telemb.py, ard.py [#3412]) - # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) - # To work around aforementioned issue we will replace request's original URL with - # percent-encoded one - # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) - # the code of this workaround has been moved here from YoutubeDL.urlopen() - url = req.get_full_url() - url_escaped = escape_url(url) - - # Substitute URL if any change after escaping - if url != url_escaped: - req = update_Request(req, url=url_escaped) - - for h, v in std_headers.items(): - # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 - # The dict keys are capitalized because of this bug by urllib - if h.capitalize() not in req.headers: - req.add_header(h, v) - - req.headers = handle_youtubedl_headers(req.headers) - - if sys.version_info < (2, 7) and '#' in req.get_full_url(): - # Python 2.6 is brain-dead when it comes to fragments - req._Request__original = req._Request__original.partition('#')[0] - req._Request__r_type = req._Request__r_type.partition('#')[0] - - return req - - def http_response(self, req, resp): - old_resp = resp - # gzip - if resp.headers.get('Content-encoding', '') == 'gzip': - content = resp.read() - gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') - try: - uncompressed = io.BytesIO(gz.read()) - except IOError as original_ioerror: - # There may be junk add the end of the file - # See http://stackoverflow.com/q/4928560/35070 for details - for i in range(1, 1024): - try: - gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') - uncompressed = io.BytesIO(gz.read()) - except IOError: - continue - break - else: - raise original_ioerror - resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - del resp.headers['Content-encoding'] - # deflate - if resp.headers.get('Content-encoding', '') == 'deflate': - gz = io.BytesIO(self.deflate(resp.read())) - resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - del resp.headers['Content-encoding'] - # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see - # https://github.com/rg3/youtube-dl/issues/6457). - if 300 <= resp.code < 400: - location = resp.headers.get('Location') - if location: - # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 - if sys.version_info >= (3, 0): - location = location.encode('iso-8859-1').decode('utf-8') - else: - location = location.decode('utf-8') - location_escaped = escape_url(location) - if location != location_escaped: - del resp.headers['Location'] - if sys.version_info < (3, 0): - location_escaped = location_escaped.encode('utf-8') - resp.headers['Location'] = location_escaped - return resp - - https_request = http_request - https_response = http_response - - -def make_socks_conn_class(base_class, socks_proxy): - assert issubclass(base_class, ( - compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) - - url_components = compat_urlparse.urlparse(socks_proxy) - if url_components.scheme.lower() == 'socks5': - socks_type = ProxyType.SOCKS5 - elif url_components.scheme.lower() in ('socks', 'socks4'): - socks_type = ProxyType.SOCKS4 - elif url_components.scheme.lower() == 'socks4a': - socks_type = ProxyType.SOCKS4A - - def unquote_if_non_empty(s): - if not s: - return s - return compat_urllib_parse_unquote_plus(s) - - proxy_args = ( - socks_type, - url_components.hostname, url_components.port or 1080, - True, # Remote DNS - unquote_if_non_empty(url_components.username), - unquote_if_non_empty(url_components.password), - ) - - class SocksConnection(base_class): - def connect(self): - self.sock = sockssocket() - self.sock.setproxy(*proxy_args) - if type(self.timeout) in (int, float): - self.sock.settimeout(self.timeout) - self.sock.connect((self.host, self.port)) - - if isinstance(self, compat_http_client.HTTPSConnection): - if hasattr(self, '_context'): # Python > 2.6 - self.sock = self._context.wrap_socket( - self.sock, server_hostname=self.host) - else: - self.sock = ssl.wrap_socket(self.sock) - - return SocksConnection - - -class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): - def __init__(self, params, https_conn_class=None, *args, **kwargs): - compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs) - self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection - self._params = params - - def https_open(self, req): - kwargs = {} - conn_class = self._https_conn_class - - if hasattr(self, '_context'): # python > 2.6 - kwargs['context'] = self._context - if hasattr(self, '_check_hostname'): # python 3.x - kwargs['check_hostname'] = self._check_hostname - - socks_proxy = req.headers.get('Ytdl-socks-proxy') - if socks_proxy: - conn_class = make_socks_conn_class(conn_class, socks_proxy) - del req.headers['Ytdl-socks-proxy'] - - return self.do_open(functools.partial( - _create_http_connection, self, conn_class, True), - req, **kwargs) - - -class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): - def save(self, filename=None, ignore_discard=False, ignore_expires=False): - # Store session cookies with `expires` set to 0 instead of an empty - # string - for cookie in self: - if cookie.expires is None: - cookie.expires = 0 - compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires) - - def load(self, filename=None, ignore_discard=False, ignore_expires=False): - compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires) - # Session cookies are denoted by either `expires` field set to - # an empty string or 0. MozillaCookieJar only recognizes the former - # (see [1]). So we need force the latter to be recognized as session - # cookies on our own. - # Session cookies may be important for cookies-based authentication, - # e.g. usually, when user does not check 'Remember me' check box while - # logging in on a site, some important cookies are stored as session - # cookies so that not recognizing them will result in failed login. - # 1. https://bugs.python.org/issue17164 - for cookie in self: - # Treat `expires=0` cookies as session cookies - if cookie.expires == 0: - cookie.expires = None - cookie.discard = True - - -class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): - def __init__(self, cookiejar=None): - compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) - - def http_response(self, request, response): - # Python 2 will choke on next HTTP request in row if there are non-ASCII - # characters in Set-Cookie HTTP header of last response (see - # https://github.com/rg3/youtube-dl/issues/6769). - # In order to at least prevent crashing we will percent encode Set-Cookie - # header before HTTPCookieProcessor starts processing it. - # if sys.version_info < (3, 0) and response.headers: - # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'): - # set_cookie = response.headers.get(set_cookie_header) - # if set_cookie: - # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ") - # if set_cookie != set_cookie_escaped: - # del response.headers[set_cookie_header] - # response.headers[set_cookie_header] = set_cookie_escaped - return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response) - - https_request = compat_urllib_request.HTTPCookieProcessor.http_request - https_response = http_response - - -def extract_timezone(date_str): - m = re.search( - r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', - date_str) - if not m: - timezone = datetime.timedelta() - else: - date_str = date_str[:-len(m.group('tz'))] - if not m.group('sign'): - timezone = datetime.timedelta() - else: - sign = 1 if m.group('sign') == '+' else -1 - timezone = datetime.timedelta( - hours=sign * int(m.group('hours')), - minutes=sign * int(m.group('minutes'))) - return timezone, date_str - - -def parse_iso8601(date_str, delimiter='T', timezone=None): - """ Return a UNIX timestamp from the given date """ - - if date_str is None: - return None - - date_str = re.sub(r'\.[0-9]+', '', date_str) - - if timezone is None: - timezone, date_str = extract_timezone(date_str) - - try: - date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) - dt = datetime.datetime.strptime(date_str, date_format) - timezone - return calendar.timegm(dt.timetuple()) - except ValueError: - pass - - -def date_formats(day_first=True): - return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST - - -def unified_strdate(date_str, day_first=True): - """Return a string with the date in the format YYYYMMDD""" - - if date_str is None: - return None - upload_date = None - # Replace commas - date_str = date_str.replace(',', ' ') - # Remove AM/PM + timezone - date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) - _, date_str = extract_timezone(date_str) - - for expression in date_formats(day_first): - try: - upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') - except ValueError: - pass - if upload_date is None: - timetuple = email.utils.parsedate_tz(date_str) - if timetuple: - try: - upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') - except ValueError: - pass - if upload_date is not None: - return compat_str(upload_date) - - -def unified_timestamp(date_str, day_first=True): - if date_str is None: - return None - - date_str = re.sub(r'[,|]', '', date_str) - - pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 - timezone, date_str = extract_timezone(date_str) - - # Remove AM/PM + timezone - date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) - - # Remove unrecognized timezones from ISO 8601 alike timestamps - m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) - if m: - date_str = date_str[:-len(m.group('tz'))] - - # Python only supports microseconds, so remove nanoseconds - m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str) - if m: - date_str = m.group(1) - - for expression in date_formats(day_first): - try: - dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) - return calendar.timegm(dt.timetuple()) - except ValueError: - pass - timetuple = email.utils.parsedate_tz(date_str) - if timetuple: - return calendar.timegm(timetuple) + pm_delta * 3600 - - -def determine_ext(url, default_ext='unknown_video'): - if url is None or '.' not in url: - return default_ext - guess = url.partition('?')[0].rpartition('.')[2] - if re.match(r'^[A-Za-z0-9]+$', guess): - return guess - # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download - elif guess.rstrip('/') in KNOWN_EXTENSIONS: - return guess.rstrip('/') - else: - return default_ext - - -def subtitles_filename(filename, sub_lang, sub_format): - return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format - - -def date_from_str(date_str): - """ - Return a datetime object from a string in the format YYYYMMDD or - (now|today)[+-][0-9](day|week|month|year)(s)?""" - today = datetime.date.today() - if date_str in ('now', 'today'): - return today - if date_str == 'yesterday': - return today - datetime.timedelta(days=1) - match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str) - if match is not None: - sign = match.group('sign') - time = int(match.group('time')) - if sign == '-': - time = -time - unit = match.group('unit') - # A bad approximation? - if unit == 'month': - unit = 'day' - time *= 30 - elif unit == 'year': - unit = 'day' - time *= 365 - unit += 's' - delta = datetime.timedelta(**{unit: time}) - return today + delta - return datetime.datetime.strptime(date_str, '%Y%m%d').date() - - -def hyphenate_date(date_str): - """ - Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format""" - match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str) - if match is not None: - return '-'.join(match.groups()) - else: - return date_str - - -class DateRange(object): - """Represents a time interval between two dates""" - - def __init__(self, start=None, end=None): - """start and end must be strings in the format accepted by date""" - if start is not None: - self.start = date_from_str(start) - else: - self.start = datetime.datetime.min.date() - if end is not None: - self.end = date_from_str(end) - else: - self.end = datetime.datetime.max.date() - if self.start > self.end: - raise ValueError('Date range: "%s" , the start date must be before the end date' % self) - - @classmethod - def day(cls, day): - """Returns a range that only contains the given day""" - return cls(day, day) - - def __contains__(self, date): - """Check if the date is in the range""" - if not isinstance(date, datetime.date): - date = date_from_str(date) - return self.start <= date <= self.end - - def __str__(self): - return '%s - %s' % (self.start.isoformat(), self.end.isoformat()) - - -def platform_name(): - """ Returns the platform name as a compat_str """ - res = platform.platform() - if isinstance(res, bytes): - res = res.decode(preferredencoding()) - - assert isinstance(res, compat_str) - return res - - -def _windows_write_string(s, out): - """ Returns True if the string was written using special methods, - False if it has yet to be written out.""" - # Adapted from http://stackoverflow.com/a/3259271/35070 - - import ctypes - import ctypes.wintypes - - WIN_OUTPUT_IDS = { - 1: -11, - 2: -12, - } - - try: - fileno = out.fileno() - except AttributeError: - # If the output stream doesn't have a fileno, it's virtual - return False - except io.UnsupportedOperation: - # Some strange Windows pseudo files? - return False - if fileno not in WIN_OUTPUT_IDS: - return False - - GetStdHandle = compat_ctypes_WINFUNCTYPE( - ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)( - ('GetStdHandle', ctypes.windll.kernel32)) - h = GetStdHandle(WIN_OUTPUT_IDS[fileno]) - - WriteConsoleW = compat_ctypes_WINFUNCTYPE( - ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR, - ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD), - ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32)) - written = ctypes.wintypes.DWORD(0) - - GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32)) - FILE_TYPE_CHAR = 0x0002 - FILE_TYPE_REMOTE = 0x8000 - GetConsoleMode = compat_ctypes_WINFUNCTYPE( - ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, - ctypes.POINTER(ctypes.wintypes.DWORD))( - ('GetConsoleMode', ctypes.windll.kernel32)) - INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value - - def not_a_console(handle): - if handle == INVALID_HANDLE_VALUE or handle is None: - return True - return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or - GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) - - if not_a_console(h): - return False - - def next_nonbmp_pos(s): - try: - return next(i for i, c in enumerate(s) if ord(c) > 0xffff) - except StopIteration: - return len(s) - - while s: - count = min(next_nonbmp_pos(s), 1024) - - ret = WriteConsoleW( - h, s, count if count else 2, ctypes.byref(written), None) - if ret == 0: - raise OSError('Failed to write string') - if not count: # We just wrote a non-BMP character - assert written.value == 2 - s = s[1:] - else: - assert written.value > 0 - s = s[written.value:] - return True - - -def write_string(s, out=None, encoding=None): - if out is None: - out = sys.stderr - assert type(s) == compat_str - - if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'): - if _windows_write_string(s, out): - return - - if ('b' in getattr(out, 'mode', '') or - sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr - byt = s.encode(encoding or preferredencoding(), 'ignore') - out.write(byt) - elif hasattr(out, 'buffer'): - enc = encoding or getattr(out, 'encoding', None) or preferredencoding() - byt = s.encode(enc, 'ignore') - out.buffer.write(byt) - else: - out.write(s) - out.flush() - - -def bytes_to_intlist(bs): - if not bs: - return [] - if isinstance(bs[0], int): # Python 3 - return list(bs) - else: - return [ord(c) for c in bs] - - -def intlist_to_bytes(xs): - if not xs: - return b'' - return compat_struct_pack('%dB' % len(xs), *xs) - - -# Cross-platform file locking -if sys.platform == 'win32': - import ctypes.wintypes - import msvcrt - - class OVERLAPPED(ctypes.Structure): - _fields_ = [ - ('Internal', ctypes.wintypes.LPVOID), - ('InternalHigh', ctypes.wintypes.LPVOID), - ('Offset', ctypes.wintypes.DWORD), - ('OffsetHigh', ctypes.wintypes.DWORD), - ('hEvent', ctypes.wintypes.HANDLE), - ] - - kernel32 = ctypes.windll.kernel32 - LockFileEx = kernel32.LockFileEx - LockFileEx.argtypes = [ - ctypes.wintypes.HANDLE, # hFile - ctypes.wintypes.DWORD, # dwFlags - ctypes.wintypes.DWORD, # dwReserved - ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow - ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh - ctypes.POINTER(OVERLAPPED) # Overlapped - ] - LockFileEx.restype = ctypes.wintypes.BOOL - UnlockFileEx = kernel32.UnlockFileEx - UnlockFileEx.argtypes = [ - ctypes.wintypes.HANDLE, # hFile - ctypes.wintypes.DWORD, # dwReserved - ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow - ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh - ctypes.POINTER(OVERLAPPED) # Overlapped - ] - UnlockFileEx.restype = ctypes.wintypes.BOOL - whole_low = 0xffffffff - whole_high = 0x7fffffff - - def _lock_file(f, exclusive): - overlapped = OVERLAPPED() - overlapped.Offset = 0 - overlapped.OffsetHigh = 0 - overlapped.hEvent = 0 - f._lock_file_overlapped_p = ctypes.pointer(overlapped) - handle = msvcrt.get_osfhandle(f.fileno()) - if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0, - whole_low, whole_high, f._lock_file_overlapped_p): - raise OSError('Locking file failed: %r' % ctypes.FormatError()) - - def _unlock_file(f): - assert f._lock_file_overlapped_p - handle = msvcrt.get_osfhandle(f.fileno()) - if not UnlockFileEx(handle, 0, - whole_low, whole_high, f._lock_file_overlapped_p): - raise OSError('Unlocking file failed: %r' % ctypes.FormatError()) - -else: - # Some platforms, such as Jython, is missing fcntl - try: - import fcntl - - def _lock_file(f, exclusive): - fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) - - def _unlock_file(f): - fcntl.flock(f, fcntl.LOCK_UN) - except ImportError: - UNSUPPORTED_MSG = 'file locking is not supported on this platform' - - def _lock_file(f, exclusive): - raise IOError(UNSUPPORTED_MSG) - - def _unlock_file(f): - raise IOError(UNSUPPORTED_MSG) - - -class locked_file(object): - def __init__(self, filename, mode, encoding=None): - assert mode in ['r', 'a', 'w'] - self.f = io.open(filename, mode, encoding=encoding) - self.mode = mode - - def __enter__(self): - exclusive = self.mode != 'r' - try: - _lock_file(self.f, exclusive) - except IOError: - self.f.close() - raise - return self - - def __exit__(self, etype, value, traceback): - try: - _unlock_file(self.f) - finally: - self.f.close() - - def __iter__(self): - return iter(self.f) - - def write(self, *args): - return self.f.write(*args) - - def read(self, *args): - return self.f.read(*args) - - -def get_filesystem_encoding(): - encoding = sys.getfilesystemencoding() - return encoding if encoding is not None else 'utf-8' - - -def shell_quote(args): - quoted_args = [] - encoding = get_filesystem_encoding() - for a in args: - if isinstance(a, bytes): - # We may get a filename encoded with 'encodeFilename' - a = a.decode(encoding) - quoted_args.append(compat_shlex_quote(a)) - return ' '.join(quoted_args) - - -def smuggle_url(url, data): - """ Pass additional data in a URL for internal use. """ - - url, idata = unsmuggle_url(url, {}) - data.update(idata) - sdata = compat_urllib_parse_urlencode( - {'__youtubedl_smuggle': json.dumps(data)}) - return url + '#' + sdata - - -def unsmuggle_url(smug_url, default=None): - if '#__youtubedl_smuggle' not in smug_url: - return smug_url, default - url, _, sdata = smug_url.rpartition('#') - jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0] - data = json.loads(jsond) - return url, data - - -def format_bytes(bytes): - if bytes is None: - return 'N/A' - if type(bytes) is str: - bytes = float(bytes) - if bytes == 0.0: - exponent = 0 - else: - exponent = int(math.log(bytes, 1024.0)) - suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent] - converted = float(bytes) / float(1024 ** exponent) - return '%.2f%s' % (converted, suffix) - - -def lookup_unit_table(unit_table, s): - units_re = '|'.join(re.escape(u) for u in unit_table) - m = re.match( - r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s) - if not m: - return None - num_str = m.group('num').replace(',', '.') - mult = unit_table[m.group('unit')] - return int(float(num_str) * mult) - - -def parse_filesize(s): - if s is None: - return None - - # The lower-case forms are of course incorrect and unofficial, - # but we support those too - _UNIT_TABLE = { - 'B': 1, - 'b': 1, - 'bytes': 1, - 'KiB': 1024, - 'KB': 1000, - 'kB': 1024, - 'Kb': 1000, - 'kb': 1000, - 'kilobytes': 1000, - 'kibibytes': 1024, - 'MiB': 1024 ** 2, - 'MB': 1000 ** 2, - 'mB': 1024 ** 2, - 'Mb': 1000 ** 2, - 'mb': 1000 ** 2, - 'megabytes': 1000 ** 2, - 'mebibytes': 1024 ** 2, - 'GiB': 1024 ** 3, - 'GB': 1000 ** 3, - 'gB': 1024 ** 3, - 'Gb': 1000 ** 3, - 'gb': 1000 ** 3, - 'gigabytes': 1000 ** 3, - 'gibibytes': 1024 ** 3, - 'TiB': 1024 ** 4, - 'TB': 1000 ** 4, - 'tB': 1024 ** 4, - 'Tb': 1000 ** 4, - 'tb': 1000 ** 4, - 'terabytes': 1000 ** 4, - 'tebibytes': 1024 ** 4, - 'PiB': 1024 ** 5, - 'PB': 1000 ** 5, - 'pB': 1024 ** 5, - 'Pb': 1000 ** 5, - 'pb': 1000 ** 5, - 'petabytes': 1000 ** 5, - 'pebibytes': 1024 ** 5, - 'EiB': 1024 ** 6, - 'EB': 1000 ** 6, - 'eB': 1024 ** 6, - 'Eb': 1000 ** 6, - 'eb': 1000 ** 6, - 'exabytes': 1000 ** 6, - 'exbibytes': 1024 ** 6, - 'ZiB': 1024 ** 7, - 'ZB': 1000 ** 7, - 'zB': 1024 ** 7, - 'Zb': 1000 ** 7, - 'zb': 1000 ** 7, - 'zettabytes': 1000 ** 7, - 'zebibytes': 1024 ** 7, - 'YiB': 1024 ** 8, - 'YB': 1000 ** 8, - 'yB': 1024 ** 8, - 'Yb': 1000 ** 8, - 'yb': 1000 ** 8, - 'yottabytes': 1000 ** 8, - 'yobibytes': 1024 ** 8, - } - - return lookup_unit_table(_UNIT_TABLE, s) - - -def parse_count(s): - if s is None: - return None - - s = s.strip() - - if re.match(r'^[\d,.]+$', s): - return str_to_int(s) - - _UNIT_TABLE = { - 'k': 1000, - 'K': 1000, - 'm': 1000 ** 2, - 'M': 1000 ** 2, - 'kk': 1000 ** 2, - 'KK': 1000 ** 2, - } - - return lookup_unit_table(_UNIT_TABLE, s) - - -def parse_resolution(s): - if s is None: - return {} - - mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s) - if mobj: - return { - 'width': int(mobj.group('w')), - 'height': int(mobj.group('h')), - } - - mobj = re.search(r'\b(\d+)[pPiI]\b', s) - if mobj: - return {'height': int(mobj.group(1))} - - mobj = re.search(r'\b([48])[kK]\b', s) - if mobj: - return {'height': int(mobj.group(1)) * 540} - - return {} - - -def month_by_name(name, lang='en'): - """ Return the number of a month by (locale-independently) English name """ - - month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en']) - - try: - return month_names.index(name) + 1 - except ValueError: - return None - - -def month_by_abbreviation(abbrev): - """ Return the number of a month by (locale-independently) English - abbreviations """ - - try: - return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1 - except ValueError: - return None - - -def fix_xml_ampersands(xml_str): - """Replace all the '&' by '&' in XML""" - return re.sub( - r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)', - '&', - xml_str) - - -def setproctitle(title): - assert isinstance(title, compat_str) - - # ctypes in Jython is not complete - # http://bugs.jython.org/issue2148 - if sys.platform.startswith('java'): - return - - try: - libc = ctypes.cdll.LoadLibrary('libc.so.6') - except OSError: - return - except TypeError: - # LoadLibrary in Windows Python 2.7.13 only expects - # a bytestring, but since unicode_literals turns - # every string into a unicode string, it fails. - return - title_bytes = title.encode('utf-8') - buf = ctypes.create_string_buffer(len(title_bytes)) - buf.value = title_bytes - try: - libc.prctl(15, buf, 0, 0, 0) - except AttributeError: - return # Strange libc, just skip this - - -def remove_start(s, start): - return s[len(start):] if s is not None and s.startswith(start) else s - - -def remove_end(s, end): - return s[:-len(end)] if s is not None and s.endswith(end) else s - - -def remove_quotes(s): - if s is None or len(s) < 2: - return s - for quote in ('"', "'", ): - if s[0] == quote and s[-1] == quote: - return s[1:-1] - return s - - -def url_basename(url): - path = compat_urlparse.urlparse(url).path - return path.strip('/').split('/')[-1] - - -def base_url(url): - return re.match(r'https?://[^?#&]+/', url).group() - - -def urljoin(base, path): - if isinstance(path, bytes): - path = path.decode('utf-8') - if not isinstance(path, compat_str) or not path: - return None - if re.match(r'^(?:https?:)?//', path): - return path - if isinstance(base, bytes): - base = base.decode('utf-8') - if not isinstance(base, compat_str) or not re.match( - r'^(?:https?:)?//', base): - return None - return compat_urlparse.urljoin(base, path) - - -class HEADRequest(compat_urllib_request.Request): - def get_method(self): - return 'HEAD' - - -class PUTRequest(compat_urllib_request.Request): - def get_method(self): - return 'PUT' - - -def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): - if get_attr: - if v is not None: - v = getattr(v, get_attr, None) - if v == '': - v = None - if v is None: - return default - try: - return int(v) * invscale // scale - except ValueError: - return default - - -def str_or_none(v, default=None): - return default if v is None else compat_str(v) - - -def str_to_int(int_str): - """ A more relaxed version of int_or_none """ - if int_str is None: - return None - int_str = re.sub(r'[,\.\+]', '', int_str) - return int(int_str) - - -def float_or_none(v, scale=1, invscale=1, default=None): - if v is None: - return default - try: - return float(v) * invscale / scale - except ValueError: - return default - - -def bool_or_none(v, default=None): - return v if isinstance(v, bool) else default - - -def strip_or_none(v): - return None if v is None else v.strip() - - -def url_or_none(url): - if not url or not isinstance(url, compat_str): - return None - url = url.strip() - return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None - - -def parse_duration(s): - if not isinstance(s, compat_basestring): - return None - - s = s.strip() - - days, hours, mins, secs, ms = [None] * 5 - m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s) - if m: - days, hours, mins, secs, ms = m.groups() - else: - m = re.match( - r'''(?ix)(?:P? - (?: - [0-9]+\s*y(?:ears?)?\s* - )? - (?: - [0-9]+\s*m(?:onths?)?\s* - )? - (?: - [0-9]+\s*w(?:eeks?)?\s* - )? - (?: - (?P<days>[0-9]+)\s*d(?:ays?)?\s* - )? - T)? - (?: - (?P<hours>[0-9]+)\s*h(?:ours?)?\s* - )? - (?: - (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s* - )? - (?: - (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s* - )?Z?$''', s) - if m: - days, hours, mins, secs, ms = m.groups() - else: - m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s) - if m: - hours, mins = m.groups() - else: - return None - - duration = 0 - if secs: - duration += float(secs) - if mins: - duration += float(mins) * 60 - if hours: - duration += float(hours) * 60 * 60 - if days: - duration += float(days) * 24 * 60 * 60 - if ms: - duration += float(ms) - return duration - - -def prepend_extension(filename, ext, expected_real_ext=None): - name, real_ext = os.path.splitext(filename) - return ( - '{0}.{1}{2}'.format(name, ext, real_ext) - if not expected_real_ext or real_ext[1:] == expected_real_ext - else '{0}.{1}'.format(filename, ext)) - - -def replace_extension(filename, ext, expected_real_ext=None): - name, real_ext = os.path.splitext(filename) - return '{0}.{1}'.format( - name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename, - ext) - - -def check_executable(exe, args=[]): - """ Checks if the given binary is installed somewhere in PATH, and returns its name. - args can be a list of arguments for a short output (like -version) """ - try: - subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() - except OSError: - return False - return exe - - -def get_exe_version(exe, args=['--version'], - version_re=None, unrecognized='present'): - """ Returns the version of the specified executable, - or False if the executable is not present """ - try: - # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers - # SIGTTOU if youtube-dl is run in the background. - # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656 - out, _ = subprocess.Popen( - [encodeArgument(exe)] + args, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate() - except OSError: - return False - if isinstance(out, bytes): # Python 2.x - out = out.decode('ascii', 'ignore') - return detect_exe_version(out, version_re, unrecognized) - - -def detect_exe_version(output, version_re=None, unrecognized='present'): - assert isinstance(output, compat_str) - if version_re is None: - version_re = r'version\s+([-0-9._a-zA-Z]+)' - m = re.search(version_re, output) - if m: - return m.group(1) - else: - return unrecognized - - -class PagedList(object): - def __len__(self): - # This is only useful for tests - return len(self.getslice()) - - -class OnDemandPagedList(PagedList): - def __init__(self, pagefunc, pagesize, use_cache=True): - self._pagefunc = pagefunc - self._pagesize = pagesize - self._use_cache = use_cache - if use_cache: - self._cache = {} - - def getslice(self, start=0, end=None): - res = [] - for pagenum in itertools.count(start // self._pagesize): - firstid = pagenum * self._pagesize - nextfirstid = pagenum * self._pagesize + self._pagesize - if start >= nextfirstid: - continue - - page_results = None - if self._use_cache: - page_results = self._cache.get(pagenum) - if page_results is None: - page_results = list(self._pagefunc(pagenum)) - if self._use_cache: - self._cache[pagenum] = page_results - - startv = ( - start % self._pagesize - if firstid <= start < nextfirstid - else 0) - - endv = ( - ((end - 1) % self._pagesize) + 1 - if (end is not None and firstid <= end <= nextfirstid) - else None) - - if startv != 0 or endv is not None: - page_results = page_results[startv:endv] - res.extend(page_results) - - # A little optimization - if current page is not "full", ie. does - # not contain page_size videos then we can assume that this page - # is the last one - there are no more ids on further pages - - # i.e. no need to query again. - if len(page_results) + startv < self._pagesize: - break - - # If we got the whole page, but the next page is not interesting, - # break out early as well - if end == nextfirstid: - break - return res - - -class InAdvancePagedList(PagedList): - def __init__(self, pagefunc, pagecount, pagesize): - self._pagefunc = pagefunc - self._pagecount = pagecount - self._pagesize = pagesize - - def getslice(self, start=0, end=None): - res = [] - start_page = start // self._pagesize - end_page = ( - self._pagecount if end is None else (end // self._pagesize + 1)) - skip_elems = start - start_page * self._pagesize - only_more = None if end is None else end - start - for pagenum in range(start_page, end_page): - page = list(self._pagefunc(pagenum)) - if skip_elems: - page = page[skip_elems:] - skip_elems = None - if only_more is not None: - if len(page) < only_more: - only_more -= len(page) - else: - page = page[:only_more] - res.extend(page) - break - res.extend(page) - return res - - -def uppercase_escape(s): - unicode_escape = codecs.getdecoder('unicode_escape') - return re.sub( - r'\\U[0-9a-fA-F]{8}', - lambda m: unicode_escape(m.group(0))[0], - s) - - -def lowercase_escape(s): - unicode_escape = codecs.getdecoder('unicode_escape') - return re.sub( - r'\\u[0-9a-fA-F]{4}', - lambda m: unicode_escape(m.group(0))[0], - s) - - -def escape_rfc3986(s): - """Escape non-ASCII characters as suggested by RFC 3986""" - if sys.version_info < (3, 0) and isinstance(s, compat_str): - s = s.encode('utf-8') - return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") - - -def escape_url(url): - """Escape URL as suggested by RFC 3986""" - url_parsed = compat_urllib_parse_urlparse(url) - return url_parsed._replace( - netloc=url_parsed.netloc.encode('idna').decode('ascii'), - path=escape_rfc3986(url_parsed.path), - params=escape_rfc3986(url_parsed.params), - query=escape_rfc3986(url_parsed.query), - fragment=escape_rfc3986(url_parsed.fragment) - ).geturl() - - -def read_batch_urls(batch_fd): - def fixup(url): - if not isinstance(url, compat_str): - url = url.decode('utf-8', 'replace') - BOM_UTF8 = '\xef\xbb\xbf' - if url.startswith(BOM_UTF8): - url = url[len(BOM_UTF8):] - url = url.strip() - if url.startswith(('#', ';', ']')): - return False - return url - - with contextlib.closing(batch_fd) as fd: - return [url for url in map(fixup, fd) if url] - - -def urlencode_postdata(*args, **kargs): - return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii') - - -def update_url_query(url, query): - if not query: - return url - parsed_url = compat_urlparse.urlparse(url) - qs = compat_parse_qs(parsed_url.query) - qs.update(query) - return compat_urlparse.urlunparse(parsed_url._replace( - query=compat_urllib_parse_urlencode(qs, True))) - - -def update_Request(req, url=None, data=None, headers={}, query={}): - req_headers = req.headers.copy() - req_headers.update(headers) - req_data = data or req.data - req_url = update_url_query(url or req.get_full_url(), query) - req_get_method = req.get_method() - if req_get_method == 'HEAD': - req_type = HEADRequest - elif req_get_method == 'PUT': - req_type = PUTRequest - else: - req_type = compat_urllib_request.Request - new_req = req_type( - req_url, data=req_data, headers=req_headers, - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) - if hasattr(req, 'timeout'): - new_req.timeout = req.timeout - return new_req - - -def _multipart_encode_impl(data, boundary): - content_type = 'multipart/form-data; boundary=%s' % boundary - - out = b'' - for k, v in data.items(): - out += b'--' + boundary.encode('ascii') + b'\r\n' - if isinstance(k, compat_str): - k = k.encode('utf-8') - if isinstance(v, compat_str): - v = v.encode('utf-8') - # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 - # suggests sending UTF-8 directly. Firefox sends UTF-8, too - content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n' - if boundary.encode('ascii') in content: - raise ValueError('Boundary overlaps with data') - out += content - - out += b'--' + boundary.encode('ascii') + b'--\r\n' - - return out, content_type - - -def multipart_encode(data, boundary=None): - ''' - Encode a dict to RFC 7578-compliant form-data - - data: - A dict where keys and values can be either Unicode or bytes-like - objects. - boundary: - If specified a Unicode object, it's used as the boundary. Otherwise - a random boundary is generated. - - Reference: https://tools.ietf.org/html/rfc7578 - ''' - has_specified_boundary = boundary is not None - - while True: - if boundary is None: - boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff)) - - try: - out, content_type = _multipart_encode_impl(data, boundary) - break - except ValueError: - if has_specified_boundary: - raise - boundary = None - - return out, content_type - - -def dict_get(d, key_or_keys, default=None, skip_false_values=True): - if isinstance(key_or_keys, (list, tuple)): - for key in key_or_keys: - if key not in d or d[key] is None or skip_false_values and not d[key]: - continue - return d[key] - return default - return d.get(key_or_keys, default) - - -def try_get(src, getter, expected_type=None): - if not isinstance(getter, (list, tuple)): - getter = [getter] - for get in getter: - try: - v = get(src) - except (AttributeError, KeyError, TypeError, IndexError): - pass - else: - if expected_type is None or isinstance(v, expected_type): - return v - - -def merge_dicts(*dicts): - merged = {} - for a_dict in dicts: - for k, v in a_dict.items(): - if v is None: - continue - if (k not in merged or - (isinstance(v, compat_str) and v and - isinstance(merged[k], compat_str) and - not merged[k])): - merged[k] = v - return merged - - -def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): - return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) - - -US_RATINGS = { - 'G': 0, - 'PG': 10, - 'PG-13': 13, - 'R': 16, - 'NC': 18, -} - - -TV_PARENTAL_GUIDELINES = { - 'TV-Y': 0, - 'TV-Y7': 7, - 'TV-G': 0, - 'TV-PG': 0, - 'TV-14': 14, - 'TV-MA': 17, -} - - -def parse_age_limit(s): - if type(s) == int: - return s if 0 <= s <= 21 else None - if not isinstance(s, compat_basestring): - return None - m = re.match(r'^(?P<age>\d{1,2})\+?$', s) - if m: - return int(m.group('age')) - if s in US_RATINGS: - return US_RATINGS[s] - m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s) - if m: - return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)] - return None - - -def strip_jsonp(code): - return re.sub( - r'''(?sx)^ - (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*) - (?:\s*&&\s*(?P=func_name))? - \s*\(\s*(?P<callback_data>.*)\);? - \s*?(?://[^\n]*)*$''', - r'\g<callback_data>', code) - - -def js_to_json(code): - COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*' - SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE) - INTEGER_TABLE = ( - (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16), - (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), - ) - - def fix_kv(m): - v = m.group(0) - if v in ('true', 'false', 'null'): - return v - elif v.startswith('/*') or v.startswith('//') or v == ',': - return "" - - if v[0] in ("'", '"'): - v = re.sub(r'(?s)\\.|"', lambda m: { - '"': '\\"', - "\\'": "'", - '\\\n': '', - '\\x': '\\u00', - }.get(m.group(0), m.group(0)), v[1:-1]) - - for regex, base in INTEGER_TABLE: - im = re.match(regex, v) - if im: - i = int(im.group(1), base) - return '"%d":' % i if v.endswith(':') else '%d' % i - - return '"%s"' % v - - return re.sub(r'''(?sx) - "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| - '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - {comment}|,(?={skip}[\]}}])| - (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*| - \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| - [0-9]+(?={skip}:) - '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) - - -def qualities(quality_ids): - """ Get a numeric quality value out of a list of possible values """ - def q(qid): - try: - return quality_ids.index(qid) - except ValueError: - return -1 - return q - - -DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s' - - -def limit_length(s, length): - """ Add ellipses to overly long strings """ - if s is None: - return None - ELLIPSES = '...' - if len(s) > length: - return s[:length - len(ELLIPSES)] + ELLIPSES - return s - - -def version_tuple(v): - return tuple(int(e) for e in re.split(r'[-.]', v)) - - -def is_outdated_version(version, limit, assume_new=True): - if not version: - return not assume_new - try: - return version_tuple(version) < version_tuple(limit) - except ValueError: - return not assume_new - - -def ytdl_is_updateable(): - """ Returns if youtube-dl can be updated with -U """ - from zipimport import zipimporter - - return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen') - - -def args_to_str(args): - # Get a short string representation for a subprocess command - return ' '.join(compat_shlex_quote(a) for a in args) - - -def error_to_compat_str(err): - err_str = str(err) - # On python 2 error byte string must be decoded with proper - # encoding rather than ascii - if sys.version_info[0] < 3: - err_str = err_str.decode(preferredencoding()) - return err_str - - -def mimetype2ext(mt): - if mt is None: - return None - - ext = { - 'audio/mp4': 'm4a', - # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as - # it's the most popular one - 'audio/mpeg': 'mp3', - }.get(mt) - if ext is not None: - return ext - - _, _, res = mt.rpartition('/') - res = res.split(';')[0].strip().lower() - - return { - '3gpp': '3gp', - 'smptett+xml': 'tt', - 'ttaf+xml': 'dfxp', - 'ttml+xml': 'ttml', - 'x-flv': 'flv', - 'x-mp4-fragmented': 'mp4', - 'x-ms-sami': 'sami', - 'x-ms-wmv': 'wmv', - 'mpegurl': 'm3u8', - 'x-mpegurl': 'm3u8', - 'vnd.apple.mpegurl': 'm3u8', - 'dash+xml': 'mpd', - 'f4m+xml': 'f4m', - 'hds+xml': 'f4m', - 'vnd.ms-sstr+xml': 'ism', - 'quicktime': 'mov', - 'mp2t': 'ts', - }.get(res, res) - - -def parse_codecs(codecs_str): - # http://tools.ietf.org/html/rfc6381 - if not codecs_str: - return {} - splited_codecs = list(filter(None, map( - lambda str: str.strip(), codecs_str.strip().strip(',').split(',')))) - vcodec, acodec = None, None - for full_codec in splited_codecs: - codec = full_codec.split('.')[0] - if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01'): - if not vcodec: - vcodec = full_codec - elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): - if not acodec: - acodec = full_codec - else: - write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr) - if not vcodec and not acodec: - if len(splited_codecs) == 2: - return { - 'vcodec': vcodec, - 'acodec': acodec, - } - elif len(splited_codecs) == 1: - return { - 'vcodec': 'none', - 'acodec': vcodec, - } - else: - return { - 'vcodec': vcodec or 'none', - 'acodec': acodec or 'none', - } - return {} - - -def urlhandle_detect_ext(url_handle): - getheader = url_handle.headers.get - - cd = getheader('Content-Disposition') - if cd: - m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd) - if m: - e = determine_ext(m.group('filename'), default_ext=None) - if e: - return e - - return mimetype2ext(getheader('Content-Type')) - - -def encode_data_uri(data, mime_type): - return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) - - -def age_restricted(content_limit, age_limit): - """ Returns True iff the content should be blocked """ - - if age_limit is None: # No limit set - return False - if content_limit is None: - return False # Content available for everyone - return age_limit < content_limit - - -def is_html(first_bytes): - """ Detect whether a file contains HTML by examining its first bytes. """ - - BOMS = [ - (b'\xef\xbb\xbf', 'utf-8'), - (b'\x00\x00\xfe\xff', 'utf-32-be'), - (b'\xff\xfe\x00\x00', 'utf-32-le'), - (b'\xff\xfe', 'utf-16-le'), - (b'\xfe\xff', 'utf-16-be'), - ] - for bom, enc in BOMS: - if first_bytes.startswith(bom): - s = first_bytes[len(bom):].decode(enc, 'replace') - break - else: - s = first_bytes.decode('utf-8', 'replace') - - return re.match(r'^\s*<', s) - - -def determine_protocol(info_dict): - protocol = info_dict.get('protocol') - if protocol is not None: - return protocol - - url = info_dict['url'] - if url.startswith('rtmp'): - return 'rtmp' - elif url.startswith('mms'): - return 'mms' - elif url.startswith('rtsp'): - return 'rtsp' - - ext = determine_ext(url) - if ext == 'm3u8': - return 'm3u8' - elif ext == 'f4m': - return 'f4m' - - return compat_urllib_parse_urlparse(url).scheme - - -def render_table(header_row, data): - """ Render a list of rows, each as a list of values """ - table = [header_row] + data - max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)] - format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s' - return '\n'.join(format_str % tuple(row) for row in table) - - -def _match_one(filter_part, dct): - COMPARISON_OPERATORS = { - '<': operator.lt, - '<=': operator.le, - '>': operator.gt, - '>=': operator.ge, - '=': operator.eq, - '!=': operator.ne, - } - operator_rex = re.compile(r'''(?x)\s* - (?P<key>[a-z_]+) - \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* - (?: - (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)| - (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)| - (?P<strval>(?![0-9.])[a-z0-9A-Z]*) - ) - \s*$ - ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))) - m = operator_rex.search(filter_part) - if m: - op = COMPARISON_OPERATORS[m.group('op')] - actual_value = dct.get(m.group('key')) - if (m.group('quotedstrval') is not None or - m.group('strval') is not None or - # If the original field is a string and matching comparisonvalue is - # a number we should respect the origin of the original field - # and process comparison value as a string (see - # https://github.com/rg3/youtube-dl/issues/11082). - actual_value is not None and m.group('intval') is not None and - isinstance(actual_value, compat_str)): - if m.group('op') not in ('=', '!='): - raise ValueError( - 'Operator %s does not support string values!' % m.group('op')) - comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval') - quote = m.group('quote') - if quote is not None: - comparison_value = comparison_value.replace(r'\%s' % quote, quote) - else: - try: - comparison_value = int(m.group('intval')) - except ValueError: - comparison_value = parse_filesize(m.group('intval')) - if comparison_value is None: - comparison_value = parse_filesize(m.group('intval') + 'B') - if comparison_value is None: - raise ValueError( - 'Invalid integer value %r in filter part %r' % ( - m.group('intval'), filter_part)) - if actual_value is None: - return m.group('none_inclusive') - return op(actual_value, comparison_value) - - UNARY_OPERATORS = { - '': lambda v: (v is True) if isinstance(v, bool) else (v is not None), - '!': lambda v: (v is False) if isinstance(v, bool) else (v is None), - } - operator_rex = re.compile(r'''(?x)\s* - (?P<op>%s)\s*(?P<key>[a-z_]+) - \s*$ - ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys()))) - m = operator_rex.search(filter_part) - if m: - op = UNARY_OPERATORS[m.group('op')] - actual_value = dct.get(m.group('key')) - return op(actual_value) - - raise ValueError('Invalid filter part %r' % filter_part) - - -def match_str(filter_str, dct): - """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """ - - return all( - _match_one(filter_part, dct) for filter_part in filter_str.split('&')) - - -def match_filter_func(filter_str): - def _match_func(info_dict): - if match_str(filter_str, info_dict): - return None - else: - video_title = info_dict.get('title', info_dict.get('id', 'video')) - return '%s does not pass filter %s, skipping ..' % (video_title, filter_str) - return _match_func - - -def parse_dfxp_time_expr(time_expr): - if not time_expr: - return - - mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr) - if mobj: - return float(mobj.group('time_offset')) - - mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr) - if mobj: - return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.')) - - -def srt_subtitles_timecode(seconds): - return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000) - - -def dfxp2srt(dfxp_data): - ''' - @param dfxp_data A bytes-like object containing DFXP data - @returns A unicode object containing converted SRT data - ''' - LEGACY_NAMESPACES = ( - (b'http://www.w3.org/ns/ttml', [ - b'http://www.w3.org/2004/11/ttaf1', - b'http://www.w3.org/2006/04/ttaf1', - b'http://www.w3.org/2006/10/ttaf1', - ]), - (b'http://www.w3.org/ns/ttml#styling', [ - b'http://www.w3.org/ns/ttml#style', - ]), - ) - - SUPPORTED_STYLING = [ - 'color', - 'fontFamily', - 'fontSize', - 'fontStyle', - 'fontWeight', - 'textDecoration' - ] - - _x = functools.partial(xpath_with_ns, ns_map={ - 'xml': 'http://www.w3.org/XML/1998/namespace', - 'ttml': 'http://www.w3.org/ns/ttml', - 'tts': 'http://www.w3.org/ns/ttml#styling', - }) - - styles = {} - default_style = {} - - class TTMLPElementParser(object): - _out = '' - _unclosed_elements = [] - _applied_styles = [] - - def start(self, tag, attrib): - if tag in (_x('ttml:br'), 'br'): - self._out += '\n' - else: - unclosed_elements = [] - style = {} - element_style_id = attrib.get('style') - if default_style: - style.update(default_style) - if element_style_id: - style.update(styles.get(element_style_id, {})) - for prop in SUPPORTED_STYLING: - prop_val = attrib.get(_x('tts:' + prop)) - if prop_val: - style[prop] = prop_val - if style: - font = '' - for k, v in sorted(style.items()): - if self._applied_styles and self._applied_styles[-1].get(k) == v: - continue - if k == 'color': - font += ' color="%s"' % v - elif k == 'fontSize': - font += ' size="%s"' % v - elif k == 'fontFamily': - font += ' face="%s"' % v - elif k == 'fontWeight' and v == 'bold': - self._out += '<b>' - unclosed_elements.append('b') - elif k == 'fontStyle' and v == 'italic': - self._out += '<i>' - unclosed_elements.append('i') - elif k == 'textDecoration' and v == 'underline': - self._out += '<u>' - unclosed_elements.append('u') - if font: - self._out += '<font' + font + '>' - unclosed_elements.append('font') - applied_style = {} - if self._applied_styles: - applied_style.update(self._applied_styles[-1]) - applied_style.update(style) - self._applied_styles.append(applied_style) - self._unclosed_elements.append(unclosed_elements) - - def end(self, tag): - if tag not in (_x('ttml:br'), 'br'): - unclosed_elements = self._unclosed_elements.pop() - for element in reversed(unclosed_elements): - self._out += '</%s>' % element - if unclosed_elements and self._applied_styles: - self._applied_styles.pop() - - def data(self, data): - self._out += data - - def close(self): - return self._out.strip() - - def parse_node(node): - target = TTMLPElementParser() - parser = xml.etree.ElementTree.XMLParser(target=target) - parser.feed(xml.etree.ElementTree.tostring(node)) - return parser.close() - - for k, v in LEGACY_NAMESPACES: - for ns in v: - dfxp_data = dfxp_data.replace(ns, k) - - dfxp = compat_etree_fromstring(dfxp_data) - out = [] - paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') - - if not paras: - raise ValueError('Invalid dfxp/TTML subtitle') - - repeat = False - while True: - for style in dfxp.findall(_x('.//ttml:style')): - style_id = style.get('id') or style.get(_x('xml:id')) - if not style_id: - continue - parent_style_id = style.get('style') - if parent_style_id: - if parent_style_id not in styles: - repeat = True - continue - styles[style_id] = styles[parent_style_id].copy() - for prop in SUPPORTED_STYLING: - prop_val = style.get(_x('tts:' + prop)) - if prop_val: - styles.setdefault(style_id, {})[prop] = prop_val - if repeat: - repeat = False - else: - break - - for p in ('body', 'div'): - ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p]) - if ele is None: - continue - style = styles.get(ele.get('style')) - if not style: - continue - default_style.update(style) - - for para, index in zip(paras, itertools.count(1)): - begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) - end_time = parse_dfxp_time_expr(para.attrib.get('end')) - dur = parse_dfxp_time_expr(para.attrib.get('dur')) - if begin_time is None: - continue - if not end_time: - if not dur: - continue - end_time = begin_time + dur - out.append('%d\n%s --> %s\n%s\n\n' % ( - index, - srt_subtitles_timecode(begin_time), - srt_subtitles_timecode(end_time), - parse_node(para))) - - return ''.join(out) - - -def cli_option(params, command_option, param): - param = params.get(param) - if param: - param = compat_str(param) - return [command_option, param] if param is not None else [] - - -def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): - param = params.get(param) - if param is None: - return [] - assert isinstance(param, bool) - if separator: - return [command_option + separator + (true_value if param else false_value)] - return [command_option, true_value if param else false_value] - - -def cli_valueless_option(params, command_option, param, expected_value=True): - param = params.get(param) - return [command_option] if param == expected_value else [] - - -def cli_configuration_args(params, param, default=[]): - ex_args = params.get(param) - if ex_args is None: - return default - assert isinstance(ex_args, list) - return ex_args - - -class ISO639Utils(object): - # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt - _lang_map = { - 'aa': 'aar', - 'ab': 'abk', - 'ae': 'ave', - 'af': 'afr', - 'ak': 'aka', - 'am': 'amh', - 'an': 'arg', - 'ar': 'ara', - 'as': 'asm', - 'av': 'ava', - 'ay': 'aym', - 'az': 'aze', - 'ba': 'bak', - 'be': 'bel', - 'bg': 'bul', - 'bh': 'bih', - 'bi': 'bis', - 'bm': 'bam', - 'bn': 'ben', - 'bo': 'bod', - 'br': 'bre', - 'bs': 'bos', - 'ca': 'cat', - 'ce': 'che', - 'ch': 'cha', - 'co': 'cos', - 'cr': 'cre', - 'cs': 'ces', - 'cu': 'chu', - 'cv': 'chv', - 'cy': 'cym', - 'da': 'dan', - 'de': 'deu', - 'dv': 'div', - 'dz': 'dzo', - 'ee': 'ewe', - 'el': 'ell', - 'en': 'eng', - 'eo': 'epo', - 'es': 'spa', - 'et': 'est', - 'eu': 'eus', - 'fa': 'fas', - 'ff': 'ful', - 'fi': 'fin', - 'fj': 'fij', - 'fo': 'fao', - 'fr': 'fra', - 'fy': 'fry', - 'ga': 'gle', - 'gd': 'gla', - 'gl': 'glg', - 'gn': 'grn', - 'gu': 'guj', - 'gv': 'glv', - 'ha': 'hau', - 'he': 'heb', - 'iw': 'heb', # Replaced by he in 1989 revision - 'hi': 'hin', - 'ho': 'hmo', - 'hr': 'hrv', - 'ht': 'hat', - 'hu': 'hun', - 'hy': 'hye', - 'hz': 'her', - 'ia': 'ina', - 'id': 'ind', - 'in': 'ind', # Replaced by id in 1989 revision - 'ie': 'ile', - 'ig': 'ibo', - 'ii': 'iii', - 'ik': 'ipk', - 'io': 'ido', - 'is': 'isl', - 'it': 'ita', - 'iu': 'iku', - 'ja': 'jpn', - 'jv': 'jav', - 'ka': 'kat', - 'kg': 'kon', - 'ki': 'kik', - 'kj': 'kua', - 'kk': 'kaz', - 'kl': 'kal', - 'km': 'khm', - 'kn': 'kan', - 'ko': 'kor', - 'kr': 'kau', - 'ks': 'kas', - 'ku': 'kur', - 'kv': 'kom', - 'kw': 'cor', - 'ky': 'kir', - 'la': 'lat', - 'lb': 'ltz', - 'lg': 'lug', - 'li': 'lim', - 'ln': 'lin', - 'lo': 'lao', - 'lt': 'lit', - 'lu': 'lub', - 'lv': 'lav', - 'mg': 'mlg', - 'mh': 'mah', - 'mi': 'mri', - 'mk': 'mkd', - 'ml': 'mal', - 'mn': 'mon', - 'mr': 'mar', - 'ms': 'msa', - 'mt': 'mlt', - 'my': 'mya', - 'na': 'nau', - 'nb': 'nob', - 'nd': 'nde', - 'ne': 'nep', - 'ng': 'ndo', - 'nl': 'nld', - 'nn': 'nno', - 'no': 'nor', - 'nr': 'nbl', - 'nv': 'nav', - 'ny': 'nya', - 'oc': 'oci', - 'oj': 'oji', - 'om': 'orm', - 'or': 'ori', - 'os': 'oss', - 'pa': 'pan', - 'pi': 'pli', - 'pl': 'pol', - 'ps': 'pus', - 'pt': 'por', - 'qu': 'que', - 'rm': 'roh', - 'rn': 'run', - 'ro': 'ron', - 'ru': 'rus', - 'rw': 'kin', - 'sa': 'san', - 'sc': 'srd', - 'sd': 'snd', - 'se': 'sme', - 'sg': 'sag', - 'si': 'sin', - 'sk': 'slk', - 'sl': 'slv', - 'sm': 'smo', - 'sn': 'sna', - 'so': 'som', - 'sq': 'sqi', - 'sr': 'srp', - 'ss': 'ssw', - 'st': 'sot', - 'su': 'sun', - 'sv': 'swe', - 'sw': 'swa', - 'ta': 'tam', - 'te': 'tel', - 'tg': 'tgk', - 'th': 'tha', - 'ti': 'tir', - 'tk': 'tuk', - 'tl': 'tgl', - 'tn': 'tsn', - 'to': 'ton', - 'tr': 'tur', - 'ts': 'tso', - 'tt': 'tat', - 'tw': 'twi', - 'ty': 'tah', - 'ug': 'uig', - 'uk': 'ukr', - 'ur': 'urd', - 'uz': 'uzb', - 've': 'ven', - 'vi': 'vie', - 'vo': 'vol', - 'wa': 'wln', - 'wo': 'wol', - 'xh': 'xho', - 'yi': 'yid', - 'ji': 'yid', # Replaced by yi in 1989 revision - 'yo': 'yor', - 'za': 'zha', - 'zh': 'zho', - 'zu': 'zul', - } - - @classmethod - def short2long(cls, code): - """Convert language code from ISO 639-1 to ISO 639-2/T""" - return cls._lang_map.get(code[:2]) - - @classmethod - def long2short(cls, code): - """Convert language code from ISO 639-2/T to ISO 639-1""" - for short_name, long_name in cls._lang_map.items(): - if long_name == code: - return short_name - - -class ISO3166Utils(object): - # From http://data.okfn.org/data/core/country-list - _country_map = { - 'AF': 'Afghanistan', - 'AX': 'Åland Islands', - 'AL': 'Albania', - 'DZ': 'Algeria', - 'AS': 'American Samoa', - 'AD': 'Andorra', - 'AO': 'Angola', - 'AI': 'Anguilla', - 'AQ': 'Antarctica', - 'AG': 'Antigua and Barbuda', - 'AR': 'Argentina', - 'AM': 'Armenia', - 'AW': 'Aruba', - 'AU': 'Australia', - 'AT': 'Austria', - 'AZ': 'Azerbaijan', - 'BS': 'Bahamas', - 'BH': 'Bahrain', - 'BD': 'Bangladesh', - 'BB': 'Barbados', - 'BY': 'Belarus', - 'BE': 'Belgium', - 'BZ': 'Belize', - 'BJ': 'Benin', - 'BM': 'Bermuda', - 'BT': 'Bhutan', - 'BO': 'Bolivia, Plurinational State of', - 'BQ': 'Bonaire, Sint Eustatius and Saba', - 'BA': 'Bosnia and Herzegovina', - 'BW': 'Botswana', - 'BV': 'Bouvet Island', - 'BR': 'Brazil', - 'IO': 'British Indian Ocean Territory', - 'BN': 'Brunei Darussalam', - 'BG': 'Bulgaria', - 'BF': 'Burkina Faso', - 'BI': 'Burundi', - 'KH': 'Cambodia', - 'CM': 'Cameroon', - 'CA': 'Canada', - 'CV': 'Cape Verde', - 'KY': 'Cayman Islands', - 'CF': 'Central African Republic', - 'TD': 'Chad', - 'CL': 'Chile', - 'CN': 'China', - 'CX': 'Christmas Island', - 'CC': 'Cocos (Keeling) Islands', - 'CO': 'Colombia', - 'KM': 'Comoros', - 'CG': 'Congo', - 'CD': 'Congo, the Democratic Republic of the', - 'CK': 'Cook Islands', - 'CR': 'Costa Rica', - 'CI': 'Côte d\'Ivoire', - 'HR': 'Croatia', - 'CU': 'Cuba', - 'CW': 'Curaçao', - 'CY': 'Cyprus', - 'CZ': 'Czech Republic', - 'DK': 'Denmark', - 'DJ': 'Djibouti', - 'DM': 'Dominica', - 'DO': 'Dominican Republic', - 'EC': 'Ecuador', - 'EG': 'Egypt', - 'SV': 'El Salvador', - 'GQ': 'Equatorial Guinea', - 'ER': 'Eritrea', - 'EE': 'Estonia', - 'ET': 'Ethiopia', - 'FK': 'Falkland Islands (Malvinas)', - 'FO': 'Faroe Islands', - 'FJ': 'Fiji', - 'FI': 'Finland', - 'FR': 'France', - 'GF': 'French Guiana', - 'PF': 'French Polynesia', - 'TF': 'French Southern Territories', - 'GA': 'Gabon', - 'GM': 'Gambia', - 'GE': 'Georgia', - 'DE': 'Germany', - 'GH': 'Ghana', - 'GI': 'Gibraltar', - 'GR': 'Greece', - 'GL': 'Greenland', - 'GD': 'Grenada', - 'GP': 'Guadeloupe', - 'GU': 'Guam', - 'GT': 'Guatemala', - 'GG': 'Guernsey', - 'GN': 'Guinea', - 'GW': 'Guinea-Bissau', - 'GY': 'Guyana', - 'HT': 'Haiti', - 'HM': 'Heard Island and McDonald Islands', - 'VA': 'Holy See (Vatican City State)', - 'HN': 'Honduras', - 'HK': 'Hong Kong', - 'HU': 'Hungary', - 'IS': 'Iceland', - 'IN': 'India', - 'ID': 'Indonesia', - 'IR': 'Iran, Islamic Republic of', - 'IQ': 'Iraq', - 'IE': 'Ireland', - 'IM': 'Isle of Man', - 'IL': 'Israel', - 'IT': 'Italy', - 'JM': 'Jamaica', - 'JP': 'Japan', - 'JE': 'Jersey', - 'JO': 'Jordan', - 'KZ': 'Kazakhstan', - 'KE': 'Kenya', - 'KI': 'Kiribati', - 'KP': 'Korea, Democratic People\'s Republic of', - 'KR': 'Korea, Republic of', - 'KW': 'Kuwait', - 'KG': 'Kyrgyzstan', - 'LA': 'Lao People\'s Democratic Republic', - 'LV': 'Latvia', - 'LB': 'Lebanon', - 'LS': 'Lesotho', - 'LR': 'Liberia', - 'LY': 'Libya', - 'LI': 'Liechtenstein', - 'LT': 'Lithuania', - 'LU': 'Luxembourg', - 'MO': 'Macao', - 'MK': 'Macedonia, the Former Yugoslav Republic of', - 'MG': 'Madagascar', - 'MW': 'Malawi', - 'MY': 'Malaysia', - 'MV': 'Maldives', - 'ML': 'Mali', - 'MT': 'Malta', - 'MH': 'Marshall Islands', - 'MQ': 'Martinique', - 'MR': 'Mauritania', - 'MU': 'Mauritius', - 'YT': 'Mayotte', - 'MX': 'Mexico', - 'FM': 'Micronesia, Federated States of', - 'MD': 'Moldova, Republic of', - 'MC': 'Monaco', - 'MN': 'Mongolia', - 'ME': 'Montenegro', - 'MS': 'Montserrat', - 'MA': 'Morocco', - 'MZ': 'Mozambique', - 'MM': 'Myanmar', - 'NA': 'Namibia', - 'NR': 'Nauru', - 'NP': 'Nepal', - 'NL': 'Netherlands', - 'NC': 'New Caledonia', - 'NZ': 'New Zealand', - 'NI': 'Nicaragua', - 'NE': 'Niger', - 'NG': 'Nigeria', - 'NU': 'Niue', - 'NF': 'Norfolk Island', - 'MP': 'Northern Mariana Islands', - 'NO': 'Norway', - 'OM': 'Oman', - 'PK': 'Pakistan', - 'PW': 'Palau', - 'PS': 'Palestine, State of', - 'PA': 'Panama', - 'PG': 'Papua New Guinea', - 'PY': 'Paraguay', - 'PE': 'Peru', - 'PH': 'Philippines', - 'PN': 'Pitcairn', - 'PL': 'Poland', - 'PT': 'Portugal', - 'PR': 'Puerto Rico', - 'QA': 'Qatar', - 'RE': 'Réunion', - 'RO': 'Romania', - 'RU': 'Russian Federation', - 'RW': 'Rwanda', - 'BL': 'Saint Barthélemy', - 'SH': 'Saint Helena, Ascension and Tristan da Cunha', - 'KN': 'Saint Kitts and Nevis', - 'LC': 'Saint Lucia', - 'MF': 'Saint Martin (French part)', - 'PM': 'Saint Pierre and Miquelon', - 'VC': 'Saint Vincent and the Grenadines', - 'WS': 'Samoa', - 'SM': 'San Marino', - 'ST': 'Sao Tome and Principe', - 'SA': 'Saudi Arabia', - 'SN': 'Senegal', - 'RS': 'Serbia', - 'SC': 'Seychelles', - 'SL': 'Sierra Leone', - 'SG': 'Singapore', - 'SX': 'Sint Maarten (Dutch part)', - 'SK': 'Slovakia', - 'SI': 'Slovenia', - 'SB': 'Solomon Islands', - 'SO': 'Somalia', - 'ZA': 'South Africa', - 'GS': 'South Georgia and the South Sandwich Islands', - 'SS': 'South Sudan', - 'ES': 'Spain', - 'LK': 'Sri Lanka', - 'SD': 'Sudan', - 'SR': 'Suriname', - 'SJ': 'Svalbard and Jan Mayen', - 'SZ': 'Swaziland', - 'SE': 'Sweden', - 'CH': 'Switzerland', - 'SY': 'Syrian Arab Republic', - 'TW': 'Taiwan, Province of China', - 'TJ': 'Tajikistan', - 'TZ': 'Tanzania, United Republic of', - 'TH': 'Thailand', - 'TL': 'Timor-Leste', - 'TG': 'Togo', - 'TK': 'Tokelau', - 'TO': 'Tonga', - 'TT': 'Trinidad and Tobago', - 'TN': 'Tunisia', - 'TR': 'Turkey', - 'TM': 'Turkmenistan', - 'TC': 'Turks and Caicos Islands', - 'TV': 'Tuvalu', - 'UG': 'Uganda', - 'UA': 'Ukraine', - 'AE': 'United Arab Emirates', - 'GB': 'United Kingdom', - 'US': 'United States', - 'UM': 'United States Minor Outlying Islands', - 'UY': 'Uruguay', - 'UZ': 'Uzbekistan', - 'VU': 'Vanuatu', - 'VE': 'Venezuela, Bolivarian Republic of', - 'VN': 'Viet Nam', - 'VG': 'Virgin Islands, British', - 'VI': 'Virgin Islands, U.S.', - 'WF': 'Wallis and Futuna', - 'EH': 'Western Sahara', - 'YE': 'Yemen', - 'ZM': 'Zambia', - 'ZW': 'Zimbabwe', - } - - @classmethod - def short2full(cls, code): - """Convert an ISO 3166-2 country code to the corresponding full name""" - return cls._country_map.get(code.upper()) - - -class GeoUtils(object): - # Major IPv4 address blocks per country - _country_ip_map = { - 'AD': '85.94.160.0/19', - 'AE': '94.200.0.0/13', - 'AF': '149.54.0.0/17', - 'AG': '209.59.64.0/18', - 'AI': '204.14.248.0/21', - 'AL': '46.99.0.0/16', - 'AM': '46.70.0.0/15', - 'AO': '105.168.0.0/13', - 'AP': '159.117.192.0/21', - 'AR': '181.0.0.0/12', - 'AS': '202.70.112.0/20', - 'AT': '84.112.0.0/13', - 'AU': '1.128.0.0/11', - 'AW': '181.41.0.0/18', - 'AZ': '5.191.0.0/16', - 'BA': '31.176.128.0/17', - 'BB': '65.48.128.0/17', - 'BD': '114.130.0.0/16', - 'BE': '57.0.0.0/8', - 'BF': '129.45.128.0/17', - 'BG': '95.42.0.0/15', - 'BH': '37.131.0.0/17', - 'BI': '154.117.192.0/18', - 'BJ': '137.255.0.0/16', - 'BL': '192.131.134.0/24', - 'BM': '196.12.64.0/18', - 'BN': '156.31.0.0/16', - 'BO': '161.56.0.0/16', - 'BQ': '161.0.80.0/20', - 'BR': '152.240.0.0/12', - 'BS': '24.51.64.0/18', - 'BT': '119.2.96.0/19', - 'BW': '168.167.0.0/16', - 'BY': '178.120.0.0/13', - 'BZ': '179.42.192.0/18', - 'CA': '99.224.0.0/11', - 'CD': '41.243.0.0/16', - 'CF': '196.32.200.0/21', - 'CG': '197.214.128.0/17', - 'CH': '85.0.0.0/13', - 'CI': '154.232.0.0/14', - 'CK': '202.65.32.0/19', - 'CL': '152.172.0.0/14', - 'CM': '165.210.0.0/15', - 'CN': '36.128.0.0/10', - 'CO': '181.240.0.0/12', - 'CR': '201.192.0.0/12', - 'CU': '152.206.0.0/15', - 'CV': '165.90.96.0/19', - 'CW': '190.88.128.0/17', - 'CY': '46.198.0.0/15', - 'CZ': '88.100.0.0/14', - 'DE': '53.0.0.0/8', - 'DJ': '197.241.0.0/17', - 'DK': '87.48.0.0/12', - 'DM': '192.243.48.0/20', - 'DO': '152.166.0.0/15', - 'DZ': '41.96.0.0/12', - 'EC': '186.68.0.0/15', - 'EE': '90.190.0.0/15', - 'EG': '156.160.0.0/11', - 'ER': '196.200.96.0/20', - 'ES': '88.0.0.0/11', - 'ET': '196.188.0.0/14', - 'EU': '2.16.0.0/13', - 'FI': '91.152.0.0/13', - 'FJ': '144.120.0.0/16', - 'FM': '119.252.112.0/20', - 'FO': '88.85.32.0/19', - 'FR': '90.0.0.0/9', - 'GA': '41.158.0.0/15', - 'GB': '25.0.0.0/8', - 'GD': '74.122.88.0/21', - 'GE': '31.146.0.0/16', - 'GF': '161.22.64.0/18', - 'GG': '62.68.160.0/19', - 'GH': '45.208.0.0/14', - 'GI': '85.115.128.0/19', - 'GL': '88.83.0.0/19', - 'GM': '160.182.0.0/15', - 'GN': '197.149.192.0/18', - 'GP': '104.250.0.0/19', - 'GQ': '105.235.224.0/20', - 'GR': '94.64.0.0/13', - 'GT': '168.234.0.0/16', - 'GU': '168.123.0.0/16', - 'GW': '197.214.80.0/20', - 'GY': '181.41.64.0/18', - 'HK': '113.252.0.0/14', - 'HN': '181.210.0.0/16', - 'HR': '93.136.0.0/13', - 'HT': '148.102.128.0/17', - 'HU': '84.0.0.0/14', - 'ID': '39.192.0.0/10', - 'IE': '87.32.0.0/12', - 'IL': '79.176.0.0/13', - 'IM': '5.62.80.0/20', - 'IN': '117.192.0.0/10', - 'IO': '203.83.48.0/21', - 'IQ': '37.236.0.0/14', - 'IR': '2.176.0.0/12', - 'IS': '82.221.0.0/16', - 'IT': '79.0.0.0/10', - 'JE': '87.244.64.0/18', - 'JM': '72.27.0.0/17', - 'JO': '176.29.0.0/16', - 'JP': '126.0.0.0/8', - 'KE': '105.48.0.0/12', - 'KG': '158.181.128.0/17', - 'KH': '36.37.128.0/17', - 'KI': '103.25.140.0/22', - 'KM': '197.255.224.0/20', - 'KN': '198.32.32.0/19', - 'KP': '175.45.176.0/22', - 'KR': '175.192.0.0/10', - 'KW': '37.36.0.0/14', - 'KY': '64.96.0.0/15', - 'KZ': '2.72.0.0/13', - 'LA': '115.84.64.0/18', - 'LB': '178.135.0.0/16', - 'LC': '192.147.231.0/24', - 'LI': '82.117.0.0/19', - 'LK': '112.134.0.0/15', - 'LR': '41.86.0.0/19', - 'LS': '129.232.0.0/17', - 'LT': '78.56.0.0/13', - 'LU': '188.42.0.0/16', - 'LV': '46.109.0.0/16', - 'LY': '41.252.0.0/14', - 'MA': '105.128.0.0/11', - 'MC': '88.209.64.0/18', - 'MD': '37.246.0.0/16', - 'ME': '178.175.0.0/17', - 'MF': '74.112.232.0/21', - 'MG': '154.126.0.0/17', - 'MH': '117.103.88.0/21', - 'MK': '77.28.0.0/15', - 'ML': '154.118.128.0/18', - 'MM': '37.111.0.0/17', - 'MN': '49.0.128.0/17', - 'MO': '60.246.0.0/16', - 'MP': '202.88.64.0/20', - 'MQ': '109.203.224.0/19', - 'MR': '41.188.64.0/18', - 'MS': '208.90.112.0/22', - 'MT': '46.11.0.0/16', - 'MU': '105.16.0.0/12', - 'MV': '27.114.128.0/18', - 'MW': '105.234.0.0/16', - 'MX': '187.192.0.0/11', - 'MY': '175.136.0.0/13', - 'MZ': '197.218.0.0/15', - 'NA': '41.182.0.0/16', - 'NC': '101.101.0.0/18', - 'NE': '197.214.0.0/18', - 'NF': '203.17.240.0/22', - 'NG': '105.112.0.0/12', - 'NI': '186.76.0.0/15', - 'NL': '145.96.0.0/11', - 'NO': '84.208.0.0/13', - 'NP': '36.252.0.0/15', - 'NR': '203.98.224.0/19', - 'NU': '49.156.48.0/22', - 'NZ': '49.224.0.0/14', - 'OM': '5.36.0.0/15', - 'PA': '186.72.0.0/15', - 'PE': '186.160.0.0/14', - 'PF': '123.50.64.0/18', - 'PG': '124.240.192.0/19', - 'PH': '49.144.0.0/13', - 'PK': '39.32.0.0/11', - 'PL': '83.0.0.0/11', - 'PM': '70.36.0.0/20', - 'PR': '66.50.0.0/16', - 'PS': '188.161.0.0/16', - 'PT': '85.240.0.0/13', - 'PW': '202.124.224.0/20', - 'PY': '181.120.0.0/14', - 'QA': '37.210.0.0/15', - 'RE': '139.26.0.0/16', - 'RO': '79.112.0.0/13', - 'RS': '178.220.0.0/14', - 'RU': '5.136.0.0/13', - 'RW': '105.178.0.0/15', - 'SA': '188.48.0.0/13', - 'SB': '202.1.160.0/19', - 'SC': '154.192.0.0/11', - 'SD': '154.96.0.0/13', - 'SE': '78.64.0.0/12', - 'SG': '152.56.0.0/14', - 'SI': '188.196.0.0/14', - 'SK': '78.98.0.0/15', - 'SL': '197.215.0.0/17', - 'SM': '89.186.32.0/19', - 'SN': '41.82.0.0/15', - 'SO': '197.220.64.0/19', - 'SR': '186.179.128.0/17', - 'SS': '105.235.208.0/21', - 'ST': '197.159.160.0/19', - 'SV': '168.243.0.0/16', - 'SX': '190.102.0.0/20', - 'SY': '5.0.0.0/16', - 'SZ': '41.84.224.0/19', - 'TC': '65.255.48.0/20', - 'TD': '154.68.128.0/19', - 'TG': '196.168.0.0/14', - 'TH': '171.96.0.0/13', - 'TJ': '85.9.128.0/18', - 'TK': '27.96.24.0/21', - 'TL': '180.189.160.0/20', - 'TM': '95.85.96.0/19', - 'TN': '197.0.0.0/11', - 'TO': '175.176.144.0/21', - 'TR': '78.160.0.0/11', - 'TT': '186.44.0.0/15', - 'TV': '202.2.96.0/19', - 'TW': '120.96.0.0/11', - 'TZ': '156.156.0.0/14', - 'UA': '93.72.0.0/13', - 'UG': '154.224.0.0/13', - 'US': '3.0.0.0/8', - 'UY': '167.56.0.0/13', - 'UZ': '82.215.64.0/18', - 'VA': '212.77.0.0/19', - 'VC': '24.92.144.0/20', - 'VE': '186.88.0.0/13', - 'VG': '172.103.64.0/18', - 'VI': '146.226.0.0/16', - 'VN': '14.160.0.0/11', - 'VU': '202.80.32.0/20', - 'WF': '117.20.32.0/21', - 'WS': '202.4.32.0/19', - 'YE': '134.35.0.0/16', - 'YT': '41.242.116.0/22', - 'ZA': '41.0.0.0/11', - 'ZM': '165.56.0.0/13', - 'ZW': '41.85.192.0/19', - } - - @classmethod - def random_ipv4(cls, code_or_block): - if len(code_or_block) == 2: - block = cls._country_ip_map.get(code_or_block.upper()) - if not block: - return None - else: - block = code_or_block - addr, preflen = block.split('/') - addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] - addr_max = addr_min | (0xffffffff >> int(preflen)) - return compat_str(socket.inet_ntoa( - compat_struct_pack('!L', random.randint(addr_min, addr_max)))) - - -class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): - def __init__(self, proxies=None): - # Set default handlers - for type in ('http', 'https'): - setattr(self, '%s_open' % type, - lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: - meth(r, proxy, type)) - compat_urllib_request.ProxyHandler.__init__(self, proxies) - - def proxy_open(self, req, proxy, type): - req_proxy = req.headers.get('Ytdl-request-proxy') - if req_proxy is not None: - proxy = req_proxy - del req.headers['Ytdl-request-proxy'] - - if proxy == '__noproxy__': - return None # No Proxy - if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): - req.add_header('Ytdl-socks-proxy', proxy) - # youtube-dl's http/https handlers do wrapping the socket with socks - return None - return compat_urllib_request.ProxyHandler.proxy_open( - self, req, proxy, type) - - -# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is -# released into Public Domain -# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387 - -def long_to_bytes(n, blocksize=0): - """long_to_bytes(n:long, blocksize:int) : string - Convert a long integer to a byte string. - - If optional blocksize is given and greater than zero, pad the front of the - byte string with binary zeros so that the length is a multiple of - blocksize. - """ - # after much testing, this algorithm was deemed to be the fastest - s = b'' - n = int(n) - while n > 0: - s = compat_struct_pack('>I', n & 0xffffffff) + s - n = n >> 32 - # strip off leading zeros - for i in range(len(s)): - if s[i] != b'\000'[0]: - break - else: - # only happens when n == 0 - s = b'\000' - i = 0 - s = s[i:] - # add back some pad bytes. this could be done more efficiently w.r.t. the - # de-padding being done above, but sigh... - if blocksize > 0 and len(s) % blocksize: - s = (blocksize - len(s) % blocksize) * b'\000' + s - return s - - -def bytes_to_long(s): - """bytes_to_long(string) : long - Convert a byte string to a long integer. - - This is (essentially) the inverse of long_to_bytes(). - """ - acc = 0 - length = len(s) - if length % 4: - extra = (4 - length % 4) - s = b'\000' * extra + s - length = length + extra - for i in range(0, length, 4): - acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0] - return acc - - -def ohdave_rsa_encrypt(data, exponent, modulus): - ''' - Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/ - - Input: - data: data to encrypt, bytes-like object - exponent, modulus: parameter e and N of RSA algorithm, both integer - Output: hex string of encrypted data - - Limitation: supports one block encryption only - ''' - - payload = int(binascii.hexlify(data[::-1]), 16) - encrypted = pow(payload, exponent, modulus) - return '%x' % encrypted - - -def pkcs1pad(data, length): - """ - Padding input data with PKCS#1 scheme - - @param {int[]} data input data - @param {int} length target length - @returns {int[]} padded data - """ - if len(data) > length - 11: - raise ValueError('Input data too long for PKCS#1 padding') - - pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)] - return [0, 2] + pseudo_random + [0] + data - - -def encode_base_n(num, n, table=None): - FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' - if not table: - table = FULL_TABLE[:n] - - if n > len(table): - raise ValueError('base %d exceeds table length %d' % (n, len(table))) - - if num == 0: - return table[0] - - ret = '' - while num: - ret = table[num % n] + ret - num = num // n - return ret - - -def decode_packed_codes(code): - mobj = re.search(PACKED_CODES_RE, code) - obfucasted_code, base, count, symbols = mobj.groups() - base = int(base) - count = int(count) - symbols = symbols.split('|') - symbol_table = {} - - while count: - count -= 1 - base_n_count = encode_base_n(count, base) - symbol_table[base_n_count] = symbols[count] or base_n_count - - return re.sub( - r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], - obfucasted_code) - - -def parse_m3u8_attributes(attrib): - info = {} - for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib): - if val.startswith('"'): - val = val[1:-1] - info[key] = val - return info - - -def urshift(val, n): - return val >> n if val >= 0 else (val + 0x100000000) >> n - - -# Based on png2str() written by @gdkchan and improved by @yokrysty -# Originally posted at https://github.com/rg3/youtube-dl/issues/9706 -def decode_png(png_data): - # Reference: https://www.w3.org/TR/PNG/ - header = png_data[8:] - - if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': - raise IOError('Not a valid PNG file.') - - int_map = {1: '>B', 2: '>H', 4: '>I'} - unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0] - - chunks = [] - - while header: - length = unpack_integer(header[:4]) - header = header[4:] - - chunk_type = header[:4] - header = header[4:] - - chunk_data = header[:length] - header = header[length:] - - header = header[4:] # Skip CRC - - chunks.append({ - 'type': chunk_type, - 'length': length, - 'data': chunk_data - }) - - ihdr = chunks[0]['data'] - - width = unpack_integer(ihdr[:4]) - height = unpack_integer(ihdr[4:8]) - - idat = b'' - - for chunk in chunks: - if chunk['type'] == b'IDAT': - idat += chunk['data'] - - if not idat: - raise IOError('Unable to read PNG data.') - - decompressed_data = bytearray(zlib.decompress(idat)) - - stride = width * 3 - pixels = [] - - def _get_pixel(idx): - x = idx % stride - y = idx // stride - return pixels[y][x] - - for y in range(height): - basePos = y * (1 + stride) - filter_type = decompressed_data[basePos] - - current_row = [] - - pixels.append(current_row) - - for x in range(stride): - color = decompressed_data[1 + basePos + x] - basex = y * stride + x - left = 0 - up = 0 - - if x > 2: - left = _get_pixel(basex - 3) - if y > 0: - up = _get_pixel(basex - stride) - - if filter_type == 1: # Sub - color = (color + left) & 0xff - elif filter_type == 2: # Up - color = (color + up) & 0xff - elif filter_type == 3: # Average - color = (color + ((left + up) >> 1)) & 0xff - elif filter_type == 4: # Paeth - a = left - b = up - c = 0 - - if x > 2 and y > 0: - c = _get_pixel(basex - stride - 3) - - p = a + b - c - - pa = abs(p - a) - pb = abs(p - b) - pc = abs(p - c) - - if pa <= pb and pa <= pc: - color = (color + a) & 0xff - elif pb <= pc: - color = (color + b) & 0xff - else: - color = (color + c) & 0xff - - current_row.append(color) - - return width, height, pixels - - -def write_xattr(path, key, value): - # This mess below finds the best xattr tool for the job - try: - # try the pyxattr module... - import xattr - - if hasattr(xattr, 'set'): # pyxattr - # Unicode arguments are not supported in python-pyxattr until - # version 0.5.0 - # See https://github.com/rg3/youtube-dl/issues/5498 - pyxattr_required_version = '0.5.0' - if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version): - # TODO: fallback to CLI tools - raise XAttrUnavailableError( - 'python-pyxattr is detected but is too old. ' - 'youtube-dl requires %s or above while your version is %s. ' - 'Falling back to other xattr implementations' % ( - pyxattr_required_version, xattr.__version__)) - - setxattr = xattr.set - else: # xattr - setxattr = xattr.setxattr - - try: - setxattr(path, key, value) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - - except ImportError: - if compat_os_name == 'nt': - # Write xattrs to NTFS Alternate Data Streams: - # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 - assert ':' not in key - assert os.path.exists(path) - - ads_fn = path + ':' + key - try: - with open(ads_fn, 'wb') as f: - f.write(value) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - else: - user_has_setfattr = check_executable('setfattr', ['--version']) - user_has_xattr = check_executable('xattr', ['-h']) - - if user_has_setfattr or user_has_xattr: - - value = value.decode('utf-8') - if user_has_setfattr: - executable = 'setfattr' - opts = ['-n', key, '-v', value] - elif user_has_xattr: - executable = 'xattr' - opts = ['-w', key, value] - - cmd = ([encodeFilename(executable, True)] + - [encodeArgument(o) for o in opts] + - [encodeFilename(path, True)]) - - try: - p = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - stdout, stderr = p.communicate() - stderr = stderr.decode('utf-8', 'replace') - if p.returncode != 0: - raise XAttrMetadataError(p.returncode, stderr) - - else: - # On Unix, and can't find pyxattr, setfattr, or xattr. - if sys.platform.startswith('linux'): - raise XAttrUnavailableError( - "Couldn't find a tool to set the xattrs. " - "Install either the python 'pyxattr' or 'xattr' " - "modules, or the GNU 'attr' package " - "(which contains the 'setfattr' tool).") - else: - raise XAttrUnavailableError( - "Couldn't find a tool to set the xattrs. " - "Install either the python 'xattr' module, " - "or the 'xattr' binary.") - - -def random_birthday(year_field, month_field, day_field): - start_date = datetime.date(1950, 1, 1) - end_date = datetime.date(1995, 12, 31) - offset = random.randint(0, (end_date - start_date).days) - random_date = start_date + datetime.timedelta(offset) - return { - year_field: str(random_date.year), - month_field: str(random_date.month), - day_field: str(random_date.day), - } diff --git a/youtube_dl/version.py b/youtube_dl/version.py deleted file mode 100644 index c7083cf..0000000 --- a/youtube_dl/version.py +++ /dev/null @@ -1,3 +0,0 @@ -from __future__ import unicode_literals - -__version__ = '2018.07.10' |