diff options
Diffstat (limited to 'hypervideo_dl/extractor/common.py')
-rw-r--r-- | hypervideo_dl/extractor/common.py | 392 |
1 files changed, 264 insertions, 128 deletions
diff --git a/hypervideo_dl/extractor/common.py b/hypervideo_dl/extractor/common.py index 4b56307..5a561a2 100644 --- a/hypervideo_dl/extractor/common.py +++ b/hypervideo_dl/extractor/common.py @@ -13,6 +13,7 @@ import netrc import os import random import re +import subprocess import sys import time import types @@ -21,9 +22,21 @@ import urllib.request import xml.etree.ElementTree from ..compat import functools # isort: split -from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name +from ..compat import ( + compat_etree_fromstring, + compat_expanduser, + compat_os_name, + urllib_req_to_req, +) from ..cookies import LenientSimpleCookie from ..downloader.f4m import get_base_url, remove_encrypted_media +from ..downloader.hls import HlsFD +from ..networking import HEADRequest, Request +from ..networking.exceptions import ( + HTTPError, + IncompleteRead, + network_exceptions, +) from ..utils import ( IDENTITY, JSON_LD_RE, @@ -33,6 +46,7 @@ from ..utils import ( GeoRestrictedError, GeoUtils, LenientJSONDecoder, + Popen, RegexNotFoundError, RetryManager, UnsupportedError, @@ -55,7 +69,7 @@ from ..utils import ( join_nonempty, js_to_json, mimetype2ext, - network_exceptions, + netrc_from_content, orderedSet, parse_bitrate, parse_codecs, @@ -65,21 +79,20 @@ from ..utils import ( parse_resolution, sanitize_filename, sanitize_url, - sanitized_Request, smuggle_url, str_or_none, str_to_int, strip_or_none, traverse_obj, + truncate_string, try_call, try_get, unescapeHTML, unified_strdate, unified_timestamp, - update_Request, - update_url_query, url_basename, url_or_none, + urlhandle_detect_ext, urljoin, variadic, xpath_element, @@ -129,6 +142,7 @@ class InfoExtractor: is parsed from a string (in case of fragmented media) for MSS - URL of the ISM manifest. + * request_data Data to send in POST request to the URL * manifest_url The URL of the manifest file in case of fragmented media: @@ -216,7 +230,19 @@ class InfoExtractor: width : height ratio as float. * no_resume The server does not support resuming the (HTTP or RTMP) download. Boolean. - * has_drm The format has DRM and cannot be downloaded. Boolean + * has_drm True if the format has DRM and cannot be downloaded. + 'maybe' if the format may have DRM and has to be tested before download. + * extra_param_to_segment_url A query string to append to each + fragment's URL, or to update each existing query string + with. Only applied by the native HLS/DASH downloaders. + * hls_aes A dictionary of HLS AES-128 decryption information + used by the native HLS downloader to override the + values in the media playlist when an '#EXT-X-KEY' tag + is present in the playlist: + * uri The URI from which the key will be downloaded + * key The key (as hex) used to decrypt fragments. + If `key` is given, any key URI will be ignored + * iv The IV (as hex) used to decrypt fragments * downloader_options A dictionary of downloader options (For internal use only) * http_chunk_size Chunk size for HTTP downloads @@ -271,6 +297,7 @@ class InfoExtractor: channel_id: Id of the channel. channel_url: Full URL to a channel webpage. channel_follower_count: Number of followers of the channel. + channel_is_verified: Whether the channel is verified on the platform. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and @@ -299,6 +326,11 @@ class InfoExtractor: * "author" - human-readable name of the comment author * "author_id" - user ID of the comment author * "author_thumbnail" - The thumbnail of the comment author + * "author_url" - The url to the comment author's page + * "author_is_verified" - Whether the author is verified + on the platform + * "author_is_uploader" - Whether the comment is made by + the video uploader * "id" - Comment ID * "html" - Comment as HTML * "text" - Plain text of the comment @@ -310,8 +342,8 @@ class InfoExtractor: * "dislike_count" - Number of negative ratings of the comment * "is_favorited" - Whether the comment is marked as favorite by the video uploader - * "author_is_uploader" - Whether the comment is made by - the video uploader + * "is_pinned" - Whether the comment is pinned to + the top of the comments age_limit: Age restriction for the video, as an integer (years) webpage_url: The URL to the video webpage, if given to hypervideo it should allow to get the same result again. (It will be set @@ -335,6 +367,10 @@ class InfoExtractor: * "start_time" - The start time of the chapter in seconds * "end_time" - The end time of the chapter in seconds * "title" (optional, string) + heatmap: A list of dictionaries, with the following entries: + * "start_time" - The start time of the data point in seconds + * "end_time" - The end time of the data point in seconds + * "value" - The normalized value of the data point (float between 0 and 1) playable_in_embed: Whether this video is allowed to play in embedded players on other sites. Can be True (=always allowed), False (=never allowed), None (=unknown), or a string @@ -446,8 +482,8 @@ class InfoExtractor: Subclasses of this should also be added to the list of extractors and - should define a _VALID_URL regexp and, re-define the _real_extract() and - (optionally) _real_initialize() methods. + should define _VALID_URL as a regexp or a Sequence of regexps, and + re-define the _real_extract() and (optionally) _real_initialize() methods. Subclasses may also override suitable() if necessary, but ensure the function signature is preserved and that this function imports everything it needs @@ -510,7 +546,7 @@ class InfoExtractor: _EMBED_REGEX = [] def _login_hint(self, method=NO_DEFAULT, netrc=None): - password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' + password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' return { None: '', 'any': f'Use --cookies, --cookies-from-browser, {password_hint}', @@ -537,8 +573,8 @@ class InfoExtractor: # we have cached the regexp for *this* class, whereas getattr would also # match the superclass if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - return cls._VALID_URL_RE.match(url) + cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL))) + return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None) @classmethod def suitable(cls, url): @@ -674,7 +710,8 @@ class InfoExtractor: for _ in range(2): try: self.initialize() - self.write_debug('Extracting URL: %s' % url) + self.to_screen('Extracting URL: %s' % ( + url if self.get_param('verbose') else truncate_string(url, 100, 20))) ie_result = self._real_extract(url) if ie_result is None: return None @@ -692,11 +729,11 @@ class InfoExtractor: except UnsupportedError: raise except ExtractorError as e: - e.video_id = e.video_id or self.get_temp_id(url), + e.video_id = e.video_id or self.get_temp_id(url) e.ie = e.ie or self.IE_NAME, e.traceback = e.traceback or sys.exc_info()[2] raise - except http.client.IncompleteRead as e: + except IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url)) @@ -755,20 +792,25 @@ class InfoExtractor: @staticmethod def __can_accept_status_code(err, expected_status): - assert isinstance(err, urllib.error.HTTPError) + assert isinstance(err, HTTPError) if expected_status is None: return False elif callable(expected_status): - return expected_status(err.code) is True + return expected_status(err.status) is True else: - return err.code in variadic(expected_status) + return err.status in variadic(expected_status) def _create_request(self, url_or_request, data=None, headers=None, query=None): if isinstance(url_or_request, urllib.request.Request): - return update_Request(url_or_request, data=data, headers=headers, query=query) - if query: - url_or_request = update_url_query(url_or_request, query) - return sanitized_Request(url_or_request, data, headers or {}) + self._downloader.deprecation_warning( + 'Passing a urllib.request.Request to _create_request() is deprecated. ' + 'Use hypervideo_dl.networking.common.Request instead.') + url_or_request = urllib_req_to_req(url_or_request) + elif not isinstance(url_or_request, Request): + url_or_request = Request(url_or_request) + + url_or_request.update(data=data, headers=headers, query=query) + return url_or_request def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None): """ @@ -804,14 +846,9 @@ class InfoExtractor: try: return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query)) except network_exceptions as err: - if isinstance(err, urllib.error.HTTPError): + if isinstance(err, HTTPError): if self.__can_accept_status_code(err, expected_status): - # Retain reference to error to prevent file object from - # being closed before it can be read. Works around the - # effects of <https://bugs.python.org/issue15002> - # introduced in Python 3.4.1. - err.fp._error = err - return err.fp + return err.response if errnote is False: return False @@ -943,11 +980,11 @@ class InfoExtractor: if prefix is not None: webpage_bytes = prefix + webpage_bytes if self.get_param('dump_intermediate_pages', False): - self.to_screen('Dumping request to ' + urlh.geturl()) + self.to_screen('Dumping request to ' + urlh.url) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self.get_param('write_pages'): - filename = self._request_dump_filename(urlh.geturl(), video_id) + filename = self._request_dump_filename(urlh.url, video_id) self.to_screen(f'Saving request to {filename}') with open(filename, 'wb') as outf: outf.write(webpage_bytes) @@ -1005,7 +1042,7 @@ class InfoExtractor: fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): if self.get_param('load_pages'): url_or_request = self._create_request(url_or_request, data, headers, query) - filename = self._request_dump_filename(url_or_request.full_url, video_id) + filename = self._request_dump_filename(url_or_request.url, video_id) self.to_screen(f'Loading request from {filename}') try: with open(filename, 'rb') as dumpf: @@ -1079,7 +1116,7 @@ class InfoExtractor: while True: try: return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs) - except http.client.IncompleteRead as e: + except IncompleteRead as e: try_count += 1 if try_count >= tries: raise e @@ -1260,51 +1297,53 @@ class InfoExtractor: Like _search_regex, but strips HTML tags and unescapes entities. """ res = self._search_regex(pattern, string, name, default, fatal, flags, group) - if res: - return clean_html(res).strip() - else: - return res + if isinstance(res, tuple): + return tuple(map(clean_html, res)) + return clean_html(res) def _get_netrc_login_info(self, netrc_machine=None): - username = None - password = None netrc_machine = netrc_machine or self._NETRC_MACHINE - if self.get_param('usenetrc', False): - try: - netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') - if os.path.isdir(netrc_file): - netrc_file = os.path.join(netrc_file, '.netrc') - info = netrc.netrc(file=netrc_file).authenticators(netrc_machine) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError( - 'No authenticators for %s' % netrc_machine) - except (OSError, netrc.NetrcParseError) as err: - self.report_warning( - 'parsing .netrc: %s' % error_to_compat_str(err)) + cmd = self.get_param('netrc_cmd') + if cmd: + cmd = cmd.replace('{}', netrc_machine) + self.to_screen(f'Executing command: {cmd}') + stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE) + if ret != 0: + raise OSError(f'Command returned error code {ret}') + info = netrc_from_content(stdout).authenticators(netrc_machine) + + elif self.get_param('usenetrc', False): + netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') + if os.path.isdir(netrc_file): + netrc_file = os.path.join(netrc_file, '.netrc') + info = netrc.netrc(netrc_file).authenticators(netrc_machine) - return username, password + else: + return None, None + if not info: + raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}') + return info[0], info[2] def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): """ Get the login info as (username, password) First look for the manually specified credentials using username_option and password_option as keys in params dictionary. If no such credentials - available look in the netrc file using the netrc_machine or _NETRC_MACHINE - value. + are available try the netrc_cmd if it is defined or look in the + netrc file using the netrc_machine or _NETRC_MACHINE value. If there's no info available, return (None, None) """ - # Attempt to use provided username and password or .netrc data username = self.get_param(username_option) if username is not None: password = self.get_param(password_option) else: - username, password = self._get_netrc_login_info(netrc_machine) - + try: + username, password = self._get_netrc_login_info(netrc_machine) + except (OSError, netrc.NetrcParseError) as err: + self.report_warning(f'Failed to parse .netrc: {err}') + return None, None return username, password def _get_tfa_info(self, note='two-factor verification code'): @@ -1324,7 +1363,7 @@ class InfoExtractor: # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))' property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' % {'prop': re.escape(prop), 'sep': '(?::|[:-])'}) template = r'<meta[^>]+?%s[^>]+?%s' @@ -1394,10 +1433,16 @@ class InfoExtractor: # And then there are the jokers who advertise that they use RTA, but actually don't. AGE_LIMIT_MARKERS = [ r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', + r'>[^<]*you acknowledge you are at least (\d+) years old', + r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:ยง+\s*)?2257\b', ] - if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS): - return 18 - return 0 + + age_limit = 0 + for marker in AGE_LIMIT_MARKERS: + mobj = re.search(marker, html) + if mobj: + age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18))) + return age_limit def _media_rating_search(self, html): # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ @@ -1650,11 +1695,8 @@ class InfoExtractor: if js is None: return {} - args = dict(zip(arg_keys.split(','), arg_vals.split(','))) - - for key, val in args.items(): - if val in ('undefined', 'void 0'): - args[key] = 'null' + args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json( + f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ()))) ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) return traverse_obj(ret, traverse) or {} @@ -1757,6 +1799,9 @@ class InfoExtractor: def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), fatal=True, m3u8_id=None, data=None, headers={}, query={}): + if self.get_param('ignore_no_formats_error'): + fatal = False + res = self._download_xml_handle( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', @@ -1768,7 +1813,7 @@ class InfoExtractor: return [] manifest, urlh = res - manifest_url = urlh.geturl() + manifest_url = urlh.url return self._parse_f4m_formats( manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id, @@ -1906,6 +1951,17 @@ class InfoExtractor: errnote=None, fatal=True, live=False, data=None, headers={}, query={}): + if self.get_param('ignore_no_formats_error'): + fatal = False + + if not m3u8_url: + if errnote is not False: + errnote = errnote or 'Failed to obtain m3u8 URL' + if fatal: + raise ExtractorError(errnote, video_id=video_id) + self.report_warning(f'{errnote}{bug_reports_message()}') + return [], {} + res = self._download_webpage_handle( m3u8_url, video_id, note='Downloading m3u8 information' if note is None else note, @@ -1916,7 +1972,7 @@ class InfoExtractor: return [], {} m3u8_doc, urlh = res - m3u8_url = urlh.geturl() + m3u8_url = urlh.url return self._parse_m3u8_formats_and_subtitles( m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, @@ -1930,11 +1986,7 @@ class InfoExtractor: errnote=None, fatal=True, data=None, headers={}, query={}, video_id=None): formats, subtitles = [], {} - - has_drm = re.search('|'.join([ - r'#EXT-X-FAXS-CM:', # Adobe Flash Access - r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay - ]), m3u8_doc) + has_drm = HlsFD._has_drm(m3u8_doc) def format_url(url): return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url) @@ -2032,6 +2084,7 @@ class InfoExtractor: 'protocol': entry_protocol, 'preference': preference, 'quality': quality, + 'has_drm': has_drm, 'vcodec': 'none' if media_type == 'AUDIO' else None, } for idx in _extract_m3u8_playlist_indices(manifest_url)) @@ -2091,6 +2144,7 @@ class InfoExtractor: 'protocol': entry_protocol, 'preference': preference, 'quality': quality, + 'has_drm': has_drm, } resolution = last_stream_inf.get('RESOLUTION') if resolution: @@ -2157,13 +2211,23 @@ class InfoExtractor: return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id) def _parse_m3u8_vod_duration(self, m3u8_vod, video_id): - if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod: + if '#EXT-X-ENDLIST' not in m3u8_vod: return None return int(sum( float(line[len('#EXTINF:'):].split(',')[0]) for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None + def _extract_mpd_vod_duration( + self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}): + + mpd_doc = self._download_xml( + mpd_url, video_id, + note='Downloading MPD VOD manifest' if note is None else note, + errnote='Failed to download VOD manifest' if errnote is None else errnote, + fatal=False, data=data, headers=headers, query=query) or {} + return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration'))) + @staticmethod def _xpath_ns(path, namespace=None): if not namespace: @@ -2177,22 +2241,17 @@ class InfoExtractor: return '/'.join(out) def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): + if self.get_param('ignore_no_formats_error'): + fatal = False + res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) if res is False: assert not fatal return [], {} - smil, urlh = res - smil_url = urlh.geturl() - namespace = self._parse_smil_namespace(smil) - - fmts = self._parse_smil_formats( - smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subs = self._parse_smil_subtitles( - smil, namespace=namespace) - - return fmts, subs + return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params, + namespace=self._parse_smil_namespace(smil)) def _extract_smil_formats(self, *args, **kwargs): fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs) @@ -2206,7 +2265,7 @@ class InfoExtractor: return {} smil, urlh = res - smil_url = urlh.geturl() + smil_url = urlh.url return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) @@ -2218,9 +2277,8 @@ class InfoExtractor: def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): namespace = self._parse_smil_namespace(smil) - formats = self._parse_smil_formats( + formats, subtitles = self._parse_smil_formats_and_subtitles( smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) - subtitles = self._parse_smil_subtitles(smil, namespace=namespace) video_id = os.path.splitext(url_basename(smil_url))[0] title = None @@ -2259,7 +2317,14 @@ class InfoExtractor: return self._search_regex( r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + def _parse_smil_formats(self, *args, **kwargs): + fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs) + if subs: + self._report_ignoring_subs('SMIL') + return fmts + + def _parse_smil_formats_and_subtitles( + self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base = smil_url for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): b = meta.get('base') or meta.get('httpBase') @@ -2267,7 +2332,7 @@ class InfoExtractor: base = b break - formats = [] + formats, subtitles = [], {} rtmp_count = 0 http_count = 0 m3u8_count = 0 @@ -2287,7 +2352,8 @@ class InfoExtractor: height = int_or_none(medium.get('height')) proto = medium.get('proto') ext = medium.get('ext') - src_ext = determine_ext(src) + src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext( + self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False)) streamer = medium.get('streamer') or base if proto == 'rtmp' or streamer.startswith('rtmp'): @@ -2314,8 +2380,9 @@ class InfoExtractor: src_url = src_url.strip() if proto == 'm3u8' or src_ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( + m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles( src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + self._merge_subtitles(m3u8_subs, target=subtitles) if len(m3u8_formats) == 1: m3u8_count += 1 m3u8_formats[0].update({ @@ -2336,11 +2403,15 @@ class InfoExtractor: f4m_url += urllib.parse.urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) elif src_ext == 'mpd': - formats.extend(self._extract_mpd_formats( - src_url, video_id, mpd_id='dash', fatal=False)) + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles( + src_url, video_id, mpd_id='dash', fatal=False) + formats.extend(mpd_formats) + self._merge_subtitles(mpd_subs, target=subtitles) elif re.search(r'\.ism/[Mm]anifest', src_url): - formats.extend(self._extract_ism_formats( - src_url, video_id, ism_id='mss', fatal=False)) + ism_formats, ism_subs = self._extract_ism_formats_and_subtitles( + src_url, video_id, ism_id='mss', fatal=False) + formats.extend(ism_formats) + self._merge_subtitles(ism_subs, target=subtitles) elif src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ @@ -2371,7 +2442,10 @@ class InfoExtractor: 'format_note': 'SMIL storyboards', }) - return formats + smil_subs = self._parse_smil_subtitles(smil, namespace=namespace) + self._merge_subtitles(smil_subs, target=subtitles) + + return formats, subtitles def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): urls = [] @@ -2397,7 +2471,7 @@ class InfoExtractor: return [] xspf, urlh = res - xspf_url = urlh.geturl() + xspf_url = urlh.url return self._parse_xspf( xspf, playlist_id, xspf_url=xspf_url, @@ -2452,6 +2526,10 @@ class InfoExtractor: def _extract_mpd_formats_and_subtitles( self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): + + if self.get_param('ignore_no_formats_error'): + fatal = False + res = self._download_xml_handle( mpd_url, video_id, note='Downloading MPD manifest' if note is None else note, @@ -2464,7 +2542,7 @@ class InfoExtractor: return [], {} # We could have been redirected to a new url when we retrieved our mpd file. - mpd_url = urlh.geturl() + mpd_url = urlh.url mpd_base_url = base_url(mpd_url) return self._parse_mpd_formats_and_subtitles( @@ -2821,6 +2899,9 @@ class InfoExtractor: return fmts def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): + if self.get_param('ignore_no_formats_error'): + fatal = False + res = self._download_xml_handle( ism_url, video_id, note='Downloading ISM manifest' if note is None else note, @@ -2832,7 +2913,7 @@ class InfoExtractor: if ism_doc is None: return [], {} - return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id) + return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id) def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): """ @@ -2928,6 +3009,8 @@ class InfoExtractor: 'protocol': 'ism', 'fragments': fragments, 'has_drm': ism_doc.find('Protection') is not None, + 'language': stream_language, + 'audio_channels': int_or_none(track.get('Channels')), '_download_params': { 'stream_type': stream_type, 'duration': duration, @@ -3190,7 +3273,7 @@ class InfoExtractor: def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): mobj = re.search( - r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', + r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''', webpage) if mobj: try: @@ -3211,19 +3294,20 @@ class InfoExtractor: def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - # JWPlayer backward compatibility: flattened playlists - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: - jwplayer_data = {'playlist': [jwplayer_data]} - entries = [] + if not isinstance(jwplayer_data, dict): + return entries - # JWPlayer backward compatibility: single playlist item + playlist_items = jwplayer_data.get('playlist') + # JWPlayer backward compatibility: single playlist item/flattened playlists # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 - if not isinstance(jwplayer_data['playlist'], list): - jwplayer_data['playlist'] = [jwplayer_data['playlist']] + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 + if not isinstance(playlist_items, list): + playlist_items = (playlist_items or jwplayer_data, ) - for video_data in jwplayer_data['playlist']: + for video_data in playlist_items: + if not isinstance(video_data, dict): + continue # JWPlayer backward compatibility: flattened sources # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 if 'sources' not in video_data: @@ -3261,6 +3345,13 @@ class InfoExtractor: 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, + 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ... + 'genre': clean_html(video_data.get('genre')), + 'channel': clean_html(dict_get(video_data, ('category', 'channel'))), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'release_year': int_or_none(video_data.get('releasedate')), + 'age_limit': int_or_none(video_data.get('age_restriction')), } # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): @@ -3278,7 +3369,7 @@ class InfoExtractor: def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - urls = [] + urls = set() formats = [] for source in jwplayer_sources_data: if not isinstance(source, dict): @@ -3287,14 +3378,14 @@ class InfoExtractor: base_url, self._proto_relative_url(source.get('file'))) if not source_url or source_url in urls: continue - urls.append(source_url) + urls.add(source_url) source_type = source.get('type') or '' ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': + if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url: formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif source_type == 'dash' or ext == 'mpd': + elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url: formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) elif ext == 'smil': @@ -3309,13 +3400,12 @@ class InfoExtractor: 'ext': ext, }) else: + format_id = str_or_none(source.get('label')) height = int_or_none(source.get('height')) - if height is None: + if height is None and format_id: # Often no height is provided but there is a label in # format like "1080p", "720p SD", or 1080. - height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''), - 'height', default=None)) + height = parse_resolution(format_id).get('height') a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), @@ -3323,6 +3413,7 @@ class InfoExtractor: 'tbr': int_or_none(source.get('bitrate'), scale=1000), 'filesize': int_or_none(source.get('filesize')), 'ext': ext, + 'format_id': format_id } if source_url.startswith('rtmp'): a_format['ext'] = 'flv' @@ -3375,7 +3466,7 @@ class InfoExtractor: def _get_cookies(self, url): """ Return a http.cookies.SimpleCookie with the cookies for the url """ - return LenientSimpleCookie(self._downloader._calc_cookies(url)) + return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url)) def _apply_first_set_cookie_header(self, url_handle, cookie): """ @@ -3416,13 +3507,17 @@ class InfoExtractor: continue t['name'] = cls.ie_key() yield t + if getattr(cls, '__wrapped__', None): + yield from cls.__wrapped__.get_testcases(include_onlymatching) @classmethod def get_webpage_testcases(cls): tests = vars(cls).get('_WEBPAGE_TESTS', []) for t in tests: t['name'] = cls.ie_key() - return tests + yield t + if getattr(cls, '__wrapped__', None): + yield from cls.__wrapped__.get_webpage_testcases() @classproperty(cache=True) def age_limit(cls): @@ -3446,8 +3541,8 @@ class InfoExtractor: @classmethod def is_single_video(cls, url): """Returns whether the URL is of a single video, None if unknown""" - assert cls.suitable(url), 'The URL must be suitable for the extractor' - return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) + if cls.suitable(url): + return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) @classmethod def is_suitable(cls, age_limit): @@ -3460,7 +3555,7 @@ class InfoExtractor: desc = '' if cls._NETRC_MACHINE: if markdown: - desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]' + desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")' else: desc += f' [{cls._NETRC_MACHINE}]' if cls.IE_DESC is False: @@ -3468,7 +3563,7 @@ class InfoExtractor: elif cls.IE_DESC: desc += f' {cls.IE_DESC}' if cls.SEARCH_KEY: - desc += f'; "{cls.SEARCH_KEY}:" prefix' + desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix' if search_examples: _COUNTS = ('', '5', '10', 'all') desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")' @@ -3582,6 +3677,42 @@ class InfoExtractor: or urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) or default) + def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True): + if not duration: + return + chapter_list = [{ + 'start_time': start_function(chapter), + 'title': title_function(chapter), + } for chapter in chapter_list or []] + if strict: + warn = self.report_warning + else: + warn = self.write_debug + chapter_list.sort(key=lambda c: c['start_time'] or 0) + + chapters = [{'start_time': 0}] + for idx, chapter in enumerate(chapter_list): + if chapter['start_time'] is None: + warn(f'Incomplete chapter {idx}') + elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: + chapters.append(chapter) + elif chapter not in chapters: + issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration + else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}') + warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"') + return chapters[1:] + + def _extract_chapters_from_description(self, description, duration): + duration_re = r'(?:\d+:)?\d{1,2}:\d{2}' + sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$' + return self._extract_chapters_helper( + re.findall(sep_re % (duration_re, r'.+?'), description or ''), + start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1], + duration=duration, strict=False) or self._extract_chapters_helper( + re.findall(sep_re % (r'.+?', duration_re), description or ''), + start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0], + duration=duration, strict=False) + @staticmethod def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): all_known = all(map( @@ -3684,10 +3815,12 @@ class InfoExtractor: if plugin_name: mro = inspect.getmro(cls) super_class = cls.__wrapped__ = mro[mro.index(cls) + 1] - cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key + cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key + cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}' while getattr(super_class, '__wrapped__', None): super_class = super_class.__wrapped__ setattr(sys.modules[super_class.__module__], super_class.__name__, cls) + _PLUGIN_OVERRIDES[super_class].append(cls) return super().__init_subclass__(**kwargs) @@ -3744,3 +3877,6 @@ class UnsupportedURLIE(InfoExtractor): def _real_extract(self, url): raise UnsupportedError(url) + + +_PLUGIN_OVERRIDES = collections.defaultdict(list) |