From 5122028a4bcac4ae577ef7fbd55ccad5cb34ef5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs?= Date: Mon, 18 Oct 2021 15:24:21 -0500 Subject: update from upstream --- hypervideo_dl/utils.py | 1053 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 880 insertions(+), 173 deletions(-) (limited to 'hypervideo_dl/utils.py') diff --git a/hypervideo_dl/utils.py b/hypervideo_dl/utils.py index fc62f09..0199f4c 100644 --- a/hypervideo_dl/utils.py +++ b/hypervideo_dl/utils.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 from __future__ import unicode_literals @@ -16,6 +16,9 @@ import email.header import errno import functools import gzip +import hashlib +import hmac +import importlib.util import io import itertools import json @@ -50,6 +53,7 @@ from .compat import ( compat_html_entities_html5, compat_http_client, compat_integer_types, + compat_numeric_types, compat_kwargs, compat_os_name, compat_parse_qs, @@ -61,6 +65,9 @@ from .compat import ( compat_urllib_parse, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, + compat_urllib_parse_urlunparse, + compat_urllib_parse_quote, + compat_urllib_parse_quote_plus, compat_urllib_parse_unquote_plus, compat_urllib_request, compat_urlparse, @@ -1735,12 +1742,16 @@ DATE_FORMATS = ( '%b %dth %Y %I:%M', '%Y %m %d', '%Y-%m-%d', + '%Y.%m.%d.', '%Y/%m/%d', '%Y/%m/%d %H:%M', '%Y/%m/%d %H:%M:%S', + '%Y%m%d%H%M', + '%Y%m%d%H%M%S', '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', + '%Y-%m-%d %H:%M:%S:%f', '%d.%m.%Y %H:%M', '%d.%m.%Y %H.%M', '%Y-%m-%dT%H:%M:%SZ', @@ -1753,6 +1764,7 @@ DATE_FORMATS = ( '%b %d %Y at %H:%M:%S', '%B %d %Y at %H:%M', '%B %d %Y at %H:%M:%S', + '%H:%M %d-%b-%Y', ) DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) @@ -1985,6 +1997,7 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True): class HTMLAttributeParser(compat_HTMLParser): """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): self.attrs = {} compat_HTMLParser.__init__(self) @@ -2086,7 +2099,9 @@ def sanitize_filename(s, restricted=False, is_id=False): def replace_insane(char): if restricted and char in ACCENT_CHARS: return ACCENT_CHARS[char] - if char == '?' or ord(char) < 32 or ord(char) == 127: + elif not restricted and char == '\n': + return ' ' + elif char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': return '' if restricted else '\'' @@ -2100,6 +2115,8 @@ def sanitize_filename(s, restricted=False, is_id=False): return '_' return char + if s == '': + return '' # Handle timestamps s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) result = ''.join(map(replace_insane, s)) @@ -2118,13 +2135,18 @@ def sanitize_filename(s, restricted=False, is_id=False): return result -def sanitize_path(s): +def sanitize_path(s, force=False): """Sanitizes and normalizes path on Windows""" - if sys.platform != 'win32': + if sys.platform == 'win32': + force = False + drive_or_unc, _ = os.path.splitdrive(s) + if sys.version_info < (2, 7) and not drive_or_unc: + drive_or_unc, _ = os.path.splitunc(s) + elif force: + drive_or_unc = '' + else: return s - drive_or_unc, _ = os.path.splitdrive(s) - if sys.version_info < (2, 7) and not drive_or_unc: - drive_or_unc, _ = os.path.splitunc(s) + norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) if drive_or_unc: norm_path.pop(0) @@ -2133,6 +2155,8 @@ def sanitize_path(s): for path_part in norm_path] if drive_or_unc: sanitized_path.insert(0, drive_or_unc + os.path.sep) + elif force and s[0] == os.path.sep: + sanitized_path.insert(0, os.path.sep) return os.path.join(*sanitized_path) @@ -2154,8 +2178,24 @@ def sanitize_url(url): return url +def extract_basic_auth(url): + parts = compat_urlparse.urlsplit(url) + if parts.username is None: + return url, None + url = compat_urlparse.urlunsplit(parts._replace(netloc=( + parts.hostname if parts.port is None + else '%s:%d' % (parts.hostname, parts.port)))) + auth_payload = base64.b64encode( + ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8')) + return url, 'Basic ' + auth_payload.decode('utf-8') + + def sanitized_Request(url, *args, **kwargs): - return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) + url, auth_header = extract_basic_auth(escape_url(sanitize_url(url))) + if auth_header is not None: + headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {}) + headers['Authorization'] = auth_header + return compat_urllib_request.Request(url, *args, **kwargs) def expand_path(s): @@ -2212,6 +2252,26 @@ def unescapeHTML(s): r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) +def escapeHTML(text): + return ( + text + .replace('&', '&') + .replace('<', '<') + .replace('>', '>') + .replace('"', '"') + .replace("'", ''') + ) + + +def process_communicate_or_kill(p, *args, **kwargs): + try: + return p.communicate(*args, **kwargs) + except BaseException: # Including KeyboardInterrupt + p.kill() + p.wait() + raise + + def get_subprocess_encoding(): if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: # For subprocess calls, encode with locale encoding @@ -2282,49 +2342,68 @@ def decodeOption(optval): return optval -def formatSeconds(secs): +def formatSeconds(secs, delim=':', msec=False): if secs > 3600: - return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60) + ret = '%d%s%02d%s%02d' % (secs // 3600, delim, (secs % 3600) // 60, delim, secs % 60) elif secs > 60: - return '%d:%02d' % (secs // 60, secs % 60) + ret = '%d%s%02d' % (secs // 60, delim, secs % 60) else: - return '%d' % secs + ret = '%d' % secs + return '%s.%03d' % (ret, secs % 1) if msec else ret -def make_HTTPS_handler(params, **kwargs): - opts_no_check_certificate = params.get('nocheckcertificate', False) - if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9 - context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH) - if opts_no_check_certificate: - context.check_hostname = False - context.verify_mode = ssl.CERT_NONE +def _ssl_load_windows_store_certs(ssl_context, storename): + # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py + try: + certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename) + if encoding == 'x509_asn' and ( + trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)] + except PermissionError: + return + for cert in certs: try: - return YoutubeDLHTTPSHandler(params, context=context, **kwargs) - except TypeError: - # Python 2.7.8 - # (create_default_context present but HTTPSHandler has no context=) + ssl_context.load_verify_locations(cadata=cert) + except ssl.SSLError: pass - if sys.version_info < (3, 2): - return YoutubeDLHTTPSHandler(params, **kwargs) - else: # Python < 3.4 - context = ssl.SSLContext(ssl.PROTOCOL_TLSv1) - context.verify_mode = (ssl.CERT_NONE - if opts_no_check_certificate - else ssl.CERT_REQUIRED) - context.set_default_verify_paths() - return YoutubeDLHTTPSHandler(params, context=context, **kwargs) - -def bug_reports_message(): +def make_HTTPS_handler(params, **kwargs): + opts_check_certificate = not params.get('nocheckcertificate') + context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + context.check_hostname = opts_check_certificate + context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE + if opts_check_certificate: + try: + context.load_default_certs() + # Work around the issue in load_default_certs when there are bad certificates. See: + # https://github.com/hypervideo/hypervideo/issues/1060, + # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312 + except ssl.SSLError: + # enum_certificates is not present in mingw python. See https://github.com/hypervideo/hypervideo/issues/1151 + if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'): + # Create a new context to discard any certificates that were already loaded + context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED + for storename in ('CA', 'ROOT'): + _ssl_load_windows_store_certs(context, storename) + context.set_default_verify_paths() + return YoutubeDLHTTPSHandler(params, context=context, **kwargs) + + +def bug_reports_message(before=';'): if ytdl_is_updateable(): update_cmd = 'type doas pacman -Sy hypervideo to update' else: - update_cmd = 'see https://yt-dl.org/update on how to update' - msg = '; please report this issue on https://yt-dl.org/bug .' + update_cmd = 'see https://git.conocimientoslibres.ga/software/hypervideo.git/about/#how-do-i-update-hypervideo' + msg = 'please report this issue on https://github.com/hypervideo/hypervideo .' msg += ' Make sure you are using the latest version; %s.' % update_cmd msg += ' Be sure to call hypervideo with the --verbose flag and include its complete output.' - return msg + + before = before.rstrip() + if not before or before.endswith(('.', '!', '?')): + msg = msg[0].title() + msg[1:] + + return (before + ' ' if before else '') + msg class YoutubeDLError(Exception): @@ -2332,28 +2411,36 @@ class YoutubeDLError(Exception): pass +network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error] +if hasattr(ssl, 'CertificateError'): + network_exceptions.append(ssl.CertificateError) +network_exceptions = tuple(network_exceptions) + + class ExtractorError(YoutubeDLError): """Error during info extraction.""" - def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): + def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None): """ tb, if given, is the original traceback (so that it can be printed out). If expected is set, this is a normal error message and most likely not a bug in hypervideo. """ - - if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): + if sys.exc_info()[0] in network_exceptions: expected = True - if video_id is not None: - msg = video_id + ': ' + msg - if cause: - msg += ' (caused by %r)' % cause - if not expected: - msg += bug_reports_message() - super(ExtractorError, self).__init__(msg) + self.msg = str(msg) self.traceback = tb - self.exc_info = sys.exc_info() # preserve original exception + self.expected = expected self.cause = cause self.video_id = video_id + self.ie = ie + self.exc_info = sys.exc_info() # preserve original exception + + super(ExtractorError, self).__init__(''.join(( + format_field(ie, template='[%s] '), + format_field(video_id, template='%s: '), + self.msg, + format_field(cause, template=' (caused by %r)'), + '' if expected else bug_reports_message()))) def format_traceback(self): if self.traceback is None: @@ -2379,6 +2466,7 @@ class GeoRestrictedError(ExtractorError): This exception may be thrown when a video is not available from your geographic location due to geographic restrictions imposed by a website. """ + def __init__(self, msg, countries=None): super(GeoRestrictedError, self).__init__(msg, expected=True) self.msg = msg @@ -2399,6 +2487,15 @@ class DownloadError(YoutubeDLError): self.exc_info = exc_info +class EntryNotInPlaylist(YoutubeDLError): + """Entry not in playlist exception. + + This exception will be thrown by YoutubeDL when a requested entry + is not found in the playlist info_dict + """ + pass + + class SameFileError(YoutubeDLError): """Same File exception. @@ -2420,6 +2517,21 @@ class PostProcessingError(YoutubeDLError): self.msg = msg +class ExistingVideoReached(YoutubeDLError): + """ --max-downloads limit has been reached. """ + pass + + +class RejectedVideoReached(YoutubeDLError): + """ --max-downloads limit has been reached. """ + pass + + +class ThrottledDownload(YoutubeDLError): + """ Download speed below --throttled-rate. """ + pass + + class MaxDownloadsReached(YoutubeDLError): """ --max-downloads limit has been reached. """ pass @@ -2582,6 +2694,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): @staticmethod def deflate(data): + if not data: + return data try: return zlib.decompress(data, -zlib.MAX_WBITS) except zlib.error: @@ -2938,8 +3052,16 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): def extract_timezone(date_str): m = re.search( - r'^.{8,}?(?PZ$| ?(?P\+|-)(?P[0-9]{2}):?(?P[0-9]{2})$)', - date_str) + r'''(?x) + ^.{8,}? # >=8 char non-TZ prefix, if present + (?PZ| # just the UTC Z, or + (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or + (?= 4 alpha or 2 digits + [ ]? # optional space + (?P\+|-) # +/- + (?P[0-9]{2}):?(?P[0-9]{2}) # hh[:]mm + $) + ''', date_str) if not m: timezone = datetime.timedelta() else: @@ -3055,33 +3177,83 @@ def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None): return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext) -def date_from_str(date_str): +def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): """ Return a datetime object from a string in the format YYYYMMDD or - (now|today)[+-][0-9](day|week|month|year)(s)?""" - today = datetime.date.today() + (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)? + + format: string date format used to return datetime object from + precision: round the time portion of a datetime object. + auto|microsecond|second|minute|hour|day. + auto: round to the unit provided in date_str (if applicable). + """ + auto_precision = False + if precision == 'auto': + auto_precision = True + precision = 'microsecond' + today = datetime_round(datetime.datetime.now(), precision) if date_str in ('now', 'today'): return today if date_str == 'yesterday': return today - datetime.timedelta(days=1) - match = re.match(r'(now|today)(?P[+-])(?P