diff options
Diffstat (limited to 'yt_dlp')
-rw-r--r-- | yt_dlp/cookies.py | 6 | ||||
-rw-r--r-- | yt_dlp/networking/_urllib.py | 7 | ||||
-rw-r--r-- | yt_dlp/networking/common.py | 5 | ||||
-rw-r--r-- | yt_dlp/utils/_legacy.py | 4 | ||||
-rw-r--r-- | yt_dlp/utils/_utils.py | 17 | ||||
-rw-r--r-- | yt_dlp/utils/networking.py | 38 |
6 files changed, 50 insertions, 27 deletions
diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 80428c747..157f5b0c2 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -33,7 +33,6 @@ from .minicurses import MultilinePrinter, QuietMultilinePrinter from .utils import ( Popen, error_to_str, - escape_url, expand_path, is_path_like, sanitize_url, @@ -42,6 +41,7 @@ from .utils import ( write_string, ) from .utils._utils import _YDLLogger +from .utils.networking import normalize_url CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} @@ -1308,7 +1308,7 @@ class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): def get_cookie_header(self, url): """Generate a Cookie HTTP header for a given url""" - cookie_req = urllib.request.Request(escape_url(sanitize_url(url))) + cookie_req = urllib.request.Request(normalize_url(sanitize_url(url))) self.add_cookie_header(cookie_req) return cookie_req.get_header('Cookie') @@ -1317,7 +1317,7 @@ class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): # Policy `_now` attribute must be set before calling `_cookies_for_request` # Ref: https://github.com/python/cpython/blob/3.7/Lib/http/cookiejar.py#L1360 self._policy._now = self._now = int(time.time()) - return self._cookies_for_request(urllib.request.Request(escape_url(sanitize_url(url)))) + return self._cookies_for_request(urllib.request.Request(normalize_url(sanitize_url(url)))) def clear(self, *args, **kwargs): with contextlib.suppress(KeyError): diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 3fe5fa52e..0c4794954 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -41,7 +41,8 @@ from .exceptions import ( from ..dependencies import brotli from ..socks import ProxyError as SocksProxyError from ..socks import sockssocket -from ..utils import escape_url, update_url_query +from ..utils import update_url_query +from ..utils.networking import normalize_url SUPPORTED_ENCODINGS = ['gzip', 'deflate'] CONTENT_DECODE_ERRORS = [zlib.error, OSError] @@ -179,7 +180,7 @@ class HTTPHandler(urllib.request.AbstractHTTPHandler): # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) # the code of this workaround has been moved here from YoutubeDL.urlopen() url = req.get_full_url() - url_escaped = escape_url(url) + url_escaped = normalize_url(url) # Substitute URL if any change after escaping if url != url_escaped: @@ -212,7 +213,7 @@ class HTTPHandler(urllib.request.AbstractHTTPHandler): if location: # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 location = location.encode('iso-8859-1').decode() - location_escaped = escape_url(location) + location_escaped = normalize_url(location) if location != location_escaped: del resp.headers['Location'] resp.headers['Location'] = location_escaped diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index 3164df49b..792e062fd 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -27,10 +27,9 @@ from ..utils import ( classproperty, deprecation_warning, error_to_str, - escape_url, update_url_query, ) -from ..utils.networking import HTTPHeaderDict +from ..utils.networking import HTTPHeaderDict, normalize_url if typing.TYPE_CHECKING: RequestData = bytes | Iterable[bytes] | typing.IO | None @@ -372,7 +371,7 @@ class Request: raise TypeError('url must be a string') elif url.startswith('//'): url = 'http:' + url - self._url = escape_url(url) + self._url = normalize_url(url) @property def method(self): diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py index 077000971..dde02092c 100644 --- a/yt_dlp/utils/_legacy.py +++ b/yt_dlp/utils/_legacy.py @@ -8,6 +8,8 @@ import urllib.request import zlib from ._utils import Popen, decode_base_n, preferredencoding +from .networking import escape_rfc3986 # noqa: F401 +from .networking import normalize_url as escape_url # noqa: F401 from .traversal import traverse_obj from ..dependencies import certifi, websockets from ..networking._helper import make_ssl_context @@ -197,7 +199,7 @@ def request_to_url(req): def sanitized_Request(url, *args, **kwargs): - from ..utils import escape_url, extract_basic_auth, sanitize_url + from ..utils import extract_basic_auth, sanitize_url url, auth_header = extract_basic_auth(escape_url(sanitize_url(url))) if auth_header is not None: headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {}) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index abae0f17e..f5552ce80 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2464,23 +2464,6 @@ def lowercase_escape(s): s) -def escape_rfc3986(s): - """Escape non-ASCII characters as suggested by RFC 3986""" - return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") - - -def escape_url(url): - """Escape URL as suggested by RFC 3986""" - url_parsed = urllib.parse.urlparse(url) - return url_parsed._replace( - netloc=url_parsed.netloc.encode('idna').decode('ascii'), - path=escape_rfc3986(url_parsed.path), - params=escape_rfc3986(url_parsed.params), - query=escape_rfc3986(url_parsed.query), - fragment=escape_rfc3986(url_parsed.fragment) - ).geturl() - - def parse_qs(url, **kwargs): return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs) diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index e6515ec8e..bbcea84d2 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -121,3 +121,41 @@ def clean_headers(headers: HTTPHeaderDict): if 'Youtubedl-No-Compression' in headers: # compat del headers['Youtubedl-No-Compression'] headers['Accept-Encoding'] = 'identity' + + +def remove_dot_segments(path): + # Implements RFC3986 5.2.4 remote_dot_segments + # Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4 + # https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263 + output = [] + segments = path.split('/') + for s in segments: + if s == '.': + continue + elif s == '..': + if output: + output.pop() + else: + output.append(s) + if not segments[0] and (not output or output[0]): + output.insert(0, '') + if segments[-1] in ('.', '..'): + output.append('') + return '/'.join(output) + + +def escape_rfc3986(s): + """Escape non-ASCII characters as suggested by RFC 3986""" + return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") + + +def normalize_url(url): + """Normalize URL as suggested by RFC 3986""" + url_parsed = urllib.parse.urlparse(url) + return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), + path=escape_rfc3986(remove_dot_segments(url_parsed.path)), + params=escape_rfc3986(url_parsed.params), + query=escape_rfc3986(url_parsed.query), + fragment=escape_rfc3986(url_parsed.fragment) + ).geturl() |