diff options
author | coletdjnz <coletdjnz@protonmail.com> | 2023-07-29 10:40:20 +1200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-28 22:40:20 +0000 |
commit | 4bf912282a34b58b6b35d8f7e6be535770c89c76 (patch) | |
tree | 829a0271e2e709a8a79f2a9de29f72dea8108d05 /yt_dlp/utils | |
parent | a15fcd299e767a510debd8dc1646fe863b96ce0e (diff) | |
download | hypervideo-pre-4bf912282a34b58b6b35d8f7e6be535770c89c76.tar.lz hypervideo-pre-4bf912282a34b58b6b35d8f7e6be535770c89c76.tar.xz hypervideo-pre-4bf912282a34b58b6b35d8f7e6be535770c89c76.zip |
[networking] Remove dot segments during URL normalization (#7662)
This implements RFC3986 5.2.4 remove_dot_segments during the URL normalization process.
Closes #3355, #6526
Authored by: coletdjnz
Diffstat (limited to 'yt_dlp/utils')
-rw-r--r-- | yt_dlp/utils/_legacy.py | 4 | ||||
-rw-r--r-- | yt_dlp/utils/_utils.py | 17 | ||||
-rw-r--r-- | yt_dlp/utils/networking.py | 38 |
3 files changed, 41 insertions, 18 deletions
diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py index 077000971..dde02092c 100644 --- a/yt_dlp/utils/_legacy.py +++ b/yt_dlp/utils/_legacy.py @@ -8,6 +8,8 @@ import urllib.request import zlib from ._utils import Popen, decode_base_n, preferredencoding +from .networking import escape_rfc3986 # noqa: F401 +from .networking import normalize_url as escape_url # noqa: F401 from .traversal import traverse_obj from ..dependencies import certifi, websockets from ..networking._helper import make_ssl_context @@ -197,7 +199,7 @@ def request_to_url(req): def sanitized_Request(url, *args, **kwargs): - from ..utils import escape_url, extract_basic_auth, sanitize_url + from ..utils import extract_basic_auth, sanitize_url url, auth_header = extract_basic_auth(escape_url(sanitize_url(url))) if auth_header is not None: headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {}) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index abae0f17e..f5552ce80 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2464,23 +2464,6 @@ def lowercase_escape(s): s) -def escape_rfc3986(s): - """Escape non-ASCII characters as suggested by RFC 3986""" - return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") - - -def escape_url(url): - """Escape URL as suggested by RFC 3986""" - url_parsed = urllib.parse.urlparse(url) - return url_parsed._replace( - netloc=url_parsed.netloc.encode('idna').decode('ascii'), - path=escape_rfc3986(url_parsed.path), - params=escape_rfc3986(url_parsed.params), - query=escape_rfc3986(url_parsed.query), - fragment=escape_rfc3986(url_parsed.fragment) - ).geturl() - - def parse_qs(url, **kwargs): return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs) diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index e6515ec8e..bbcea84d2 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -121,3 +121,41 @@ def clean_headers(headers: HTTPHeaderDict): if 'Youtubedl-No-Compression' in headers: # compat del headers['Youtubedl-No-Compression'] headers['Accept-Encoding'] = 'identity' + + +def remove_dot_segments(path): + # Implements RFC3986 5.2.4 remote_dot_segments + # Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4 + # https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263 + output = [] + segments = path.split('/') + for s in segments: + if s == '.': + continue + elif s == '..': + if output: + output.pop() + else: + output.append(s) + if not segments[0] and (not output or output[0]): + output.insert(0, '') + if segments[-1] in ('.', '..'): + output.append('') + return '/'.join(output) + + +def escape_rfc3986(s): + """Escape non-ASCII characters as suggested by RFC 3986""" + return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") + + +def normalize_url(url): + """Normalize URL as suggested by RFC 3986""" + url_parsed = urllib.parse.urlparse(url) + return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), + path=escape_rfc3986(remove_dot_segments(url_parsed.path)), + params=escape_rfc3986(url_parsed.params), + query=escape_rfc3986(url_parsed.query), + fragment=escape_rfc3986(url_parsed.fragment) + ).geturl() |