diff options
author | coletdjnz <coletdjnz@protonmail.com> | 2023-07-29 10:40:20 +1200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-28 22:40:20 +0000 |
commit | 4bf912282a34b58b6b35d8f7e6be535770c89c76 (patch) | |
tree | 829a0271e2e709a8a79f2a9de29f72dea8108d05 /yt_dlp/utils/networking.py | |
parent | a15fcd299e767a510debd8dc1646fe863b96ce0e (diff) | |
download | hypervideo-pre-4bf912282a34b58b6b35d8f7e6be535770c89c76.tar.lz hypervideo-pre-4bf912282a34b58b6b35d8f7e6be535770c89c76.tar.xz hypervideo-pre-4bf912282a34b58b6b35d8f7e6be535770c89c76.zip |
[networking] Remove dot segments during URL normalization (#7662)
This implements RFC3986 5.2.4 remove_dot_segments during the URL normalization process.
Closes #3355, #6526
Authored by: coletdjnz
Diffstat (limited to 'yt_dlp/utils/networking.py')
-rw-r--r-- | yt_dlp/utils/networking.py | 38 |
1 files changed, 38 insertions, 0 deletions
diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py index e6515ec8e..bbcea84d2 100644 --- a/yt_dlp/utils/networking.py +++ b/yt_dlp/utils/networking.py @@ -121,3 +121,41 @@ def clean_headers(headers: HTTPHeaderDict): if 'Youtubedl-No-Compression' in headers: # compat del headers['Youtubedl-No-Compression'] headers['Accept-Encoding'] = 'identity' + + +def remove_dot_segments(path): + # Implements RFC3986 5.2.4 remote_dot_segments + # Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4 + # https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263 + output = [] + segments = path.split('/') + for s in segments: + if s == '.': + continue + elif s == '..': + if output: + output.pop() + else: + output.append(s) + if not segments[0] and (not output or output[0]): + output.insert(0, '') + if segments[-1] in ('.', '..'): + output.append('') + return '/'.join(output) + + +def escape_rfc3986(s): + """Escape non-ASCII characters as suggested by RFC 3986""" + return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") + + +def normalize_url(url): + """Normalize URL as suggested by RFC 3986""" + url_parsed = urllib.parse.urlparse(url) + return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), + path=escape_rfc3986(remove_dot_segments(url_parsed.path)), + params=escape_rfc3986(url_parsed.params), + query=escape_rfc3986(url_parsed.query), + fragment=escape_rfc3986(url_parsed.fragment) + ).geturl() |