aboutsummaryrefslogtreecommitdiffstats
path: root/yt_dlp/utils
diff options
context:
space:
mode:
authorcoletdjnz <coletdjnz@protonmail.com>2023-07-29 10:40:20 +1200
committerGitHub <noreply@github.com>2023-07-28 22:40:20 +0000
commit4bf912282a34b58b6b35d8f7e6be535770c89c76 (patch)
tree829a0271e2e709a8a79f2a9de29f72dea8108d05 /yt_dlp/utils
parenta15fcd299e767a510debd8dc1646fe863b96ce0e (diff)
downloadhypervideo-pre-4bf912282a34b58b6b35d8f7e6be535770c89c76.tar.lz
hypervideo-pre-4bf912282a34b58b6b35d8f7e6be535770c89c76.tar.xz
hypervideo-pre-4bf912282a34b58b6b35d8f7e6be535770c89c76.zip
[networking] Remove dot segments during URL normalization (#7662)
This implements RFC3986 5.2.4 remove_dot_segments during the URL normalization process. Closes #3355, #6526 Authored by: coletdjnz
Diffstat (limited to 'yt_dlp/utils')
-rw-r--r--yt_dlp/utils/_legacy.py4
-rw-r--r--yt_dlp/utils/_utils.py17
-rw-r--r--yt_dlp/utils/networking.py38
3 files changed, 41 insertions, 18 deletions
diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py
index 077000971..dde02092c 100644
--- a/yt_dlp/utils/_legacy.py
+++ b/yt_dlp/utils/_legacy.py
@@ -8,6 +8,8 @@ import urllib.request
import zlib
from ._utils import Popen, decode_base_n, preferredencoding
+from .networking import escape_rfc3986 # noqa: F401
+from .networking import normalize_url as escape_url # noqa: F401
from .traversal import traverse_obj
from ..dependencies import certifi, websockets
from ..networking._helper import make_ssl_context
@@ -197,7 +199,7 @@ def request_to_url(req):
def sanitized_Request(url, *args, **kwargs):
- from ..utils import escape_url, extract_basic_auth, sanitize_url
+ from ..utils import extract_basic_auth, sanitize_url
url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
if auth_header is not None:
headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py
index abae0f17e..f5552ce80 100644
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@@ -2464,23 +2464,6 @@ def lowercase_escape(s):
s)
-def escape_rfc3986(s):
- """Escape non-ASCII characters as suggested by RFC 3986"""
- return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
-
-
-def escape_url(url):
- """Escape URL as suggested by RFC 3986"""
- url_parsed = urllib.parse.urlparse(url)
- return url_parsed._replace(
- netloc=url_parsed.netloc.encode('idna').decode('ascii'),
- path=escape_rfc3986(url_parsed.path),
- params=escape_rfc3986(url_parsed.params),
- query=escape_rfc3986(url_parsed.query),
- fragment=escape_rfc3986(url_parsed.fragment)
- ).geturl()
-
-
def parse_qs(url, **kwargs):
return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
diff --git a/yt_dlp/utils/networking.py b/yt_dlp/utils/networking.py
index e6515ec8e..bbcea84d2 100644
--- a/yt_dlp/utils/networking.py
+++ b/yt_dlp/utils/networking.py
@@ -121,3 +121,41 @@ def clean_headers(headers: HTTPHeaderDict):
if 'Youtubedl-No-Compression' in headers: # compat
del headers['Youtubedl-No-Compression']
headers['Accept-Encoding'] = 'identity'
+
+
+def remove_dot_segments(path):
+ # Implements RFC3986 5.2.4 remote_dot_segments
+ # Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4
+ # https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263
+ output = []
+ segments = path.split('/')
+ for s in segments:
+ if s == '.':
+ continue
+ elif s == '..':
+ if output:
+ output.pop()
+ else:
+ output.append(s)
+ if not segments[0] and (not output or output[0]):
+ output.insert(0, '')
+ if segments[-1] in ('.', '..'):
+ output.append('')
+ return '/'.join(output)
+
+
+def escape_rfc3986(s):
+ """Escape non-ASCII characters as suggested by RFC 3986"""
+ return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
+
+
+def normalize_url(url):
+ """Normalize URL as suggested by RFC 3986"""
+ url_parsed = urllib.parse.urlparse(url)
+ return url_parsed._replace(
+ netloc=url_parsed.netloc.encode('idna').decode('ascii'),
+ path=escape_rfc3986(remove_dot_segments(url_parsed.path)),
+ params=escape_rfc3986(url_parsed.params),
+ query=escape_rfc3986(url_parsed.query),
+ fragment=escape_rfc3986(url_parsed.fragment)
+ ).geturl()