aboutsummaryrefslogtreecommitdiffstats
path: root/yt_dlp/utils/_utils.py
diff options
context:
space:
mode:
authorcoletdjnz <coletdjnz@protonmail.com>2023-05-27 19:06:13 +1200
committerGitHub <noreply@github.com>2023-05-27 07:06:13 +0000
commit08916a49c777cb6e000eec092881eb93ec22076c (patch)
tree7ed5b19bd12e959abafbe0adda4e749543318197 /yt_dlp/utils/_utils.py
parent66468bbf49562ff82670cbbd456c5e8448a6df34 (diff)
downloadhypervideo-pre-08916a49c777cb6e000eec092881eb93ec22076c.tar.lz
hypervideo-pre-08916a49c777cb6e000eec092881eb93ec22076c.tar.xz
hypervideo-pre-08916a49c777cb6e000eec092881eb93ec22076c.zip
[core] Improve HTTP redirect handling (#7094)
Aligns HTTP redirect handling with what browsers commonly do and RFC standards. Fixes issues https://github.com/yt-dlp/yt-dlp/commit/afac4caa7db30804bebac33e53c3cb0237958224 missed. Authored by: coletdjnz
Diffstat (limited to 'yt_dlp/utils/_utils.py')
-rw-r--r--yt_dlp/utils/_utils.py59
1 files changed, 21 insertions, 38 deletions
diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py
index 238b0fe69..d78022295 100644
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@@ -1664,61 +1664,44 @@ class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
The code is based on HTTPRedirectHandler implementation from CPython [1].
- This redirect handler solves two issues:
- - ensures redirect URL is always unicode under python 2
- - introduces support for experimental HTTP response status code
- 308 Permanent Redirect [2] used by some sites [3]
+ This redirect handler fixes and improves the logic to better align with RFC7261
+ and what browsers tend to do [2][3]
1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
- 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
- 3. https://github.com/ytdl-org/youtube-dl/issues/28768
+ 2. https://datatracker.ietf.org/doc/html/rfc7231
+ 3. https://github.com/python/cpython/issues/91306
"""
http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
def redirect_request(self, req, fp, code, msg, headers, newurl):
- """Return a Request or None in response to a redirect.
-
- This is called by the http_error_30x methods when a
- redirection response is received. If a redirection should
- take place, return a new Request to allow http_error_30x to
- perform the redirect. Otherwise, raise HTTPError if no-one
- else should try to handle this url. Return None if you can't
- but another Handler might.
- """
- m = req.get_method()
- if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
- or code in (301, 302, 303) and m == "POST")):
+ if code not in (301, 302, 303, 307, 308):
raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
- # Strictly (according to RFC 2616), 301 or 302 in response to
- # a POST MUST NOT cause a redirection without confirmation
- # from the user (of urllib.request, in this case). In practice,
- # essentially all clients do redirect in this case, so we do
- # the same.
-
- # Be conciliant with URIs containing a space. This is mainly
- # redundant with the more complete encoding done in http_error_302(),
- # but it is kept for compatibility with other callers.
- newurl = newurl.replace(' ', '%20')
-
- CONTENT_HEADERS = ("content-length", "content-type")
- # NB: don't use dict comprehension for python 2.6 compatibility
- newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
+ new_method = req.get_method()
+ new_data = req.data
+ remove_headers = []
# A 303 must either use GET or HEAD for subsequent request
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
- if code == 303 and m != 'HEAD':
- m = 'GET'
+ if code == 303 and req.get_method() != 'HEAD':
+ new_method = 'GET'
# 301 and 302 redirects are commonly turned into a GET from a POST
# for subsequent requests by browsers, so we'll do the same.
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
- if code in (301, 302) and m == 'POST':
- m = 'GET'
+ elif code in (301, 302) and req.get_method() == 'POST':
+ new_method = 'GET'
+
+ # only remove payload if method changed (e.g. POST to GET)
+ if new_method != req.get_method():
+ new_data = None
+ remove_headers.extend(['Content-Length', 'Content-Type'])
+
+ new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
return urllib.request.Request(
- newurl, headers=newheaders, origin_req_host=req.origin_req_host,
- unverifiable=True, method=m)
+ newurl, headers=new_headers, origin_req_host=req.origin_req_host,
+ unverifiable=True, method=new_method, data=new_data)
def extract_timezone(date_str):