From 08916a49c777cb6e000eec092881eb93ec22076c Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 27 May 2023 19:06:13 +1200 Subject: [core] Improve HTTP redirect handling (#7094) Aligns HTTP redirect handling with what browsers commonly do and RFC standards. Fixes issues https://github.com/yt-dlp/yt-dlp/commit/afac4caa7db30804bebac33e53c3cb0237958224 missed. Authored by: coletdjnz --- yt_dlp/utils/_utils.py | 59 ++++++++++++++++++-------------------------------- 1 file changed, 21 insertions(+), 38 deletions(-) (limited to 'yt_dlp/utils/_utils.py') diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 238b0fe69..d78022295 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1664,61 +1664,44 @@ class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler): The code is based on HTTPRedirectHandler implementation from CPython [1]. - This redirect handler solves two issues: - - ensures redirect URL is always unicode under python 2 - - introduces support for experimental HTTP response status code - 308 Permanent Redirect [2] used by some sites [3] + This redirect handler fixes and improves the logic to better align with RFC7261 + and what browsers tend to do [2][3] 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py - 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308 - 3. https://github.com/ytdl-org/youtube-dl/issues/28768 + 2. https://datatracker.ietf.org/doc/html/rfc7231 + 3. https://github.com/python/cpython/issues/91306 """ http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302 def redirect_request(self, req, fp, code, msg, headers, newurl): - """Return a Request or None in response to a redirect. - - This is called by the http_error_30x methods when a - redirection response is received. If a redirection should - take place, return a new Request to allow http_error_30x to - perform the redirect. Otherwise, raise HTTPError if no-one - else should try to handle this url. Return None if you can't - but another Handler might. - """ - m = req.get_method() - if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") - or code in (301, 302, 303) and m == "POST")): + if code not in (301, 302, 303, 307, 308): raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp) - # Strictly (according to RFC 2616), 301 or 302 in response to - # a POST MUST NOT cause a redirection without confirmation - # from the user (of urllib.request, in this case). In practice, - # essentially all clients do redirect in this case, so we do - # the same. - - # Be conciliant with URIs containing a space. This is mainly - # redundant with the more complete encoding done in http_error_302(), - # but it is kept for compatibility with other callers. - newurl = newurl.replace(' ', '%20') - - CONTENT_HEADERS = ("content-length", "content-type") - # NB: don't use dict comprehension for python 2.6 compatibility - newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS} + new_method = req.get_method() + new_data = req.data + remove_headers = [] # A 303 must either use GET or HEAD for subsequent request # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4 - if code == 303 and m != 'HEAD': - m = 'GET' + if code == 303 and req.get_method() != 'HEAD': + new_method = 'GET' # 301 and 302 redirects are commonly turned into a GET from a POST # for subsequent requests by browsers, so we'll do the same. # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3 - if code in (301, 302) and m == 'POST': - m = 'GET' + elif code in (301, 302) and req.get_method() == 'POST': + new_method = 'GET' + + # only remove payload if method changed (e.g. POST to GET) + if new_method != req.get_method(): + new_data = None + remove_headers.extend(['Content-Length', 'Content-Type']) + + new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers} return urllib.request.Request( - newurl, headers=newheaders, origin_req_host=req.origin_req_host, - unverifiable=True, method=m) + newurl, headers=new_headers, origin_req_host=req.origin_req_host, + unverifiable=True, method=new_method, data=new_data) def extract_timezone(date_str): -- cgit v1.2.3