diff options
author | pukkandan <pukkandan.ytdlp@gmail.com> | 2023-07-15 14:30:08 +0530 |
---|---|---|
committer | pukkandan <pukkandan.ytdlp@gmail.com> | 2023-07-15 16:18:34 +0530 |
commit | c365dba8430ee33abda85d31f95128605bf240eb (patch) | |
tree | 174d35c23267863ca89a1a030935b445edf56799 /yt_dlp/networking/_urllib.py | |
parent | 1b392f905d20ef1f1b300b180f867d43c9ce49b8 (diff) | |
download | hypervideo-pre-c365dba8430ee33abda85d31f95128605bf240eb.tar.lz hypervideo-pre-c365dba8430ee33abda85d31f95128605bf240eb.tar.xz hypervideo-pre-c365dba8430ee33abda85d31f95128605bf240eb.zip |
[networking] Add module (#2861)
No actual changes - code is only moved around
Diffstat (limited to 'yt_dlp/networking/_urllib.py')
-rw-r--r-- | yt_dlp/networking/_urllib.py | 315 |
1 files changed, 315 insertions, 0 deletions
diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py new file mode 100644 index 000000000..1f5871ae6 --- /dev/null +++ b/yt_dlp/networking/_urllib.py @@ -0,0 +1,315 @@ +import functools +import gzip +import http.client +import io +import socket +import ssl +import urllib.error +import urllib.parse +import urllib.request +import urllib.response +import zlib + +from ._helper import ( + add_accept_encoding_header, + get_redirect_method, + make_socks_proxy_opts, +) +from ..dependencies import brotli +from ..socks import sockssocket +from ..utils import escape_url, update_url_query +from ..utils.networking import clean_headers, std_headers + +SUPPORTED_ENCODINGS = ['gzip', 'deflate'] + +if brotli: + SUPPORTED_ENCODINGS.append('br') + + +def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): + hc = http_class(*args, **kwargs) + source_address = ydl_handler._params.get('source_address') + + if source_address is not None: + # This is to workaround _create_connection() from socket where it will try all + # address data from getaddrinfo() including IPv6. This filters the result from + # getaddrinfo() based on the source_address value. + # This is based on the cpython socket.create_connection() function. + # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 + def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): + host, port = address + err = None + addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) + af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 + ip_addrs = [addr for addr in addrs if addr[0] == af] + if addrs and not ip_addrs: + ip_version = 'v4' if af == socket.AF_INET else 'v6' + raise OSError( + "No remote IP%s addresses available for connect, can't use '%s' as source address" + % (ip_version, source_address[0])) + for res in ip_addrs: + af, socktype, proto, canonname, sa = res + sock = None + try: + sock = socket.socket(af, socktype, proto) + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: + sock.settimeout(timeout) + sock.bind(source_address) + sock.connect(sa) + err = None # Explicitly break reference cycle + return sock + except OSError as _: + err = _ + if sock is not None: + sock.close() + if err is not None: + raise err + else: + raise OSError('getaddrinfo returns an empty list') + if hasattr(hc, '_create_connection'): + hc._create_connection = _create_connection + hc.source_address = (source_address, 0) + + return hc + + +class HTTPHandler(urllib.request.HTTPHandler): + """Handler for HTTP requests and responses. + + This class, when installed with an OpenerDirector, automatically adds + the standard headers to every HTTP request and handles gzipped, deflated and + brotli responses from web servers. + + Part of this code was copied from: + + http://techknack.net/python-urllib2-handlers/ + + Andrew Rowls, the author of that code, agreed to release it to the + public domain. + """ + + def __init__(self, params, *args, **kwargs): + urllib.request.HTTPHandler.__init__(self, *args, **kwargs) + self._params = params + + def http_open(self, req): + conn_class = http.client.HTTPConnection + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + + return self.do_open(functools.partial( + _create_http_connection, self, conn_class, False), + req) + + @staticmethod + def deflate(data): + if not data: + return data + try: + return zlib.decompress(data, -zlib.MAX_WBITS) + except zlib.error: + return zlib.decompress(data) + + @staticmethod + def brotli(data): + if not data: + return data + return brotli.decompress(data) + + @staticmethod + def gz(data): + gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb') + try: + return gz.read() + except OSError as original_oserror: + # There may be junk add the end of the file + # See http://stackoverflow.com/q/4928560/35070 for details + for i in range(1, 1024): + try: + gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb') + return gz.read() + except OSError: + continue + else: + raise original_oserror + + def http_request(self, req): + # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not + # always respected by websites, some tend to give out URLs with non percent-encoded + # non-ASCII characters (see telemb.py, ard.py [#3412]) + # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) + # To work around aforementioned issue we will replace request's original URL with + # percent-encoded one + # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) + # the code of this workaround has been moved here from YoutubeDL.urlopen() + url = req.get_full_url() + url_escaped = escape_url(url) + + # Substitute URL if any change after escaping + if url != url_escaped: + req = update_Request(req, url=url_escaped) + + for h, v in self._params.get('http_headers', std_headers).items(): + # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 + # The dict keys are capitalized because of this bug by urllib + if h.capitalize() not in req.headers: + req.add_header(h, v) + + clean_headers(req.headers) + add_accept_encoding_header(req.headers, SUPPORTED_ENCODINGS) + return super().do_request_(req) + + def http_response(self, req, resp): + old_resp = resp + + # Content-Encoding header lists the encodings in order that they were applied [1]. + # To decompress, we simply do the reverse. + # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding + decoded_response = None + for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))): + if encoding == 'gzip': + decoded_response = self.gz(decoded_response or resp.read()) + elif encoding == 'deflate': + decoded_response = self.deflate(decoded_response or resp.read()) + elif encoding == 'br' and brotli: + decoded_response = self.brotli(decoded_response or resp.read()) + + if decoded_response is not None: + resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code) + resp.msg = old_resp.msg + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see + # https://github.com/ytdl-org/youtube-dl/issues/6457). + if 300 <= resp.code < 400: + location = resp.headers.get('Location') + if location: + # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 + location = location.encode('iso-8859-1').decode() + location_escaped = escape_url(location) + if location != location_escaped: + del resp.headers['Location'] + resp.headers['Location'] = location_escaped + return resp + + https_request = http_request + https_response = http_response + + +def make_socks_conn_class(base_class, socks_proxy): + assert issubclass(base_class, ( + http.client.HTTPConnection, http.client.HTTPSConnection)) + + proxy_args = make_socks_proxy_opts(socks_proxy) + + class SocksConnection(base_class): + def connect(self): + self.sock = sockssocket() + self.sock.setproxy(**proxy_args) + if isinstance(self.timeout, (int, float)): + self.sock.settimeout(self.timeout) + self.sock.connect((self.host, self.port)) + + if isinstance(self, http.client.HTTPSConnection): + if hasattr(self, '_context'): # Python > 2.6 + self.sock = self._context.wrap_socket( + self.sock, server_hostname=self.host) + else: + self.sock = ssl.wrap_socket(self.sock) + + return SocksConnection + + +class RedirectHandler(urllib.request.HTTPRedirectHandler): + """YoutubeDL redirect handler + + The code is based on HTTPRedirectHandler implementation from CPython [1]. + + This redirect handler fixes and improves the logic to better align with RFC7261 + and what browsers tend to do [2][3] + + 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py + 2. https://datatracker.ietf.org/doc/html/rfc7231 + 3. https://github.com/python/cpython/issues/91306 + """ + + http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302 + + def redirect_request(self, req, fp, code, msg, headers, newurl): + if code not in (301, 302, 303, 307, 308): + raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp) + + new_data = req.data + + # Technically the Cookie header should be in unredirected_hdrs, + # however in practice some may set it in normal headers anyway. + # We will remove it here to prevent any leaks. + remove_headers = ['Cookie'] + + new_method = get_redirect_method(req.get_method(), code) + # only remove payload if method changed (e.g. POST to GET) + if new_method != req.get_method(): + new_data = None + remove_headers.extend(['Content-Length', 'Content-Type']) + + new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers} + + return urllib.request.Request( + newurl, headers=new_headers, origin_req_host=req.origin_req_host, + unverifiable=True, method=new_method, data=new_data) + + +class ProxyHandler(urllib.request.ProxyHandler): + def __init__(self, proxies=None): + # Set default handlers + for type in ('http', 'https'): + setattr(self, '%s_open' % type, + lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: + meth(r, proxy, type)) + urllib.request.ProxyHandler.__init__(self, proxies) + + def proxy_open(self, req, proxy, type): + req_proxy = req.headers.get('Ytdl-request-proxy') + if req_proxy is not None: + proxy = req_proxy + del req.headers['Ytdl-request-proxy'] + + if proxy == '__noproxy__': + return None # No Proxy + if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): + req.add_header('Ytdl-socks-proxy', proxy) + # yt-dlp's http/https handlers do wrapping the socket with socks + return None + return urllib.request.ProxyHandler.proxy_open( + self, req, proxy, type) + + +class PUTRequest(urllib.request.Request): + def get_method(self): + return 'PUT' + + +class HEADRequest(urllib.request.Request): + def get_method(self): + return 'HEAD' + + +def update_Request(req, url=None, data=None, headers=None, query=None): + req_headers = req.headers.copy() + req_headers.update(headers or {}) + req_data = data or req.data + req_url = update_url_query(url or req.get_full_url(), query) + req_get_method = req.get_method() + if req_get_method == 'HEAD': + req_type = HEADRequest + elif req_get_method == 'PUT': + req_type = PUTRequest + else: + req_type = urllib.request.Request + new_req = req_type( + req_url, data=req_data, headers=req_headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + if hasattr(req, 'timeout'): + new_req.timeout = req.timeout + return new_req |