From b32330be4f15dd044e6212f526e52375f0a0f6c2 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 21 Feb 2019 21:32:31 -0800 Subject: refactor common.py into 3 files --- youtube/util.py | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 youtube/util.py (limited to 'youtube/util.py') diff --git a/youtube/util.py b/youtube/util.py new file mode 100644 index 0000000..2ebd8bc --- /dev/null +++ b/youtube/util.py @@ -0,0 +1,153 @@ +import socks, sockshandler +import gzip +import brotli +import urllib.parse +import re +import time +import settings + + +URL_ORIGIN = "/https://www.youtube.com" + + +class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler): + '''Separate cookiejars for receiving and sending''' + def __init__(self, cookiejar_send=None, cookiejar_receive=None): + import http.cookiejar + self.cookiejar_send = cookiejar_send + self.cookiejar_receive = cookiejar_receive + + def http_request(self, request): + if self.cookiejar_send is not None: + self.cookiejar_send.add_cookie_header(request) + return request + + def http_response(self, request, response): + if self.cookiejar_receive is not None: + self.cookiejar_receive.extract_cookies(response, request) + return response + + https_request = http_request + https_response = http_response + + +def decode_content(content, encoding_header): + encodings = encoding_header.replace(' ', '').split(',') + for encoding in reversed(encodings): + if encoding == 'identity': + continue + if encoding == 'br': + content = brotli.decompress(content) + elif encoding == 'gzip': + content = gzip.decompress(content) + return content + +def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookiejar_send=None, cookiejar_receive=None, use_tor=True): + ''' + When cookiejar_send is set to a CookieJar object, + those cookies will be sent in the request (but cookies in response will not be merged into it) + When cookiejar_receive is set to a CookieJar object, + cookies received in the response will be merged into the object (nothing will be sent from it) + When both are set to the same object, cookies will be sent from the object, + and response cookies will be merged into it. + ''' + headers = dict(headers) # Note: Calling dict() on a dict will make a copy + headers['Accept-Encoding'] = 'gzip, br' + + # prevent python version being leaked by urllib if User-Agent isn't provided + # (urllib will use ex. Python-urllib/3.6 otherwise) + if 'User-Agent' not in headers and 'user-agent' not in headers and 'User-agent' not in headers: + headers['User-Agent'] = 'Python-urllib' + + if data is not None: + if isinstance(data, str): + data = data.encode('ascii') + elif not isinstance(data, bytes): + data = urllib.parse.urlencode(data).encode('ascii') + + start_time = time.time() + + + req = urllib.request.Request(url, data=data, headers=headers) + + cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive) + + if use_tor and settings.route_tor: + opener = urllib.request.build_opener(sockshandler.SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150), cookie_processor) + else: + opener = urllib.request.build_opener(cookie_processor) + + response = opener.open(req, timeout=timeout) + response_time = time.time() + + + content = response.read() + read_finish = time.time() + if report_text: + print(report_text, ' Latency:', round(response_time - start_time,3), ' Read time:', round(read_finish - response_time,3)) + content = decode_content(content, response.getheader('Content-Encoding', default='identity')) + return content + +mobile_user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1' +mobile_ua = (('User-Agent', mobile_user_agent),) +desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0' +desktop_ua = (('User-Agent', desktop_user_agent),) + + + + + + + + + + +def dict_add(*dicts): + for dictionary in dicts[1:]: + dicts[0].update(dictionary) + return dicts[0] + +def video_id(url): + url_parts = urllib.parse.urlparse(url) + return urllib.parse.parse_qs(url_parts.query)['v'][0] + +def default_multi_get(object, *keys, default): + ''' Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices. Last argument is the default value to use in case of any IndexErrors or KeyErrors ''' + try: + for key in keys: + object = object[key] + return object + except (IndexError, KeyError): + return default + + +# default, sddefault, mqdefault, hqdefault, hq720 +def get_thumbnail_url(video_id): + return "/i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" + +def seconds_to_timestamp(seconds): + seconds = int(seconds) + hours, seconds = divmod(seconds,3600) + minutes, seconds = divmod(seconds,60) + if hours != 0: + timestamp = str(hours) + ":" + timestamp += str(minutes).zfill(2) # zfill pads with zeros + else: + timestamp = str(minutes) + + timestamp += ":" + str(seconds).zfill(2) + return timestamp + + + +def update_query_string(query_string, items): + parameters = urllib.parse.parse_qs(query_string) + parameters.update(items) + return urllib.parse.urlencode(parameters, doseq=True) + + + +def uppercase_escape(s): + return re.sub( + r'\\U([0-9a-fA-F]{8})', + lambda m: chr(int(m.group(1), base=16)), s) \ No newline at end of file -- cgit v1.2.3 From 1e1f55c9e5d98ba076bc67e7abe9e4d77d84c65b Mon Sep 17 00:00:00 2001 From: James Taylor Date: Mon, 25 Feb 2019 20:47:49 -0800 Subject: Use persistent connections --- youtube/util.py | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 11 deletions(-) (limited to 'youtube/util.py') diff --git a/youtube/util.py b/youtube/util.py index 2ebd8bc..e0f4a2c 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -1,14 +1,75 @@ +import settings import socks, sockshandler import gzip import brotli import urllib.parse import re import time -import settings +# The trouble with the requests library: It ships its own certificate bundle via certifi +# instead of using the system certificate store, meaning self-signed certificates +# configured by the user will not work. Some draconian networks block TLS unless a corporate +# certificate is installed on the system. Additionally, some users install a self signed cert +# in order to use programs to modify or monitor requests made by programs on the system. + +# Finally, certificates expire and need to be updated, or are sometimes revoked. Sometimes +# certificate authorites go rogue and need to be untrusted. Since we are going through Tor exit nodes, +# this becomes all the more important. A rogue CA could issue a fake certificate for accounts.google.com, and a +# malicious exit node could use this to decrypt traffic when logging in and retrieve passwords. Examples: +# https://www.engadget.com/2015/10/29/google-warns-symantec-over-certificates/ +# https://nakedsecurity.sophos.com/2013/12/09/serious-security-google-finds-fake-but-trusted-ssl-certificates-for-its-domains-made-in-france/ + +# In the requests documentation it says: +# "Before version 2.16, Requests bundled a set of root CAs that it trusted, sourced from the Mozilla trust store. +# The certificates were only updated once for each Requests version. When certifi was not installed, +# this led to extremely out-of-date certificate bundles when using significantly older versions of Requests. +# For the sake of security we recommend upgrading certifi frequently!" +# (http://docs.python-requests.org/en/master/user/advanced/#ca-certificates) + +# Expecting users to remember to manually update certifi on Linux isn't reasonable in my view. +# On windows, this is even worse since I am distributing all dependencies. This program is not +# updated frequently, and using requests would lead to outdated certificates. Certificates +# should be updated with OS updates, instead of thousands of developers of different programs +# being expected to do this correctly 100% of the time. + +# There is hope that this might be fixed eventually: +# https://github.com/kennethreitz/requests/issues/2966 + +# Until then, I will use a mix of urllib3 and urllib. +import urllib3 +import urllib3.contrib.socks URL_ORIGIN = "/https://www.youtube.com" +connection_pool = urllib3.PoolManager(cert_reqs = 'CERT_REQUIRED') + +old_tor_connection_pool = None +tor_connection_pool = urllib3.contrib.socks.SOCKSProxyManager('socks5://127.0.0.1:9150/', cert_reqs = 'CERT_REQUIRED') + +tor_pool_refresh_time = time.monotonic() # prevent problems due to clock changes + +def get_pool(use_tor): + global old_tor_connection_pool + global tor_connection_pool + global tor_pool_refresh_time + + if not use_tor: + return connection_pool + + # Tor changes circuits after 10 minutes: https://tor.stackexchange.com/questions/262/for-how-long-does-a-circuit-stay-alive + current_time = time.monotonic() + if current_time - tor_pool_refresh_time > 300: # close pool after 5 minutes + tor_connection_pool.clear() + + # Keep a reference for 5 min to avoid it getting garbage collected while sockets still in use + old_tor_connection_pool = tor_connection_pool + + tor_connection_pool = urllib3.contrib.socks.SOCKSProxyManager('socks5://127.0.0.1:9150/', cert_reqs = 'CERT_REQUIRED') + tor_pool_refresh_time = current_time + + return tor_connection_pool + + class HTTPAsymmetricCookieProcessor(urllib.request.BaseHandler): '''Separate cookiejars for receiving and sending''' @@ -42,7 +103,7 @@ def decode_content(content, encoding_header): content = gzip.decompress(content) return content -def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookiejar_send=None, cookiejar_receive=None, use_tor=True): +def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookiejar_send=None, cookiejar_receive=None, use_tor=True, return_response=False): ''' When cookiejar_send is set to a CookieJar object, those cookies will be sent in the request (but cookies in response will not be merged into it) @@ -59,7 +120,9 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookieja if 'User-Agent' not in headers and 'user-agent' not in headers and 'User-agent' not in headers: headers['User-Agent'] = 'Python-urllib' + method = "GET" if data is not None: + method = "POST" if isinstance(data, str): data = data.encode('ascii') elif not isinstance(data, bytes): @@ -67,25 +130,38 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookieja start_time = time.time() + if cookiejar_send is not None or cookiejar_receive is not None: # Use urllib + req = urllib.request.Request(url, data=data, headers=headers) - req = urllib.request.Request(url, data=data, headers=headers) + cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive) - cookie_processor = HTTPAsymmetricCookieProcessor(cookiejar_send=cookiejar_send, cookiejar_receive=cookiejar_receive) + if use_tor and settings.route_tor: + opener = urllib.request.build_opener(sockshandler.SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150), cookie_processor) + else: + opener = urllib.request.build_opener(cookie_processor) - if use_tor and settings.route_tor: - opener = urllib.request.build_opener(sockshandler.SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150), cookie_processor) - else: - opener = urllib.request.build_opener(cookie_processor) + response = opener.open(req, timeout=timeout) + response_time = time.time() - response = opener.open(req, timeout=timeout) - response_time = time.time() + content = response.read() + + else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them. + pool = get_pool(use_tor and settings.route_tor) + + response = pool.request(method, url, headers=headers, timeout=timeout, preload_content=False) + response_time = time.time() + + content = response.read() + response.release_conn() - content = response.read() read_finish = time.time() if report_text: print(report_text, ' Latency:', round(response_time - start_time,3), ' Read time:', round(read_finish - response_time,3)) content = decode_content(content, response.getheader('Content-Encoding', default='identity')) + + if return_response: + return content, response return content mobile_user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1' -- cgit v1.2.3 From 89daa27be257c6170e7d730529bee9ce1294f73d Mon Sep 17 00:00:00 2001 From: James Taylor Date: Tue, 26 Feb 2019 17:15:51 -0800 Subject: prevent urllib3 from decoding content --- youtube/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube/util.py') diff --git a/youtube/util.py b/youtube/util.py index e0f4a2c..9950815 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -149,7 +149,7 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookieja else: # Use a urllib3 pool. Cookies can't be used since urllib3 doesn't have easy support for them. pool = get_pool(use_tor and settings.route_tor) - response = pool.request(method, url, headers=headers, timeout=timeout, preload_content=False) + response = pool.request(method, url, headers=headers, timeout=timeout, preload_content=False, decode_content=False) response_time = time.time() content = response.read() -- cgit v1.2.3