diff options
author | coletdjnz <coletdjnz@protonmail.com> | 2023-05-27 22:40:05 +1200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-27 10:40:05 +0000 |
commit | daafbf49b3482edae4d70dd37070be99742a926e (patch) | |
tree | 5cc5037f4296eb66241e8daa59d85a82c4412236 /yt_dlp/utils/_utils.py | |
parent | 3f66b6fe50f8d5b545712f8b19d5ae62f5373980 (diff) | |
download | hypervideo-pre-daafbf49b3482edae4d70dd37070be99742a926e.tar.lz hypervideo-pre-daafbf49b3482edae4d70dd37070be99742a926e.tar.xz hypervideo-pre-daafbf49b3482edae4d70dd37070be99742a926e.zip |
[core] Support decoding multiple content encodings (#7142)
Authored by: coletdjnz
Diffstat (limited to 'yt_dlp/utils/_utils.py')
-rw-r--r-- | yt_dlp/utils/_utils.py | 61 |
1 files changed, 32 insertions, 29 deletions
diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 6f4f22bb3..7c91faff8 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1361,6 +1361,23 @@ class YoutubeDLHandler(urllib.request.HTTPHandler): return data return brotli.decompress(data) + @staticmethod + def gz(data): + gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb') + try: + return gz.read() + except OSError as original_oserror: + # There may be junk add the end of the file + # See http://stackoverflow.com/q/4928560/35070 for details + for i in range(1, 1024): + try: + gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb') + return gz.read() + except OSError: + continue + else: + raise original_oserror + def http_request(self, req): # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not # always respected by websites, some tend to give out URLs with non percent-encoded @@ -1394,35 +1411,21 @@ class YoutubeDLHandler(urllib.request.HTTPHandler): def http_response(self, req, resp): old_resp = resp - # gzip - if resp.headers.get('Content-encoding', '') == 'gzip': - content = resp.read() - gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') - try: - uncompressed = io.BytesIO(gz.read()) - except OSError as original_ioerror: - # There may be junk add the end of the file - # See http://stackoverflow.com/q/4928560/35070 for details - for i in range(1, 1024): - try: - gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') - uncompressed = io.BytesIO(gz.read()) - except OSError: - continue - break - else: - raise original_ioerror - resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - # deflate - if resp.headers.get('Content-encoding', '') == 'deflate': - gz = io.BytesIO(self.deflate(resp.read())) - resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - # brotli - if resp.headers.get('Content-encoding', '') == 'br': - resp = urllib.request.addinfourl( - io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code) + + # Content-Encoding header lists the encodings in order that they were applied [1]. + # To decompress, we simply do the reverse. + # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding + decoded_response = None + for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))): + if encoding == 'gzip': + decoded_response = self.gz(decoded_response or resp.read()) + elif encoding == 'deflate': + decoded_response = self.deflate(decoded_response or resp.read()) + elif encoding == 'br' and brotli: + decoded_response = self.brotli(decoded_response or resp.read()) + + if decoded_response is not None: + resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see # https://github.com/ytdl-org/youtube-dl/issues/6457). |