[core] Support decoding multiple content encodings (#7142)

Authored by: coletdjnz
author: coletdjnz <coletdjnz@protonmail.com> 2023-05-27 22:40:05 +1200
committer: GitHub <noreply@github.com> 2023-05-27 10:40:05 +0000
commit: daafbf49b3482edae4d70dd37070be99742a926e (patch)
tree: 5cc5037f4296eb66241e8daa59d85a82c4412236 /yt_dlp/utils/_utils.py
parent: 3f66b6fe50f8d5b545712f8b19d5ae62f5373980 (diff)
download: hypervideo-pre-daafbf49b3482edae4d70dd37070be99742a926e.tar.lz
hypervideo-pre-daafbf49b3482edae4d70dd37070be99742a926e.tar.xz
hypervideo-pre-daafbf49b3482edae4d70dd37070be99742a926e.zip
1 files changed, 32 insertions, 29 deletions
diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py
index 6f4f22bb3..7c91faff8 100644
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@@ -1361,6 +1361,23 @@ class YoutubeDLHandler(urllib.request.HTTPHandler):
             return data
         return brotli.decompress(data)
 
+    @staticmethod
+    def gz(data):
+        gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
+        try:
+            return gz.read()
+        except OSError as original_oserror:
+            # There may be junk add the end of the file
+            # See http://stackoverflow.com/q/4928560/35070 for details
+            for i in range(1, 1024):
+                try:
+                    gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
+                    return gz.read()
+                except OSError:
+                    continue
+            else:
+                raise original_oserror
+
     def http_request(self, req):
         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
         # always respected by websites, some tend to give out URLs with non percent-encoded
@@ -1394,35 +1411,21 @@ class YoutubeDLHandler(urllib.request.HTTPHandler):
 
     def http_response(self, req, resp):
         old_resp = resp
-        # gzip
-        if resp.headers.get('Content-encoding', '') == 'gzip':
-            content = resp.read()
-            gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
-            try:
-                uncompressed = io.BytesIO(gz.read())
-            except OSError as original_ioerror:
-                # There may be junk add the end of the file
-                # See http://stackoverflow.com/q/4928560/35070 for details
-                for i in range(1, 1024):
-                    try:
-                        gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
-                        uncompressed = io.BytesIO(gz.read())
-                    except OSError:
-                        continue
-                    break
-                else:
-                    raise original_ioerror
-            resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
-            resp.msg = old_resp.msg
-        # deflate
-        if resp.headers.get('Content-encoding', '') == 'deflate':
-            gz = io.BytesIO(self.deflate(resp.read()))
-            resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
-            resp.msg = old_resp.msg
-        # brotli
-        if resp.headers.get('Content-encoding', '') == 'br':
-            resp = urllib.request.addinfourl(
-                io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
+
+        # Content-Encoding header lists the encodings in order that they were applied [1].
+        # To decompress, we simply do the reverse.
+        # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
+        decoded_response = None
+        for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
+            if encoding == 'gzip':
+                decoded_response = self.gz(decoded_response or resp.read())
+            elif encoding == 'deflate':
+                decoded_response = self.deflate(decoded_response or resp.read())
+            elif encoding == 'br' and brotli:
+                decoded_response = self.brotli(decoded_response or resp.read())
+
+        if decoded_response is not None:
+            resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
             resp.msg = old_resp.msg
         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
         # https://github.com/ytdl-org/youtube-dl/issues/6457).
author	coletdjnz <coletdjnz@protonmail.com>	2023-05-27 22:40:05 +1200
committer	GitHub <noreply@github.com>	2023-05-27 10:40:05 +0000
commit	daafbf49b3482edae4d70dd37070be99742a926e (patch)
tree	5cc5037f4296eb66241e8daa59d85a82c4412236 /yt_dlp/utils/_utils.py
parent	3f66b6fe50f8d5b545712f8b19d5ae62f5373980 (diff)
download	hypervideo-pre-daafbf49b3482edae4d70dd37070be99742a926e.tar.lz hypervideo-pre-daafbf49b3482edae4d70dd37070be99742a926e.tar.xz hypervideo-pre-daafbf49b3482edae4d70dd37070be99742a926e.zip