Fix a904a7f8c6edc42046f0a78fb279739d500d4887

author: pukkandan <pukkandan.ytdlp@gmail.com> 2022-07-15 21:44:07 +0530
committer: pukkandan <pukkandan.ytdlp@gmail.com> 2022-07-15 21:45:05 +0530
commit: 88f60feb32614c723f997b2cba20c8c10fbe9bd3 (patch)
tree: 46af3fa0a59a87b2054ae4b45843f4a1843a03af /yt_dlp/utils.py
parent: a904a7f8c6edc42046f0a78fb279739d500d4887 (diff)
download: hypervideo-pre-88f60feb32614c723f997b2cba20c8c10fbe9bd3.tar.lz
hypervideo-pre-88f60feb32614c723f997b2cba20c8c10fbe9bd3.tar.xz
hypervideo-pre-88f60feb32614c723f997b2cba20c8c10fbe9bd3.zip
1 files changed, 7 insertions, 24 deletions
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 5d4e607ab..7648b6fce 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -3485,6 +3485,7 @@ def age_restricted(content_limit, age_limit):
     return age_limit < content_limit
 
 
+# List of known byte-order-marks (BOM)
 BOMS = [
     (b'\xef\xbb\xbf', 'utf-8'),
     (b'\x00\x00\xfe\xff', 'utf-32-be'),
@@ -3492,7 +3493,6 @@ BOMS = [
     (b'\xff\xfe', 'utf-16-le'),
     (b'\xfe\xff', 'utf-16-be'),
 ]
-""" List of known byte-order-marks (BOM) """
 
 
 def is_html(first_bytes):
@@ -5398,37 +5398,20 @@ def read_stdin(what):
 
 def determine_file_encoding(data):
     """
-    From the first 512 bytes of a given file,
-    it tries to detect the encoding to be used to read as text.
-
+    Detect the text encoding used
     @returns (encoding, bytes to skip)
     """
 
+    # BOM marks are given priority over declarations
     for bom, enc in BOMS:
-        # matching BOM beats any declaration
-        # BOMs are skipped to prevent any errors
         if data.startswith(bom):
             return enc, len(bom)
 
-    # strip off all null bytes to match even when UTF-16 or UTF-32 is used
-    # endians don't matter
+    # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
+    # We ignore the endianness to get a good enough match
     data = data.replace(b'\0', b'')
-
-    PREAMBLES = [
-        # "# -*- coding: utf-8 -*-"
-        # "# coding: utf-8"
-        rb'(?m)^#(?:\s+-\*-)?\s*coding\s*:\s*(?P<encoding>\S+)(?:\s+-\*-)?\s*$',
-        # "# vi: set fileencoding=utf-8"
-        rb'^#\s+vi\s*:\s+set\s+fileencoding=(?P<encoding>[^\s,]+)'
-    ]
-    for pb in PREAMBLES:
-        mobj = re.match(pb, data)
-        if not mobj:
-            continue
-        # preambles aren't skipped since they're just ignored when reading as config
-        return mobj.group('encoding').decode(), 0
-
-    return None, 0
+    mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
+    return mobj.group(1).decode() if mobj else None, 0
 
 
 class Config:
author	pukkandan <pukkandan.ytdlp@gmail.com>	2022-07-15 21:44:07 +0530
committer	pukkandan <pukkandan.ytdlp@gmail.com>	2022-07-15 21:45:05 +0530
commit	88f60feb32614c723f997b2cba20c8c10fbe9bd3 (patch)
tree	46af3fa0a59a87b2054ae4b45843f4a1843a03af /yt_dlp/utils.py
parent	a904a7f8c6edc42046f0a78fb279739d500d4887 (diff)
download	hypervideo-pre-88f60feb32614c723f997b2cba20c8c10fbe9bd3.tar.lz hypervideo-pre-88f60feb32614c723f997b2cba20c8c10fbe9bd3.tar.xz hypervideo-pre-88f60feb32614c723f997b2cba20c8c10fbe9bd3.zip