Allow users to specify encoding in each config files (#4357)

Authored by: Lesmiscore
author: Lesmiscore <nao20010128@gmail.com> 2022-07-15 20:52:14 +0900
committer: GitHub <noreply@github.com> 2022-07-15 20:52:14 +0900
commit: a904a7f8c6edc42046f0a78fb279739d500d4887 (patch)
tree: b66716631ff4ff8ac2bf8f3aee46bf1784da6529 /yt_dlp/utils.py
parent: 49afc1d84a767ab2576d2c7d51d28c8920fc96f9 (diff)
download: hypervideo-pre-a904a7f8c6edc42046f0a78fb279739d500d4887.tar.lz
hypervideo-pre-a904a7f8c6edc42046f0a78fb279739d500d4887.tar.xz
hypervideo-pre-a904a7f8c6edc42046f0a78fb279739d500d4887.zip
1 files changed, 52 insertions, 10 deletions
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 6e0c31c01..5d4e607ab 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -3485,17 +3485,19 @@ def age_restricted(content_limit, age_limit):
     return age_limit < content_limit
 
 
+BOMS = [
+    (b'\xef\xbb\xbf', 'utf-8'),
+    (b'\x00\x00\xfe\xff', 'utf-32-be'),
+    (b'\xff\xfe\x00\x00', 'utf-32-le'),
+    (b'\xff\xfe', 'utf-16-le'),
+    (b'\xfe\xff', 'utf-16-be'),
+]
+""" List of known byte-order-marks (BOM) """
+
+
 def is_html(first_bytes):
     """ Detect whether a file contains HTML by examining its first bytes. """
 
-    BOMS = [
-        (b'\xef\xbb\xbf', 'utf-8'),
-        (b'\x00\x00\xfe\xff', 'utf-32-be'),
-        (b'\xff\xfe\x00\x00', 'utf-32-le'),
-        (b'\xff\xfe', 'utf-16-le'),
-        (b'\xfe\xff', 'utf-16-be'),
-    ]
-
     encoding = 'utf-8'
     for bom, enc in BOMS:
         while first_bytes.startswith(bom):
@@ -5394,6 +5396,41 @@ def read_stdin(what):
     return sys.stdin
 
 
+def determine_file_encoding(data):
+    """
+    From the first 512 bytes of a given file,
+    it tries to detect the encoding to be used to read as text.
+
+    @returns (encoding, bytes to skip)
+    """
+
+    for bom, enc in BOMS:
+        # matching BOM beats any declaration
+        # BOMs are skipped to prevent any errors
+        if data.startswith(bom):
+            return enc, len(bom)
+
+    # strip off all null bytes to match even when UTF-16 or UTF-32 is used
+    # endians don't matter
+    data = data.replace(b'\0', b'')
+
+    PREAMBLES = [
+        # "# -*- coding: utf-8 -*-"
+        # "# coding: utf-8"
+        rb'(?m)^#(?:\s+-\*-)?\s*coding\s*:\s*(?P<encoding>\S+)(?:\s+-\*-)?\s*$',
+        # "# vi: set fileencoding=utf-8"
+        rb'^#\s+vi\s*:\s+set\s+fileencoding=(?P<encoding>[^\s,]+)'
+    ]
+    for pb in PREAMBLES:
+        mobj = re.match(pb, data)
+        if not mobj:
+            continue
+        # preambles aren't skipped since they're just ignored when reading as config
+        return mobj.group('encoding').decode(), 0
+
+    return None, 0
+
+
 class Config:
     own_args = None
     parsed_args = None
@@ -5445,12 +5482,17 @@ class Config:
     @staticmethod
     def read_file(filename, default=[]):
         try:
-            optionf = open(filename)
+            optionf = open(filename, 'rb')
         except OSError:
             return default  # silently skip if file is not present
         try:
+            enc, skip = determine_file_encoding(optionf.read(512))
+            optionf.seek(skip, io.SEEK_SET)
+        except OSError:
+            enc = None  # silently skip read errors
+        try:
             # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
-            contents = optionf.read()
+            contents = optionf.read().decode(enc or preferredencoding())
             res = shlex.split(contents, comments=True)
         except Exception as err:
             raise ValueError(f'Unable to parse "{filename}": {err}')
author	Lesmiscore <nao20010128@gmail.com>	2022-07-15 20:52:14 +0900
committer	GitHub <noreply@github.com>	2022-07-15 20:52:14 +0900
commit	a904a7f8c6edc42046f0a78fb279739d500d4887 (patch)
tree	b66716631ff4ff8ac2bf8f3aee46bf1784da6529 /yt_dlp/utils.py
parent	49afc1d84a767ab2576d2c7d51d28c8920fc96f9 (diff)
download	hypervideo-pre-a904a7f8c6edc42046f0a78fb279739d500d4887.tar.lz hypervideo-pre-a904a7f8c6edc42046f0a78fb279739d500d4887.tar.xz hypervideo-pre-a904a7f8c6edc42046f0a78fb279739d500d4887.zip