aboutsummaryrefslogtreecommitdiffstats
path: root/yt_dlp/utils.py
diff options
context:
space:
mode:
authorLesmiscore <nao20010128@gmail.com>2022-07-15 20:52:14 +0900
committerGitHub <noreply@github.com>2022-07-15 20:52:14 +0900
commita904a7f8c6edc42046f0a78fb279739d500d4887 (patch)
treeb66716631ff4ff8ac2bf8f3aee46bf1784da6529 /yt_dlp/utils.py
parent49afc1d84a767ab2576d2c7d51d28c8920fc96f9 (diff)
downloadhypervideo-pre-a904a7f8c6edc42046f0a78fb279739d500d4887.tar.lz
hypervideo-pre-a904a7f8c6edc42046f0a78fb279739d500d4887.tar.xz
hypervideo-pre-a904a7f8c6edc42046f0a78fb279739d500d4887.zip
Allow users to specify encoding in each config files (#4357)
Authored by: Lesmiscore
Diffstat (limited to 'yt_dlp/utils.py')
-rw-r--r--yt_dlp/utils.py62
1 files changed, 52 insertions, 10 deletions
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 6e0c31c01..5d4e607ab 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -3485,17 +3485,19 @@ def age_restricted(content_limit, age_limit):
return age_limit < content_limit
+BOMS = [
+ (b'\xef\xbb\xbf', 'utf-8'),
+ (b'\x00\x00\xfe\xff', 'utf-32-be'),
+ (b'\xff\xfe\x00\x00', 'utf-32-le'),
+ (b'\xff\xfe', 'utf-16-le'),
+ (b'\xfe\xff', 'utf-16-be'),
+]
+""" List of known byte-order-marks (BOM) """
+
+
def is_html(first_bytes):
""" Detect whether a file contains HTML by examining its first bytes. """
- BOMS = [
- (b'\xef\xbb\xbf', 'utf-8'),
- (b'\x00\x00\xfe\xff', 'utf-32-be'),
- (b'\xff\xfe\x00\x00', 'utf-32-le'),
- (b'\xff\xfe', 'utf-16-le'),
- (b'\xfe\xff', 'utf-16-be'),
- ]
-
encoding = 'utf-8'
for bom, enc in BOMS:
while first_bytes.startswith(bom):
@@ -5394,6 +5396,41 @@ def read_stdin(what):
return sys.stdin
+def determine_file_encoding(data):
+ """
+ From the first 512 bytes of a given file,
+ it tries to detect the encoding to be used to read as text.
+
+ @returns (encoding, bytes to skip)
+ """
+
+ for bom, enc in BOMS:
+ # matching BOM beats any declaration
+ # BOMs are skipped to prevent any errors
+ if data.startswith(bom):
+ return enc, len(bom)
+
+ # strip off all null bytes to match even when UTF-16 or UTF-32 is used
+ # endians don't matter
+ data = data.replace(b'\0', b'')
+
+ PREAMBLES = [
+ # "# -*- coding: utf-8 -*-"
+ # "# coding: utf-8"
+ rb'(?m)^#(?:\s+-\*-)?\s*coding\s*:\s*(?P<encoding>\S+)(?:\s+-\*-)?\s*$',
+ # "# vi: set fileencoding=utf-8"
+ rb'^#\s+vi\s*:\s+set\s+fileencoding=(?P<encoding>[^\s,]+)'
+ ]
+ for pb in PREAMBLES:
+ mobj = re.match(pb, data)
+ if not mobj:
+ continue
+ # preambles aren't skipped since they're just ignored when reading as config
+ return mobj.group('encoding').decode(), 0
+
+ return None, 0
+
+
class Config:
own_args = None
parsed_args = None
@@ -5445,12 +5482,17 @@ class Config:
@staticmethod
def read_file(filename, default=[]):
try:
- optionf = open(filename)
+ optionf = open(filename, 'rb')
except OSError:
return default # silently skip if file is not present
try:
+ enc, skip = determine_file_encoding(optionf.read(512))
+ optionf.seek(skip, io.SEEK_SET)
+ except OSError:
+ enc = None # silently skip read errors
+ try:
# FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
- contents = optionf.read()
+ contents = optionf.read().decode(enc or preferredencoding())
res = shlex.split(contents, comments=True)
except Exception as err:
raise ValueError(f'Unable to parse "{filename}": {err}')