diff options
author | Lesmiscore <nao20010128@gmail.com> | 2022-07-15 20:52:14 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-07-15 20:52:14 +0900 |
commit | a904a7f8c6edc42046f0a78fb279739d500d4887 (patch) | |
tree | b66716631ff4ff8ac2bf8f3aee46bf1784da6529 /yt_dlp/utils.py | |
parent | 49afc1d84a767ab2576d2c7d51d28c8920fc96f9 (diff) | |
download | hypervideo-pre-a904a7f8c6edc42046f0a78fb279739d500d4887.tar.lz hypervideo-pre-a904a7f8c6edc42046f0a78fb279739d500d4887.tar.xz hypervideo-pre-a904a7f8c6edc42046f0a78fb279739d500d4887.zip |
Allow users to specify encoding in each config files (#4357)
Authored by: Lesmiscore
Diffstat (limited to 'yt_dlp/utils.py')
-rw-r--r-- | yt_dlp/utils.py | 62 |
1 files changed, 52 insertions, 10 deletions
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 6e0c31c01..5d4e607ab 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3485,17 +3485,19 @@ def age_restricted(content_limit, age_limit): return age_limit < content_limit +BOMS = [ + (b'\xef\xbb\xbf', 'utf-8'), + (b'\x00\x00\xfe\xff', 'utf-32-be'), + (b'\xff\xfe\x00\x00', 'utf-32-le'), + (b'\xff\xfe', 'utf-16-le'), + (b'\xfe\xff', 'utf-16-be'), +] +""" List of known byte-order-marks (BOM) """ + + def is_html(first_bytes): """ Detect whether a file contains HTML by examining its first bytes. """ - BOMS = [ - (b'\xef\xbb\xbf', 'utf-8'), - (b'\x00\x00\xfe\xff', 'utf-32-be'), - (b'\xff\xfe\x00\x00', 'utf-32-le'), - (b'\xff\xfe', 'utf-16-le'), - (b'\xfe\xff', 'utf-16-be'), - ] - encoding = 'utf-8' for bom, enc in BOMS: while first_bytes.startswith(bom): @@ -5394,6 +5396,41 @@ def read_stdin(what): return sys.stdin +def determine_file_encoding(data): + """ + From the first 512 bytes of a given file, + it tries to detect the encoding to be used to read as text. + + @returns (encoding, bytes to skip) + """ + + for bom, enc in BOMS: + # matching BOM beats any declaration + # BOMs are skipped to prevent any errors + if data.startswith(bom): + return enc, len(bom) + + # strip off all null bytes to match even when UTF-16 or UTF-32 is used + # endians don't matter + data = data.replace(b'\0', b'') + + PREAMBLES = [ + # "# -*- coding: utf-8 -*-" + # "# coding: utf-8" + rb'(?m)^#(?:\s+-\*-)?\s*coding\s*:\s*(?P<encoding>\S+)(?:\s+-\*-)?\s*$', + # "# vi: set fileencoding=utf-8" + rb'^#\s+vi\s*:\s+set\s+fileencoding=(?P<encoding>[^\s,]+)' + ] + for pb in PREAMBLES: + mobj = re.match(pb, data) + if not mobj: + continue + # preambles aren't skipped since they're just ignored when reading as config + return mobj.group('encoding').decode(), 0 + + return None, 0 + + class Config: own_args = None parsed_args = None @@ -5445,12 +5482,17 @@ class Config: @staticmethod def read_file(filename, default=[]): try: - optionf = open(filename) + optionf = open(filename, 'rb') except OSError: return default # silently skip if file is not present try: + enc, skip = determine_file_encoding(optionf.read(512)) + optionf.seek(skip, io.SEEK_SET) + except OSError: + enc = None # silently skip read errors + try: # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 - contents = optionf.read() + contents = optionf.read().decode(enc or preferredencoding()) res = shlex.split(contents, comments=True) except Exception as err: raise ValueError(f'Unable to parse "{filename}": {err}') |