diff options
author | pukkandan <pukkandan.ytdlp@gmail.com> | 2022-07-15 21:44:07 +0530 |
---|---|---|
committer | pukkandan <pukkandan.ytdlp@gmail.com> | 2022-07-15 21:45:05 +0530 |
commit | 88f60feb32614c723f997b2cba20c8c10fbe9bd3 (patch) | |
tree | 46af3fa0a59a87b2054ae4b45843f4a1843a03af /yt_dlp/utils.py | |
parent | a904a7f8c6edc42046f0a78fb279739d500d4887 (diff) | |
download | hypervideo-pre-88f60feb32614c723f997b2cba20c8c10fbe9bd3.tar.lz hypervideo-pre-88f60feb32614c723f997b2cba20c8c10fbe9bd3.tar.xz hypervideo-pre-88f60feb32614c723f997b2cba20c8c10fbe9bd3.zip |
Fix a904a7f8c6edc42046f0a78fb279739d500d4887
Diffstat (limited to 'yt_dlp/utils.py')
-rw-r--r-- | yt_dlp/utils.py | 31 |
1 files changed, 7 insertions, 24 deletions
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 5d4e607ab..7648b6fce 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3485,6 +3485,7 @@ def age_restricted(content_limit, age_limit): return age_limit < content_limit +# List of known byte-order-marks (BOM) BOMS = [ (b'\xef\xbb\xbf', 'utf-8'), (b'\x00\x00\xfe\xff', 'utf-32-be'), @@ -3492,7 +3493,6 @@ BOMS = [ (b'\xff\xfe', 'utf-16-le'), (b'\xfe\xff', 'utf-16-be'), ] -""" List of known byte-order-marks (BOM) """ def is_html(first_bytes): @@ -5398,37 +5398,20 @@ def read_stdin(what): def determine_file_encoding(data): """ - From the first 512 bytes of a given file, - it tries to detect the encoding to be used to read as text. - + Detect the text encoding used @returns (encoding, bytes to skip) """ + # BOM marks are given priority over declarations for bom, enc in BOMS: - # matching BOM beats any declaration - # BOMs are skipped to prevent any errors if data.startswith(bom): return enc, len(bom) - # strip off all null bytes to match even when UTF-16 or UTF-32 is used - # endians don't matter + # Strip off all null bytes to match even when UTF-16 or UTF-32 is used. + # We ignore the endianness to get a good enough match data = data.replace(b'\0', b'') - - PREAMBLES = [ - # "# -*- coding: utf-8 -*-" - # "# coding: utf-8" - rb'(?m)^#(?:\s+-\*-)?\s*coding\s*:\s*(?P<encoding>\S+)(?:\s+-\*-)?\s*$', - # "# vi: set fileencoding=utf-8" - rb'^#\s+vi\s*:\s+set\s+fileencoding=(?P<encoding>[^\s,]+)' - ] - for pb in PREAMBLES: - mobj = re.match(pb, data) - if not mobj: - continue - # preambles aren't skipped since they're just ignored when reading as config - return mobj.group('encoding').decode(), 0 - - return None, 0 + mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data) + return mobj.group(1).decode() if mobj else None, 0 class Config: |