aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--yt_dlp/extractor/generic.py15
-rw-r--r--yt_dlp/utils.py11
2 files changed, 20 insertions, 6 deletions
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py
index dda2b1eef..b0fc176ef 100644
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -2527,6 +2527,21 @@ class GenericIE(InfoExtractor):
'upload_date': '20220504',
},
},
+ {
+ # Webpage contains double BOM
+ 'url': 'https://www.filmarkivet.se/movies/paris-d-moll/',
+ 'md5': 'df02cadc719dcc63d43288366f037754',
+ 'info_dict': {
+ 'id': 'paris-d-moll',
+ 'ext': 'mp4',
+ 'upload_date': '20220518',
+ 'title': 'Paris d-moll',
+ 'description': 'md5:319e37ea5542293db37e1e13072fe330',
+ 'thumbnail': 'https://www.filmarkivet.se/wp-content/uploads/parisdmoll2.jpg',
+ 'timestamp': 1652833414,
+ 'age_limit': 0,
+ }
+ }
]
def report_following_redirect(self, new_url):
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 48a94415d..3b0e6750c 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -3290,14 +3290,13 @@ def is_html(first_bytes):
(b'\xff\xfe', 'utf-16-le'),
(b'\xfe\xff', 'utf-16-be'),
]
+
+ encoding = 'utf-8'
for bom, enc in BOMS:
- if first_bytes.startswith(bom):
- s = first_bytes[len(bom):].decode(enc, 'replace')
- break
- else:
- s = first_bytes.decode('utf-8', 'replace')
+ while first_bytes.startswith(bom):
+ encoding, first_bytes = enc, first_bytes[len(bom):]
- return re.match(r'^\s*<', s)
+ return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
def determine_protocol(info_dict):