diff options
author | Lesmiscore <nao20010128@gmail.com> | 2022-05-29 22:48:04 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-29 22:48:04 +0900 |
commit | 222a230871fe4fe63f35c49590379c9a77116819 (patch) | |
tree | 0067375281c6e356b8302db3f99d9e75fcad65b7 | |
parent | ee27297f82ccbd702ccd4721d1d3c9d67bbe187e (diff) | |
download | hypervideo-pre-222a230871fe4fe63f35c49590379c9a77116819.tar.lz hypervideo-pre-222a230871fe4fe63f35c49590379c9a77116819.tar.xz hypervideo-pre-222a230871fe4fe63f35c49590379c9a77116819.zip |
[extractor/common] Recognize `src` attribute from HTML5 media elements (#3899)
Authored by: Lesmiscore
-rw-r--r-- | test/test_InfoExtractor.py | 18 | ||||
-rw-r--r-- | yt_dlp/extractor/common.py | 7 |
2 files changed, 23 insertions, 2 deletions
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 257ea7dd3..928246668 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -502,6 +502,24 @@ class TestInfoExtractor(unittest.TestCase): }], }) + # from https://0000.studio/ + # with type attribute but without extension in URL + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://0000.studio', + r''' + <video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92" + controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain"> + </video> + ''', None)[0], + { + 'formats': [{ + 'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92', + 'ext': 'mp4', + }], + }) + def test_extract_jwplayer_data_realworld(self): # from http://www.suffolk.edu/sjc/ expect_dict( diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 5767662ed..a589fb7fa 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3197,7 +3197,8 @@ class InfoExtractor: return f return {} - def _media_formats(src, cur_media_type, type_info={}): + def _media_formats(src, cur_media_type, type_info=None): + type_info = type_info or {} full_url = absolute_url(src) ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': @@ -3215,6 +3216,7 @@ class InfoExtractor: formats = [{ 'url': full_url, 'vcodec': 'none' if cur_media_type == 'audio' else None, + 'ext': ext, }] return is_plain_url, formats @@ -3241,7 +3243,8 @@ class InfoExtractor: media_attributes = extract_attributes(media_tag) src = strip_or_none(media_attributes.get('src')) if src: - _, formats = _media_formats(src, media_type) + f = parse_content_type(media_attributes.get('type')) + _, formats = _media_formats(src, media_type, f) media_info['formats'].extend(formats) media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) if media_content: |