diff options
| author | Yen Chi Hsuan <yan12125@gmail.com> | 2016-07-10 23:40:45 +0800 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2016-07-10 23:40:45 +0800 | 
| commit | 0b68de3cc1f99ce8c49a497245c02d4d03201aa8 (patch) | |
| tree | 885385b3b4968715eb2b7d51b4f66f3a12da7f46 | |
| parent | 39e9d524e5fe289936160d4c599a77f10f6e9061 (diff) | |
| parent | 59bbe4911acd4493bf407925bfdeb1ad03db6ef3 (diff) | |
| download | hypervideo-pre-0b68de3cc1f99ce8c49a497245c02d4d03201aa8.tar.lz hypervideo-pre-0b68de3cc1f99ce8c49a497245c02d4d03201aa8.tar.xz hypervideo-pre-0b68de3cc1f99ce8c49a497245c02d4d03201aa8.zip | |
Merge pull request #8876 from remitamine/html5_media
[extractor/common] add helper method to extract html5 media entries
| -rw-r--r-- | test/test_utils.py | 24 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 58 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 36 | 
3 files changed, 118 insertions, 0 deletions
| diff --git a/test/test_utils.py b/test/test_utils.py index afd273a65..2273b5a10 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -81,6 +81,7 @@ from youtube_dl.utils import (      cli_option,      cli_valueless_option,      cli_bool_option, +    parse_codecs,  )  from youtube_dl.compat import (      compat_chr, @@ -608,6 +609,29 @@ class TestUtil(unittest.TestCase):              limit_length('foo bar baz asd', 12).startswith('foo bar'))          self.assertTrue('...' in limit_length('foo bar baz asd', 12)) +    def test_parse_codecs(self): +        self.assertEqual(parse_codecs(''), {}) +        self.assertEqual(parse_codecs('avc1.77.30, mp4a.40.2'), { +            'vcodec': 'avc1.77.30', +            'acodec': 'mp4a.40.2', +        }) +        self.assertEqual(parse_codecs('mp4a.40.2'), { +            'vcodec': 'none', +            'acodec': 'mp4a.40.2', +        }) +        self.assertEqual(parse_codecs('mp4a.40.5,avc1.42001e'), { +            'vcodec': 'avc1.42001e', +            'acodec': 'mp4a.40.5', +        }) +        self.assertEqual(parse_codecs('avc3.640028'), { +            'vcodec': 'avc3.640028', +            'acodec': 'none', +        }) +        self.assertEqual(parse_codecs(', h264,,newcodec,aac'), { +            'vcodec': 'h264', +            'acodec': 'aac', +        }) +      def test_escape_rfc3986(self):          reserved = "!*'();:@&=+$,/?#[]"          unreserved = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~' diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 816baa424..df546da27 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -55,6 +55,8 @@ from ..utils import (      update_Request,      update_url_query,      parse_m3u8_attributes, +    extract_attributes, +    parse_codecs,  ) @@ -1635,6 +1637,62 @@ class InfoExtractor(object):                          self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)          return formats +    def _parse_html5_media_entries(self, base_url, webpage): +        def absolute_url(video_url): +            return compat_urlparse.urljoin(base_url, video_url) + +        def parse_content_type(content_type): +            if not content_type: +                return {} +            ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type) +            if ctr: +                mimetype, codecs = ctr.groups() +                f = parse_codecs(codecs) +                f['ext'] = mimetype2ext(mimetype) +                return f +            return {} + +        entries = [] +        for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage): +            media_info = { +                'formats': [], +                'subtitles': {}, +            } +            media_attributes = extract_attributes(media_tag) +            src = media_attributes.get('src') +            if src: +                media_info['formats'].append({ +                    'url': absolute_url(src), +                    'vcodec': 'none' if media_type == 'audio' else None, +                }) +            media_info['thumbnail'] = media_attributes.get('poster') +            if media_content: +                for source_tag in re.findall(r'<source[^>]+>', media_content): +                    source_attributes = extract_attributes(source_tag) +                    src = source_attributes.get('src') +                    if not src: +                        continue +                    f = parse_content_type(source_attributes.get('type')) +                    f.update({ +                        'url': absolute_url(src), +                        'vcodec': 'none' if media_type == 'audio' else None, +                    }) +                    media_info['formats'].append(f) +                for track_tag in re.findall(r'<track[^>]+>', media_content): +                    track_attributes = extract_attributes(track_tag) +                    kind = track_attributes.get('kind') +                    if not kind or kind == 'subtitles': +                        src = track_attributes.get('src') +                        if not src: +                            continue +                        lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') +                        media_info['subtitles'].setdefault(lang, []).append({ +                            'url': absolute_url(src), +                        }) +            if media_info['formats']: +                entries.append(media_info) +        return entries +      def _live_title(self, name):          """ Generate the title for a live video """          now = datetime.datetime.now() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 3498697b6..4c1d0d526 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2126,6 +2126,42 @@ def mimetype2ext(mt):      }.get(res, res) +def parse_codecs(codecs_str): +    # http://tools.ietf.org/html/rfc6381 +    if not codecs_str: +        return {} +    splited_codecs = list(filter(None, map( +        lambda str: str.strip(), codecs_str.strip().strip(',').split(',')))) +    vcodec, acodec = None, None +    for full_codec in splited_codecs: +        codec = full_codec.split('.')[0] +        if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'): +            if not vcodec: +                vcodec = full_codec +        elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'): +            if not acodec: +                acodec = full_codec +        else: +            write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr) +    if not vcodec and not acodec: +        if len(splited_codecs) == 2: +            return { +                'vcodec': vcodec, +                'acodec': acodec, +            } +        elif len(splited_codecs) == 1: +            return { +                'vcodec': 'none', +                'acodec': vcodec, +            } +    else: +        return { +            'vcodec': vcodec or 'none', +            'acodec': acodec or 'none', +        } +    return {} + +  def urlhandle_detect_ext(url_handle):      getheader = url_handle.headers.get | 
