diff options
| -rw-r--r-- | yt_dlp/extractor/common.py | 4 | ||||
| -rw-r--r-- | yt_dlp/extractor/elonet.py | 85 | 
2 files changed, 34 insertions, 55 deletions
| diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d0e57da23..af964c527 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1297,8 +1297,8 @@ class InfoExtractor(object):      @staticmethod      def _og_regexes(prop):          content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' -        property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' -                       % {'prop': re.escape(prop)}) +        property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' +                       % {'prop': re.escape(prop), 'sep': '(?::|[:-])'})          template = r'<meta[^>]+?%s[^>]+?%s'          return [              template % (property_re, content_re), diff --git a/yt_dlp/extractor/elonet.py b/yt_dlp/extractor/elonet.py index eefba4e24..9c6aea28e 100644 --- a/yt_dlp/extractor/elonet.py +++ b/yt_dlp/extractor/elonet.py @@ -1,30 +1,22 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor -from ..utils import ( -    base_url, -    ExtractorError, -    try_get, -) -from ..compat import compat_str +from ..utils import determine_ext  class ElonetIE(InfoExtractor):      _VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P<id>[0-9]+)'      _TESTS = [{ -        # m3u8 with subtitles          'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867', -        'md5': '8efc954b96c543711707f87de757caea',          'info_dict': {              'id': '107867',              'ext': 'mp4',              'title': 'Valkoinen peura', -            'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...', -            'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large', +            'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_107867.+', +            'description': 'md5:bded4201c9677fab10854884fe8f7312',          }, +        'params': {'skip_download': 'dash'},      }, {          # DASH with subtitles          'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539', @@ -32,58 +24,45 @@ class ElonetIE(InfoExtractor):              'id': '116539',              'ext': 'mp4',              'title': 'Minulla on tiikeri', -            'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...', -            'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr', -        } +            'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_116539.+', +            'description': 'md5:5ab72b3fe76d3414e46cc8f277104419', +        }, +        'params': {'skip_download': 'dash'}, +    }, { +        # Page with multiple videos, download the main one +        'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_117396', +        'info_dict': { +            'id': '117396', +            'ext': 'mp4', +            'title': 'Sampo', +            'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_117396.+', +            'description': 'md5:ec69572a5b054d0ecafe8086b1fa96f7', +        }, +        'params': {'skip_download': 'dash'},      }]      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        title = self._html_search_regex( -            r'<meta .*property="og:title" .*content="(.+?)"', webpage, 'title') -        description = self._html_search_regex( -            r'<meta .*property="og:description" .*content="(.+?)"', webpage, 'description') -        thumbnail = self._html_search_regex( -            r'<meta .*property="og:image" .*content="(.+?)"', webpage, 'thumbnail') +        src = self._parse_json(self._html_search_regex( +            r'id=\'video-data\'[^>]+data-video-sources="([^"]+)"', webpage, 'json'), video_id)[0]['src'] +        ext = determine_ext(src) -        json_s = self._html_search_regex( -            r'data-video-sources="(.+?)"', webpage, 'json') -        src = try_get( -            self._parse_json(json_s, video_id), -            lambda x: x[0]["src"], compat_str) -        formats = [] -        subtitles = {} -        if re.search(r'\.m3u8\??', src): -            res = self._download_webpage_handle( -                # elonet servers have certificate problems -                src.replace('https:', 'http:'), video_id, -                note='Downloading m3u8 information', -                errnote='Failed to download m3u8 information') -            if res: -                doc, urlh = res -                url = urlh.geturl() -                formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url) -                for f in formats: -                    f['ext'] = 'mp4' -        elif re.search(r'\.mpd\??', src): -            res = self._download_xml_handle( -                src, video_id, -                note='Downloading MPD manifest', -                errnote='Failed to download MPD manifest') -            if res: -                doc, urlh = res -                url = base_url(urlh.geturl()) -                formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url) +        if ext == 'm3u8': +            formats, subtitles = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False) +        elif ext == 'mpd': +            formats, subtitles = self._extract_mpd_formats_and_subtitles(src, video_id, fatal=False)          else: -            raise ExtractorError("Unknown streaming format") +            formats, subtitles = [], {} +            self.raise_no_formats(f'Unknown streaming format {ext}') +        self._sort_formats(formats)          return {              'id': video_id, -            'title': title, -            'description': description, -            'thumbnail': thumbnail, +            'title': self._og_search_title(webpage), +            'description': self._og_search_description(webpage), +            'thumbnail': self._og_search_thumbnail(webpage),              'formats': formats,              'subtitles': subtitles,          } | 
