diff options
Diffstat (limited to 'hypervideo_dl/extractor/gedidigital.py')
-rw-r--r-- | hypervideo_dl/extractor/gedidigital.py | 57 |
1 files changed, 53 insertions, 4 deletions
diff --git a/hypervideo_dl/extractor/gedidigital.py b/hypervideo_dl/extractor/gedidigital.py index 6c4153b..ec386c2 100644 --- a/hypervideo_dl/extractor/gedidigital.py +++ b/hypervideo_dl/extractor/gedidigital.py @@ -5,18 +5,22 @@ import re from .common import InfoExtractor from ..utils import ( + base_url, determine_ext, int_or_none, + url_basename, + urljoin, ) class GediDigitalIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://video\. + _VALID_URL = r'''(?x)(?P<url>(?:https?:)//video\. (?: (?: (?:espresso\.)?repubblica |lastampa |ilsecoloxix + |huffingtonpost )| (?: iltirreno @@ -32,12 +36,12 @@ class GediDigitalIE(InfoExtractor): |corrierealpi |lasentinella )\.gelocal - )\.it(?:/[^/]+){2,3}?/(?P<id>\d+)(?:[/?&#]|$)''' + )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*)''' _TESTS = [{ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', 'md5': '84658d7fb9e55a6e57ecc77b73137494', 'info_dict': { - 'id': '121559', + 'id': '121683', 'ext': 'mp4', 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', @@ -45,6 +49,9 @@ class GediDigitalIE(InfoExtractor): 'duration': 125, }, }, { + 'url': 'https://video.huffingtonpost.it/embed/politica/cotticelli-non-so-cosa-mi-sia-successo-sto-cercando-di-capire-se-ho-avuto-un-malore/29312/29276?responsive=true&el=video971040871621586700', + 'only_matching': True, + }, { 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', 'only_matching': True, }, { @@ -94,9 +101,49 @@ class GediDigitalIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _sanitize_urls(urls): + # add protocol if missing + for i, e in enumerate(urls): + if e.startswith('//'): + urls[i] = 'https:%s' % e + # clean iframes urls + for i, e in enumerate(urls): + urls[i] = urljoin(base_url(e), url_basename(e)) + return urls + + @staticmethod + def _extract_urls(webpage): + entries = [ + mobj.group('eurl') + for mobj in re.finditer(r'''(?x) + (?: + data-frame-src=| + <iframe[^\n]+src= + ) + (["'])(?P<eurl>%s)\1''' % GediDigitalIE._VALID_URL, webpage)] + return GediDigitalIE._sanitize_urls(entries) + + @staticmethod + def _extract_url(webpage): + urls = GediDigitalIE._extract_urls(webpage) + return urls[0] if urls else None + + @staticmethod + def _clean_formats(formats): + format_urls = set() + clean_formats = [] + for f in formats: + if f['url'] not in format_urls: + if f.get('audio_ext') != 'none' and not f.get('acodec'): + continue + format_urls.add(f['url']) + clean_formats.append(f) + formats[:] = clean_formats + def _real_extract(self, url): video_id = self._match_id(url) - + url = self._match_valid_url(url).group('url') webpage = self._download_webpage(url, video_id) title = self._html_search_meta( ['twitter:title', 'og:title'], webpage, fatal=True) @@ -129,6 +176,7 @@ class GediDigitalIE(InfoExtractor): f.update({ 'abr': abr, 'tbr': abr, + 'acodec': ext, 'vcodec': 'none' }) else: @@ -148,6 +196,7 @@ class GediDigitalIE(InfoExtractor): elif n == 'videoDuration': duration = int_or_none(v) + self._clean_formats(formats) self._sort_formats(formats) return { |