aboutsummaryrefslogtreecommitdiffstats
path: root/hypervideo_dl/extractor/gedidigital.py
diff options
context:
space:
mode:
Diffstat (limited to 'hypervideo_dl/extractor/gedidigital.py')
-rw-r--r--hypervideo_dl/extractor/gedidigital.py57
1 files changed, 53 insertions, 4 deletions
diff --git a/hypervideo_dl/extractor/gedidigital.py b/hypervideo_dl/extractor/gedidigital.py
index 6c4153b..ec386c2 100644
--- a/hypervideo_dl/extractor/gedidigital.py
+++ b/hypervideo_dl/extractor/gedidigital.py
@@ -5,18 +5,22 @@ import re
from .common import InfoExtractor
from ..utils import (
+ base_url,
determine_ext,
int_or_none,
+ url_basename,
+ urljoin,
)
class GediDigitalIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://video\.
+ _VALID_URL = r'''(?x)(?P<url>(?:https?:)//video\.
(?:
(?:
(?:espresso\.)?repubblica
|lastampa
|ilsecoloxix
+ |huffingtonpost
)|
(?:
iltirreno
@@ -32,12 +36,12 @@ class GediDigitalIE(InfoExtractor):
|corrierealpi
|lasentinella
)\.gelocal
- )\.it(?:/[^/]+){2,3}?/(?P<id>\d+)(?:[/?&#]|$)'''
+ )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*)'''
_TESTS = [{
'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683',
'md5': '84658d7fb9e55a6e57ecc77b73137494',
'info_dict': {
- 'id': '121559',
+ 'id': '121683',
'ext': 'mp4',
'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso',
'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca',
@@ -45,6 +49,9 @@ class GediDigitalIE(InfoExtractor):
'duration': 125,
},
}, {
+ 'url': 'https://video.huffingtonpost.it/embed/politica/cotticelli-non-so-cosa-mi-sia-successo-sto-cercando-di-capire-se-ho-avuto-un-malore/29312/29276?responsive=true&el=video971040871621586700',
+ 'only_matching': True,
+ }, {
'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360',
'only_matching': True,
}, {
@@ -94,9 +101,49 @@ class GediDigitalIE(InfoExtractor):
'only_matching': True,
}]
+ @staticmethod
+ def _sanitize_urls(urls):
+ # add protocol if missing
+ for i, e in enumerate(urls):
+ if e.startswith('//'):
+ urls[i] = 'https:%s' % e
+ # clean iframes urls
+ for i, e in enumerate(urls):
+ urls[i] = urljoin(base_url(e), url_basename(e))
+ return urls
+
+ @staticmethod
+ def _extract_urls(webpage):
+ entries = [
+ mobj.group('eurl')
+ for mobj in re.finditer(r'''(?x)
+ (?:
+ data-frame-src=|
+ <iframe[^\n]+src=
+ )
+ (["'])(?P<eurl>%s)\1''' % GediDigitalIE._VALID_URL, webpage)]
+ return GediDigitalIE._sanitize_urls(entries)
+
+ @staticmethod
+ def _extract_url(webpage):
+ urls = GediDigitalIE._extract_urls(webpage)
+ return urls[0] if urls else None
+
+ @staticmethod
+ def _clean_formats(formats):
+ format_urls = set()
+ clean_formats = []
+ for f in formats:
+ if f['url'] not in format_urls:
+ if f.get('audio_ext') != 'none' and not f.get('acodec'):
+ continue
+ format_urls.add(f['url'])
+ clean_formats.append(f)
+ formats[:] = clean_formats
+
def _real_extract(self, url):
video_id = self._match_id(url)
-
+ url = self._match_valid_url(url).group('url')
webpage = self._download_webpage(url, video_id)
title = self._html_search_meta(
['twitter:title', 'og:title'], webpage, fatal=True)
@@ -129,6 +176,7 @@ class GediDigitalIE(InfoExtractor):
f.update({
'abr': abr,
'tbr': abr,
+ 'acodec': ext,
'vcodec': 'none'
})
else:
@@ -148,6 +196,7 @@ class GediDigitalIE(InfoExtractor):
elif n == 'videoDuration':
duration = int_or_none(v)
+ self._clean_formats(formats)
self._sort_formats(formats)
return {