1 files changed, 53 insertions, 4 deletions
diff --git a/hypervideo_dl/extractor/gedidigital.py b/hypervideo_dl/extractor/gedidigital.py
index 6c4153b..ec386c2 100644
--- a/hypervideo_dl/extractor/gedidigital.py
+++ b/hypervideo_dl/extractor/gedidigital.py
@@ -5,18 +5,22 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    base_url,
     determine_ext,
     int_or_none,
+    url_basename,
+    urljoin,
 )
 
 
 class GediDigitalIE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://video\.
+    _VALID_URL = r'''(?x)(?P<url>(?:https?:)//video\.
         (?:
             (?:
                 (?:espresso\.)?repubblica
                 |lastampa
                 |ilsecoloxix
+                |huffingtonpost
             )|
             (?:
                 iltirreno
@@ -32,12 +36,12 @@ class GediDigitalIE(InfoExtractor):
                 |corrierealpi
                 |lasentinella
             )\.gelocal
-        )\.it(?:/[^/]+){2,3}?/(?P<id>\d+)(?:[/?&#]|$)'''
+        )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*)'''
     _TESTS = [{
         'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683',
         'md5': '84658d7fb9e55a6e57ecc77b73137494',
         'info_dict': {
-            'id': '121559',
+            'id': '121683',
             'ext': 'mp4',
             'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso',
             'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca',
@@ -45,6 +49,9 @@ class GediDigitalIE(InfoExtractor):
             'duration': 125,
         },
     }, {
+        'url': 'https://video.huffingtonpost.it/embed/politica/cotticelli-non-so-cosa-mi-sia-successo-sto-cercando-di-capire-se-ho-avuto-un-malore/29312/29276?responsive=true&el=video971040871621586700',
+        'only_matching': True,
+    }, {
         'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360',
         'only_matching': True,
     }, {
@@ -94,9 +101,49 @@ class GediDigitalIE(InfoExtractor):
         'only_matching': True,
     }]
 
+    @staticmethod
+    def _sanitize_urls(urls):
+        # add protocol if missing
+        for i, e in enumerate(urls):
+            if e.startswith('//'):
+                urls[i] = 'https:%s' % e
+        # clean iframes urls
+        for i, e in enumerate(urls):
+            urls[i] = urljoin(base_url(e), url_basename(e))
+        return urls
+
+    @staticmethod
+    def _extract_urls(webpage):
+        entries = [
+            mobj.group('eurl')
+            for mobj in re.finditer(r'''(?x)
+            (?:
+                data-frame-src=|
+                <iframe[^\n]+src=
+            )
+            (["'])(?P<eurl>%s)\1''' % GediDigitalIE._VALID_URL, webpage)]
+        return GediDigitalIE._sanitize_urls(entries)
+
+    @staticmethod
+    def _extract_url(webpage):
+        urls = GediDigitalIE._extract_urls(webpage)
+        return urls[0] if urls else None
+
+    @staticmethod
+    def _clean_formats(formats):
+        format_urls = set()
+        clean_formats = []
+        for f in formats:
+            if f['url'] not in format_urls:
+                if f.get('audio_ext') != 'none' and not f.get('acodec'):
+                    continue
+                format_urls.add(f['url'])
+                clean_formats.append(f)
+        formats[:] = clean_formats
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
-
+        url = self._match_valid_url(url).group('url')
         webpage = self._download_webpage(url, video_id)
         title = self._html_search_meta(
             ['twitter:title', 'og:title'], webpage, fatal=True)
@@ -129,6 +176,7 @@ class GediDigitalIE(InfoExtractor):
                         f.update({
                             'abr': abr,
                             'tbr': abr,
+                            'acodec': ext,
                             'vcodec': 'none'
                         })
                     else:
@@ -148,6 +196,7 @@ class GediDigitalIE(InfoExtractor):
                 elif n == 'videoDuration':
                     duration = int_or_none(v)
 
+        self._clean_formats(formats)
         self._sort_formats(formats)
 
         return {