diff options
Diffstat (limited to 'yt_dlp')
| -rw-r--r-- | yt_dlp/YoutubeDL.py | 20 | ||||
| -rw-r--r-- | yt_dlp/extractor/applepodcasts.py | 1 | ||||
| -rw-r--r-- | yt_dlp/extractor/bandcamp.py | 4 | ||||
| -rw-r--r-- | yt_dlp/extractor/bilibili.py | 17 | ||||
| -rw-r--r-- | yt_dlp/extractor/cbs.py | 4 | ||||
| -rw-r--r-- | yt_dlp/extractor/common.py | 3 | ||||
| -rw-r--r-- | yt_dlp/extractor/fujitv.py | 2 | ||||
| -rw-r--r-- | yt_dlp/extractor/lbry.py | 51 | ||||
| -rw-r--r-- | yt_dlp/extractor/peertube.py | 7 | ||||
| -rw-r--r-- | yt_dlp/extractor/pinterest.py | 4 | ||||
| -rw-r--r-- | yt_dlp/extractor/pornhub.py | 44 | ||||
| -rw-r--r-- | yt_dlp/extractor/rtve.py | 232 | ||||
| -rw-r--r-- | yt_dlp/extractor/shahid.py | 20 | ||||
| -rw-r--r-- | yt_dlp/extractor/southpark.py | 16 | ||||
| -rw-r--r-- | yt_dlp/extractor/sportdeutschland.py | 145 | ||||
| -rw-r--r-- | yt_dlp/extractor/tver.py | 11 | ||||
| -rw-r--r-- | yt_dlp/extractor/voxmedia.py | 26 | 
17 files changed, 355 insertions, 252 deletions
| diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 397d40503..0979252c9 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1794,14 +1794,18 @@ class YoutubeDL(object):          if 'display_id' not in info_dict and 'id' in info_dict:              info_dict['display_id'] = info_dict['id'] -        if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: -            # Working around out-of-range timestamp values (e.g. negative ones on Windows, -            # see http://bugs.python.org/issue1646728) -            try: -                upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) -                info_dict['upload_date'] = upload_date.strftime('%Y%m%d') -            except (ValueError, OverflowError, OSError): -                pass +        for ts_key, date_key in ( +                ('timestamp', 'upload_date'), +                ('release_timestamp', 'release_date'), +        ): +            if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: +                # Working around out-of-range timestamp values (e.g. negative ones on Windows, +                # see http://bugs.python.org/issue1646728) +                try: +                    upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) +                    info_dict[date_key] = upload_date.strftime('%Y%m%d') +                except (ValueError, OverflowError, OSError): +                    pass          # Auto generate title fields corresponding to the *_number fields when missing          # in order to always have clean titles. This is very common for TV series. diff --git a/yt_dlp/extractor/applepodcasts.py b/yt_dlp/extractor/applepodcasts.py index 95758fece..6a74de758 100644 --- a/yt_dlp/extractor/applepodcasts.py +++ b/yt_dlp/extractor/applepodcasts.py @@ -42,6 +42,7 @@ class ApplePodcastsIE(InfoExtractor):          ember_data = self._parse_json(self._search_regex(              r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',              webpage, 'ember data'), episode_id) +        ember_data = ember_data.get(episode_id) or ember_data          episode = ember_data['data']['attributes']          description = episode.get('description') or {} diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 69e673a26..006aab3b4 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -49,6 +49,7 @@ class BandcampIE(InfoExtractor):              'uploader': 'Ben Prunty',              'timestamp': 1396508491,              'upload_date': '20140403', +            'release_timestamp': 1396483200,              'release_date': '20140403',              'duration': 260.877,              'track': 'Lanius (Battle)', @@ -69,6 +70,7 @@ class BandcampIE(InfoExtractor):              'uploader': 'Mastodon',              'timestamp': 1322005399,              'upload_date': '20111122', +            'release_timestamp': 1076112000,              'release_date': '20040207',              'duration': 120.79,              'track': 'Hail to Fire', @@ -197,7 +199,7 @@ class BandcampIE(InfoExtractor):              'thumbnail': thumbnail,              'uploader': artist,              'timestamp': timestamp, -            'release_date': unified_strdate(tralbum.get('album_release_date')), +            'release_timestamp': unified_timestamp(tralbum.get('album_release_date')),              'duration': duration,              'track': track,              'track_number': track_number, diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index c3e0a9262..6fcc4ac93 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -138,11 +138,6 @@ class BiliBiliIE(InfoExtractor):          anime_id = mobj.group('anime_id')          page_id = mobj.group('page')          webpage = self._download_webpage(url, video_id) -        headers = { -            'Referer': url, -            'Accept': '*/*' -        } -        headers.update(self.geo_verification_headers())          if 'anime/' not in url:              cid = self._search_regex( @@ -160,8 +155,12 @@ class BiliBiliIE(InfoExtractor):              if 'no_bangumi_tip' not in smuggled_data:                  self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run yt-dlp with %s' % (                      video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) +            headers = { +                'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', +                'Referer': url +            } +            headers.update(self.geo_verification_headers()) -            headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8'              js = self._download_json(                  'http://bangumi.bilibili.com/web_api/get_source', video_id,                  data=urlencode_postdata({'episode_id': video_id}), @@ -170,6 +169,12 @@ class BiliBiliIE(InfoExtractor):                  self._report_error(js)              cid = js['result']['cid'] +        headers = { +            'Accept': 'application/json', +            'Referer': url +        } +        headers.update(self.geo_verification_headers()) +          entries = []          RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') diff --git a/yt_dlp/extractor/cbs.py b/yt_dlp/extractor/cbs.py index 503d2e6a6..38c8bbc80 100644 --- a/yt_dlp/extractor/cbs.py +++ b/yt_dlp/extractor/cbs.py @@ -27,7 +27,7 @@ class CBSBaseIE(ThePlatformFeedIE):  class CBSIE(CBSBaseIE): -    _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:(?:cbs\.com|paramountplus\.com)/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)' +    _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:(?:cbs|paramountplus)\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)'      _TESTS = [{          'url': 'https://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', @@ -53,7 +53,7 @@ class CBSIE(CBSBaseIE):          'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',          'only_matching': True,      }, { -        'url': 'https://www.paramountplus.com/shows/star-trek-discovery/video/l5ANMH9wM7kxwV1qr4u1xn88XOhYMlZX/star-trek-discovery-the-vulcan-hello/', +        'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/',          'only_matching': True,      }] diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index b74a5dc01..65fcfcbf5 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -231,8 +231,9 @@ class InfoExtractor(object):      uploader:       Full name of the video uploader.      license:        License name the video is licensed under.      creator:        The creator of the video. +    release_timestamp: UNIX timestamp of the moment the video was released.      release_date:   The date (YYYYMMDD) when the video was released. -    timestamp:      UNIX timestamp of the moment the video became available. +    timestamp:      UNIX timestamp of the moment the video was uploaded      upload_date:    Video upload date (YYYYMMDD).                      If not explicitly set, calculated from timestamp.      uploader_id:    Nickname or id of the video uploader. diff --git a/yt_dlp/extractor/fujitv.py b/yt_dlp/extractor/fujitv.py index 39685e075..a02a94374 100644 --- a/yt_dlp/extractor/fujitv.py +++ b/yt_dlp/extractor/fujitv.py @@ -17,7 +17,7 @@ class FujiTVFODPlus7IE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url)          formats = self._extract_m3u8_formats( -            self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id) +            self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id, 'mp4')          for f in formats:              wh = self._BITRATE_MAP.get(f.get('tbr'))              if wh: diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index 051d94873..865cda761 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -6,8 +6,10 @@ import json  from .common import InfoExtractor  from ..compat import ( +    compat_parse_qs,      compat_str,      compat_urllib_parse_unquote, +    compat_urllib_parse_urlparse,  )  from ..utils import (      determine_ext, @@ -62,6 +64,7 @@ class LBRYBaseIE(InfoExtractor):              'description': stream_value.get('description'),              'license': stream_value.get('license'),              'timestamp': int_or_none(stream.get('timestamp')), +            'release_timestamp': int_or_none(stream_value.get('release_time')),              'tags': stream_value.get('tags'),              'duration': int_or_none(media.get('duration')),              'channel': try_get(signing_channel, lambda x: x['value']['title']), @@ -94,6 +97,8 @@ class LBRYIE(LBRYBaseIE):              'description': 'md5:f6cb5c704b332d37f5119313c2c98f51',              'timestamp': 1595694354,              'upload_date': '20200725', +            'release_timestamp': 1595340697, +            'release_date': '20200721',              'width': 1280,              'height': 720,          } @@ -108,6 +113,8 @@ class LBRYIE(LBRYBaseIE):              'description': 'md5:661ac4f1db09f31728931d7b88807a61',              'timestamp': 1591312601,              'upload_date': '20200604', +            'release_timestamp': 1591312421, +            'release_date': '20200604',              'tags': list,              'duration': 2570,              'channel': 'The LBRY Foundation', @@ -189,17 +196,18 @@ class LBRYChannelIE(LBRYBaseIE):      }]      _PAGE_SIZE = 50 -    def _fetch_page(self, claim_id, url, page): +    def _fetch_page(self, claim_id, url, params, page):          page += 1 +        page_params = { +            'channel_ids': [claim_id], +            'claim_type': 'stream', +            'no_totals': True, +            'page': page, +            'page_size': self._PAGE_SIZE, +        } +        page_params.update(params)          result = self._call_api_proxy( -            'claim_search', claim_id, { -                'channel_ids': [claim_id], -                'claim_type': 'stream', -                'no_totals': True, -                'page': page, -                'page_size': self._PAGE_SIZE, -                'stream_types': self._SUPPORTED_STREAM_TYPES, -            }, 'page %d' % page) +            'claim_search', claim_id, page_params, 'page %d' % page)          for item in (result.get('items') or []):              stream_claim_name = item.get('name')              stream_claim_id = item.get('claim_id') @@ -220,8 +228,31 @@ class LBRYChannelIE(LBRYBaseIE):          result = self._resolve_url(              'lbry://' + display_id, display_id, 'channel')          claim_id = result['claim_id'] +        qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) +        content = qs.get('content', [None])[0] +        params = { +            'fee_amount': qs.get('fee_amount', ['>=0'])[0], +            'order_by': { +                'new': ['release_time'], +                'top': ['effective_amount'], +                'trending': ['trending_group', 'trending_mixed'], +            }[qs.get('order', ['new'])[0]], +            'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, +        } +        duration = qs.get('duration', [None])[0] +        if duration: +            params['duration'] = { +                'long': '>=1200', +                'short': '<=240', +            }[duration] +        language = qs.get('language', ['all'])[0] +        if language != 'all': +            languages = [language] +            if language == 'en': +                languages.append('none') +            params['any_languages'] = languages          entries = OnDemandPagedList( -            functools.partial(self._fetch_page, claim_id, url), +            functools.partial(self._fetch_page, claim_id, url, params),              self._PAGE_SIZE)          result_value = result.get('value') or {}          return self.playlist_result( diff --git a/yt_dlp/extractor/peertube.py b/yt_dlp/extractor/peertube.py index 32ff51653..d9b13adc2 100644 --- a/yt_dlp/extractor/peertube.py +++ b/yt_dlp/extractor/peertube.py @@ -599,11 +599,13 @@ class PeerTubeIE(InfoExtractor):          else:              age_limit = None +        webpage_url = 'https://%s/videos/watch/%s' % (host, video_id) +          return {              'id': video_id,              'title': title,              'description': description, -            'thumbnail': urljoin(url, video.get('thumbnailPath')), +            'thumbnail': urljoin(webpage_url, video.get('thumbnailPath')),              'timestamp': unified_timestamp(video.get('publishedAt')),              'uploader': account_data('displayName', compat_str),              'uploader_id': str_or_none(account_data('id', int)), @@ -621,5 +623,6 @@ class PeerTubeIE(InfoExtractor):              'tags': try_get(video, lambda x: x['tags'], list),              'categories': categories,              'formats': formats, -            'subtitles': subtitles +            'subtitles': subtitles, +            'webpage_url': webpage_url,          } diff --git a/yt_dlp/extractor/pinterest.py b/yt_dlp/extractor/pinterest.py index 15c11a755..09aeea340 100644 --- a/yt_dlp/extractor/pinterest.py +++ b/yt_dlp/extractor/pinterest.py @@ -31,6 +31,7 @@ class PinterestBaseIE(InfoExtractor):          title = (data.get('title') or data.get('grid_title') or video_id).strip() +        urls = []          formats = []          duration = None          if extract_formats: @@ -38,8 +39,9 @@ class PinterestBaseIE(InfoExtractor):                  if not isinstance(format_dict, dict):                      continue                  format_url = url_or_none(format_dict.get('url')) -                if not format_url: +                if not format_url or format_url in urls:                      continue +                urls.append(format_url)                  duration = float_or_none(format_dict.get('duration'), scale=1000)                  ext = determine_ext(format_url)                  if 'hls' in format_id.lower() or ext == 'm3u8': diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index b7631e4e1..2a7818e41 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -167,6 +167,7 @@ class PornHubIE(PornHubBaseIE):          'params': {              'skip_download': True,          }, +        'skip': 'Video has been flagged for verification in accordance with our trust and safety policy',      }, {          # subtitles          'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', @@ -265,7 +266,8 @@ class PornHubIE(PornHubBaseIE):          webpage = dl_webpage('pc')          error_msg = self._html_search_regex( -            r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>', +            (r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>', +             r'(?s)<section[^>]+class=["\']noVideo["\'][^>]*>(?P<error>.+?)</section>'),              webpage, 'error message', default=None, group='error')          if error_msg:              error_msg = re.sub(r'\s+', ' ', error_msg) @@ -394,6 +396,21 @@ class PornHubIE(PornHubBaseIE):          upload_date = None          formats = [] + +        def add_format(format_url, height=None): +            tbr = None +            mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', format_url) +            if mobj: +                if not height: +                    height = int(mobj.group('height')) +                tbr = int(mobj.group('tbr')) +            formats.append({ +                'url': format_url, +                'format_id': '%dp' % height if height else None, +                'height': height, +                'tbr': tbr, +            }) +          for video_url, height in video_urls:              if not upload_date:                  upload_date = self._search_regex( @@ -410,18 +427,19 @@ class PornHubIE(PornHubBaseIE):                      video_url, video_id, 'mp4', entry_protocol='m3u8_native',                      m3u8_id='hls', fatal=False))                  continue -            tbr = None -            mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url) -            if mobj: -                if not height: -                    height = int(mobj.group('height')) -                tbr = int(mobj.group('tbr')) -            formats.append({ -                'url': video_url, -                'format_id': '%dp' % height if height else None, -                'height': height, -                'tbr': tbr, -            }) +            if '/video/get_media' in video_url: +                medias = self._download_json(video_url, video_id, fatal=False) +                if isinstance(medias, list): +                    for media in medias: +                        if not isinstance(media, dict): +                            continue +                        video_url = url_or_none(media.get('videoUrl')) +                        if not video_url: +                            continue +                        height = int_or_none(media.get('quality')) +                        add_format(video_url, height) +                continue +            add_format(video_url)          self._sort_formats(formats)          video_uploader = self._html_search_regex( diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py index ce9db0629..d2fb754cf 100644 --- a/yt_dlp/extractor/rtve.py +++ b/yt_dlp/extractor/rtve.py @@ -2,8 +2,9 @@  from __future__ import unicode_literals  import base64 +import io  import re -import time +import sys  from .common import InfoExtractor  from ..compat import ( @@ -14,56 +15,13 @@ from ..utils import (      determine_ext,      ExtractorError,      float_or_none, +    qualities,      remove_end,      remove_start, -    sanitized_Request,      std_headers,  ) - -def _decrypt_url(png): -    encrypted_data = compat_b64decode(png) -    text_index = encrypted_data.find(b'tEXt') -    text_chunk = encrypted_data[text_index - 4:] -    length = compat_struct_unpack('!I', text_chunk[:4])[0] -    # Use bytearray to get integers when iterating in both python 2.x and 3.x -    data = bytearray(text_chunk[8:8 + length]) -    data = [chr(b) for b in data if b != 0] -    hash_index = data.index('#') -    alphabet_data = data[:hash_index] -    url_data = data[hash_index + 1:] -    if url_data[0] == 'H' and url_data[3] == '%': -        # remove useless HQ%% at the start -        url_data = url_data[4:] - -    alphabet = [] -    e = 0 -    d = 0 -    for l in alphabet_data: -        if d == 0: -            alphabet.append(l) -            d = e = (e + 1) % 4 -        else: -            d -= 1 -    url = '' -    f = 0 -    e = 3 -    b = 1 -    for letter in url_data: -        if f == 0: -            l = int(letter) * 10 -            f = 1 -        else: -            if e == 0: -                l += int(letter) -                url += alphabet[l] -                e = (b + 3) % 4 -                f = 0 -                b += 1 -            else: -                e -= 1 - -    return url +_bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x))  class RTVEALaCartaIE(InfoExtractor): @@ -79,28 +37,31 @@ class RTVEALaCartaIE(InfoExtractor):              'ext': 'mp4',              'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',              'duration': 5024.566, +            'series': 'Balonmano',          }, +        'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],      }, {          'note': 'Live stream',          'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/',          'info_dict': {              'id': '1694255', -            'ext': 'flv', -            'title': 'TODO', +            'ext': 'mp4', +            'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            'is_live': True, +        }, +        'params': { +            'skip_download': 'live stream',          }, -        'skip': 'The f4m manifest can\'t be used yet',      }, {          'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', -        'md5': 'e55e162379ad587e9640eda4f7353c0f', +        'md5': 'd850f3c8731ea53952ebab489cf81cbf',          'info_dict': {              'id': '4236788',              'ext': 'mp4', -            'title': 'Servir y proteger - Capítulo 104 ', +            'title': 'Servir y proteger - Capítulo 104',              'duration': 3222.0,          }, -        'params': { -            'skip_download': True,  # requires ffmpeg -        }, +        'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],      }, {          'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve',          'only_matching': True, @@ -111,58 +72,102 @@ class RTVEALaCartaIE(InfoExtractor):      def _real_initialize(self):          user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') -        manager_info = self._download_json( +        self._manager = self._download_json(              'http://www.rtve.es/odin/loki/' + user_agent_b64, -            None, 'Fetching manager info') -        self._manager = manager_info['manager'] +            None, 'Fetching manager info')['manager'] + +    @staticmethod +    def _decrypt_url(png): +        encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) +        while True: +            length = compat_struct_unpack('!I', encrypted_data.read(4))[0] +            chunk_type = encrypted_data.read(4) +            if chunk_type == b'IEND': +                break +            data = encrypted_data.read(length) +            if chunk_type == b'tEXt': +                alphabet_data, text = data.split(b'\0') +                quality, url_data = text.split(b'%%') +                alphabet = [] +                e = 0 +                d = 0 +                for l in _bytes_to_chr(alphabet_data): +                    if d == 0: +                        alphabet.append(l) +                        d = e = (e + 1) % 4 +                    else: +                        d -= 1 +                url = '' +                f = 0 +                e = 3 +                b = 1 +                for letter in _bytes_to_chr(url_data): +                    if f == 0: +                        l = int(letter) * 10 +                        f = 1 +                    else: +                        if e == 0: +                            l += int(letter) +                            url += alphabet[l] +                            e = (b + 3) % 4 +                            f = 0 +                            b += 1 +                        else: +                            e -= 1 + +                yield quality.decode(), url +            encrypted_data.read(4)  # CRC + +    def _extract_png_formats(self, video_id): +        png = self._download_webpage( +            'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id), +            video_id, 'Downloading url information', query={'q': 'v2'}) +        q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) +        formats = [] +        for quality, video_url in self._decrypt_url(png): +            ext = determine_ext(video_url) +            if ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    video_url, video_id, 'mp4', 'm3u8_native', +                    m3u8_id='hls', fatal=False)) +            elif ext == 'mpd': +                formats.extend(self._extract_mpd_formats( +                    video_url, video_id, 'dash', fatal=False)) +            else: +                formats.append({ +                    'format_id': quality, +                    'quality': q(quality), +                    'url': video_url, +                }) +        self._sort_formats(formats) +        return formats      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          info = self._download_json(              'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id,              video_id)['page']['items'][0]          if info['state'] == 'DESPU':              raise ExtractorError('The video is no longer available', expected=True) -        title = info['title'] -        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) -        png_request = sanitized_Request(png_url) -        png_request.add_header('Referer', url) -        png = self._download_webpage(png_request, video_id, 'Downloading url information') -        video_url = _decrypt_url(png) -        ext = determine_ext(video_url) - -        formats = [] -        if not video_url.endswith('.f4m') and ext != 'm3u8': -            if '?' not in video_url: -                video_url = video_url.replace('resources/', 'auth/resources/') -            video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve') - -        if ext == 'm3u8': -            formats.extend(self._extract_m3u8_formats( -                video_url, video_id, ext='mp4', entry_protocol='m3u8_native', -                m3u8_id='hls', fatal=False)) -        elif ext == 'f4m': -            formats.extend(self._extract_f4m_formats( -                video_url, video_id, f4m_id='hds', fatal=False)) -        else: -            formats.append({ -                'url': video_url, -            }) -        self._sort_formats(formats) +        title = info['title'].strip() +        formats = self._extract_png_formats(video_id)          subtitles = None -        if info.get('sbtFile') is not None: -            subtitles = self.extract_subtitles(video_id, info['sbtFile']) +        sbt_file = info.get('sbtFile') +        if sbt_file: +            subtitles = self.extract_subtitles(video_id, sbt_file) + +        is_live = info.get('live') is True          return {              'id': video_id, -            'title': title, +            'title': self._live_title(title) if is_live else title,              'formats': formats,              'thumbnail': info.get('image'), -            'page_url': url,              'subtitles': subtitles, -            'duration': float_or_none(info.get('duration'), scale=1000), +            'duration': float_or_none(info.get('duration'), 1000), +            'is_live': is_live, +            'series': info.get('programTitle'),          }      def _get_subtitles(self, video_id, sub_file): @@ -174,48 +179,26 @@ class RTVEALaCartaIE(InfoExtractor):              for s in subs) -class RTVEInfantilIE(InfoExtractor): +class RTVEInfantilIE(RTVEALaCartaIE):      IE_NAME = 'rtve.es:infantil'      IE_DESC = 'RTVE infantil' -    _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P<show>[^/]*)/video/(?P<short_title>[^/]*)/(?P<id>[0-9]+)/' +    _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P<id>[0-9]+)/'      _TESTS = [{          'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', -        'md5': '915319587b33720b8e0357caaa6617e6', +        'md5': '5747454717aedf9f9fdf212d1bcfc48d',          'info_dict': {              'id': '3040283',              'ext': 'mp4',              'title': 'Maneras de vivir', -            'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG', +            'thumbnail': r're:https?://.+/1426182947956\.JPG',              'duration': 357.958,          }, +        'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],      }] -    def _real_extract(self, url): -        video_id = self._match_id(url) -        info = self._download_json( -            'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, -            video_id)['page']['items'][0] - -        webpage = self._download_webpage(url, video_id) -        vidplayer_id = self._search_regex( -            r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') -        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id -        png = self._download_webpage(png_url, video_id, 'Downloading url information') -        video_url = _decrypt_url(png) - -        return { -            'id': video_id, -            'ext': 'mp4', -            'title': info['title'], -            'url': video_url, -            'thumbnail': info.get('image'), -            'duration': float_or_none(info.get('duration'), scale=1000), -        } - - -class RTVELiveIE(InfoExtractor): +class RTVELiveIE(RTVEALaCartaIE):      IE_NAME = 'rtve.es:live'      IE_DESC = 'RTVE.es live streams'      _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' @@ -225,7 +208,7 @@ class RTVELiveIE(InfoExtractor):          'info_dict': {              'id': 'la-1',              'ext': 'mp4', -            'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', +            'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',          },          'params': {              'skip_download': 'live stream', @@ -234,29 +217,22 @@ class RTVELiveIE(InfoExtractor):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        start_time = time.gmtime()          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id)          title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es')          title = remove_start(title, 'Estoy viendo ') -        title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time)          vidplayer_id = self._search_regex(              (r'playerId=player([0-9]+)',               r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)',               r'data-id=["\'](\d+)'),              webpage, 'internal video ID') -        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id -        png = self._download_webpage(png_url, video_id, 'Downloading url information') -        m3u8_url = _decrypt_url(png) -        formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') -        self._sort_formats(formats)          return {              'id': video_id, -            'title': title, -            'formats': formats, +            'title': self._live_title(title), +            'formats': self._extract_png_formats(vidplayer_id),              'is_live': True,          } diff --git a/yt_dlp/extractor/shahid.py b/yt_dlp/extractor/shahid.py index c1d6aba2c..5768199bc 100644 --- a/yt_dlp/extractor/shahid.py +++ b/yt_dlp/extractor/shahid.py @@ -51,13 +51,16 @@ class ShahidIE(ShahidBaseIE):      _NETRC_MACHINE = 'shahid'      _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?P<type>episode|clip|movie)-(?P<id>\d+)'      _TESTS = [{ -        'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D8%B4%D8%A8%D8%A7%D8%A8-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-275286', +        'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924',          'info_dict': { -            'id': '275286', +            'id': '816924',              'ext': 'mp4', -            'title': 'مجلس الشباب الموسم 1 كليب 1', -            'timestamp': 1506988800, -            'upload_date': '20171003', +            'title': 'متحف الدحيح الموسم 1 كليب 1', +            'timestamp': 1602806400, +            'upload_date': '20201016', +            'description': 'برومو', +            'duration': 22, +            'categories': ['كوميديا'],          },          'params': {              # m3u8 download @@ -109,12 +112,15 @@ class ShahidIE(ShahidBaseIE):              page_type = 'episode'          playout = self._call_api( -            'playout/url/' + video_id, video_id)['playout'] +            'playout/new/url/' + video_id, video_id)['playout']          if not self._downloader.params.get('allow_unplayable_formats') and playout.get('drm'):              raise ExtractorError('This video is DRM protected.', expected=True) -        formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4') +        formats = self._extract_m3u8_formats(re.sub( +            # https://docs.aws.amazon.com/mediapackage/latest/ug/manifest-filtering.html +            r'aws\.manifestfilter=[\w:;,-]+&?', +            '', playout['url']), video_id, 'mp4')          self._sort_formats(formats)          # video = self._call_api( diff --git a/yt_dlp/extractor/southpark.py b/yt_dlp/extractor/southpark.py index 95e6d2890..9aedaa04a 100644 --- a/yt_dlp/extractor/southpark.py +++ b/yt_dlp/extractor/southpark.py @@ -6,9 +6,9 @@ from .mtv import MTVServicesInfoExtractor  class SouthParkIE(MTVServicesInfoExtractor):      IE_NAME = 'southpark.cc.com' -    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' +    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' -    _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' +    _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'      _TESTS = [{          'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', @@ -23,8 +23,20 @@ class SouthParkIE(MTVServicesInfoExtractor):      }, {          'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1',          'only_matching': True, +    }, { +        'url': 'https://www.southparkstudios.com/episodes/h4o269/south-park-stunning-and-brave-season-19-ep-1', +        'only_matching': True,      }] +    def _get_feed_query(self, uri): +        return { +            'accountOverride': 'intl.mtvi.com', +            'arcEp': 'shared.southpark.global', +            'ep': '90877963', +            'imageEp': 'shared.southpark.global', +            'mgid': uri, +        } +  class SouthParkEsIE(SouthParkIE):      IE_NAME = 'southpark.cc.com:español' diff --git a/yt_dlp/extractor/sportdeutschland.py b/yt_dlp/extractor/sportdeutschland.py index 378fc7568..3e497a939 100644 --- a/yt_dlp/extractor/sportdeutschland.py +++ b/yt_dlp/extractor/sportdeutschland.py @@ -1,82 +1,105 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor +from ..compat import ( +    compat_parse_qs, +    compat_urllib_parse_urlparse, +)  from ..utils import ( +    clean_html, +    float_or_none, +    int_or_none,      parse_iso8601, -    sanitized_Request, +    strip_or_none, +    try_get,  )  class SportDeutschlandIE(InfoExtractor): -    _VALID_URL = r'https?://sportdeutschland\.tv/(?P<sport>[^/?#]+)/(?P<id>[^?#/]+)(?:$|[?#])' +    _VALID_URL = r'https?://sportdeutschland\.tv/(?P<id>(?:[^/]+/)?[^?#/&]+)'      _TESTS = [{          'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0',          'info_dict': { -            'id': 're-live-deutsche-meisterschaften-2020-halbfinals', +            'id': '5318cac0275701382770543d7edaf0a0',              'ext': 'mp4', -            'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals', -            'categories': ['Badminton-Deutschland'], -            'view_count': int, -            'thumbnail': r're:^https?://.*\.(?:jpg|png)$', -            'timestamp': int, -            'upload_date': '20200201', -            'description': 're:.*',  # meaningless description for THIS video +            'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1', +            'duration': 16106.36,          }, +        'params': { +            'noplaylist': True, +            # m3u8 download +            'skip_download': True, +        }, +    }, { +        'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', +        'info_dict': { +            'id': 'c6e2fdd01f63013854c47054d2ab776f', +            'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals', +            'description': 'md5:5263ff4c31c04bb780c9f91130b48530', +            'duration': 31397, +        }, +        'playlist_count': 2, +    }, { +        'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich', +        'only_matching': True,      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        sport_id = mobj.group('sport') - -        api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( -            sport_id, video_id) -        req = sanitized_Request(api_url, headers={ -            'Accept': 'application/vnd.vidibus.v2.html+json', -            'Referer': url, -        }) -        data = self._download_json(req, video_id) - +        display_id = self._match_id(url) +        data = self._download_json( +            'https://backend.sportdeutschland.tv/api/permalinks/' + display_id, +            display_id, query={'access_token': 'true'})          asset = data['asset'] -        categories = [data['section']['title']] - -        formats = [] -        smil_url = asset['video'] -        if '.smil' in smil_url: -            m3u8_url = smil_url.replace('.smil', '.m3u8') -            formats.extend( -                self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')) +        title = (asset.get('title') or asset['label']).strip() +        asset_id = asset.get('id') or asset.get('uuid') +        info = { +            'id': asset_id, +            'title': title, +            'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'), +            'duration': int_or_none(asset.get('seconds')), +        } +        videos = asset.get('videos') or [] +        if len(videos) > 1: +            playlist_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('playlistId', [None])[0] +            if playlist_id: +                if self._downloader.params.get('noplaylist'): +                    videos = [videos[int(playlist_id)]] +                    self.to_screen('Downloading just a single video because of --no-playlist') +                else: +                    self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id) -            smil_doc = self._download_xml( -                smil_url, video_id, note='Downloading SMIL metadata') -            base_url_el = smil_doc.find('./head/meta') -            if base_url_el: -                base_url = base_url_el.attrib['base'] -            formats.extend([{ -                'format_id': 'rmtp', -                'url': base_url if base_url_el else n.attrib['src'], -                'play_path': n.attrib['src'], -                'ext': 'flv', -                'preference': -100, -                'format_note': 'Seems to fail at example stream', -            } for n in smil_doc.findall('./body/video')]) +            def entries(): +                for i, video in enumerate(videos, 1): +                    video_id = video.get('uuid') +                    video_url = video.get('url') +                    if not (video_id and video_url): +                        continue +                    formats = self._extract_m3u8_formats( +                        video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False) +                    if not formats: +                        continue +                    yield { +                        'id': video_id, +                        'formats': formats, +                        'title': title + ' - ' + (video.get('label') or 'Teil %d' % i), +                        'duration': float_or_none(video.get('duration')), +                    } +            info.update({ +                '_type': 'multi_video', +                'entries': entries(), +            })          else: -            formats.append({'url': smil_url}) - -        self._sort_formats(formats) - -        return { -            'id': video_id, -            'formats': formats, -            'title': asset['title'], -            'thumbnail': asset.get('image'), -            'description': asset.get('teaser'), -            'duration': asset.get('duration'), -            'categories': categories, -            'view_count': asset.get('views'), -            'rtmp_live': asset.get('live'), -            'timestamp': parse_iso8601(asset.get('date')), -        } +            formats = self._extract_m3u8_formats( +                videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4') +            section_title = strip_or_none(try_get(data, lambda x: x['section']['title'])) +            info.update({ +                'formats': formats, +                'display_id': asset.get('permalink'), +                'thumbnail': try_get(asset, lambda x: x['images'][0]), +                'categories': [section_title] if section_title else None, +                'view_count': int_or_none(asset.get('views')), +                'is_live': asset.get('is_live') is True, +                'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')), +            }) +        return info diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index 931d4d650..a54f49319 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -9,6 +9,7 @@ from ..utils import (      int_or_none,      remove_start,      smuggle_url, +    strip_or_none,      try_get,  ) @@ -25,6 +26,10 @@ class TVerIE(InfoExtractor):      }, {          'url': 'https://tver.jp/episode/79622438',          'only_matching': True, +    }, { +        # subtitle = ' ' +        'url': 'https://tver.jp/corner/f0068870', +        'only_matching': True,      }]      _TOKEN = None      BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' @@ -47,8 +52,12 @@ class TVerIE(InfoExtractor):          }          if service == 'cx': +            title = main['title'] +            subtitle = strip_or_none(main.get('subtitle')) +            if subtitle: +                title += ' - ' + subtitle              info.update({ -                'title': main.get('subtitle') or main['title'], +                'title': title,                  'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id),                  'ie_key': 'FujiTVFODPlus7',              }) diff --git a/yt_dlp/extractor/voxmedia.py b/yt_dlp/extractor/voxmedia.py index b318e15d4..661208125 100644 --- a/yt_dlp/extractor/voxmedia.py +++ b/yt_dlp/extractor/voxmedia.py @@ -7,6 +7,8 @@ from ..compat import compat_urllib_parse_unquote  from ..utils import (      ExtractorError,      int_or_none, +    try_get, +    unified_timestamp,  ) @@ -19,14 +21,17 @@ class VoxMediaVolumeIE(OnceIE):          setup = self._parse_json(self._search_regex(              r'setup\s*=\s*({.+});', webpage, 'setup'), video_id) -        video_data = setup.get('video') or {} +        player_setup = setup.get('player_setup') or setup +        video_data = player_setup.get('video') or {} +        formatted_metadata = video_data.get('formatted_metadata') or {}          info = {              'id': video_id, -            'title': video_data.get('title_short'), +            'title': player_setup.get('title') or video_data.get('title_short'),              'description': video_data.get('description_long') or video_data.get('description_short'), -            'thumbnail': video_data.get('brightcove_thumbnail') +            'thumbnail': formatted_metadata.get('thumbnail') or video_data.get('brightcove_thumbnail'), +            'timestamp': unified_timestamp(formatted_metadata.get('video_publish_date')),          } -        asset = setup.get('asset') or setup.get('params') or {} +        asset = try_get(setup, lambda x: x['embed_assets']['chorus'], dict) or {}          formats = []          hls_url = asset.get('hls_url') @@ -47,6 +52,7 @@ class VoxMediaVolumeIE(OnceIE):          if formats:              self._sort_formats(formats)              info['formats'] = formats +            info['duration'] = int_or_none(asset.get('duration'))              return info          for provider_video_type in ('ooyala', 'youtube', 'brightcove'): @@ -84,7 +90,7 @@ class VoxMediaIE(InfoExtractor):      }, {          # Volume embed, Youtube          'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', -        'md5': '4c8f4a0937752b437c3ebc0ed24802b5', +        'md5': 'fd19aa0cf3a0eea515d4fd5c8c0e9d68',          'info_dict': {              'id': 'Gy8Md3Eky38',              'ext': 'mp4', @@ -93,6 +99,7 @@ class VoxMediaIE(InfoExtractor):              'uploader_id': 'TheVerge',              'upload_date': '20141021',              'uploader': 'The Verge', +            'timestamp': 1413907200,          },          'add_ie': ['Youtube'],          'skip': 'similar to the previous test', @@ -100,13 +107,13 @@ class VoxMediaIE(InfoExtractor):          # Volume embed, Youtube          'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill',          'info_dict': { -            'id': 'YCjDnX-Xzhg', +            'id': '22986359b',              'ext': 'mp4',              'title': "Mississippi's laws are so bad that its anti-LGBTQ law isn't needed to allow discrimination",              'description': 'md5:fc1317922057de31cd74bce91eb1c66c', -            'uploader_id': 'voxdotcom',              'upload_date': '20150915', -            'uploader': 'Vox', +            'timestamp': 1442332800, +            'duration': 285,          },          'add_ie': ['Youtube'],          'skip': 'similar to the previous test', @@ -160,6 +167,9 @@ class VoxMediaIE(InfoExtractor):              'ext': 'mp4',              'title': 'Post-Post-PC CEO: The Full Code Conference Video of Microsoft\'s Satya Nadella',              'description': 'The longtime veteran was chosen earlier this year as the software giant\'s third leader in its history.', +            'timestamp': 1402938000, +            'upload_date': '20140616', +            'duration': 4114,          },          'add_ie': ['VoxMediaVolume'],      }] | 
