diff options
Diffstat (limited to 'yt_dlp/extractor')
123 files changed, 5059 insertions, 2242 deletions
diff --git a/yt_dlp/extractor/adobetv.py b/yt_dlp/extractor/adobetv.py index 12b819206..3cfa1ff55 100644 --- a/yt_dlp/extractor/adobetv.py +++ b/yt_dlp/extractor/adobetv.py @@ -9,6 +9,7 @@ from ..utils import ( float_or_none, int_or_none, ISO639Utils, + join_nonempty, OnDemandPagedList, parse_duration, str_or_none, @@ -263,7 +264,7 @@ class AdobeTVVideoIE(AdobeTVBaseIE): continue formats.append({ 'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000), - 'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])), + 'format_id': join_nonempty(source.get('format'), source.get('label')), 'height': int_or_none(source.get('height') or None), 'tbr': int_or_none(source.get('bitrate') or None), 'width': int_or_none(source.get('width') or None), diff --git a/yt_dlp/extractor/aljazeera.py b/yt_dlp/extractor/aljazeera.py index e829b45e4..7bcdb7afb 100644 --- a/yt_dlp/extractor/aljazeera.py +++ b/yt_dlp/extractor/aljazeera.py @@ -1,55 +1,86 @@ +# coding: utf-8 from __future__ import unicode_literals import json from .common import InfoExtractor +from ..utils import ( + try_get, +) class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?P<type>program/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)' + _VALID_URL = r'https?://(?P<base>\w+\.aljazeera\.\w+)/(?P<type>programs?/[^/]+|(?:feature|video|new)s)?/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)' _TESTS = [{ - 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', + 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/pojedini-domovi-u-sarajevu-jos-pod-vodom-mjestanima-se-dostavlja-hrana', 'info_dict': { - 'id': '3792260579001', + 'id': '6280641530001', 'ext': 'mp4', - 'title': 'The Slum - Episode 1: Deliverance', - 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', - 'uploader_id': '665003303001', - 'timestamp': 1411116829, - 'upload_date': '20140919', - }, - 'add_ie': ['BrightcoveNew'], - 'skip': 'Not accessible from Travis CI server', - }, { - 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', - 'only_matching': True, + 'title': 'Pojedini domovi u Sarajevu još pod vodom, mještanima se dostavlja hrana', + 'timestamp': 1636219149, + 'description': 'U sarajevskim naseljima Rajlovac i Reljevo stambeni objekti, ali i industrijska postrojenja i dalje su pod vodom.', + 'upload_date': '20211106', + } }, { - 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', - 'only_matching': True, + 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/djokovic-usao-u-finale-mastersa-u-parizu', + 'info_dict': { + 'id': '6280654936001', + 'ext': 'mp4', + 'title': 'Đoković ušao u finale Mastersa u Parizu', + 'timestamp': 1636221686, + 'description': 'Novak Đoković je u polufinalu Mastersa u Parizu nakon preokreta pobijedio Poljaka Huberta Hurkacza.', + 'upload_date': '20211106', + }, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + BRIGHTCOVE_URL_RE = r'https?://players.brightcove.net/(?P<account>\d+)/(?P<player_id>[a-zA-Z0-9]+)_(?P<embed>[^/]+)/index.html\?videoId=(?P<id>\d+)' def _real_extract(self, url): - post_type, name = self._match_valid_url(url).groups() + base, post_type, id = self._match_valid_url(url).groups() + wp = { + 'balkans.aljazeera.net': 'ajb', + 'chinese.aljazeera.net': 'chinese', + 'mubasher.aljazeera.net': 'ajm', + }.get(base) or 'aje' post_type = { 'features': 'post', 'program': 'episode', + 'programs': 'episode', 'videos': 'video', + 'news': 'news', }[post_type.split('/')[0]] video = self._download_json( - 'https://www.aljazeera.com/graphql', name, query={ + f'https://{base}/graphql', id, query={ + 'wp-site': wp, 'operationName': 'ArchipelagoSingleArticleQuery', 'variables': json.dumps({ - 'name': name, + 'name': id, 'postType': post_type, }), }, headers={ - 'wp-site': 'aje', - })['data']['article']['video'] - video_id = video['id'] - account_id = video.get('accountId') or '665003303001' - player_id = video.get('playerId') or 'BkeSH5BDb' - return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), - 'BrightcoveNew', video_id) + 'wp-site': wp, + }) + video = try_get(video, lambda x: x['data']['article']['video']) or {} + video_id = video.get('id') + account = video.get('accountId') or '911432371001' + player_id = video.get('playerId') or 'csvTfAlKW' + embed = 'default' + + if video_id is None: + webpage = self._download_webpage(url, id) + + account, player_id, embed, video_id = self._search_regex(self.BRIGHTCOVE_URL_RE, webpage, 'video id', + group=(1, 2, 3, 4), default=(None, None, None, None)) + + if video_id is None: + return { + '_type': 'url_transparent', + 'url': url, + 'ie_key': 'Generic' + } + + return { + '_type': 'url_transparent', + 'url': f'https://players.brightcove.net/{account}/{player_id}_{embed}/index.html?videoId={video_id}', + 'ie_key': 'BrightcoveNew' + } diff --git a/yt_dlp/extractor/amazon.py b/yt_dlp/extractor/amazon.py new file mode 100644 index 000000000..7c5d35f47 --- /dev/null +++ b/yt_dlp/extractor/amazon.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import int_or_none + + +class AmazonStoreIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P<id>[^/&#$?]+)' + + _TESTS = [{ + 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', + 'info_dict': { + 'id': 'B098XNCHLD', + 'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed', + }, + 'playlist_mincount': 1, + 'playlist': [{ + 'info_dict': { + 'id': 'A1F83G8C2ARO7P', + 'ext': 'mp4', + 'title': 'mcdodo usb c cable 100W 5a', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }] + }, { + 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', + 'info_dict': { + 'id': 'B0863TXGM3', + 'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff', + }, + 'playlist_mincount': 4, + }, { + 'url': 'https://www.amazon.com/dp/B0845NXCXF/', + 'info_dict': { + 'id': 'B0845NXCXF', + 'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171', + }, + 'playlist-mincount': 1, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id) + entries = [{ + 'id': video['marketPlaceID'], + 'url': video['url'], + 'title': video.get('title'), + 'thumbnail': video.get('thumbUrl') or video.get('thumb'), + 'duration': video.get('durationSeconds'), + 'height': int_or_none(video.get('videoHeight')), + 'width': int_or_none(video.get('videoWidth')), + } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] + return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title']) diff --git a/yt_dlp/extractor/animeondemand.py b/yt_dlp/extractor/animeondemand.py index 54e097d2f..5694f7240 100644 --- a/yt_dlp/extractor/animeondemand.py +++ b/yt_dlp/extractor/animeondemand.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, extract_attributes, ExtractorError, + join_nonempty, url_or_none, urlencode_postdata, urljoin, @@ -140,15 +141,8 @@ class AnimeOnDemandIE(InfoExtractor): kind = self._search_regex( r'videomaterialurl/\d+/([^/]+)/', playlist_url, 'media kind', default=None) - format_id_list = [] - if lang: - format_id_list.append(lang) - if kind: - format_id_list.append(kind) - if not format_id_list and num is not None: - format_id_list.append(compat_str(num)) - format_id = '-'.join(format_id_list) - format_note = ', '.join(filter(None, (kind, lang_note))) + format_id = join_nonempty(lang, kind) if lang or kind else str(num) + format_note = join_nonempty(kind, lang_note, delim=', ') item_id_list = [] if format_id: item_id_list.append(format_id) @@ -195,12 +189,10 @@ class AnimeOnDemandIE(InfoExtractor): if not file_: continue ext = determine_ext(file_) - format_id_list = [lang, kind] - if ext == 'm3u8': - format_id_list.append('hls') - elif source.get('type') == 'video/dash' or ext == 'mpd': - format_id_list.append('dash') - format_id = '-'.join(filter(None, format_id_list)) + format_id = join_nonempty( + lang, kind, + 'hls' if ext == 'm3u8' else None, + 'dash' if source.get('type') == 'video/dash' or ext == 'mpd' else None) if ext == 'm3u8': file_formats = self._extract_m3u8_formats( file_, video_id, 'mp4', diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py index d688e2c5b..0d444fc33 100644 --- a/yt_dlp/extractor/anvato.py +++ b/yt_dlp/extractor/anvato.py @@ -16,6 +16,7 @@ from ..utils import ( determine_ext, intlist_to_bytes, int_or_none, + join_nonempty, strip_jsonp, unescapeHTML, unsmuggle_url, @@ -303,13 +304,13 @@ class AnvatoIE(InfoExtractor): tbr = int_or_none(published_url.get('kbps')) a_format = { 'url': video_url, - 'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(), - 'tbr': tbr if tbr != 0 else None, + 'format_id': join_nonempty('http', published_url.get('cdn_name')).lower(), + 'tbr': tbr or None, } if media_format == 'm3u8' and tbr is not None: a_format.update({ - 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), + 'format_id': join_nonempty('hls', tbr), 'ext': 'mp4', }) elif media_format == 'm3u8-variant' or ext == 'm3u8': diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 048d30f27..f8d57109e 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -388,7 +388,13 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?P<mode>player|live|video|sendung|sammlung)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)' + _VALID_URL = r'''(?x)https:// + (?:(?:beta|www)\.)?ardmediathek\.de/ + (?:(?P<client>[^/]+)/)? + (?:player|live|video|(?P<playlist>sendung|sammlung))/ + (?:(?P<display_id>[^?#]+)/)? + (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)''' + _TESTS = [{ 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', @@ -403,6 +409,18 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'upload_date': '20200805', 'ext': 'mp4', }, + 'skip': 'Error', + }, { + 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', + 'md5': 'f1837e563323b8a642a8ddeff0131f51', + 'info_dict': { + 'id': '10049223', + 'ext': 'mp4', + 'title': 'tagesschau, 20:00 Uhr', + 'timestamp': 1636398000, + 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b', + 'upload_date': '20211108', + }, }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', 'only_matching': True, @@ -426,6 +444,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): # playlist of type 'sammlung' 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', + 'only_matching': True, }] def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber): @@ -525,20 +549,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): return self.playlist_result(entries, playlist_title=display_id) def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id - - if mobj.group('mode') in ('sendung', 'sammlung'): - # this is a playlist-URL - return self._ARD_extract_playlist( - url, video_id, display_id, - mobj.group('client'), - mobj.group('mode')) + video_id, display_id, playlist_type, client = self._match_valid_url(url).group( + 'id', 'display_id', 'playlist', 'client') + display_id, client = display_id or video_id, client or 'ard' + + if playlist_type: + return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type) player_page = self._download_json( 'https://api.ardmediathek.de/public-gateway', @@ -574,7 +590,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): } } } -}''' % (mobj.group('client'), video_id), +}''' % (client, video_id), }).encode(), headers={ 'Content-Type': 'application/json' })['data']['playerPage'] diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py index 8143eb4d7..6d843966a 100644 --- a/yt_dlp/extractor/atresplayer.py +++ b/yt_dlp/extractor/atresplayer.py @@ -24,9 +24,6 @@ class AtresPlayerIE(InfoExtractor): 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', 'duration': 3413, }, - 'params': { - 'format': 'bestvideo', - }, 'skip': 'This video is only available for registered users' }, { diff --git a/yt_dlp/extractor/bandaichannel.py b/yt_dlp/extractor/bandaichannel.py index d67285913..f1bcdef7a 100644 --- a/yt_dlp/extractor/bandaichannel.py +++ b/yt_dlp/extractor/bandaichannel.py @@ -21,7 +21,6 @@ class BandaiChannelIE(BrightcoveNewIE): 'duration': 1387.733, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }] diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 4e2dcd76b..672ed1ffe 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -451,9 +451,10 @@ class BBCCoUkIE(InfoExtractor): playlist = self._download_json( 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, playlist_id, 'Downloading playlist JSON') + formats = [] + subtitles = {} - version = playlist.get('defaultAvailableVersion') - if version: + for version in playlist.get('allAvailableVersions', []): smp_config = version['smpConfig'] title = smp_config['title'] description = smp_config['summary'] @@ -463,8 +464,18 @@ class BBCCoUkIE(InfoExtractor): continue programme_id = item.get('vpid') duration = int_or_none(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) - return programme_id, title, description, duration, formats, subtitles + version_formats, version_subtitles = self._download_media_selector(programme_id) + types = version['types'] + for f in version_formats: + f['format_note'] = ', '.join(types) + if any('AudioDescribed' in x for x in types): + f['language_preference'] = -10 + formats += version_formats + for tag, subformats in (version_subtitles or {}).items(): + subtitles.setdefault(tag, []) + subtitles[tag] += subformats + + return programme_id, title, description, duration, formats, subtitles except ExtractorError as ee: if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): raise diff --git a/yt_dlp/extractor/blogger.py b/yt_dlp/extractor/blogger.py new file mode 100644 index 000000000..dba131cb0 --- /dev/null +++ b/yt_dlp/extractor/blogger.py @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from ..utils import ( + mimetype2ext, + parse_duration, + parse_qs, + str_or_none, + traverse_obj, +) +from .common import InfoExtractor + + +class BloggerIE(InfoExtractor): + IE_NAME = 'blogger.com' + _VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P<id>.+)' + _VALID_EMBED = r'''<iframe[^>]+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''' + _TESTS = [{ + 'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw', + 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac', + 'info_dict': { + 'id': 'BLOGGER-video-3c740e3a49197e16-796', + 'title': 'BLOGGER-video-3c740e3a49197e16-796', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*', + 'duration': 76.068, + } + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall(BloggerIE._VALID_EMBED, webpage) + + def _real_extract(self, url): + token_id = self._match_id(url) + webpage = self._download_webpage(url, token_id) + data_json = self._search_regex(r'var\s+VIDEO_CONFIG\s*=\s*(\{.*)', webpage, 'JSON data') + data = self._parse_json(data_json.encode('utf-8').decode('unicode_escape'), token_id) + streams = data['streams'] + formats = [{ + 'ext': mimetype2ext(traverse_obj(parse_qs(stream['play_url']), ('mime', 0))), + 'url': stream['play_url'], + 'format_id': str_or_none(stream.get('format_id')), + } for stream in streams] + + return { + 'id': data.get('iframe_id', token_id), + 'title': data.get('iframe_id', token_id), + 'formats': formats, + 'thumbnail': data.get('thumbnail'), + 'duration': parse_duration(traverse_obj(parse_qs(streams[0]['play_url']), ('dur', 0))), + } diff --git a/yt_dlp/extractor/breitbart.py b/yt_dlp/extractor/breitbart.py new file mode 100644 index 000000000..f50f719dc --- /dev/null +++ b/yt_dlp/extractor/breitbart.py @@ -0,0 +1,39 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BreitBartIE(InfoExtractor): + _VALID_URL = r'https?:\/\/(?:www\.)breitbart.com/videos/v/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.breitbart.com/videos/v/5cOz1yup/?pl=Ij6NDOji', + 'md5': '0aa6d1d6e183ac5ca09207fe49f17ade', + 'info_dict': { + 'id': '5cOz1yup', + 'ext': 'mp4', + 'title': 'Watch \u2013 Clyburn: Statues in Congress Have to Go Because they Are Honoring Slavery', + 'description': 'md5:bac35eb0256d1cb17f517f54c79404d5', + 'thumbnail': 'https://cdn.jwplayer.com/thumbs/5cOz1yup-1920.jpg', + 'age_limit': 0, + } + }, { + 'url': 'https://www.breitbart.com/videos/v/eaiZjVOn/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + formats = self._extract_m3u8_formats(f'https://cdn.jwplayer.com/manifests/{video_id}.m3u8', video_id, ext='mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': self._og_search_title( + webpage, default=None) or self._html_search_regex( + r'(?s)<title>(.*?)</title>', webpage, 'video title'), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': self._rta_search(webpage), + 'formats': formats + } diff --git a/yt_dlp/extractor/canalalpha.py b/yt_dlp/extractor/canalalpha.py new file mode 100644 index 000000000..7287677c1 --- /dev/null +++ b/yt_dlp/extractor/canalalpha.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + try_get, + unified_strdate, +) + + +class CanalAlphaIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?canalalpha\.ch/play/[^/]+/[^/]+/(?P<id>\d+)/?.*' + + _TESTS = [{ + 'url': 'https://www.canalalpha.ch/play/le-journal/episode/24520/jeudi-28-octobre-2021', + 'info_dict': { + 'id': '24520', + 'ext': 'mp4', + 'title': 'Jeudi 28 octobre 2021', + 'description': 'md5:d30c6c3e53f8ad40d405379601973b30', + 'thumbnail': 'https://static.canalalpha.ch/poster/journal/journal_20211028.jpg', + 'upload_date': '20211028', + 'duration': 1125, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/le-journal/topic/24512/la-poste-fait-de-neuchatel-un-pole-cryptographique', + 'info_dict': { + 'id': '24512', + 'ext': 'mp4', + 'title': 'La Poste fait de Neuchâtel un pôle cryptographique', + 'description': 'md5:4ba63ae78a0974d1a53d6703b6e1dedf', + 'thumbnail': 'https://static.canalalpha.ch/poster/news/news_39712.jpg', + 'upload_date': '20211028', + 'duration': 138, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/eureka/episode/24484/ces-innovations-qui-veulent-rendre-lagriculture-plus-durable', + 'info_dict': { + 'id': '24484', + 'ext': 'mp4', + 'title': 'Ces innovations qui veulent rendre l’agriculture plus durable', + 'description': 'md5:3de3f151180684621e85be7c10e4e613', + 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg', + 'upload_date': '20211026', + 'duration': 360, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/avec-le-temps/episode/23516/redonner-de-leclat-grace-au-polissage', + 'info_dict': { + 'id': '23516', + 'ext': 'mp4', + 'title': 'Redonner de l\'éclat grâce au polissage', + 'description': 'md5:0d8fbcda1a5a4d6f6daa3165402177e1', + 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_9990.png', + 'upload_date': '20210726', + 'duration': 360, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._search_regex( + r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;', + webpage, 'data_json'), id)['1']['data']['data'] + manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {} + subtitles = {} + formats = [{ + 'url': video['$url'], + 'ext': 'mp4', + 'width': try_get(video, lambda x: x['res']['width'], expected_type=int), + 'height': try_get(video, lambda x: x['res']['height'], expected_type=int), + } for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')] + if manifests.get('hls'): + m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], id) + formats.extend(m3u8_frmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + if manifests.get('dash'): + dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash'], id) + formats.extend(dash_frmts) + subtitles = self._merge_subtitles(subtitles, dash_subs) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title').strip(), + 'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))), + 'thumbnail': data_json.get('poster'), + 'upload_date': unified_strdate(dict_get(data_json, ('webPublishAt', 'featuredAt', 'diffusionDate'))), + 'duration': try_get(data_json, lambda x: x['video']['duration'], expected_type=int), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py index 49e7e4e39..e97c91929 100644 --- a/yt_dlp/extractor/canvas.py +++ b/yt_dlp/extractor/canvas.py @@ -1,4 +1,5 @@ from __future__ import unicode_literals +import json from .common import InfoExtractor @@ -41,9 +42,9 @@ class CanvasIE(InfoExtractor): _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', - 'HLS_AES': 'm3u8', + 'HLS_AES': 'm3u8_native', } - _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' + _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -59,16 +60,21 @@ class CanvasIE(InfoExtractor): # New API endpoint if not data: + vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', + video_id, note='refreshtoken: Retrieve vrtnutoken', + errnote='refreshtoken failed')['vrtnutoken'] headers = self.geo_verification_headers() - headers.update({'Content-Type': 'application/json'}) - token = self._download_json( + headers.update({'Content-Type': 'application/json; charset=utf-8'}) + vrtPlayerToken = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] + 'Downloading token', headers=headers, data=json.dumps({ + 'identityToken': vrtnutoken + }).encode('utf-8'))['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), video_id, 'Downloading video JSON', query={ - 'vrtPlayerToken': token, - 'client': '%s@PROD' % site_id, + 'vrtPlayerToken': vrtPlayerToken, + 'client': 'null', }, expected_status=400) if not data.get('title'): code = data.get('code') @@ -264,7 +270,7 @@ class VrtNUIE(GigyaBaseIE): 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], }] _NETRC_MACHINE = 'vrtnu' - _APIKEY = '3_qhEcPa5JGFROVwu5SWKqJ4mVOIkwlFNMSKwzPDAh8QZOtHqu6L4nD5Q7lk0eXOOG' + _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' _CONTEXT_ID = 'R3595707040' def _real_initialize(self): @@ -275,16 +281,13 @@ class VrtNUIE(GigyaBaseIE): if username is None: return - auth_info = self._download_json( - 'https://accounts.vrt.be/accounts.login', None, - note='Login data', errnote='Could not get Login data', - headers={}, data=urlencode_postdata({ - 'loginID': username, - 'password': password, - 'sessionExpiration': '-2', - 'APIKey': self._APIKEY, - 'targetEnv': 'jssdk', - })) + auth_info = self._gigya_login({ + 'APIKey': self._APIKEY, + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + }) if auth_info.get('errorDetails'): raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) @@ -301,14 +304,15 @@ class VrtNUIE(GigyaBaseIE): 'UID': auth_info['UID'], 'UIDSignature': auth_info['UIDSignature'], 'signatureTimestamp': auth_info['signatureTimestamp'], - 'client_id': 'vrtnu-site', '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, } self._request_webpage( 'https://login.vrt.be/perform_login', - None, note='Requesting a token', errnote='Could not get a token', - headers={}, data=urlencode_postdata(post_data)) + None, note='Performing login', errnote='perform login failed', + headers={}, query={ + 'client_id': 'vrtnu-site' + }, data=urlencode_postdata(post_data)) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 4fcf2a9c1..413053499 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -390,7 +390,8 @@ class CBCGemPlaylistIE(InfoExtractor): show = match.group('show') show_info = self._download_json(self._API_BASE + show, season_id) season = int(match.group('season')) - season_info = try_get(show_info, lambda x: x['seasons'][season - 1]) + + season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) if season_info is None: raise ExtractorError(f'Couldn\'t find season {season} of {show}') diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py index 5e04d38a2..f766dfbb7 100644 --- a/yt_dlp/extractor/ceskatelevize.py +++ b/yt_dlp/extractor/ceskatelevize.py @@ -20,22 +20,8 @@ from ..utils import ( class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' _TESTS = [{ - 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', - 'info_dict': { - 'id': '61924494877246241', - 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Život v Grónsku', - 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3350, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', 'info_dict': { 'id': '61924494877028507', @@ -66,12 +52,58 @@ class CeskaTelevizeIE(InfoExtractor): }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', 'only_matching': True, + }, { + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti. Připravil Peter Serge Butko', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494877311053', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 11.9, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', + 'only_matching': True, }] def _real_extract(self, url): playlist_id = self._match_id(url) - + parsed_url = compat_urllib_parse_urlparse(url) webpage = self._download_webpage(url, playlist_id) + site_name = self._og_search_property('site_name', webpage, fatal=False, default=None) + playlist_title = self._og_search_title(webpage, default=None) + if site_name and playlist_title: + playlist_title = playlist_title.replace(f' — {site_name}', '', 1) + playlist_description = self._og_search_description(webpage, default=None) + if playlist_description: + playlist_description = playlist_description.replace('\xa0', ' ') + + if parsed_url.path.startswith('/porady/'): + refer_url = update_url_query(unescapeHTML(self._search_regex( + (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), + webpage, 'iframe player url', group='url')), query={'autoStart': 'true'}) + webpage = self._download_webpage(refer_url, playlist_id) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s</p>' % NOT_AVAILABLE_STRING in webpage: @@ -100,7 +132,7 @@ class CeskaTelevizeIE(InfoExtractor): data = { 'playlist[0][type]': type_, 'playlist[0][id]': episode_id, - 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestUrl': parsed_url.path, 'requestSource': 'iVysilani', } @@ -108,7 +140,7 @@ class CeskaTelevizeIE(InfoExtractor): for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( - 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') @@ -130,9 +162,6 @@ class CeskaTelevizeIE(InfoExtractor): req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist_title = self._og_search_title(webpage, default=None) - playlist_description = self._og_search_description(webpage, default=None) - playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: continue @@ -237,54 +266,3 @@ class CeskaTelevizeIE(InfoExtractor): yield line return '\r\n'.join(_fix_subtitle(subtitles)) - - -class CeskaTelevizePoradyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' - _TESTS = [{ - # video with 18+ caution trailer - 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', - 'info_dict': { - 'id': '215562210900007-bogotart', - 'title': 'Queer: Bogotart', - 'description': 'Alternativní průvodce současným queer světem', - }, - 'playlist': [{ - 'info_dict': { - 'id': '61924494876844842', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Varování 18+)', - 'duration': 10.2, - }, - }, { - 'info_dict': { - 'id': '61924494877068022', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Queer)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 1558.3, - }, - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # iframe embed - 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - data_url = update_url_query(unescapeHTML(self._search_regex( - (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), - webpage, 'iframe player url', group='url')), query={ - 'autoStart': 'true', - }) - - return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index aa98c0cc9..fc28bca2e 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import base64 +import collections import datetime import hashlib import itertools @@ -54,6 +55,7 @@ from ..utils import ( GeoRestrictedError, GeoUtils, int_or_none, + join_nonempty, js_to_json, JSON_LD_RE, mimetype2ext, @@ -341,6 +343,7 @@ class InfoExtractor(object): series, programme or podcast: series: Title of the series or programme the video episode belongs to. + series_id: Id of the series or programme the video episode belongs to, as a unicode string. season: Title of the season the video episode belongs to. season_number: Number of the season the video episode belongs to, as an integer. season_id: Id of the season the video episode belongs to, as a unicode string. @@ -441,11 +444,11 @@ class InfoExtractor(object): _WORKING = True _LOGIN_HINTS = { - 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials', + 'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials', 'cookies': ( 'Use --cookies-from-browser or --cookies for the authentication. ' 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'), - 'password': 'Use --username and --password or --netrc to provide account credentials', + 'password': 'Use --username and --password, or --netrc to provide account credentials', } def __init__(self, downloader=None): @@ -1449,6 +1452,9 @@ class InfoExtractor(object): item_type = e.get('@type') if expected_type is not None and expected_type != item_type: continue + rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none) + if rating is not None: + info['average_rating'] = rating if item_type in ('TVEpisode', 'Episode'): episode_name = unescapeHTML(e.get('name')) info.update({ @@ -1495,6 +1501,13 @@ class InfoExtractor(object): break return dict((k, v) for k, v in info.items() if v is not None) + def _search_nextjs_data(self, webpage, video_id, **kw): + return self._parse_json( + self._search_regex( + r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', + webpage, 'next.js data', **kw), + video_id, **kw) + @staticmethod def _hidden_inputs(html): html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) @@ -1531,7 +1544,7 @@ class InfoExtractor(object): 'vcodec': {'type': 'ordered', 'regex': True, 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, 'acodec': {'type': 'ordered', 'regex': True, - 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']}, + 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]}, 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', @@ -1911,7 +1924,7 @@ class InfoExtractor(object): tbr = int_or_none(media_el.attrib.get('bitrate')) width = int_or_none(media_el.attrib.get('width')) height = int_or_none(media_el.attrib.get('height')) - format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) + format_id = join_nonempty(f4m_id, tbr or i) # If <bootstrapInfo> is present, the specified f4m is a # stream-level manifest, and only set-level manifests may refer to # external resources. See section 11.4 and section 4 of F4M spec @@ -1973,7 +1986,7 @@ class InfoExtractor(object): def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None): return { - 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), + 'format_id': join_nonempty(m3u8_id, 'meta'), 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', @@ -2026,10 +2039,10 @@ class InfoExtractor(object): video_id=None): formats, subtitles = [], {} - if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access - return formats, subtitles - - has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc) + has_drm = re.search('|'.join([ + r'#EXT-X-FAXS-CM:', # Adobe Flash Access + r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay + ]), m3u8_doc) def format_url(url): return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url) @@ -2068,7 +2081,7 @@ class InfoExtractor(object): if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is formats = [{ - 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))), + 'format_id': join_nonempty(m3u8_id, idx), 'format_index': idx, 'url': m3u8_url, 'ext': ext, @@ -2117,7 +2130,7 @@ class InfoExtractor(object): if media_url: manifest_url = format_url(media_url) formats.extend({ - 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))), + 'format_id': join_nonempty(m3u8_id, group_id, name, idx), 'format_note': name, 'format_index': idx, 'url': manifest_url, @@ -2174,9 +2187,9 @@ class InfoExtractor(object): # format_id intact. if not live: stream_name = build_stream_name() - format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats)) + format_id[1] = stream_name or '%d' % (tbr or len(formats)) f = { - 'format_id': '-'.join(map(str, filter(None, format_id))), + 'format_id': join_nonempty(*format_id), 'format_index': idx, 'url': manifest_url, 'manifest_url': m3u8_url, @@ -2640,7 +2653,7 @@ class InfoExtractor(object): mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) formats, subtitles = [], {} - stream_numbers = {'audio': 0, 'video': 0} + stream_numbers = collections.defaultdict(int) for period in mpd_doc.findall(_add_ns('Period')): period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { @@ -2706,10 +2719,8 @@ class InfoExtractor(object): 'format_note': 'DASH %s' % content_type, 'filesize': filesize, 'container': mimetype2ext(mime_type) + '_dash', - 'manifest_stream_number': stream_numbers[content_type] } f.update(parse_codecs(codecs)) - stream_numbers[content_type] += 1 elif content_type == 'text': f = { 'ext': mimetype2ext(mime_type), @@ -2876,7 +2887,9 @@ class InfoExtractor(object): else: # Assuming direct URL to unfragmented media. f['url'] = base_url - if content_type in ('video', 'audio') or mime_type == 'image/jpeg': + if content_type in ('video', 'audio', 'image/jpeg'): + f['manifest_stream_number'] = stream_numbers[f['url']] + stream_numbers[f['url']] += 1 formats.append(f) elif content_type == 'text': subtitles.setdefault(lang or 'und', []).append(f) @@ -2965,13 +2978,6 @@ class InfoExtractor(object): }) fragment_ctx['time'] += fragment_ctx['duration'] - format_id = [] - if ism_id: - format_id.append(ism_id) - if stream_name: - format_id.append(stream_name) - format_id.append(compat_str(tbr)) - if stream_type == 'text': subtitles.setdefault(stream_language, []).append({ 'ext': 'ismt', @@ -2990,7 +2996,7 @@ class InfoExtractor(object): }) elif stream_type in ('video', 'audio'): formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(ism_id, stream_name, tbr), 'url': ism_url, 'manifest_url': ism_url, 'ext': 'ismv' if stream_type == 'video' else 'isma', diff --git a/yt_dlp/extractor/corus.py b/yt_dlp/extractor/corus.py index 352951e20..119461375 100644 --- a/yt_dlp/extractor/corus.py +++ b/yt_dlp/extractor/corus.py @@ -55,7 +55,6 @@ class CorusIE(ThePlatformFeedIE): 'timestamp': 1486392197, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'expected_warnings': ['Failed to parse JSON'], diff --git a/yt_dlp/extractor/cozytv.py b/yt_dlp/extractor/cozytv.py new file mode 100644 index 000000000..868d8d27d --- /dev/null +++ b/yt_dlp/extractor/cozytv.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class CozyTVIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?cozy\.tv/(?P<uploader>[^/]+)/replays/(?P<id>[^/$#&?]+)' + + _TESTS = [{ + 'url': 'https://cozy.tv/beardson/replays/2021-11-19_1', + 'info_dict': { + 'id': 'beardson-2021-11-19_1', + 'ext': 'mp4', + 'title': 'pokemon pt2', + 'uploader': 'beardson', + 'upload_date': '20211119', + 'was_live': True, + 'duration': 7981, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + uploader, date = self._match_valid_url(url).groups() + id = f'{uploader}-{date}' + data_json = self._download_json(f'https://api.cozy.tv/cache/{uploader}/replay/{date}', id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://cozycdn.foxtrotstream.xyz/replays/{uploader}/{date}/index.m3u8', id, ext='mp4') + return { + 'id': id, + 'title': data_json.get('title'), + 'uploader': data_json.get('user') or uploader, + 'upload_date': unified_strdate(data_json.get('date')), + 'was_live': True, + 'duration': data_json.get('duration'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index 511ac1b2c..cd35728e5 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -27,6 +27,7 @@ from ..utils import ( int_or_none, lowercase_escape, merge_dicts, + qualities, remove_end, sanitized_Request, try_get, @@ -478,19 +479,24 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], webpage, 'video_uploader', default=False) + requested_languages = self._configuration_arg('language') + requested_hardsubs = [('' if val == 'none' else val) for val in self._configuration_arg('hardsub')] + language_preference = qualities((requested_languages or [language or ''])[::-1]) + hardsub_preference = qualities((requested_hardsubs or ['', language or ''])[::-1]) + formats = [] for stream in media.get('streams', []): - audio_lang = stream.get('audio_lang') - hardsub_lang = stream.get('hardsub_lang') + audio_lang = stream.get('audio_lang') or '' + hardsub_lang = stream.get('hardsub_lang') or '' + if (requested_languages and audio_lang.lower() not in requested_languages + or requested_hardsubs and hardsub_lang.lower() not in requested_hardsubs): + continue vrv_formats = self._extract_vrv_formats( stream.get('url'), video_id, stream.get('format'), audio_lang, hardsub_lang) for f in vrv_formats: - f['language_preference'] = 1 if audio_lang == language else 0 - f['quality'] = ( - 1 if not hardsub_lang - else 0 if hardsub_lang == language - else -1) + f['language_preference'] = language_preference(audio_lang) + f['quality'] = hardsub_preference(hardsub_lang) formats.extend(vrv_formats) if not formats: available_fmts = [] diff --git a/yt_dlp/extractor/cspan.py b/yt_dlp/extractor/cspan.py index 2e01aff48..c717aec3a 100644 --- a/yt_dlp/extractor/cspan.py +++ b/yt_dlp/extractor/cspan.py @@ -18,7 +18,7 @@ from ..utils import ( str_to_int, unescapeHTML, ) -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE from .ustream import UstreamIE diff --git a/yt_dlp/extractor/curiositystream.py b/yt_dlp/extractor/curiositystream.py index 034a5c92a..485b6031f 100644 --- a/yt_dlp/extractor/curiositystream.py +++ b/yt_dlp/extractor/curiositystream.py @@ -15,7 +15,6 @@ from ..utils import ( class CuriosityStreamBaseIE(InfoExtractor): _NETRC_MACHINE = 'curiositystream' _auth_token = None - _API_BASE_URL = 'https://api.curiositystream.com/v1/' def _handle_errors(self, result): error = result.get('error', {}).get('message') @@ -39,38 +38,44 @@ class CuriosityStreamBaseIE(InfoExtractor): if email is None: return result = self._download_json( - self._API_BASE_URL + 'login', None, data=urlencode_postdata({ + 'https://api.curiositystream.com/v1/login', None, + note='Logging in', data=urlencode_postdata({ 'email': email, 'password': password, })) self._handle_errors(result) - self._auth_token = result['message']['auth_token'] + CuriosityStreamBaseIE._auth_token = result['message']['auth_token'] class CuriosityStreamIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream' _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://app.curiositystream.com/video/2', 'info_dict': { 'id': '2', 'ext': 'mp4', 'title': 'How Did You Develop The Internet?', 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', + 'channel': 'Curiosity Stream', + 'categories': ['Technology', 'Interview'], + 'average_rating': 96.79, + 'series_id': '2', }, 'params': { - 'format': 'bestvideo', # m3u8 download 'skip_download': True, }, - } + }] + + _API_BASE_URL = 'https://api.curiositystream.com/v1/media/' def _real_extract(self, url): video_id = self._match_id(url) formats = [] for encoding_format in ('m3u8', 'mpd'): - media = self._call_api('media/' + video_id, video_id, query={ + media = self._call_api(video_id, video_id, query={ 'encodingsNew': 'true', 'encodingsFormat': encoding_format, }) @@ -140,12 +145,33 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): 'duration': int_or_none(media.get('duration')), 'tags': media.get('tags'), 'subtitles': subtitles, + 'channel': media.get('producer'), + 'categories': [media.get('primary_category'), media.get('type')], + 'average_rating': media.get('rating_percentage'), + 'series_id': str(media.get('collection_id') or '') or None, } -class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): - IE_NAME = 'curiositystream:collection' - _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P<id>\d+)' +class CuriosityStreamCollectionBaseIE(CuriosityStreamBaseIE): + + def _real_extract(self, url): + collection_id = self._match_id(url) + collection = self._call_api(collection_id, collection_id) + entries = [] + for media in collection.get('media', []): + media_id = compat_str(media.get('id')) + media_type, ie = ('series', CuriosityStreamSeriesIE) if media.get('is_collection') else ('video', CuriosityStreamIE) + entries.append(self.url_result( + 'https://curiositystream.com/%s/%s' % (media_type, media_id), + ie=ie.ie_key(), video_id=media_id)) + return self.playlist_result( + entries, collection_id, + collection.get('title'), collection.get('description')) + + +class CuriosityStreamCollectionsIE(CuriosityStreamCollectionBaseIE): + IE_NAME = 'curiositystream:collections' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/collections/(?P<id>\d+)' _API_BASE_URL = 'https://api.curiositystream.com/v2/collections/' _TESTS = [{ 'url': 'https://curiositystream.com/collections/86', @@ -156,7 +182,17 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): }, 'playlist_mincount': 7, }, { - 'url': 'https://app.curiositystream.com/collection/2', + 'url': 'https://curiositystream.com/collections/36', + 'only_matching': True, + }] + + +class CuriosityStreamSeriesIE(CuriosityStreamCollectionBaseIE): + IE_NAME = 'curiositystream:series' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:series|collection)/(?P<id>\d+)' + _API_BASE_URL = 'https://api.curiositystream.com/v2/series/' + _TESTS = [{ + 'url': 'https://curiositystream.com/series/2', 'info_dict': { 'id': '2', 'title': 'Curious Minds: The Internet', @@ -164,23 +200,6 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): }, 'playlist_mincount': 16, }, { - 'url': 'https://curiositystream.com/series/2', - 'only_matching': True, - }, { - 'url': 'https://curiositystream.com/collections/36', + 'url': 'https://curiositystream.com/collection/2', 'only_matching': True, }] - - def _real_extract(self, url): - collection_id = self._match_id(url) - collection = self._call_api(collection_id, collection_id) - entries = [] - for media in collection.get('media', []): - media_id = compat_str(media.get('id')) - media_type, ie = ('series', CuriosityStreamCollectionIE) if media.get('is_collection') else ('video', CuriosityStreamIE) - entries.append(self.url_result( - 'https://curiositystream.com/%s/%s' % (media_type, media_id), - ie=ie.ie_key(), video_id=media_id)) - return self.playlist_result( - entries, collection_id, - collection.get('title'), collection.get('description')) diff --git a/yt_dlp/extractor/discoverynetworks.py b/yt_dlp/extractor/discoverynetworks.py index f43c87160..4f8bdf0b9 100644 --- a/yt_dlp/extractor/discoverynetworks.py +++ b/yt_dlp/extractor/discoverynetworks.py @@ -19,7 +19,6 @@ class DiscoveryNetworksDeIE(DPlayIE): 'upload_date': '20190331', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { diff --git a/yt_dlp/extractor/discoveryplusindia.py b/yt_dlp/extractor/discoveryplusindia.py index 51801402c..8ec418a97 100644 --- a/yt_dlp/extractor/discoveryplusindia.py +++ b/yt_dlp/extractor/discoveryplusindia.py @@ -28,7 +28,6 @@ class DiscoveryPlusIndiaIE(DPlayIE): 'creator': 'Discovery Channel', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'skip': 'Cookies (not necessarily logged in) are needed' diff --git a/yt_dlp/extractor/disney.py b/yt_dlp/extractor/disney.py index f018cbe9d..0ad7b1f46 100644 --- a/yt_dlp/extractor/disney.py +++ b/yt_dlp/extractor/disney.py @@ -7,8 +7,8 @@ from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, - compat_str, determine_ext, + join_nonempty, update_url_query, ) @@ -119,18 +119,13 @@ class DisneyIE(InfoExtractor): continue formats.append(f) continue - format_id = [] - if flavor_format: - format_id.append(flavor_format) - if tbr: - format_id.append(compat_str(tbr)) ext = determine_ext(flavor_url) if flavor_format == 'applehttp' or ext == 'm3u8': ext = 'mp4' width = int_or_none(flavor.get('width')) height = int_or_none(flavor.get('height')) formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(flavor_format, tbr), 'url': flavor_url, 'width': width, 'height': height, diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index d62480810..525c8e243 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -46,7 +46,6 @@ class DPlayIE(InfoExtractor): 'episode_number': 1, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -67,7 +66,6 @@ class DPlayIE(InfoExtractor): 'episode_number': 1, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -87,7 +85,6 @@ class DPlayIE(InfoExtractor): 'episode_number': 7, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'skip': 'Available for Premium users', @@ -313,9 +310,6 @@ class HGTVDeIE(DPlayIE): 'season_number': 3, 'episode_number': 3, }, - 'params': { - 'format': 'bestvideo', - }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/dvtv.py b/yt_dlp/extractor/dvtv.py index de7f6d670..08663cffb 100644 --- a/yt_dlp/extractor/dvtv.py +++ b/yt_dlp/extractor/dvtv.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + join_nonempty, js_to_json, mimetype2ext, try_get, @@ -139,13 +140,9 @@ class DVTVIE(InfoExtractor): label = video.get('label') height = self._search_regex( r'^(\d+)[pP]', label or '', 'height', default=None) - format_id = ['http'] - for f in (ext, label): - if f: - format_id.append(f) formats.append({ 'url': video_url, - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty('http', ext, label), 'height': int_or_none(height), }) self._sort_formats(formats) diff --git a/yt_dlp/extractor/egghead.py b/yt_dlp/extractor/egghead.py index f6b50e7c2..b6b86768c 100644 --- a/yt_dlp/extractor/egghead.py +++ b/yt_dlp/extractor/egghead.py @@ -86,7 +86,6 @@ class EggheadLessonIE(EggheadBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', }, }, { 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', diff --git a/yt_dlp/extractor/espn.py b/yt_dlp/extractor/espn.py index d4a66c29f..dc50f3b8b 100644 --- a/yt_dlp/extractor/espn.py +++ b/yt_dlp/extractor/espn.py @@ -7,7 +7,9 @@ from .once import OnceIE from ..compat import compat_str from ..utils import ( determine_ext, + dict_get, int_or_none, + unified_strdate, unified_timestamp, ) @@ -236,3 +238,44 @@ class FiveThirtyEightIE(InfoExtractor): webpage, 'embed url') return self.url_result(embed_url, 'AbcNewsVideo') + + +class ESPNCricInfoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/video/[^#$&?/]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.espncricinfo.com/video/finch-chasing-comes-with-risks-despite-world-cup-trend-1289135', + 'info_dict': { + 'id': '1289135', + 'ext': 'mp4', + 'title': 'Finch: Chasing comes with \'risks\' despite World Cup trend', + 'description': 'md5:ea32373303e25efbb146efdfc8a37829', + 'upload_date': '20211113', + 'duration': 96, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://hs-consumer-api.espncricinfo.com/v1/pages/video/video-details?videoId={id}', id)['video'] + formats, subtitles = [], {} + for item in data_json.get('playbacks') or []: + if item.get('type') == 'HLS' and item.get('url'): + m3u8_frmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(item['url'], id) + formats.extend(m3u8_frmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + elif item.get('type') == 'AUDIO' and item.get('url'): + formats.append({ + 'url': item['url'], + 'vcodec': 'none', + }) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title'), + 'description': data_json.get('summary'), + 'upload_date': unified_strdate(dict_get(data_json, ('publishedAt', 'recordedAt'))), + 'duration': data_json.get('duration'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 9d963ee46..a4baad2da 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -50,6 +50,7 @@ from .animelab import ( AnimeLabIE, AnimeLabShowsIE, ) +from .amazon import AmazonStoreIE from .americastestkitchen import ( AmericasTestKitchenIE, AmericasTestKitchenSeasonIE, @@ -165,6 +166,7 @@ from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, ) +from .blogger import BloggerIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE @@ -177,6 +179,7 @@ from .br import ( ) from .bravotv import BravoTVIE from .breakcom import BreakIE +from .breitbart import BreitBartIE from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, @@ -192,6 +195,7 @@ from .camdemy import ( ) from .cammodels import CamModelsIE from .camwithher import CamWithHerIE +from .canalalpha import CanalAlphaIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .canvas import ( @@ -235,10 +239,7 @@ from .ccc import ( from .ccma import CCMAIE from .cctv import CCTVIE from .cda import CDAIE -from .ceskatelevize import ( - CeskaTelevizeIE, - CeskaTelevizePoradyIE, -) +from .ceskatelevize import CeskaTelevizeIE from .cgtn import CGTNIE from .channel9 import Channel9IE from .charlierose import CharlieRoseIE @@ -293,6 +294,7 @@ from .commonprotocols import ( from .condenast import CondeNastIE from .contv import CONtvIE from .corus import CorusIE +from .cozytv import CozyTVIE from .cracked import CrackedIE from .crackle import CrackleIE from .crooksandliars import CrooksAndLiarsIE @@ -309,7 +311,8 @@ from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE from .curiositystream import ( CuriosityStreamIE, - CuriosityStreamCollectionIE, + CuriosityStreamCollectionsIE, + CuriosityStreamSeriesIE, ) from .cwtv import CWTVIE from .dailymail import DailyMailIE @@ -419,6 +422,7 @@ from .espn import ( ESPNIE, ESPNArticleIE, FiveThirtyEightIE, + ESPNCricInfoIE, ) from .esri import EsriVideoIE from .europa import EuropaIE @@ -495,7 +499,10 @@ from .funimation import ( ) from .funk import FunkIE from .fusion import FusionIE -from .gab import GabTVIE +from .gab import ( + GabTVIE, + GabIE, +) from .gaia import GaiaIE from .gameinformer import GameInformerIE from .gamespot import GameSpotIE @@ -591,12 +598,16 @@ from .indavideo import IndavideoEmbedIE from .infoq import InfoQIE from .instagram import ( InstagramIE, + InstagramIOSIE, InstagramUserIE, InstagramTagIE, ) from .internazionale import InternazionaleIE from .internetvideoarchive import InternetVideoArchiveIE -from .iprima import IPrimaIE +from .iprima import ( + IPrimaIE, + IPrimaCNNIE +) from .iqiyi import IqiyiIE from .ir90tv import Ir90TvIE from .itv import ( @@ -696,6 +707,7 @@ from .line import ( LineLiveChannelIE, ) from .linkedin import ( + LinkedInIE, LinkedInLearningIE, LinkedInLearningCourseIE, ) @@ -787,6 +799,7 @@ from .mirrativ import ( ) from .mit import TechTVMITIE, OCWMITIE from .mitele import MiTeleIE +from .mixch import MixchIE from .mixcloud import ( MixcloudIE, MixcloudUserIE, @@ -839,7 +852,10 @@ from .myvi import ( ) from .myvideoge import MyVideoGeIE from .myvidster import MyVidsterIE -from .n1 import N1InfoIIE, N1InfoAssetIE +from .n1 import ( + N1InfoAssetIE, + N1InfoIIE, +) from .nationalgeographic import ( NationalGeographicVideoIE, NationalGeographicTVIE, @@ -873,7 +889,10 @@ from .ndr import ( NJoyEmbedIE, ) from .ndtv import NDTVIE -from .nebula import NebulaIE +from .nebula import ( + NebulaIE, + NebulaCollectionIE, +) from .nerdcubed import NerdCubedFeedIE from .netzkino import NetzkinoIE from .neteasemusic import ( @@ -927,7 +946,10 @@ from .niconico import ( NicovideoSearchIE, NicovideoSearchURLIE, ) -from .ninecninemedia import NineCNineMediaIE +from .ninecninemedia import ( + NineCNineMediaIE, + CPTwentyFourIE, +) from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE @@ -991,6 +1013,7 @@ from .oktoberfesttv import OktoberfestTVIE from .olympics import OlympicsReplayIE from .on24 import On24IE from .ondemandkorea import OnDemandKoreaIE +from .onefootball import OneFootballIE from .onet import ( OnetIE, OnetChannelIE, @@ -1051,6 +1074,7 @@ from .peertube import ( PeerTubeIE, PeerTubePlaylistIE, ) +from .peertv import PeerTVIE from .peloton import ( PelotonIE, PelotonLiveIE @@ -1075,6 +1099,7 @@ from .pinterest import ( PinterestCollectionIE, ) from .pladform import PladformIE +from .planetmarathi import PlanetMarathiIE from .platzi import ( PlatziIE, PlatziCourseIE, @@ -1096,9 +1121,14 @@ from .pokemon import ( PokemonIE, PokemonWatchIE, ) +from .polsatgo import PolsatGoIE from .polskieradio import ( PolskieRadioIE, PolskieRadioCategoryIE, + PolskieRadioPlayerIE, + PolskieRadioPodcastIE, + PolskieRadioPodcastListIE, + PolskieRadioRadioKierowcowIE, ) from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE @@ -1145,6 +1175,11 @@ from .radiode import RadioDeIE from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE +from .radiozet import RadioZetPodcastIE +from .radiokapital import ( + RadioKapitalIE, + RadioKapitalShowIE, +) from .radlive import ( RadLiveIE, RadLiveChannelIE, @@ -1155,6 +1190,8 @@ from .rai import ( RaiPlayLiveIE, RaiPlayPlaylistIE, RaiIE, + RaiPlayRadioIE, + RaiPlayRadioPlaylistIE, ) from .raywenderlich import ( RayWenderlichIE, @@ -1178,10 +1215,8 @@ from .redbulltv import ( RedBullTVRrnContentIE, RedBullIE, ) -from .reddit import ( - RedditIE, - RedditRIE, -) +from .reddit import RedditIE +from .redgifs import RedGifsIE from .redtube import RedTubeIE from .regiotv import RegioTVIE from .rentv import ( @@ -1195,7 +1230,7 @@ from .rice import RICEIE from .rmcdecouverte import RMCDecouverteIE from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE -from .roosterteeth import RoosterTeethIE +from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rozhlas import RozhlasIE @@ -1208,6 +1243,7 @@ from .rtl2 import ( RTL2YouSeriesIE, ) from .rtp import RTPIE +from .rtrfm import RTRFMIE from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE from .rtvnh import RTVNHIE @@ -1249,7 +1285,7 @@ from .scte import ( SCTECourseIE, ) from .seeker import SeekerIE -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE, SenateGovIE from .sendtonews import SendtoNewsIE from .servus import ServusIE from .sevenplus import SevenPlusIE @@ -1376,8 +1412,10 @@ from .streamable import StreamableIE from .streamanity import StreamanityIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .streamff import StreamFFIE from .streetvoice import StreetVoiceIE from .stretchinternet import StretchInternetIE +from .stripchat import StripchatIE from .stv import STVPlayerIE from .sunporno import SunPornoIE from .sverigesradio import ( @@ -1551,6 +1589,7 @@ from .tvnow import ( from .tvp import ( TVPEmbedIE, TVPIE, + TVPStreamIE, TVPWebsiteIE, ) from .tvplay import ( @@ -1761,6 +1800,7 @@ from .weibo import ( WeiboMobileIE ) from .weiqitv import WeiqiTVIE +from .willow import WillowIE from .wimtv import WimTVIE from .whowatch import WhoWatchIE from .wistia import ( @@ -1768,6 +1808,10 @@ from .wistia import ( WistiaPlaylistIE, ) from .worldstarhiphop import WorldStarHipHopIE +from .wppilot import ( + WPPilotIE, + WPPilotChannelsIE, +) from .wsj import ( WSJIE, WSJArticleIE, diff --git a/yt_dlp/extractor/fancode.py b/yt_dlp/extractor/fancode.py index 912feb702..f6733b124 100644 --- a/yt_dlp/extractor/fancode.py +++ b/yt_dlp/extractor/fancode.py @@ -21,7 +21,6 @@ class FancodeVodIE(InfoExtractor): 'url': 'https://fancode.com/video/15043/match-preview-pbks-vs-mi', 'params': { 'skip_download': True, - 'format': 'bestvideo' }, 'info_dict': { 'id': '6249806281001', diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index 3bbab69e6..bc5ef4df9 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -185,7 +185,7 @@ class FranceTVIE(InfoExtractor): 'vcodec': 'none', 'ext': 'mhtml', 'protocol': 'mhtml', - 'url': 'about:dummy', + 'url': 'about:invalid', 'fragments': [{ 'path': sheet, # XXX: not entirely accurate; each spritesheet seems to be diff --git a/yt_dlp/extractor/funimation.py b/yt_dlp/extractor/funimation.py index 382cbe159..96dad2ca3 100644 --- a/yt_dlp/extractor/funimation.py +++ b/yt_dlp/extractor/funimation.py @@ -10,6 +10,7 @@ from ..compat import compat_HTTPError from ..utils import ( determine_ext, int_or_none, + join_nonempty, js_to_json, orderedSet, qualities, @@ -275,7 +276,7 @@ class FunimationIE(FunimationBaseIE): def _get_subtitles(self, subtitles, experience_id, episode, display_id, format_name): if isinstance(episode, str): webpage = self._download_webpage( - f'https://www.funimation.com/player/{experience_id}', display_id, + f'https://www.funimation.com/player/{experience_id}/', display_id, fatal=False, note=f'Downloading player webpage for {format_name}') episode, _, _ = self._get_episode(webpage, episode_id=episode, fatal=False) @@ -288,10 +289,11 @@ class FunimationIE(FunimationBaseIE): sub_type = sub_type if sub_type != 'FULL' else None current_sub = { 'url': text_track['src'], - 'name': ' '.join(filter(None, (version, text_track.get('label'), sub_type))) + 'name': join_nonempty(version, text_track.get('label'), sub_type, delim=' ') } - lang = '_'.join(filter(None, ( - text_track.get('language', 'und'), version if version != 'Simulcast' else None, sub_type))) + lang = join_nonempty(text_track.get('language', 'und'), + version if version != 'Simulcast' else None, + sub_type, delim='_') if current_sub not in subtitles.get(lang, []): subtitles.setdefault(lang, []).append(current_sub) return subtitles diff --git a/yt_dlp/extractor/gab.py b/yt_dlp/extractor/gab.py index 25b5cb066..bde6e8624 100644 --- a/yt_dlp/extractor/gab.py +++ b/yt_dlp/extractor/gab.py @@ -6,7 +6,11 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + int_or_none, + parse_codecs, + parse_duration, str_to_int, + unified_timestamp ) @@ -32,8 +36,10 @@ class GabTVIE(InfoExtractor): channel_name = self._search_regex(r'data-channel-name=\"(?P<channel_id>[^\"]+)', webpage, 'channel_name') title = self._search_regex(r'data-episode-title=\"(?P<channel_id>[^\"]+)', webpage, 'title') view_key = self._search_regex(r'data-view-key=\"(?P<channel_id>[^\"]+)', webpage, 'view_key') - description = clean_html(self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content')) or None - available_resolutions = re.findall(r'<a\ data-episode-id=\"%s\"\ data-resolution=\"(?P<resolution>[^\"]+)' % id, webpage) + description = clean_html( + self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content')) or None + available_resolutions = re.findall(r'<a\ data-episode-id=\"%s\"\ data-resolution=\"(?P<resolution>[^\"]+)' % id, + webpage) formats = [] for resolution in available_resolutions: @@ -62,3 +68,80 @@ class GabTVIE(InfoExtractor): 'uploader_id': channel_id, 'thumbnail': f'https://tv.gab.com/image/{id}', } + + +class GabIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gab\.com/[^/]+/posts/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://gab.com/SomeBitchIKnow/posts/107163961867310434', + 'md5': '8ca34fb00f1e1033b5c5988d79ec531d', + 'info_dict': { + 'id': '107163961867310434-0', + 'ext': 'mp4', + 'title': 'L on Gab', + 'uploader_id': '946600', + 'uploader': 'SomeBitchIKnow', + 'description': 'md5:204055fafd5e1a519f5d6db953567ca3', + 'timestamp': 1635192289, + 'upload_date': '20211025', + } + }, { + 'url': 'https://gab.com/TheLonelyProud/posts/107045884469287653', + 'md5': 'f9cefcfdff6418e392611a828d47839d', + 'info_dict': { + 'id': '107045884469287653-0', + 'ext': 'mp4', + 'title': 'Jody Sadowski on Gab', + 'uploader_id': '1390705', + 'timestamp': 1633390571, + 'upload_date': '20211004', + 'uploader': 'TheLonelyProud', + } + }] + + def _real_extract(self, url): + post_id = self._match_id(url) + json_data = self._download_json(f'https://gab.com/api/v1/statuses/{post_id}', post_id) + + entries = [] + for idx, media in enumerate(json_data['media_attachments']): + if media.get('type') not in ('video', 'gifv'): + continue + metadata = media['meta'] + format_metadata = { + 'acodec': parse_codecs(metadata.get('audio_encode')).get('acodec'), + 'asr': int_or_none((metadata.get('audio_bitrate') or '').split(' ')[0]), + 'fps': metadata.get('fps'), + } + + formats = [{ + 'url': url, + 'width': f.get('width'), + 'height': f.get('height'), + 'tbr': int_or_none(f.get('bitrate'), scale=1000), + **format_metadata, + } for url, f in ((media.get('url'), metadata.get('original') or {}), + (media.get('source_mp4'), metadata.get('playable') or {})) if url] + + self._sort_formats(formats) + + author = json_data.get('account') or {} + entries.append({ + 'id': f'{post_id}-{idx}', + 'title': f'{json_data["account"]["display_name"]} on Gab', + 'timestamp': unified_timestamp(json_data.get('created_at')), + 'formats': formats, + 'description': clean_html(json_data.get('content')), + 'duration': metadata.get('duration') or parse_duration(metadata.get('length')), + 'like_count': json_data.get('favourites_count'), + 'comment_count': json_data.get('replies_count'), + 'repost_count': json_data.get('reblogs_count'), + 'uploader': author.get('username'), + 'uploader_id': author.get('id'), + 'uploader_url': author.get('url'), + }) + + if len(entries) > 1: + return self.playlist_result(entries, post_id) + + return entries[0] diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 0d279016b..51557f0f1 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -56,7 +56,7 @@ from .sportbox import SportBoxIE from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE @@ -135,6 +135,8 @@ from .arcpublishing import ArcPublishingIE from .medialaan import MedialaanIE from .simplecast import SimplecastIE from .wimtv import WimTVIE +from .tvp import TVPEmbedIE +from .blogger import BloggerIE class GenericIE(InfoExtractor): @@ -359,9 +361,6 @@ class GenericIE(InfoExtractor): 'formats': 'mincount:9', 'upload_date': '20130904', }, - 'params': { - 'format': 'bestvideo', - }, }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 { @@ -2175,6 +2174,17 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # blogger embed + 'url': 'https://blog.tomeuvizoso.net/2019/01/a-panfrost-milestone.html', + 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac', + 'info_dict': { + 'id': 'BLOGGER-video-3c740e3a49197e16-796', + 'ext': 'mp4', + 'title': 'Blogger', + 'thumbnail': r're:^https?://.*', + }, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2334,12 +2344,43 @@ class GenericIE(InfoExtractor): 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', } }, + { + # Reddit-hosted video that will redirect and be processed by RedditIE + # Redirects to https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ + 'url': 'https://v.redd.it/zv89llsvexdz', + 'md5': '87f5f02f6c1582654146f830f21f8662', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'timestamp': 1501941939.0, + 'title': 'That small heart attack.', + 'upload_date': '20170805', + 'uploader': 'Antw87' + } + }, + { + # 1080p Reddit-hosted video that will redirect and be processed by RedditIE + 'url': 'https://v.redd.it/33hgok7dfbz71/', + 'md5': '7a1d587940242c9bb3bd6eb320b39258', + 'info_dict': { + 'id': '33hgok7dfbz71', + 'ext': 'mp4', + 'title': "The game Didn't want me to Knife that Guy I guess", + 'uploader': 'paraf1ve', + 'timestamp': 1636788683.0, + 'upload_date': '20211113' + } + } + # ] def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) + def report_detected(self, name): + self._downloader.write_debug(f'Identified a {name}') + def _extract_rss(self, url, video_id, doc): playlist_title = doc.find('./channel/title').text playlist_desc_el = doc.find('./channel/description') @@ -2555,10 +2596,13 @@ class GenericIE(InfoExtractor): content_type = head_response.headers.get('Content-Type', '').lower() m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: + self.report_detected('direct video link') format_id = compat_str(m.group('format_id')) subtitles = {} if format_id.endswith('mpegurl'): formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): + formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id) elif format_id == 'f4m': formats = self._extract_f4m_formats(url, video_id) else: @@ -2595,6 +2639,7 @@ class GenericIE(InfoExtractor): # Is it an M3U playlist? if first_bytes.startswith(b'#EXTM3U'): + self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') self._sort_formats(info_dict['formats']) return info_dict @@ -2625,16 +2670,20 @@ class GenericIE(InfoExtractor): except compat_xml_parse_error: doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': + self.report_detected('RSS feed') return self._extract_rss(url, video_id, doc) elif doc.tag == 'SmoothStreamingMedia': info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url) + self.report_detected('ISM manifest') self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): smil = self._parse_smil(doc, url, video_id) + self.report_detected('SMIL file') self._sort_formats(smil['formats']) return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': + self.report_detected('XSPF playlist') return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, @@ -2645,10 +2694,12 @@ class GenericIE(InfoExtractor): doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) + self.report_detected('DASH manifest') self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) + self.report_detected('F4M manifest') self._sort_formats(info_dict['formats']) return info_dict except compat_xml_parse_error: @@ -2657,6 +2708,7 @@ class GenericIE(InfoExtractor): # Is it a Camtasia project? camtasia_res = self._extract_camtasia(url, video_id, webpage) if camtasia_res is not None: + self.report_detected('Camtasia video') return camtasia_res # Sometimes embedded video player is hidden behind percent encoding @@ -2707,6 +2759,8 @@ class GenericIE(InfoExtractor): 'age_limit': age_limit, }) + self._downloader.write_debug('Looking for video embeds') + # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: @@ -3204,6 +3258,11 @@ class GenericIE(InfoExtractor): if onionstudios_url: return self.url_result(onionstudios_url) + # Look for Blogger embeds + blogger_urls = BloggerIE._extract_urls(webpage) + if blogger_urls: + return self.playlist_from_matches(blogger_urls, video_id, video_title, ie=BloggerIE.ie_key()) + # Look for ViewLift embeds viewlift_url = ViewLiftEmbedIE._extract_url(webpage) if viewlift_url: @@ -3497,9 +3556,14 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key()) + tvp_urls = TVPEmbedIE._extract_urls(webpage) + if tvp_urls: + return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: + self.report_detected('HTML5 media') if len(entries) == 1: entries[0].update({ 'id': video_id, @@ -3519,6 +3583,7 @@ class GenericIE(InfoExtractor): webpage, video_id, transform_source=js_to_json) if jwplayer_data: if isinstance(jwplayer_data.get('playlist'), str): + self.report_detected('JW Player playlist') return { **info_dict, '_type': 'url', @@ -3528,6 +3593,7 @@ class GenericIE(InfoExtractor): try: info = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=url) + self.report_detected('JW Player data') return merge_dicts(info, info_dict) except ExtractorError: # See https://github.com/ytdl-org/youtube-dl/pull/16735 @@ -3577,6 +3643,7 @@ class GenericIE(InfoExtractor): }, }) if formats or subtitles: + self.report_detected('video.js embed') self._sort_formats(formats) info_dict['formats'] = formats info_dict['subtitles'] = subtitles @@ -3585,6 +3652,7 @@ class GenericIE(InfoExtractor): # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url'): + self.report_detected('JSON LD') return merge_dicts(json_ld, info_dict) def check_video(vurl): @@ -3601,7 +3669,9 @@ class GenericIE(InfoExtractor): # Start with something easy: JW Player in SWFObject found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)) - if not found: + if found: + self.report_detected('JW Player in SFWObject') + else: # Look for gorilla-vid style embedding found = filter_video(re.findall(r'''(?sx) (?: @@ -3611,10 +3681,13 @@ class GenericIE(InfoExtractor): ) .*? ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) + if found: + self.report_detected('JW Player embed') if not found: # Look for generic KVS player found = re.search(r'<script [^>]*?src="https://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage) if found: + self.report_detected('KWS Player') if found.group('maj_ver') not in ['4', '5']: self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver')) flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage) @@ -3660,10 +3733,14 @@ class GenericIE(InfoExtractor): if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) + if found: + self.report_detected('video file') if not found: # Broaden the findall a little bit: JWPlayer JS loader found = filter_video(re.findall( r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) + if found: + self.report_detected('JW Player JS loader') if not found: # Flow player found = filter_video(re.findall(r'''(?xs) @@ -3672,10 +3749,14 @@ class GenericIE(InfoExtractor): \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s* ["']?url["']?\s*:\s*["']([^"']+)["'] ''', webpage)) + if found: + self.report_detected('Flow Player') if not found: # Cinerama player found = re.findall( r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) + if found: + self.report_detected('Cinerama player') if not found: # Try to find twitter cards info # twitter:player:stream should be checked before twitter:player since @@ -3683,6 +3764,8 @@ class GenericIE(InfoExtractor): # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) found = filter_video(re.findall( r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)) + if found: + self.report_detected('Twitter card') if not found: # We look for Open Graph info: # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) @@ -3690,6 +3773,8 @@ class GenericIE(InfoExtractor): # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage)) + if found: + self.report_detected('Open Graph video info') if not found: REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' found = re.search( @@ -3721,6 +3806,7 @@ class GenericIE(InfoExtractor): # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) embed_url = self._html_search_meta('twitter:player', webpage, default=None) if embed_url and embed_url != url: + self.report_detected('twitter:player iframe') return self.url_result(embed_url) if not found: diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index 12e6c53d4..0bdf772a1 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -230,6 +230,11 @@ class HotStarIE(HotStarBaseIE): if tags and 'encryption:plain' not in tags: for f in current_formats: f['has_drm'] = True + if tags and 'language' in tags: + lang = re.search(r'language:(?P<lang>[a-z]+)', tags).group('lang') + for f in current_formats: + if not f.get('langauge'): + f['language'] = lang formats.extend(current_formats) subs = self._merge_subtitles(subs, current_subs) if not formats and geo_restricted: diff --git a/yt_dlp/extractor/imdb.py b/yt_dlp/extractor/imdb.py index a31301985..24f1fde64 100644 --- a/yt_dlp/extractor/imdb.py +++ b/yt_dlp/extractor/imdb.py @@ -111,7 +111,7 @@ class ImdbIE(InfoExtractor): 'formats': formats, 'description': info.get('videoDescription'), 'thumbnail': url_or_none(try_get( - video_metadata, lambda x: x['videoSlate']['source'])), + info, lambda x: x['videoSlate']['source'])), 'duration': parse_duration(info.get('videoRuntime')), } diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index ccfcddd5b..1fcf97a19 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -1,4 +1,4 @@ -from __future__ import unicode_literals +# coding: utf-8 import itertools import hashlib @@ -8,7 +8,6 @@ import time from .common import InfoExtractor from ..compat import ( - compat_str, compat_HTTPError, ) from ..utils import ( @@ -18,16 +17,156 @@ from ..utils import ( int_or_none, lowercase_escape, std_headers, - try_get, + traverse_obj, url_or_none, - variadic, urlencode_postdata, ) -class InstagramIE(InfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))' +class InstagramBaseIE(InfoExtractor): _NETRC_MACHINE = 'instagram' + _IS_LOGGED_IN = False + + def _login(self): + username, password = self._get_login_info() + if username is None or self._IS_LOGGED_IN: + return + + login_webpage = self._download_webpage( + 'https://www.instagram.com/accounts/login/', None, + note='Downloading login webpage', errnote='Failed to download login webpage') + + shared_data = self._parse_json( + self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', + login_webpage, 'shared data', default='{}'), + None) + + login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={ + 'Accept': '*/*', + 'X-IG-App-ID': '936619743392459', + 'X-ASBD-ID': '198387', + 'X-IG-WWW-Claim': '0', + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRFToken': shared_data['config']['csrf_token'], + 'X-Instagram-AJAX': shared_data['rollout_hash'], + 'Referer': 'https://www.instagram.com/', + }, data=urlencode_postdata({ + 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', + 'username': username, + 'queryParams': '{}', + 'optIntoOneTap': 'false', + 'stopDeletionNonce': '', + 'trustedDeviceRecords': '{}', + })) + + if not login.get('authenticated'): + if login.get('message'): + raise ExtractorError(f'Unable to login: {login["message"]}') + raise ExtractorError('Unable to login') + InstagramBaseIE._IS_LOGGED_IN = True + + def _real_initialize(self): + self._login() + + def _get_count(self, media, kind, *keys): + return traverse_obj( + media, (kind, 'count'), *((f'edge_media_{key}', 'count') for key in keys), + expected_type=int_or_none) + + def _get_dimension(self, name, media, webpage=None): + return ( + traverse_obj(media, ('dimensions', name), expected_type=int_or_none) + or int_or_none(self._html_search_meta( + (f'og:video:{name}', f'video:{name}'), webpage or '', default=None))) + + def _extract_nodes(self, nodes, is_direct=False): + for idx, node in enumerate(nodes, start=1): + if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: + continue + + video_id = node.get('shortcode') + + if is_direct: + info = { + 'id': video_id or node['id'], + 'url': node.get('video_url'), + 'width': self._get_dimension('width', node), + 'height': self._get_dimension('height', node), + 'http_headers': { + 'Referer': 'https://www.instagram.com/', + } + } + elif not video_id: + continue + else: + info = { + '_type': 'url', + 'ie_key': 'Instagram', + 'id': video_id, + 'url': f'https://instagram.com/p/{video_id}', + } + + yield { + **info, + 'title': node.get('title') or (f'Video {idx}' if is_direct else None), + 'description': traverse_obj( + node, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str), + 'thumbnail': traverse_obj( + node, 'display_url', 'thumbnail_src', 'display_src', expected_type=url_or_none), + 'duration': float_or_none(node.get('video_duration')), + 'timestamp': int_or_none(node.get('taken_at_timestamp')), + 'view_count': int_or_none(node.get('video_view_count')), + 'comment_count': self._get_count(node, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'), + 'like_count': self._get_count(node, 'likes', 'preview_like'), + } + + +class InstagramIOSIE(InfoExtractor): + IE_DESC = 'IOS instagram:// URL' + _VALID_URL = r'instagram://media\?id=(?P<id>[\d_]+)' + _TESTS = [{ + 'url': 'instagram://media?id=482584233761418119', + 'md5': '0d2da106a9d2631273e192b372806516', + 'info_dict': { + 'id': 'aye83DjauH', + 'ext': 'mp4', + 'title': 'Video by naomipq', + 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, + 'timestamp': 1371748545, + 'upload_date': '20130620', + 'uploader_id': 'naomipq', + 'uploader': 'B E A U T Y F O R A S H E S', + 'like_count': int, + 'comment_count': int, + 'comments': list, + }, + 'add_ie': ['Instagram'] + }] + + def _get_id(self, id): + """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id""" + chrs = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' + media_id = int(id.split('_')[0]) + shortened_id = '' + while media_id > 0: + r = media_id % 64 + media_id = (media_id - r) // 64 + shortened_id = chrs[r] + shortened_id + return shortened_id + + def _real_extract(self, url): + return { + '_type': 'url_transparent', + 'url': f'http://instagram.com/tv/{self._get_id(self._match_id(url))}/', + 'ie_key': 'Instagram', + } + + +class InstagramIE(InstagramBaseIE): + _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -143,71 +282,23 @@ class InstagramIE(InfoExtractor): if mobj: return mobj.group('link') - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - login_webpage = self._download_webpage( - 'https://www.instagram.com/accounts/login/', None, - note='Downloading login webpage', errnote='Failed to download login webpage') - - shared_data = self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - login_webpage, 'shared data', default='{}'), - None) - - login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={ - 'Accept': '*/*', - 'X-IG-App-ID': '936619743392459', - 'X-ASBD-ID': '198387', - 'X-IG-WWW-Claim': '0', - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRFToken': shared_data['config']['csrf_token'], - 'X-Instagram-AJAX': shared_data['rollout_hash'], - 'Referer': 'https://www.instagram.com/', - }, data=urlencode_postdata({ - 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', - 'username': username, - 'queryParams': '{}', - 'optIntoOneTap': 'false', - 'stopDeletionNonce': '', - 'trustedDeviceRecords': '{}', - })) - - if not login.get('authenticated'): - if login.get('message'): - raise ExtractorError(f'Unable to login: {login["message"]}') - raise ExtractorError('Unable to login') - - def _real_initialize(self): - self._login() - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - url = mobj.group('url') - + video_id, url = self._match_valid_url(url).group('id', 'url') webpage, urlh = self._download_webpage_handle(url, video_id) - if 'www.instagram.com/accounts/login' in urlh.geturl().rstrip('/'): + if 'www.instagram.com/accounts/login' in urlh.geturl(): self.raise_login_required('You need to log in to access this content') - (media, video_url, description, thumbnail, timestamp, uploader, - uploader_id, like_count, comment_count, comments, height, - width) = [None] * 12 - shared_data = self._parse_json( self._search_regex( r'window\._sharedData\s*=\s*({.+?});', webpage, 'shared data', default='{}'), video_id, fatal=False) - if shared_data: - media = try_get( - shared_data, - (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], - lambda x: x['entry_data']['PostPage'][0]['media']), - dict) + media = traverse_obj( + shared_data, + ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), + ('entry_data', 'PostPage', 0, 'media'), + expected_type=dict) + # _sharedData.entry_data.PostPage is empty when authenticated (see # https://github.com/ytdl-org/youtube-dl/pull/22880) if not media: @@ -216,123 +307,78 @@ class InstagramIE(InfoExtractor): r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', webpage, 'additional data', default='{}'), video_id, fatal=False) - if additional_data: - media = try_get( - additional_data, lambda x: x['graphql']['shortcode_media'], - dict) - if media: - video_url = media.get('video_url') - height = int_or_none(media.get('dimensions', {}).get('height')) - width = int_or_none(media.get('dimensions', {}).get('width')) - description = try_get( - media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) or media.get('caption') - title = media.get('title') - thumbnail = media.get('display_src') or media.get('display_url') - duration = float_or_none(media.get('video_duration')) - timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) - uploader = media.get('owner', {}).get('full_name') - uploader_id = media.get('owner', {}).get('username') - - def get_count(keys, kind): - for key in variadic(keys): - count = int_or_none(try_get( - media, (lambda x: x['edge_media_%s' % key]['count'], - lambda x: x['%ss' % kind]['count']))) - if count is not None: - return count - - like_count = get_count('preview_like', 'like') - comment_count = get_count( - ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment') - - comments = [] - for comment in try_get(media, lambda x: x['edge_media_to_parent_comment']['edges']): - comment_dict = comment.get('node', {}) - comment_text = comment_dict.get('text') - if comment_text: - comments.append({ - 'author': try_get(comment_dict, lambda x: x['owner']['username']), - 'author_id': try_get(comment_dict, lambda x: x['owner']['id']), - 'id': comment_dict.get('id'), - 'text': comment_text, - 'timestamp': int_or_none(comment_dict.get('created_at')), - }) - if not video_url: - edges = try_get( - media, lambda x: x['edge_sidecar_to_children']['edges'], - list) or [] - if edges: - entries = [] - for edge_num, edge in enumerate(edges, start=1): - node = try_get(edge, lambda x: x['node'], dict) - if not node: - continue - node_video_url = url_or_none(node.get('video_url')) - if not node_video_url: - continue - entries.append({ - 'id': node.get('shortcode') or node['id'], - 'title': node.get('title') or 'Video %d' % edge_num, - 'url': node_video_url, - 'thumbnail': node.get('display_url'), - 'duration': float_or_none(node.get('video_duration')), - 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), - 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), - 'view_count': int_or_none(node.get('video_view_count')), - }) - return self.playlist_result( - entries, video_id, - 'Post by %s' % uploader_id if uploader_id else None, - description) + media = traverse_obj(additional_data, ('graphql', 'shortcode_media'), expected_type=dict) or {} - if not video_url: - video_url = self._og_search_video_url(webpage, secure=False) - - formats = [{ - 'url': video_url, - 'width': width, - 'height': height, - }] - - if not uploader_id: - uploader_id = self._search_regex( - r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', - webpage, 'uploader id', fatal=False) + uploader_id = traverse_obj(media, ('owner', 'username')) or self._search_regex( + r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'uploader id', fatal=False) + description = ( + traverse_obj(media, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str) + or media.get('caption')) if not description: description = self._search_regex( r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None) if description is not None: description = lowercase_escape(description) - if not thumbnail: - thumbnail = self._og_search_thumbnail(webpage) + video_url = media.get('video_url') + if not video_url: + nodes = traverse_obj(media, ('edge_sidecar_to_children', 'edges', ..., 'node'), expected_type=dict) or [] + if nodes: + return self.playlist_result( + self._extract_nodes(nodes, True), video_id, + 'Post by %s' % uploader_id if uploader_id else None, description) + + video_url = self._og_search_video_url(webpage, secure=False) + + formats = [{ + 'url': video_url, + 'width': self._get_dimension('width', media, webpage), + 'height': self._get_dimension('height', media, webpage), + }] + dash = traverse_obj(media, ('dash_info', 'video_dash_manifest')) + if dash: + formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash')) + self._sort_formats(formats) + + comments = [{ + 'author': traverse_obj(comment_dict, ('node', 'owner', 'username')), + 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id')), + 'id': traverse_obj(comment_dict, ('node', 'id')), + 'text': traverse_obj(comment_dict, ('node', 'text')), + 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), expected_type=int_or_none), + } for comment_dict in traverse_obj(media, ('edge_media_to_parent_comment', 'edges'))] + + display_resources = ( + media.get('display_resources') + or [{'src': media.get(key)} for key in ('display_src', 'display_url')] + or [{'src': self._og_search_thumbnail(webpage)}]) + thumbnails = [{ + 'url': thumbnail['src'], + 'width': thumbnail.get('config_width'), + 'height': thumbnail.get('config_height'), + } for thumbnail in display_resources if thumbnail.get('src')] return { 'id': video_id, 'formats': formats, - 'ext': 'mp4', - 'title': title or 'Video by %s' % uploader_id, + 'title': media.get('title') or 'Video by %s' % uploader_id, 'description': description, - 'duration': duration, - 'thumbnail': thumbnail, - 'timestamp': timestamp, + 'duration': float_or_none(media.get('video_duration')), + 'timestamp': traverse_obj(media, 'taken_at_timestamp', 'date', expected_type=int_or_none), 'uploader_id': uploader_id, - 'uploader': uploader, - 'like_count': like_count, - 'comment_count': comment_count, + 'uploader': traverse_obj(media, ('owner', 'full_name')), + 'like_count': self._get_count(media, 'likes', 'preview_like'), + 'comment_count': self._get_count(media, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'), 'comments': comments, + 'thumbnails': thumbnails, 'http_headers': { 'Referer': 'https://www.instagram.com/', } } -class InstagramPlaylistIE(InfoExtractor): - # A superclass for handling any kind of query based on GraphQL which - # results in a playlist. - +class InstagramPlaylistBaseIE(InstagramBaseIE): _gis_tmpl = None # used to cache GIS request type def _parse_graphql(self, webpage, item_id): @@ -344,10 +390,6 @@ class InstagramPlaylistIE(InfoExtractor): def _extract_graphql(self, data, url): # Parses GraphQL queries containing videos and generates a playlist. - def get_count(suffix): - return int_or_none(try_get( - node, lambda x: x['edge_media_' + suffix]['count'])) - uploader_id = self._match_id(url) csrf_token = data['config']['csrf_token'] rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' @@ -396,55 +438,14 @@ class InstagramPlaylistIE(InfoExtractor): continue raise - edges = media.get('edges') - if not edges or not isinstance(edges, list): + nodes = traverse_obj(media, ('edges', ..., 'node'), expected_type=dict) or [] + if not nodes: break + yield from self._extract_nodes(nodes) - for edge in edges: - node = edge.get('node') - if not node or not isinstance(node, dict): - continue - if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: - continue - video_id = node.get('shortcode') - if not video_id: - continue - - info = self.url_result( - 'https://instagram.com/p/%s/' % video_id, - ie=InstagramIE.ie_key(), video_id=video_id) - - description = try_get( - node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) - thumbnail = node.get('thumbnail_src') or node.get('display_src') - timestamp = int_or_none(node.get('taken_at_timestamp')) - - comment_count = get_count('to_comment') - like_count = get_count('preview_like') - view_count = int_or_none(node.get('video_view_count')) - - info.update({ - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'comment_count': comment_count, - 'like_count': like_count, - 'view_count': view_count, - }) - - yield info - - page_info = media.get('page_info') - if not page_info or not isinstance(page_info, dict): - break - - has_next_page = page_info.get('has_next_page') - if not has_next_page: - break - - cursor = page_info.get('end_cursor') - if not cursor or not isinstance(cursor, compat_str): + has_next_page = traverse_obj(media, ('page_info', 'has_next_page')) + cursor = traverse_obj(media, ('page_info', 'end_cursor'), expected_type=str) + if not has_next_page or not cursor: break def _real_extract(self, url): @@ -458,11 +459,11 @@ class InstagramPlaylistIE(InfoExtractor): self._extract_graphql(data, url), user_or_tag, user_or_tag) -class InstagramUserIE(InstagramPlaylistIE): +class InstagramUserIE(InstagramPlaylistBaseIE): _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' - _TEST = { + _TESTS = [{ 'url': 'https://instagram.com/porsche', 'info_dict': { 'id': 'porsche', @@ -474,7 +475,7 @@ class InstagramUserIE(InstagramPlaylistIE): 'skip_download': True, 'playlistend': 5, } - } + }] _QUERY_HASH = '42323d64886122307be10013ad2dcc44', @@ -492,11 +493,11 @@ class InstagramUserIE(InstagramPlaylistIE): } -class InstagramTagIE(InstagramPlaylistIE): +class InstagramTagIE(InstagramPlaylistBaseIE): _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)' IE_DESC = 'Instagram hashtag search' IE_NAME = 'instagram:tag' - _TEST = { + _TESTS = [{ 'url': 'https://instagram.com/explore/tags/lolcats', 'info_dict': { 'id': 'lolcats', @@ -508,7 +509,7 @@ class InstagramTagIE(InstagramPlaylistIE): 'skip_download': True, 'playlistend': 50, } - } + }] _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', diff --git a/yt_dlp/extractor/internazionale.py b/yt_dlp/extractor/internazionale.py index 676e8e269..45e2af690 100644 --- a/yt_dlp/extractor/internazionale.py +++ b/yt_dlp/extractor/internazionale.py @@ -20,9 +20,6 @@ class InternazionaleIE(InfoExtractor): 'upload_date': '20150219', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'https://www.internazionale.it/video/2018/08/29/telefono-stare-con-noi-stessi', 'md5': '9db8663704cab73eb972d1cee0082c79', @@ -36,9 +33,6 @@ class InternazionaleIE(InfoExtractor): 'upload_date': '20180829', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - 'format': 'bestvideo', - }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index 28e660972..347fec1d5 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -8,12 +8,19 @@ from .common import InfoExtractor from ..utils import ( determine_ext, js_to_json, + urlencode_postdata, + ExtractorError, + parse_qs ) class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?!cnn)(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' _GEO_BYPASS = False + _NETRC_MACHINE = 'iprima' + _LOGIN_URL = 'https://auth.iprima.cz/oauth2/login' + _TOKEN_URL = 'https://auth.iprima.cz/oauth2/token' + access_token = None _TESTS = [{ 'url': 'https://prima.iprima.cz/particka/92-epizoda', @@ -22,16 +29,8 @@ class IPrimaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Partička (92)', 'description': 'md5:859d53beae4609e6dd7796413f1b6cac', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, { - 'url': 'https://cnn.iprima.cz/videa/70-epizoda', - 'info_dict': { - 'id': 'p681554', - 'ext': 'mp4', - 'title': 'HLAVNÍ ZPRÁVY 3.5.2020', + 'upload_date': '20201103', + 'timestamp': 1604437480, }, 'params': { 'skip_download': True, # m3u8 download @@ -44,11 +43,9 @@ class IPrimaIE(InfoExtractor): 'url': 'http://play.iprima.cz/closer-nove-pripady/closer-nove-pripady-iv-1', 'only_matching': True, }, { - # iframe api.play-backend.iprima.cz 'url': 'https://prima.iprima.cz/my-little-pony/mapa-znameni-2-2', 'only_matching': True, }, { - # iframe prima.iprima.cz 'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha', 'only_matching': True, }, { @@ -66,9 +63,127 @@ class IPrimaIE(InfoExtractor): }, { 'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi', 'only_matching': True, - }, { - 'url': 'https://autosalon.iprima.cz/motorsport/7-epizoda-1', - 'only_matching': True, + }] + + def _login(self): + username, password = self._get_login_info() + + if username is None or password is None: + self.raise_login_required('Login is required to access any iPrima content', method='password') + + login_page = self._download_webpage( + self._LOGIN_URL, None, note='Downloading login page', + errnote='Downloading login page failed') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + '_email': username, + '_password': password}) + + _, login_handle = self._download_webpage_handle( + self._LOGIN_URL, None, data=urlencode_postdata(login_form), + note='Logging in') + + code = parse_qs(login_handle.geturl()).get('code')[0] + if not code: + raise ExtractorError('Login failed', expected=True) + + token_request_data = { + 'scope': 'openid+email+profile+phone+address+offline_access', + 'client_id': 'prima_sso', + 'grant_type': 'authorization_code', + 'code': code, + 'redirect_uri': 'https://auth.iprima.cz/sso/auth-check'} + + token_data = self._download_json( + self._TOKEN_URL, None, + note='Downloading token', errnote='Downloading token failed', + data=urlencode_postdata(token_request_data)) + + self.access_token = token_data.get('access_token') + if self.access_token is None: + raise ExtractorError('Getting token failed', expected=True) + + def _raise_access_error(self, error_code): + if error_code == 'PLAY_GEOIP_DENIED': + self.raise_geo_restricted(countries=['CZ'], metadata_available=True) + elif error_code is not None: + self.raise_no_formats('Access to stream infos forbidden', expected=True) + + def _real_initialize(self): + if not self.access_token: + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + ['og:title', 'twitter:title'], + webpage, 'title', default=None) + + video_id = self._search_regex(( + r'productId\s*=\s*([\'"])(?P<id>p\d+)\1', + r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1'), + webpage, 'real id', group='id') + + metadata = self._download_json( + f'https://api.play-backend.iprima.cz/api/v1//products/id-{video_id}/play', + video_id, note='Getting manifest URLs', errnote='Failed to get manifest URLs', + headers={'X-OTT-Access-Token': self.access_token}, + expected_status=403) + + self._raise_access_error(metadata.get('errorCode')) + + stream_infos = metadata.get('streamInfos') + formats = [] + if stream_infos is None: + self.raise_no_formats('Reading stream infos failed', expected=True) + else: + for manifest in stream_infos: + manifest_type = manifest.get('type') + manifest_url = manifest.get('url') + ext = determine_ext(manifest_url) + if manifest_type == 'HLS' or ext == 'm3u8': + formats += self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + elif manifest_type == 'DASH' or ext == 'mpd': + formats += self._extract_mpd_formats( + manifest_url, video_id, mpd_id='dash', fatal=False) + self._sort_formats(formats) + + final_result = self._search_json_ld(webpage, video_id) or {} + final_result.update({ + 'id': video_id, + 'title': title, + 'thumbnail': self._html_search_meta( + ['thumbnail', 'og:image', 'twitter:image'], + webpage, 'thumbnail', default=None), + 'formats': formats, + 'description': self._html_search_meta( + ['description', 'og:description', 'twitter:description'], + webpage, 'description', default=None)}) + + return final_result + + +class IPrimaCNNIE(InfoExtractor): + _VALID_URL = r'https?://cnn\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _GEO_BYPASS = False + + _TESTS = [{ + 'url': 'https://cnn.iprima.cz/porady/strunc/24072020-koronaviru-mam-plne-zuby-strasit-druhou-vlnou-je-absurdni-rika-senatorka-dernerova', + 'info_dict': { + 'id': 'p716177', + 'ext': 'mp4', + 'title': 'md5:277c6b1ed0577e51b40ddd35602ff43e', + }, + 'params': { + 'skip_download': 'm3u8' + } }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/itv.py b/yt_dlp/extractor/itv.py index 6e6a3673c..bdd6af688 100644 --- a/yt_dlp/extractor/itv.py +++ b/yt_dlp/extractor/itv.py @@ -117,7 +117,7 @@ class ITVIE(InfoExtractor): # See: https://github.com/yt-dlp/yt-dlp/issues/986 platform_tag_subs, featureset_subs = next( ((platform_tag, featureset) - for platform_tag, featuresets in reversed(variants.items()) for featureset in featuresets + for platform_tag, featuresets in reversed(list(variants.items())) for featureset in featuresets if try_get(featureset, lambda x: x[2]) == 'outband-webvtt'), (None, None)) @@ -146,8 +146,8 @@ class ITVIE(InfoExtractor): # See: https://github.com/yt-dlp/yt-dlp/issues/986 platform_tag_video, featureset_video = next( ((platform_tag, featureset) - for platform_tag, featuresets in reversed(variants.items()) for featureset in featuresets - if try_get(featureset, lambda x: x[:2]) == ['hls', 'aes']), + for platform_tag, featuresets in reversed(list(variants.items())) for featureset in featuresets + if set(try_get(featureset, lambda x: x[:2]) or []) == {'aes', 'hls'}), (None, None)) if not platform_tag_video or not featureset_video: raise ExtractorError('No downloads available', expected=True, video_id=video_id) diff --git a/yt_dlp/extractor/kinopoisk.py b/yt_dlp/extractor/kinopoisk.py index 9e8d01f53..cdbb642e2 100644 --- a/yt_dlp/extractor/kinopoisk.py +++ b/yt_dlp/extractor/kinopoisk.py @@ -23,9 +23,6 @@ class KinoPoiskIE(InfoExtractor): 'duration': 4533, 'age_limit': 12, }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'https://www.kinopoisk.ru/film/81041', 'only_matching': True, diff --git a/yt_dlp/extractor/la7.py b/yt_dlp/extractor/la7.py index 363fbd6a5..de985e450 100644 --- a/yt_dlp/extractor/la7.py +++ b/yt_dlp/extractor/la7.py @@ -7,8 +7,9 @@ from .common import InfoExtractor from ..utils import ( determine_ext, float_or_none, + HEADRequest, + int_or_none, parse_duration, - smuggle_url, unified_strdate, ) @@ -25,19 +26,38 @@ class LA7IE(InfoExtractor): 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', 'md5': '8b613ffc0c4bf9b9e377169fc19c214c', 'info_dict': { - 'id': '0_42j6wd36', + 'id': 'inccool8-02-10-2015-163722', 'ext': 'mp4', 'title': 'Inc.Cool8', 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico', 'thumbnail': 're:^https?://.*', - 'uploader_id': 'kdla7pillole@iltrovatore.it', - 'timestamp': 1443814869, 'upload_date': '20151002', }, }, { 'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077', 'only_matching': True, }] + _HOST = 'https://awsvodpkg.iltrovatore.it' + + def _generate_mp4_url(self, quality, m3u8_formats): + for f in m3u8_formats: + if f['vcodec'] != 'none' and quality in f['url']: + http_url = '%s%s.mp4' % (self._HOST, quality) + + urlh = self._request_webpage( + HEADRequest(http_url), quality, + note='Check filesize', fatal=False) + if urlh: + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', 'https-'), + 'url': http_url, + 'protocol': 'https', + 'filesize_approx': int_or_none(urlh.headers.get('Content-Length', None)), + }) + return http_f + return None def _real_extract(self, url): video_id = self._match_id(url) @@ -46,22 +66,30 @@ class LA7IE(InfoExtractor): url = '%s//%s' % (self.http_scheme(), url) webpage = self._download_webpage(url, video_id) + video_path = self._search_regex(r'(/content/.*?).mp4', webpage, 'video_path') - player_data = self._search_regex( - [r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'], - webpage, 'player data') - vid = self._search_regex(r'vid\s*:\s*"(.+?)",', player_data, 'vid') + formats = self._extract_mpd_formats( + f'{self._HOST}/local/dash/,{video_path}.mp4.urlset/manifest.mpd', + video_id, mpd_id='dash', fatal=False) + m3u8_formats = self._extract_m3u8_formats( + f'{self._HOST}/local/hls/,{video_path}.mp4.urlset/master.m3u8', + video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + + for q in filter(None, video_path.split(',')): + http_f = self._generate_mp4_url(q, m3u8_formats) + if http_f: + formats.append(http_f) + + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'url': smuggle_url('kaltura:103:%s' % vid, { - 'service_url': 'http://nkdam.iltrovatore.it', - }), 'id': video_id, 'title': self._og_search_title(webpage, default=None), 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'ie_key': 'Kaltura', + 'formats': formats, + 'upload_date': unified_strdate(self._search_regex(r'datetime="(.+?)"', webpage, 'upload_date', fatal=False)) } diff --git a/yt_dlp/extractor/lego.py b/yt_dlp/extractor/lego.py index b9d8b167c..901f43bcf 100644 --- a/yt_dlp/extractor/lego.py +++ b/yt_dlp/extractor/lego.py @@ -8,6 +8,7 @@ from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, + join_nonempty, qualities, ) @@ -102,12 +103,8 @@ class LEGOIE(InfoExtractor): m3u8_id=video_source_format, fatal=False)) else: video_source_quality = video_source.get('Quality') - format_id = [] - for v in (video_source_format, video_source_quality): - if v: - format_id.append(v) f = { - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(video_source_format, video_source_quality), 'quality': q(video_source_quality), 'url': video_source_url, } diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index 3ce906e2f..bd76ae166 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -6,18 +6,54 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, + extract_attributes, ExtractorError, float_or_none, + get_element_by_class, int_or_none, srt_subtitles_timecode, + strip_or_none, + mimetype2ext, try_get, urlencode_postdata, urljoin, ) -class LinkedInLearningBaseIE(InfoExtractor): +class LinkedInBaseIE(InfoExtractor): _NETRC_MACHINE = 'linkedin' + _logged_in = False + + def _real_initialize(self): + if self._logged_in: + return + email, password = self._get_login_info() + if email is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + action_url = urljoin(self._LOGIN_URL, self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', + default='https://www.linkedin.com/uas/login-submit', group='url')) + data = self._hidden_inputs(login_page) + data.update({ + 'session_key': email, + 'session_password': password, + }) + login_submit_page = self._download_webpage( + action_url, None, 'Logging in', + data=urlencode_postdata(data)) + error = self._search_regex( + r'<span[^>]+class="error"[^>]*>\s*(.+?)\s*</span>', + login_submit_page, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + LinkedInBaseIE._logged_in = True + + +class LinkedInLearningBaseIE(LinkedInBaseIE): _LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning' def _call_api(self, course_slug, fields, video_slug=None, resolution=None): @@ -34,6 +70,8 @@ class LinkedInLearningBaseIE(InfoExtractor): }) sub = ' %dp' % resolution api_url = 'https://www.linkedin.com/learning-api/detailedCourses' + if not self._get_cookies(api_url).get('JSESSIONID'): + self.raise_login_required() return self._download_json( api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={ 'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, @@ -49,29 +87,47 @@ class LinkedInLearningBaseIE(InfoExtractor): def _get_video_id(self, video_data, course_slug, video_slug): return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug) - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - action_url = urljoin(self._LOGIN_URL, self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', - default='https://www.linkedin.com/uas/login-submit', group='url')) - data = self._hidden_inputs(login_page) - data.update({ - 'session_key': email, - 'session_password': password, - }) - login_submit_page = self._download_webpage( - action_url, None, 'Logging in', - data=urlencode_postdata(data)) - error = self._search_regex( - r'<span[^>]+class="error"[^>]*>\s*(.+?)\s*</span>', - login_submit_page, 'error', default=None) - if error: - raise ExtractorError(error, expected=True) +class LinkedInIE(LinkedInBaseIE): + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/.+?(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20', + 'info_dict': { + 'id': '6850898786781339649', + 'ext': 'mp4', + 'title': 'Mishal K. on LinkedIn: #sendinblueviews #toronto #digitalmarketing', + 'description': 'md5:be125430bab1c574f16aeb186a4d5b19', + 'creator': 'Mishal K.' + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title') + description = clean_html(get_element_by_class('share-update-card__update-text', webpage)) + like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage)) + creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage))) + + sources = self._parse_json(extract_attributes(self._search_regex(r'(<video[^>]+>)', webpage, 'video'))['data-sources'], video_id) + formats = [{ + 'url': source['src'], + 'ext': mimetype2ext(source.get('type')), + 'tbr': float_or_none(source.get('data-bitrate'), scale=1000), + } for source in sources] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'like_count': like_count, + 'creator': creator, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': description, + } class LinkedInLearningIE(LinkedInLearningBaseIE): @@ -102,7 +158,6 @@ class LinkedInLearningIE(LinkedInLearningBaseIE): def _real_extract(self, url): course_slug, video_slug = self._match_valid_url(url).groups() - video_data = None formats = [] for width, height in ((640, 360), (960, 540), (1280, 720)): video_data = self._call_api( diff --git a/yt_dlp/extractor/mdr.py b/yt_dlp/extractor/mdr.py index 0bdd62693..3ca174c2b 100644 --- a/yt_dlp/extractor/mdr.py +++ b/yt_dlp/extractor/mdr.py @@ -2,13 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( determine_ext, int_or_none, + join_nonempty, parse_duration, parse_iso8601, url_or_none, @@ -148,13 +146,9 @@ class MDRIE(InfoExtractor): abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) - format_id = [media_type] - if vbr or abr: - format_id.append(compat_str(vbr or abr)) - f = { 'url': video_url, - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(media_type, vbr or abr), 'filesize': filesize, 'abr': abr, 'vbr': vbr, diff --git a/yt_dlp/extractor/mediaklikk.py b/yt_dlp/extractor/mediaklikk.py index b9b6d739f..18ff3befa 100644 --- a/yt_dlp/extractor/mediaklikk.py +++ b/yt_dlp/extractor/mediaklikk.py @@ -12,8 +12,8 @@ from ..compat import ( class MediaKlikkIE(InfoExtractor): - _VALID_URL = r'''(?x)^https?:\/\/(?:www\.)? - (?:mediaklikk|m4sport|hirado|petofilive)\.hu\/.*?videok?\/ + _VALID_URL = r'''(?x)https?://(?:www\.)? + (?:mediaklikk|m4sport|hirado|petofilive)\.hu/.*?(?:videok?|cikk)/ (?:(?P<year>[0-9]{4})/(?P<month>[0-9]{1,2})/(?P<day>[0-9]{1,2})/)? (?P<id>[^/#?_]+)''' diff --git a/yt_dlp/extractor/mixch.py b/yt_dlp/extractor/mixch.py new file mode 100644 index 000000000..a99ddd172 --- /dev/null +++ b/yt_dlp/extractor/mixch.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, +) + + +class MixchIE(InfoExtractor): + IE_NAME = 'mixch' + _VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)' + + TESTS = [{ + 'url': 'https://mixch.tv/u/16236849/live', + 'skip': 'don\'t know if this live persists', + 'info_dict': { + 'id': '16236849', + 'title': '24配信シェア⭕️投票🙏💦', + 'comment_count': 13145, + 'view_count': 28348, + 'timestamp': 1636189377, + 'uploader': '🦥伊咲👶🏻#フレアワ', + 'uploader_id': '16236849', + } + }, { + 'url': 'https://mixch.tv/u/16137876/live', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://mixch.tv/u/{video_id}/live', video_id) + + initial_js_state = self._parse_json(self._search_regex( + r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id) + if not initial_js_state.get('liveInfo'): + raise ExtractorError('Livestream has ended.', expected=True) + + return { + 'id': video_id, + 'title': traverse_obj(initial_js_state, ('liveInfo', 'title')), + 'comment_count': traverse_obj(initial_js_state, ('liveInfo', 'comments')), + 'view_count': traverse_obj(initial_js_state, ('liveInfo', 'visitor')), + 'timestamp': traverse_obj(initial_js_state, ('liveInfo', 'created')), + 'uploader': traverse_obj(initial_js_state, ('broadcasterInfo', 'name')), + 'uploader_id': video_id, + 'formats': [{ + 'format_id': 'hls', + 'url': traverse_obj(initial_js_state, ('liveInfo', 'hls')) or 'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_%s.m3u8' % video_id, + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'is_live': True, + } diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py index 141dd7deb..be5de0a70 100644 --- a/yt_dlp/extractor/mtv.py +++ b/yt_dlp/extractor/mtv.py @@ -15,6 +15,7 @@ from ..utils import ( float_or_none, HEADRequest, int_or_none, + join_nonempty, RegexNotFoundError, sanitized_Request, strip_or_none, @@ -99,9 +100,9 @@ class MTVServicesInfoExtractor(InfoExtractor): formats.extend([{ 'ext': 'flv' if rtmp_video_url.startswith('rtmp') else ext, 'url': rtmp_video_url, - 'format_id': '-'.join(filter(None, [ + 'format_id': join_nonempty( 'rtmp' if rtmp_video_url.startswith('rtmp') else None, - rendition.get('bitrate')])), + rendition.get('bitrate')), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), }]) @@ -306,20 +307,22 @@ class MTVServicesInfoExtractor(InfoExtractor): mgid = self._extract_triforce_mgid(webpage) if not mgid: - mgid = self._search_regex( - r'"videoConfig":{"videoId":"(mgid:.*?)"', webpage, 'mgid', default=None) - - if not mgid: - mgid = self._search_regex( - r'"media":{"video":{"config":{"uri":"(mgid:.*?)"', webpage, 'mgid', default=None) - - if not mgid: data = self._parse_json(self._search_regex( r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) main_container = self._extract_child_with_type(data, 'MainContainer') ab_testing = self._extract_child_with_type(main_container, 'ABTesting') video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer') - mgid = video_player['props']['media']['video']['config']['uri'] + if video_player: + mgid = try_get(video_player, lambda x: x['props']['media']['video']['config']['uri']) + else: + flex_wrapper = self._extract_child_with_type(ab_testing or main_container, 'FlexWrapper') + auth_suite_wrapper = self._extract_child_with_type(flex_wrapper, 'AuthSuiteWrapper') + player = self._extract_child_with_type(auth_suite_wrapper or flex_wrapper, 'Player') + if player: + mgid = try_get(player, lambda x: x['props']['videoDetail']['mgid']) + + if not mgid: + raise ExtractorError('Could not extract mgid') return mgid diff --git a/yt_dlp/extractor/n1.py b/yt_dlp/extractor/n1.py index 7a09c6779..fdb7f32db 100644 --- a/yt_dlp/extractor/n1.py +++ b/yt_dlp/extractor/n1.py @@ -3,8 +3,6 @@ from __future__ import unicode_literals import re -from .youtube import YoutubeIE -from .reddit import RedditRIE from .common import InfoExtractor from ..utils import ( unified_timestamp, @@ -40,7 +38,7 @@ class N1InfoAssetIE(InfoExtractor): class N1InfoIIE(InfoExtractor): IE_NAME = 'N1Info:article' - _VALID_URL = r'https?://(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)/(?:[^/]+/){1,2}(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)|nova\.rs)/(?:[^/]+/){1,2}(?P<id>[^/]+)' _TESTS = [{ # Youtube embedded 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/', @@ -90,10 +88,18 @@ class N1InfoIIE(InfoExtractor): 'uploader': 'YouLotWhatDontStop', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { + 'url': 'https://nova.rs/vesti/politika/zaklina-tatalovic-ani-brnabic-pricate-lazi-video/', + 'info_dict': { + 'id': 'tnjganabrnabicizaklinatatalovic100danavladegp-novas-worldwide', + 'ext': 'mp4', + 'title': 'Žaklina Tatalović Ani Brnabić: Pričate laži (VIDEO)', + 'upload_date': '20211102', + 'timestamp': 1635861677, + }, + }, { 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/', 'only_matching': True, }] @@ -116,16 +122,16 @@ class N1InfoIIE(InfoExtractor): 'title': title, 'thumbnail': video_data.get('data-thumbnail'), 'timestamp': timestamp, - 'ie_key': N1InfoAssetIE.ie_key()}) + 'ie_key': 'N1InfoAsset'}) embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage) for embedded_video in embedded_videos: video_data = extract_attributes(embedded_video) - url = video_data.get('src') + url = video_data.get('src') or '' if url.startswith('https://www.youtube.com'): - entries.append(self.url_result(url, ie=YoutubeIE.ie_key())) + entries.append(self.url_result(url, ie='Youtube')) elif url.startswith('https://www.redditmedia.com'): - entries.append(self.url_result(url, ie=RedditRIE.ie_key())) + entries.append(self.url_result(url, ie='RedditR')) return { '_type': 'playlist', diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 9698a358e..d235805c3 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -1,22 +1,163 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import json import time +import urllib -from urllib.error import HTTPError -from .common import InfoExtractor -from ..compat import compat_str, compat_urllib_parse_unquote, compat_urllib_parse_quote from ..utils import ( ExtractorError, parse_iso8601, try_get, - urljoin, ) +from .common import InfoExtractor + + +class NebulaBaseIE(InfoExtractor): + _NETRC_MACHINE = 'watchnebula' + + _nebula_api_token = None + _nebula_bearer_token = None + _zype_access_token = None + + def _perform_nebula_auth(self): + username, password = self._get_login_info() + if not (username and password): + self.raise_login_required() + + data = json.dumps({'email': username, 'password': password}).encode('utf8') + response = self._download_json( + 'https://api.watchnebula.com/api/v1/auth/login/', + data=data, fatal=False, video_id=None, + headers={ + 'content-type': 'application/json', + # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint + 'cookie': '' + }, + note='Logging in to Nebula with supplied credentials', + errnote='Authentication failed or rejected') + if not response or not response.get('key'): + self.raise_login_required() + + # save nebula token as cookie + self._set_cookie( + 'nebula.app', 'nebula-auth', + urllib.parse.quote( + json.dumps({ + "apiToken": response["key"], + "isLoggingIn": False, + "isLoggingOut": False, + }, separators=(",", ":"))), + expire_time=int(time.time()) + 86400 * 365, + ) + + return response['key'] + + def _retrieve_nebula_api_token(self): + """ + Check cookie jar for valid token. Try to authenticate using credentials if no valid token + can be found in the cookie jar. + """ + nebula_cookies = self._get_cookies('https://nebula.app') + nebula_cookie = nebula_cookies.get('nebula-auth') + if nebula_cookie: + self.to_screen('Authenticating to Nebula with token from cookie jar') + nebula_cookie_value = urllib.parse.unquote(nebula_cookie.value) + nebula_api_token = self._parse_json(nebula_cookie_value, None).get('apiToken') + if nebula_api_token: + return nebula_api_token + + return self._perform_nebula_auth() + def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''): + assert method in ('GET', 'POST',) + assert auth_type in ('api', 'bearer',) -class NebulaIE(InfoExtractor): + def inner_call(): + authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}' + return self._download_json( + url, video_id, note=note, headers={'Authorization': authorization}, + data=b'' if method == 'POST' else None) + + try: + return inner_call() + except ExtractorError as exc: + # if 401 or 403, attempt credential re-auth and retry + if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403): + self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}') + self._login() + return inner_call() + else: + raise + + def _fetch_nebula_bearer_token(self): + """ + Get a Bearer token for the Nebula API. This will be required to fetch video meta data. + """ + response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/', + method='POST', + note='Authorizing to Nebula') + return response['token'] + def _fetch_zype_access_token(self): + """ + Get a Zype access token, which is required to access video streams -- in our case: to + generate video URLs. + """ + user_object = self._call_nebula_api('https://api.watchnebula.com/api/v1/auth/user/', note='Retrieving Zype access token') + + access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], str) + if not access_token: + if try_get(user_object, lambda x: x['is_subscribed'], bool): + # TODO: Reimplement the same Zype token polling the Nebula frontend implements + # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532 + raise ExtractorError( + 'Unable to extract Zype access token from Nebula API authentication endpoint. ' + 'Open an arbitrary video in a browser with this account to generate a token', + expected=True) + raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') + return access_token + + def _build_video_info(self, episode): + zype_id = episode['zype_id'] + zype_video_url = f'https://player.zype.com/embed/{zype_id}.html?access_token={self._zype_access_token}' + channel_slug = episode['channel_slug'] + return { + 'id': episode['zype_id'], + 'display_id': episode['slug'], + '_type': 'url_transparent', + 'ie_key': 'Zype', + 'url': zype_video_url, + 'title': episode['title'], + 'description': episode['description'], + 'timestamp': parse_iso8601(episode['published_at']), + 'thumbnails': [{ + # 'id': tn.get('name'), # this appears to be null + 'url': tn['original'], + 'height': key, + } for key, tn in episode['assets']['thumbnail'].items()], + 'duration': episode['duration'], + 'channel': episode['channel_title'], + 'channel_id': channel_slug, + 'channel_url': f'https://nebula.app/{channel_slug}', + 'uploader': episode['channel_title'], + 'uploader_id': channel_slug, + 'uploader_url': f'https://nebula.app/{channel_slug}', + 'series': episode['channel_title'], + 'creator': episode['channel_title'], + } + + def _login(self): + self._nebula_api_token = self._retrieve_nebula_api_token() + self._nebula_bearer_token = self._fetch_nebula_bearer_token() + self._zype_access_token = self._fetch_zype_access_token() + + def _real_initialize(self): + self._login() + + +class NebulaIE(NebulaBaseIE): _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)' _TESTS = [ { @@ -30,12 +171,13 @@ class NebulaIE(InfoExtractor): 'upload_date': '20180731', 'timestamp': 1533009600, 'channel': 'Lindsay Ellis', + 'channel_id': 'lindsayellis', 'uploader': 'Lindsay Ellis', + 'uploader_id': 'lindsayellis', }, 'params': { 'usenetrc': True, }, - 'skip': 'All Nebula content requires authentication', }, { 'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', @@ -47,13 +189,14 @@ class NebulaIE(InfoExtractor): 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', 'upload_date': '20200327', 'timestamp': 1585348140, - 'channel': 'The Logistics of D-Day', - 'uploader': 'The Logistics of D-Day', + 'channel': 'Real Engineering', + 'channel_id': 'realengineering', + 'uploader': 'Real Engineering', + 'uploader_id': 'realengineering', }, 'params': { 'usenetrc': True, }, - 'skip': 'All Nebula content requires authentication', }, { 'url': 'https://nebula.app/videos/money-episode-1-the-draw', @@ -66,173 +209,82 @@ class NebulaIE(InfoExtractor): 'upload_date': '20200323', 'timestamp': 1584980400, 'channel': 'Tom Scott Presents: Money', + 'channel_id': 'tom-scott-presents-money', 'uploader': 'Tom Scott Presents: Money', + 'uploader_id': 'tom-scott-presents-money', }, 'params': { 'usenetrc': True, }, - 'skip': 'All Nebula content requires authentication', }, { 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', 'only_matching': True, }, ] - _NETRC_MACHINE = 'watchnebula' - _nebula_token = None + def _fetch_video_metadata(self, slug): + return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/', + video_id=slug, + auth_type='bearer', + note='Fetching video meta data') - def _retrieve_nebula_auth(self): - """ - Log in to Nebula, and returns a Nebula API token - """ + def _real_extract(self, url): + slug = self._match_id(url) + video = self._fetch_video_metadata(slug) + return self._build_video_info(video) - username, password = self._get_login_info() - if not (username and password): - self.raise_login_required() - self.report_login() - data = json.dumps({'email': username, 'password': password}).encode('utf8') - response = self._download_json( - 'https://api.watchnebula.com/api/v1/auth/login/', - data=data, fatal=False, video_id=None, - headers={ - 'content-type': 'application/json', - # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint - 'cookie': '' +class NebulaCollectionIE(NebulaBaseIE): + IE_NAME = 'nebula:collection' + _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/(?!videos/)(?P<id>[-\w]+)' + _TESTS = [ + { + 'url': 'https://nebula.app/tom-scott-presents-money', + 'info_dict': { + 'id': 'tom-scott-presents-money', + 'title': 'Tom Scott Presents: Money', + 'description': 'Tom Scott hosts a series all about trust, negotiation and money.', }, - note='Authenticating to Nebula with supplied credentials', - errnote='Authentication failed or rejected') - if not response or not response.get('key'): - self.raise_login_required() - - # save nebula token as cookie - self._set_cookie( - 'nebula.app', 'nebula-auth', - compat_urllib_parse_quote( - json.dumps({ - "apiToken": response["key"], - "isLoggingIn": False, - "isLoggingOut": False, - }, separators=(",", ":"))), - expire_time=int(time.time()) + 86400 * 365, - ) - - return response['key'] - - def _retrieve_zype_api_key(self, page_url, display_id): - """ - Retrieves the Zype API key - """ - - # Find the js that has the API key from the webpage and download it - webpage = self._download_webpage(page_url, video_id=display_id) - main_script_relpath = self._search_regex( - r'<script[^>]*src="(?P<script_relpath>[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage, - group='script_relpath', name='script relative path', fatal=True) - main_script_abspath = urljoin(page_url, main_script_relpath) - main_script = self._download_webpage(main_script_abspath, video_id=display_id, - note='Retrieving Zype API key') - - api_key = self._search_regex( - r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P<api_key>[\w-]*)"', main_script, - group='api_key', name='API key', fatal=True) - - return api_key - - def _call_zype_api(self, path, params, video_id, api_key, note): - """ - A helper for making calls to the Zype API. - """ - query = {'api_key': api_key, 'per_page': 1} - query.update(params) - return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note) - - def _call_nebula_api(self, path, video_id, access_token, note): - """ - A helper for making calls to the Nebula API. - """ - return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={ - 'Authorization': 'Token {access_token}'.format(access_token=access_token) - }, note=note) - - def _fetch_zype_access_token(self, video_id): - try: - user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token') - except ExtractorError as exc: - # if 401, attempt credential auth and retry - if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.code == 401: - self._nebula_token = self._retrieve_nebula_auth() - user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token') - else: - raise - - access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) - if not access_token: - if try_get(user_object, lambda x: x['is_subscribed'], bool): - # TODO: Reimplement the same Zype token polling the Nebula frontend implements - # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532 - raise ExtractorError( - 'Unable to extract Zype access token from Nebula API authentication endpoint. ' - 'Open an arbitrary video in a browser with this account to generate a token', - expected=True) - raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') - return access_token - - def _extract_channel_title(self, video_meta): - # TODO: Implement the API calls giving us the channel list, - # so that we can do the title lookup and then figure out the channel URL - categories = video_meta.get('categories', []) if video_meta else [] - # the channel name is the value of the first category - for category in categories: - if category.get('value'): - return category['value'][0] - - def _real_initialize(self): - # check cookie jar for valid token - nebula_cookies = self._get_cookies('https://nebula.app') - nebula_cookie = nebula_cookies.get('nebula-auth') - if nebula_cookie: - self.to_screen('Authenticating to Nebula with token from cookie jar') - nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value) - self._nebula_token = self._parse_json(nebula_cookie_value, None).get('apiToken') + 'playlist_count': 5, + 'params': { + 'usenetrc': True, + }, + }, { + 'url': 'https://nebula.app/lindsayellis', + 'info_dict': { + 'id': 'lindsayellis', + 'title': 'Lindsay Ellis', + 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', + }, + 'playlist_mincount': 100, + 'params': { + 'usenetrc': True, + }, + }, + ] - # try to authenticate using credentials if no valid token has been found - if not self._nebula_token: - self._nebula_token = self._retrieve_nebula_auth() + def _generate_playlist_entries(self, collection_id, channel): + episodes = channel['episodes']['results'] + for page_num in itertools.count(2): + for episode in episodes: + yield self._build_video_info(episode) + next_url = channel['episodes']['next'] + if not next_url: + break + channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer', + note=f'Retrieving channel page {page_num}') + episodes = channel['episodes']['results'] def _real_extract(self, url): - display_id = self._match_id(url) - api_key = self._retrieve_zype_api_key(url, display_id) - - response = self._call_zype_api('/videos', {'friendly_title': display_id}, - display_id, api_key, note='Retrieving metadata from Zype') - if len(response.get('response') or []) != 1: - raise ExtractorError('Unable to find video on Zype API') - video_meta = response['response'][0] - - video_id = video_meta['_id'] - zype_access_token = self._fetch_zype_access_token(display_id) + collection_id = self._match_id(url) + channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/' + channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel') + channel_details = channel['details'] - channel_title = self._extract_channel_title(video_meta) - - return { - 'id': video_id, - 'display_id': display_id, - '_type': 'url_transparent', - 'ie_key': 'Zype', - 'url': 'https://player.zype.com/embed/%s.html?access_token=%s' % (video_id, zype_access_token), - 'title': video_meta.get('title'), - 'description': video_meta.get('description'), - 'timestamp': parse_iso8601(video_meta.get('published_at')), - 'thumbnails': [{ - 'id': tn.get('name'), # this appears to be null - 'url': tn['url'], - 'width': tn.get('width'), - 'height': tn.get('height'), - } for tn in video_meta.get('thumbnails', [])], - 'duration': video_meta.get('duration'), - 'channel': channel_title, - 'uploader': channel_title, # we chose uploader = channel name - # TODO: uploader_url, channel_id, channel_url - } + return self.playlist_result( + entries=self._generate_playlist_entries(collection_id, channel), + playlist_id=collection_id, + playlist_title=channel_details['title'], + playlist_description=channel_details['description'] + ) diff --git a/yt_dlp/extractor/newgrounds.py b/yt_dlp/extractor/newgrounds.py index bbbd9e8ee..1e1274ef0 100644 --- a/yt_dlp/extractor/newgrounds.py +++ b/yt_dlp/extractor/newgrounds.py @@ -6,7 +6,9 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, extract_attributes, + get_element_by_id, int_or_none, parse_count, parse_duration, @@ -29,7 +31,8 @@ class NewgroundsIE(InfoExtractor): 'timestamp': 1378878540, 'upload_date': '20130911', 'duration': 143, - 'description': 'md5:6d885138814015dfd656c2ddb00dacfc', + 'view_count': int, + 'description': 'md5:b8b3c2958875189f07d8e313462e8c4f', }, }, { 'url': 'https://www.newgrounds.com/portal/view/1', @@ -41,6 +44,7 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'Brian-Beaton', 'timestamp': 955064100, 'upload_date': '20000406', + 'view_count': int, 'description': 'Scrotum plays "catch."', 'age_limit': 17, }, @@ -54,7 +58,8 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'ZONE-SAMA', 'timestamp': 1487965140, 'upload_date': '20170224', - 'description': 'ZTV News Episode 8 (February 2017)', + 'view_count': int, + 'description': 'md5:aff9b330ec2e78ed93b1ad6d017accc6', 'age_limit': 17, }, 'params': { @@ -70,7 +75,8 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'Egoraptor', 'timestamp': 1140663240, 'upload_date': '20060223', - 'description': 'Metal Gear is awesome is so is this movie.', + 'view_count': int, + 'description': 'md5:9246c181614e23754571995104da92e0', 'age_limit': 13, } }, { @@ -80,7 +86,7 @@ class NewgroundsIE(InfoExtractor): 'id': '297383', 'ext': 'swf', 'title': 'Metal Gear Awesome', - 'description': 'Metal Gear is awesome is so is this movie.', + 'description': 'Metal Gear Awesome', 'uploader': 'Egoraptor', 'upload_date': '20060223', 'timestamp': 1140663240, @@ -145,10 +151,13 @@ class NewgroundsIE(InfoExtractor): (r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)', r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp', default=None)) + duration = parse_duration(self._html_search_regex( r'"duration"\s*:\s*["\']?(\d+)["\']?', webpage, 'duration', default=None)) + description = clean_html(get_element_by_id('author_comments', webpage)) or self._og_search_description(webpage) + view_count = parse_count(self._html_search_regex( r'(?s)<dt>\s*(?:Views|Listens)\s*</dt>\s*<dd>([\d\.,]+)</dd>', webpage, 'view count', default=None)) @@ -177,7 +186,7 @@ class NewgroundsIE(InfoExtractor): 'duration': duration, 'formats': formats, 'thumbnail': self._og_search_thumbnail(webpage), - 'description': self._og_search_description(webpage), + 'description': description, 'age_limit': age_limit, 'view_count': view_count, } diff --git a/yt_dlp/extractor/nexx.py b/yt_dlp/extractor/nexx.py index 860d636e2..8aceebd49 100644 --- a/yt_dlp/extractor/nexx.py +++ b/yt_dlp/extractor/nexx.py @@ -385,8 +385,7 @@ class NexxIE(InfoExtractor): elif cdn == 'free': formats = self._extract_free_formats(video, video_id) else: - # TODO: reverse more cdns - assert False + self.raise_no_formats(f'{cdn} formats are currently not supported', video_id) self._sort_formats(formats) @@ -427,7 +426,6 @@ class NexxEmbedIE(InfoExtractor): 'upload_date': '20140305', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 950a3d0d4..4998fed83 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -73,6 +73,7 @@ class NhkBaseIE(InfoExtractor): m3u8_id='hls', fatal=False) for f in info['formats']: f['language'] = lang + self._sort_formats(info['formats']) else: info.update({ '_type': 'url_transparent', diff --git a/yt_dlp/extractor/ninecninemedia.py b/yt_dlp/extractor/ninecninemedia.py index 4aaf21a12..781842721 100644 --- a/yt_dlp/extractor/ninecninemedia.py +++ b/yt_dlp/extractor/ninecninemedia.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, @@ -99,3 +98,37 @@ class NineCNineMediaIE(InfoExtractor): } return info + + +class CPTwentyFourIE(InfoExtractor): + IE_NAME = 'cp24' + _GEO_COUNTRIES = ['CA'] + _VALID_URL = r'https?://(?:www\.)?cp24\.com/news/(?P<id>[^?#]+)' + + _TESTS = [{ + 'url': 'https://www.cp24.com/news/video-shows-atm-being-ripped-out-of-business-by-pickup-truck-driver-in-mississauga-1.5676877', + 'info_dict': { + 'id': '2328005', + 'ext': 'mp4', + 'title': 'WATCH: Truck rips ATM from Mississauga business', + 'description': 'md5:cf7498480885f080a754389a2b2f7073', + 'timestamp': 1637618377, + 'episode_number': None, + 'season': 'Season 0', + 'season_number': 0, + 'season_id': 57974, + 'series': 'CTV News Toronto', + 'duration': 26.86, + 'thumbnail': 'http://images2.9c9media.com/image_asset/2014_11_5_2eb609a0-475b-0132-fbd6-34b52f6f1279_jpg_2000x1125.jpg', + 'upload_date': '20211122', + }, + 'params': {'skip_download': True, 'format': 'bv'} + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + id, destination = self._search_regex( + r'getAuthStates\("(?P<id>[^"]+)",\s?"(?P<destination>[^"]+)"\);', + webpage, 'video id and destination', group=('id', 'destination')) + return self.url_result(f'9c9media:{destination}:{id}', ie=NineCNineMediaIE.ie_key(), video_id=id) diff --git a/yt_dlp/extractor/nova.py b/yt_dlp/extractor/nova.py index 3acb88121..0007b6b12 100644 --- a/yt_dlp/extractor/nova.py +++ b/yt_dlp/extractor/nova.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, js_to_json, qualities, + traverse_obj, unified_strdate, url_or_none, ) @@ -17,30 +18,44 @@ from ..utils import ( class NovaEmbedIE(InfoExtractor): _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1', - 'md5': 'ee009bafcc794541570edd44b71cbea3', 'info_dict': { 'id': '8o0n0r', - 'ext': 'mp4', 'title': '2180. díl', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2578, }, - } + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['DRM protected', 'Requested format is not available'], + }, { + 'url': 'https://media.cms.nova.cz/embed/KybpWYvcgOa', + 'info_dict': { + 'id': 'KybpWYvcgOa', + 'ext': 'mp4', + 'title': 'Borhyová oslavila 60? Soutěžící z pořadu odboural moderátora Ondřeje Sokola', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 114, + }, + 'params': {'skip_download': 'm3u8'}, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + has_drm = False duration = None formats = [] player = self._parse_json( self._search_regex( - r'Player\.init\s*\([^,]+,\s*(?:\w+\s*\?\s*{.+?}\s*:\s*)?({.+})\s*,\s*{.+?}\s*\)\s*;', - webpage, 'player', default='{}'), video_id, fatal=False) + r'Player\.init\s*\([^,]+,(?P<cndn>\s*\w+\s*\?)?\s*(?P<json>{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)', + webpage, 'player', default='{}', group='json'), video_id, fatal=False) if player: for format_id, format_list in player['tracks'].items(): if not isinstance(format_list, list): @@ -48,6 +63,10 @@ class NovaEmbedIE(InfoExtractor): for format_dict in format_list: if not isinstance(format_dict, dict): continue + if (not self.get_param('allow_unplayable_formats') + and traverse_obj(format_dict, ('drm', 'keySystem'))): + has_drm = True + continue format_url = url_or_none(format_dict.get('src')) format_type = format_dict.get('type') ext = determine_ext(format_url) @@ -104,6 +123,8 @@ class NovaEmbedIE(InfoExtractor): f['format_id'] = f_id formats.append(f) + if not formats and has_drm: + self.report_drm(video_id) self._sort_formats(formats) title = self._og_search_title( diff --git a/yt_dlp/extractor/nrl.py b/yt_dlp/extractor/nrl.py index 22a2df8d3..0bd5086ae 100644 --- a/yt_dlp/extractor/nrl.py +++ b/yt_dlp/extractor/nrl.py @@ -16,7 +16,6 @@ class NRLTVIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - 'format': 'bestvideo', }, } diff --git a/yt_dlp/extractor/olympics.py b/yt_dlp/extractor/olympics.py index 0bc9206ed..0aad836fa 100644 --- a/yt_dlp/extractor/olympics.py +++ b/yt_dlp/extractor/olympics.py @@ -2,22 +2,26 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + int_or_none, + try_get +) class OlympicsReplayIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?olympics\.com/tokyo-2020/(?:[a-z]{2}/)?replay/(?P<id>[^/#&?]+)' + _VALID_URL = r'https?://(?:www\.)?olympics\.com(?:/tokyo-2020)?/[a-z]{2}/(?:replay|video)/(?P<id>[^/#&?]+)' _TESTS = [{ - 'url': 'https://olympics.com/tokyo-2020/en/replay/300622eb-abc0-43ea-b03b-c5f2d429ec7b/jumping-team-qualifier', + 'url': 'https://olympics.com/fr/video/men-s-109kg-group-a-weightlifting-tokyo-2020-replays', 'info_dict': { - 'id': '300622eb-abc0-43ea-b03b-c5f2d429ec7b', + 'id': 'f6a0753c-8e6f-4b7d-a435-027054a4f8e9', 'ext': 'mp4', - 'title': 'Jumping Team Qualifier', - 'release_date': '20210806', - 'upload_date': '20210713', + 'title': '+109kg (H) Groupe A - Haltérophilie | Replay de Tokyo 2020', + 'upload_date': '20210801', + 'timestamp': 1627783200, + 'description': 'md5:c66af4a5bc7429dbcc43d15845ff03b3', }, 'params': { - 'format': 'bv', + 'skip_download': True, }, }, { 'url': 'https://olympics.com/tokyo-2020/en/replay/bd242924-4b22-49a5-a846-f1d4c809250d/mens-bronze-medal-match-hun-esp', @@ -26,31 +30,41 @@ class OlympicsReplayIE(InfoExtractor): def _real_extract(self, url): id = self._match_id(url) - # The parameters are hardcoded in the webpage, it's not necessary to download the webpage just for these parameters. - # If in downloading webpage serves other functions aswell, then extract these parameters from it. - token_url = 'https://appovptok.ovpobs.tv/api/identity/app/token?api_key=OTk5NDcxOjpvY3N3LWFwaXVzZXI%3D&api_secret=ODY4ODM2MjE3ODMwYmVjNTAxMWZlMDJiMTYxZmY0MjFiMjMwMjllMjJmNDA1YWRiYzA5ODcxYTZjZTljZDkxOTo6NTM2NWIzNjRlMTM1ZmI2YWNjNmYzMGMzOGM3NzZhZTY%3D' - token = self._download_webpage(token_url, id) - headers = {'x-obs-app-token': token} - data_json = self._download_json(f'https://appocswtok.ovpobs.tv/api/schedule-sessions/{id}?include=stream', - id, headers=headers) - meta_data = data_json['data']['attributes'] - for t_dict in data_json['included']: - if t_dict.get('type') == 'Stream': - stream_data = t_dict['attributes'] + + webpage = self._download_webpage(url, id) + title = self._html_search_meta(('title', 'og:title', 'twitter:title'), webpage) + uuid = self._html_search_meta('episode_uid', webpage) + m3u8_url = self._html_search_meta('video_url', webpage) + json_ld = self._search_json_ld(webpage, uuid) + thumbnails_list = json_ld.get('image') + if not thumbnails_list: + thumbnails_list = self._html_search_regex( + r'["\']image["\']:\s*["\']([^"\']+)["\']', webpage, 'images', default='') + thumbnails_list = thumbnails_list.replace('[', '').replace(']', '').split(',') + thumbnails_list = [thumbnail.strip() for thumbnail in thumbnails_list] + thumbnails = [] + for thumbnail in thumbnails_list: + width_a, height_a, width = self._search_regex( + r'/images/image/private/t_(?P<width_a>\d+)-(?P<height_a>\d+)_(?P<width>\d+)/primary/[\W\w\d]+', + thumbnail, 'thumb', group=(1, 2, 3), default=(None, None, None)) + width_a, height_a, width = int_or_none(width_a), int_or_none(height_a), int_or_none(width) + thumbnails.append({ + 'url': thumbnail, + 'width': width, + 'height': int_or_none(try_get(width, lambda x: x * height_a / width_a)) + }) m3u8_url = self._download_json( - 'https://meteringtok.ovpobs.tv/api/playback-sessions', id, headers=headers, query={ - 'alias': stream_data['alias'], - 'stream': stream_data['stream'], - 'type': 'vod' - })['data']['attributes']['url'] - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id) + f'https://olympics.com/tokenGenerator?url={m3u8_url}', uuid, note='Downloading m3u8 url') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, uuid, m3u8_id='hls') self._sort_formats(formats) return { - 'id': id, - 'title': meta_data['title'], - 'release_date': unified_strdate(meta_data.get('start') or meta_data.get('broadcastPublished')), - 'upload_date': unified_strdate(meta_data.get('publishedAt')), + 'id': uuid, + 'title': title, + 'timestamp': json_ld.get('timestamp'), + 'description': json_ld.get('description'), + 'thumbnails': thumbnails, + 'duration': json_ld.get('duration'), 'formats': formats, 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/onefootball.py b/yt_dlp/extractor/onefootball.py new file mode 100644 index 000000000..79501003d --- /dev/null +++ b/yt_dlp/extractor/onefootball.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class OneFootballIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?onefootball\.com/[a-z]{2}/video/[^/&?#]+-(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://onefootball.com/en/video/highlights-fc-zuerich-3-3-fc-basel-34012334', + 'info_dict': { + 'id': '34012334', + 'ext': 'mp4', + 'title': 'Highlights: FC Zürich 3-3 FC Basel', + 'description': 'md5:33d9855cb790702c4fe42a513700aba8', + 'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34012334', + 'timestamp': 1635874604, + 'upload_date': '20211102' + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://onefootball.com/en/video/klopp-fumes-at-var-decisions-in-west-ham-defeat-34041020', + 'info_dict': { + 'id': '34041020', + 'ext': 'mp4', + 'title': 'Klopp fumes at VAR decisions in West Ham defeat', + 'description': 'md5:9c50371095a01ad3f63311c73d8f51a5', + 'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34041020', + 'timestamp': 1636314103, + 'upload_date': '20211107' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._search_json_ld(webpage, id) + m3u8_url = self._html_search_regex(r'(https://cdn\.jwplayer\.com/manifests/.+\.m3u8)', webpage, 'm3u8_url') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title'), + 'description': data_json.get('description'), + 'thumbnail': data_json.get('thumbnail'), + 'timestamp': data_json.get('timestamp'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/orf.py b/yt_dlp/extractor/orf.py index 428ec97e4..e2b703880 100644 --- a/yt_dlp/extractor/orf.py +++ b/yt_dlp/extractor/orf.py @@ -11,6 +11,7 @@ from ..utils import ( float_or_none, HEADRequest, int_or_none, + join_nonempty, orderedSet, remove_end, str_or_none, @@ -82,12 +83,7 @@ class ORFTVthekIE(InfoExtractor): src = url_or_none(fd.get('src')) if not src: continue - format_id_list = [] - for key in ('delivery', 'quality', 'quality_string'): - value = fd.get(key) - if value: - format_id_list.append(value) - format_id = '-'.join(format_id_list) + format_id = join_nonempty('delivery', 'quality', 'quality_string', from_dict=fd) ext = determine_ext(src) if ext == 'm3u8': m3u8_formats = self._extract_m3u8_formats( diff --git a/yt_dlp/extractor/paramountplus.py b/yt_dlp/extractor/paramountplus.py index 338b84d5b..17138985a 100644 --- a/yt_dlp/extractor/paramountplus.py +++ b/yt_dlp/extractor/paramountplus.py @@ -60,7 +60,6 @@ class ParamountPlusIE(CBSBaseIE): }, 'params': { 'skip_download': 'm3u8', - 'format': 'bestvideo', }, 'expected_warnings': ['Ignoring subtitle tracks'], # TODO: Investigate this }, { @@ -76,7 +75,6 @@ class ParamountPlusIE(CBSBaseIE): }, 'params': { 'skip_download': 'm3u8', - 'format': 'bestvideo', }, 'expected_warnings': ['Ignoring subtitle tracks'], }, { diff --git a/yt_dlp/extractor/parliamentliveuk.py b/yt_dlp/extractor/parliamentliveuk.py index 869ebd865..974d65482 100644 --- a/yt_dlp/extractor/parliamentliveuk.py +++ b/yt_dlp/extractor/parliamentliveuk.py @@ -25,9 +25,6 @@ class ParliamentLiveUKIE(InfoExtractor): 'timestamp': 1395153872, 'upload_date': '20140318', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4', 'only_matching': True, diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index c7d316efc..d3ee071e0 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -191,7 +191,7 @@ class PatreonIE(InfoExtractor): class PatreonUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?P<id>[-_\w\d]+)/?(?:posts/?)?' + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?P<id>[-\w]+)' _TESTS = [{ 'url': 'https://www.patreon.com/dissonancepod/', diff --git a/yt_dlp/extractor/pbs.py b/yt_dlp/extractor/pbs.py index 0eabf9bee..ffaa6bf92 100644 --- a/yt_dlp/extractor/pbs.py +++ b/yt_dlp/extractor/pbs.py @@ -193,7 +193,7 @@ class PBSIE(InfoExtractor): # Article with embedded player (or direct video) (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player - (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ + (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+) ) ''' % '|'.join(list(zip(*_STATIONS))[0]) diff --git a/yt_dlp/extractor/peertv.py b/yt_dlp/extractor/peertv.py new file mode 100644 index 000000000..002d33a88 --- /dev/null +++ b/yt_dlp/extractor/peertv.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import js_to_json + + +class PeerTVIE(InfoExtractor): + IE_NAME = 'peer.tv' + _VALID_URL = r'https?://(?:www\.)?peer\.tv/(?:de|it|en)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.peer.tv/de/841', + 'info_dict': { + 'id': '841', + 'ext': 'mp4', + 'title': 'Die Brunnenburg', + 'description': 'md5:4395f6142b090338340ab88a3aae24ed', + }, + }, { + 'url': 'https://www.peer.tv/it/404', + 'info_dict': { + 'id': '404', + 'ext': 'mp4', + 'title': 'Cascate di ghiaccio in Val Gardena', + 'description': 'md5:e8e5907f236171842674e8090e3577b8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_key = self._html_search_regex(r'player\.peer\.tv/js/([a-zA-Z0-9]+)', webpage, 'video key') + + js = self._download_webpage(f'https://player.peer.tv/js/{video_key}/', video_id, + headers={'Referer': 'https://www.peer.tv/'}, note='Downloading session id') + + session_id = self._search_regex(r'["\']session_id["\']:\s*["\']([a-zA-Z0-9]+)["\']', js, 'session id') + + player_webpage = self._download_webpage( + f'https://player.peer.tv/jsc/{video_key}/{session_id}?jsr=aHR0cHM6Ly93d3cucGVlci50di9kZS84NDE=&cs=UTF-8&mq=2&ua=0&webm=p&mp4=p&hls=1', + video_id, note='Downloading player webpage') + + m3u8_url = self._search_regex(r'["\']playlist_url["\']:\s*(["\'][^"\']+["\'])', player_webpage, 'm3u8 url') + m3u8_url = self._parse_json(m3u8_url, video_id, transform_source=js_to_json) + + formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._html_search_regex(r'<h1>(.+?)</h1>', webpage, 'title').replace('\xa0', ' '), + 'formats': formats, + 'description': self._html_search_meta(('og:description', 'description'), webpage), + 'thumbnail': self._html_search_meta(('og:image', 'image'), webpage) + } diff --git a/yt_dlp/extractor/peloton.py b/yt_dlp/extractor/peloton.py index 287d341c9..7d832253f 100644 --- a/yt_dlp/extractor/peloton.py +++ b/yt_dlp/extractor/peloton.py @@ -203,7 +203,6 @@ class PelotonLiveIE(InfoExtractor): 'chapters': 'count:3' }, 'params': { - 'format': 'bestvideo', 'skip_download': 'm3u8', }, '_skip': 'Account needed' diff --git a/yt_dlp/extractor/picarto.py b/yt_dlp/extractor/picarto.py index e6c51e16b..17d08d69e 100644 --- a/yt_dlp/extractor/picarto.py +++ b/yt_dlp/extractor/picarto.py @@ -111,7 +111,7 @@ class PicartoVodIE(InfoExtractor): vod_info = self._parse_json( self._search_regex( r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, - video_id), + 'vod player'), video_id, transform_source=js_to_json) formats = self._extract_m3u8_formats( diff --git a/yt_dlp/extractor/piksel.py b/yt_dlp/extractor/piksel.py index a362664b2..84c3de2f0 100644 --- a/yt_dlp/extractor/piksel.py +++ b/yt_dlp/extractor/piksel.py @@ -4,11 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( dict_get, ExtractorError, int_or_none, + join_nonempty, parse_iso8601, try_get, unescapeHTML, @@ -116,12 +116,8 @@ class PikselIE(InfoExtractor): elif asset_type == 'audio': tbr = abr - format_id = ['http'] - if tbr: - format_id.append(compat_str(tbr)) - formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty('http', tbr), 'url': unescapeHTML(http_url), 'vbr': vbr, 'abr': abr, @@ -167,7 +163,7 @@ class PikselIE(InfoExtractor): re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, transform_source=transform_source, fatal=False)) - self._sort_formats(formats) + self._sort_formats(formats, ('tbr', )) # Incomplete resolution information subtitles = {} for caption in video_data.get('captions', []): diff --git a/yt_dlp/extractor/planetmarathi.py b/yt_dlp/extractor/planetmarathi.py new file mode 100644 index 000000000..d1d9911f7 --- /dev/null +++ b/yt_dlp/extractor/planetmarathi.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + try_get, + unified_strdate, +) + + +class PlanetMarathiIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?planetmarathi\.com/titles/(?P<id>[^/#&?$]+)' + _TESTS = [{ + 'url': 'https://www.planetmarathi.com/titles/ek-unad-divas', + 'playlist_mincount': 2, + 'info_dict': { + 'id': 'ek-unad-divas', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ASSETS-MOVIE-ASSET-01_ek-unad-divas', + 'ext': 'mp4', + 'title': 'ek unad divas', + 'alt_title': 'चित्रपट', + 'description': 'md5:41c7ed6b041c2fea9820a3f3125bd881', + 'season_number': None, + 'episode_number': 1, + 'duration': 5539, + 'upload_date': '20210829', + }, + }] # Trailer skipped + }, { + 'url': 'https://www.planetmarathi.com/titles/baap-beep-baap-season-1', + 'playlist_mincount': 10, + 'info_dict': { + 'id': 'baap-beep-baap-season-1', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ASSETS-CHARACTER-PROFILE-SEASON-01-ASSET-01_baap-beep-baap-season-1', + 'ext': 'mp4', + 'title': 'Manohar Kanhere', + 'alt_title': 'मनोहर कान्हेरे', + 'description': 'md5:285ed45d5c0ab5522cac9a043354ebc6', + 'season_number': 1, + 'episode_number': 1, + 'duration': 29, + 'upload_date': '20210829', + }, + }] # Trailers, Episodes, other Character profiles skipped + }] + + def _real_extract(self, url): + id = self._match_id(url) + entries = [] + json_data = self._download_json(f'https://www.planetmarathi.com/api/v1/titles/{id}/assets', id)['assets'] + for asset in json_data: + asset_title = asset['mediaAssetName']['en'] + if asset_title == 'Movie': + asset_title = id.replace('-', ' ') + asset_id = f'{asset["sk"]}_{id}'.replace('#', '-') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(asset['mediaAssetURL'], asset_id) + self._sort_formats(formats) + entries.append({ + 'id': asset_id, + 'title': asset_title, + 'alt_title': try_get(asset, lambda x: x['mediaAssetName']['mr']), + 'description': try_get(asset, lambda x: x['mediaAssetDescription']['en']), + 'season_number': asset.get('mediaAssetSeason'), + 'episode_number': asset.get('mediaAssetIndexForAssetType'), + 'duration': asset.get('mediaAssetDurationInSeconds'), + 'upload_date': unified_strdate(asset.get('created')), + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result(entries, playlist_id=id) diff --git a/yt_dlp/extractor/polsatgo.py b/yt_dlp/extractor/polsatgo.py new file mode 100644 index 000000000..1e3f46c07 --- /dev/null +++ b/yt_dlp/extractor/polsatgo.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from uuid import uuid4 +import json + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + url_or_none, + ExtractorError, +) + + +class PolsatGoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polsat(?:box)?go\.pl/.+/(?P<id>[0-9a-fA-F]+)(?:[/#?]|$)' + _TESTS = [{ + 'url': 'https://polsatgo.pl/wideo/seriale/swiat-wedlug-kiepskich/5024045/sezon-1/5028300/swiat-wedlug-kiepskich-odcinek-88/4121', + 'info_dict': { + 'id': '4121', + 'ext': 'mp4', + 'title': 'Świat według Kiepskich - Odcinek 88', + 'age_limit': 12, + }, + }] + + def _extract_formats(self, sources, video_id): + for source in sources or []: + if not source.get('id'): + continue + url = url_or_none(self._call_api( + 'drm', video_id, 'getPseudoLicense', + {'mediaId': video_id, 'sourceId': source['id']}).get('url')) + if not url: + continue + yield { + 'url': url, + 'height': int_or_none(try_get(source, lambda x: x['quality'][:-1])) + } + + def _real_extract(self, url): + video_id = self._match_id(url) + media = self._call_api('navigation', video_id, 'prePlayData', {'mediaId': video_id})['mediaItem'] + + formats = list(self._extract_formats( + try_get(media, lambda x: x['playback']['mediaSources']), video_id)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': media['displayInfo']['title'], + 'formats': formats, + 'age_limit': int_or_none(media['displayInfo']['ageGroup']) + } + + def _call_api(self, endpoint, media_id, method, params): + rand_uuid = str(uuid4()) + res = self._download_json( + f'https://b2c-mobile.redefine.pl/rpc/{endpoint}/', media_id, + note=f'Downloading {method} JSON metadata', + data=json.dumps({ + 'method': method, + 'id': '2137', + 'jsonrpc': '2.0', + 'params': { + **params, + 'userAgentData': { + 'deviceType': 'mobile', + 'application': 'native', + 'os': 'android', + 'build': 10003, + 'widevine': False, + 'portal': 'pg', + 'player': 'cpplayer', + }, + 'deviceId': { + 'type': 'other', + 'value': rand_uuid, + }, + 'clientId': rand_uuid, + 'cpid': 1, + }, + }).encode('utf-8'), + headers={'Content-type': 'application/json'}) + if not res.get('result'): + if res['error']['code'] == 13404: + raise ExtractorError('This video is either unavailable in your region or is DRM protected', expected=True) + raise ExtractorError(f'Solorz said: {res["error"]["message"]} - {res["error"]["data"]["userMessage"]}') + return res['result'] diff --git a/yt_dlp/extractor/polskieradio.py b/yt_dlp/extractor/polskieradio.py index 53fe0340a..b2b3eb29c 100644 --- a/yt_dlp/extractor/polskieradio.py +++ b/yt_dlp/extractor/polskieradio.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals import itertools +import json +import math import re from .common import InfoExtractor @@ -12,15 +14,45 @@ from ..compat import ( ) from ..utils import ( extract_attributes, + ExtractorError, + InAdvancePagedList, int_or_none, + js_to_json, + parse_iso8601, strip_or_none, unified_timestamp, unescapeHTML, + url_or_none, ) -class PolskieRadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' +class PolskieRadioBaseExtractor(InfoExtractor): + def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): + media_urls = set() + + for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', webpage): + media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False) + if not media.get('file') or not media.get('desc'): + continue + media_url = self._proto_relative_url(media['file']) + if media_url in media_urls: + continue + media_urls.add(media_url) + entry = base_data.copy() + entry.update({ + 'id': compat_str(media['id']), + 'url': media_url, + 'duration': int_or_none(media.get('length')), + 'vcodec': 'none' if media.get('provider') == 'audio' else None, + }) + entry_title = compat_urllib_parse_unquote(media['desc']) + if entry_title: + entry['title'] = entry_title + yield entry + + +class PolskieRadioIE(PolskieRadioBaseExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' _TESTS = [{ # Old-style single broadcast. 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', 'info_dict': { @@ -59,22 +91,14 @@ class PolskieRadioIE(InfoExtractor): 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' }, }], - }, { # Old-style multiple broadcast playlist. - 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2487823,Marek-Kondrat-czyta-Mistrza-i-Malgorzate', - 'info_dict': { - 'id': '2487823', - 'title': 'Marek Kondrat czyta "Mistrza i Małgorzatę"', - 'description': 'md5:8422a95cc83834f2aaeff9d82e9c8f39', - }, - 'playlist_mincount': 50, - }, { # New-style multiple broadcast playlist. - 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2541317,Czytamy-Kalendarz-i-klepsydre-Tadeusza-Konwickiego', + }, { + # PR4 audition - other frontend + 'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301', 'info_dict': { - 'id': '2541317', - 'title': 'Czytamy "Kalendarz i klepsydrę" Tadeusza Konwickiego', - 'description': 'md5:0baeaa46d877f1351fb2eeed3e871f9f', + 'id': '2610977', + 'ext': 'mp3', + 'title': 'Pogłos 29 października godz. 23:01', }, - 'playlist_mincount': 15, }, { 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', 'only_matching': True, @@ -85,6 +109,9 @@ class PolskieRadioIE(InfoExtractor): # with mp4 video 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', 'only_matching': True, + }, { + 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci', + 'only_matching': True, }] def _real_extract(self, url): @@ -94,39 +121,37 @@ class PolskieRadioIE(InfoExtractor): content = self._search_regex( r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', - webpage, 'content') + webpage, 'content', default=None) timestamp = unified_timestamp(self._html_search_regex( r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', - webpage, 'timestamp', fatal=False)) + webpage, 'timestamp', default=None)) - thumbnail_url = self._og_search_thumbnail(webpage) + thumbnail_url = self._og_search_thumbnail(webpage, default=None) - entries = [] + title = self._og_search_title(webpage).strip() - media_urls = set() + description = strip_or_none(self._og_search_description(webpage, default=None)) + description = description.replace('\xa0', ' ') if description is not None else None - for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', content): - media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False) - if not media.get('file') or not media.get('desc'): - continue - media_url = self._proto_relative_url(media['file'], 'http:') - if media_url in media_urls: - continue - media_urls.add(media_url) - entries.append({ - 'id': compat_str(media['id']), - 'url': media_url, - 'title': compat_urllib_parse_unquote(media['desc']), - 'duration': int_or_none(media.get('length')), - 'vcodec': 'none' if media.get('provider') == 'audio' else None, + if not content: + return { + 'id': playlist_id, + 'url': self._proto_relative_url( + self._search_regex( + r"source:\s*'(//static\.prsa\.pl/[^']+)'", + webpage, 'audition record url')), + 'title': title, + 'description': description, 'timestamp': timestamp, - 'thumbnail': thumbnail_url - }) + 'thumbnail': thumbnail_url, + } - title = self._og_search_title(webpage).strip() - description = strip_or_none(self._og_search_description(webpage)) - description = description.replace('\xa0', ' ') if description is not None else None + entries = self._extract_webpage_player_entries(content, playlist_id, { + 'title': title, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url, + }) return self.playlist_result(entries, playlist_id, title, description) @@ -207,3 +232,201 @@ class PolskieRadioCategoryIE(InfoExtractor): return self.playlist_result( self._entries(url, webpage, category_id), category_id, title) + + +class PolskieRadioPlayerIE(InfoExtractor): + IE_NAME = 'polskieradio:player' + _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P<id>[^/]+)' + + _BASE_URL = 'https://player.polskieradio.pl' + _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js' + _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje' + + _TESTS = [{ + 'url': 'https://player.polskieradio.pl/anteny/trojka', + 'info_dict': { + 'id': '3', + 'ext': 'm4a', + 'title': 'Trójka', + }, + 'params': { + 'format': 'bestaudio', + 'skip_download': 'endless stream', + }, + }] + + def _get_channel_list(self, channel_url='no_channel'): + player_code = self._download_webpage( + self._PLAYER_URL, channel_url, + note='Downloading js player') + channel_list = js_to_json(self._search_regex( + r';var r="anteny",a=(\[.+?\])},', player_code, 'channel list')) + return self._parse_json(channel_list, channel_url) + + def _real_extract(self, url): + channel_url = self._match_id(url) + channel_list = self._get_channel_list(channel_url) + + channel = next((c for c in channel_list if c.get('url') == channel_url), None) + + if not channel: + raise ExtractorError('Channel not found') + + station_list = self._download_json(self._STATIONS_API_URL, channel_url, + note='Downloading stream url list', + headers={ + 'Accept': 'application/json', + 'Referer': url, + 'Origin': self._BASE_URL, + }) + station = next((s for s in station_list + if s.get('Name') == (channel.get('streamName') or channel.get('name'))), None) + if not station: + raise ExtractorError('Station not found even though we extracted channel') + + formats = [] + for stream_url in station['Streams']: + stream_url = self._proto_relative_url(stream_url) + if stream_url.endswith('/playlist.m3u8'): + formats.extend(self._extract_m3u8_formats(stream_url, channel_url, live=True)) + elif stream_url.endswith('/manifest.f4m'): + formats.extend(self._extract_mpd_formats(stream_url, channel_url)) + elif stream_url.endswith('/Manifest'): + formats.extend(self._extract_ism_formats(stream_url, channel_url)) + else: + formats.append({ + 'url': stream_url, + }) + + self._sort_formats(formats) + + return { + 'id': compat_str(channel['id']), + 'formats': formats, + 'title': channel.get('name') or channel.get('streamName'), + 'display_id': channel_url, + 'thumbnail': f'{self._BASE_URL}/images/{channel_url}-color-logo.png', + 'is_live': True, + } + + +class PolskieRadioPodcastBaseExtractor(InfoExtractor): + _API_BASE = 'https://apipodcasts.polskieradio.pl/api' + + def _parse_episode(self, data): + return { + 'id': data['guid'], + 'formats': [{ + 'url': data['url'], + 'filesize': int_or_none(data.get('fileSize')), + }], + 'title': data['title'], + 'description': data.get('description'), + 'duration': int_or_none(data.get('length')), + 'timestamp': parse_iso8601(data.get('publishDate')), + 'thumbnail': url_or_none(data.get('image')), + 'series': data.get('podcastTitle'), + 'episode': data['title'], + } + + +class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor): + IE_NAME = 'polskieradio:podcast:list' + _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://podcasty.polskieradio.pl/podcast/8/', + 'info_dict': { + 'id': '8', + 'title': 'Śniadanie w Trójce', + 'description': 'md5:57abcc27bc4c6a6b25baa3061975b9ef', + 'uploader': 'Beata Michniewicz', + }, + 'playlist_mincount': 714, + }] + _PAGE_SIZE = 10 + + def _call_api(self, podcast_id, page): + return self._download_json( + f'{self._API_BASE}/Podcasts/{podcast_id}/?pageSize={self._PAGE_SIZE}&page={page}', + podcast_id, f'Downloading page {page}') + + def _real_extract(self, url): + podcast_id = self._match_id(url) + data = self._call_api(podcast_id, 1) + + def get_page(page_num): + page_data = self._call_api(podcast_id, page_num + 1) if page_num else data + yield from (self._parse_episode(ep) for ep in page_data['items']) + + return { + '_type': 'playlist', + 'entries': InAdvancePagedList( + get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE), + 'id': str(data['id']), + 'title': data['title'], + 'description': data.get('description'), + 'uploader': data.get('announcer'), + } + + +class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): + IE_NAME = 'polskieradio:podcast' + _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P<id>[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})' + _TESTS = [{ + 'url': 'https://podcasty.polskieradio.pl/track/6eafe403-cb8f-4756-b896-4455c3713c32', + 'info_dict': { + 'id': '6eafe403-cb8f-4756-b896-4455c3713c32', + 'ext': 'mp3', + 'title': 'Theresa May rezygnuje. Co dalej z brexitem?', + 'description': 'md5:e41c409a29d022b70ef0faa61dbded60', + }, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + data = self._download_json( + f'{self._API_BASE}/audio', + podcast_id, 'Downloading podcast metadata', + data=json.dumps({ + 'guids': [podcast_id], + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + return self._parse_episode(data[0]) + + +class PolskieRadioRadioKierowcowIE(PolskieRadioBaseExtractor): + _VALID_URL = r'https?://(?:www\.)?radiokierowcow\.pl/artykul/(?P<id>[0-9]+)' + IE_NAME = 'polskieradio:kierowcow' + + _TESTS = [{ + 'url': 'https://radiokierowcow.pl/artykul/2694529', + 'info_dict': { + 'id': '2694529', + 'title': 'Zielona fala reliktem przeszłości?', + 'description': 'md5:343950a8717c9818fdfd4bd2b8ca9ff2', + }, + 'playlist_count': 3, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) + nextjs_build = self._search_nextjs_data(webpage, media_id)['buildId'] + article = self._download_json( + f'https://radiokierowcow.pl/_next/data/{nextjs_build}/artykul/{media_id}.json?articleId={media_id}', + media_id) + data = article['pageProps']['data'] + title = data['title'] + entries = self._extract_webpage_player_entries(data['content'], media_id, { + 'title': title, + }) + + return { + '_type': 'playlist', + 'id': media_id, + 'entries': entries, + 'title': title, + 'description': data.get('lead'), + } diff --git a/yt_dlp/extractor/pornflip.py b/yt_dlp/extractor/pornflip.py index d0aefa2dd..accf45269 100644 --- a/yt_dlp/extractor/pornflip.py +++ b/yt_dlp/extractor/pornflip.py @@ -29,7 +29,6 @@ class PornFlipIE(InfoExtractor): 'age_limit': 18, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, diff --git a/yt_dlp/extractor/radiokapital.py b/yt_dlp/extractor/radiokapital.py new file mode 100644 index 000000000..2e93e034f --- /dev/null +++ b/yt_dlp/extractor/radiokapital.py @@ -0,0 +1,99 @@ +# coding: utf-8 + +from .common import InfoExtractor +from ..utils import ( + clean_html, + traverse_obj, + unescapeHTML, +) + +import itertools +from urllib.parse import urlencode + + +class RadioKapitalBaseIE(InfoExtractor): + def _call_api(self, resource, video_id, note='Downloading JSON metadata', qs={}): + return self._download_json( + f'https://www.radiokapital.pl/wp-json/kapital/v1/{resource}?{urlencode(qs)}', + video_id, note=note) + + def _parse_episode(self, data): + release = '%s%s%s' % (data['published'][6:11], data['published'][3:6], data['published'][:3]) + return { + '_type': 'url_transparent', + 'url': data['mixcloud_url'], + 'ie_key': 'Mixcloud', + 'title': unescapeHTML(data['title']), + 'description': clean_html(data.get('content')), + 'tags': traverse_obj(data, ('tags', ..., 'name')), + 'release_date': release, + 'series': traverse_obj(data, ('show', 'title')), + } + + +class RadioKapitalIE(RadioKapitalBaseIE): + IE_NAME = 'radiokapital' + _VALID_URL = r'https?://(?:www\.)?radiokapital\.pl/shows/[a-z\d-]+/(?P<id>[a-z\d-]+)' + + _TESTS = [{ + 'url': 'https://radiokapital.pl/shows/tutaj-sa-smoki/5-its-okay-to-be-immaterial', + 'info_dict': { + 'id': 'radiokapital_radio-kapitał-tutaj-są-smoki-5-its-okay-to-be-immaterial-2021-05-20', + 'ext': 'm4a', + 'title': '#5: It’s okay to\xa0be\xa0immaterial', + 'description': 'md5:2499da5fbfb0e88333b7d37ec8e9e4c4', + 'uploader': 'Radio Kapitał', + 'uploader_id': 'radiokapital', + 'timestamp': 1621640164, + 'upload_date': '20210521', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + episode = self._call_api('episodes/%s' % video_id, video_id) + return self._parse_episode(episode) + + +class RadioKapitalShowIE(RadioKapitalBaseIE): + IE_NAME = 'radiokapital:show' + _VALID_URL = r'https?://(?:www\.)?radiokapital\.pl/shows/(?P<id>[a-z\d-]+)/?(?:$|[?#])' + + _TESTS = [{ + 'url': 'https://radiokapital.pl/shows/wesz', + 'info_dict': { + 'id': '100', + 'title': 'WĘSZ', + 'description': 'md5:3a557a1e0f31af612b0dcc85b1e0ca5c', + }, + 'playlist_mincount': 17, + }] + + def _get_episode_list(self, series_id, page_no): + return self._call_api( + 'episodes', series_id, + f'Downloading episode list page #{page_no}', qs={ + 'show': series_id, + 'page': page_no, + }) + + def _entries(self, series_id): + for page_no in itertools.count(1): + episode_list = self._get_episode_list(series_id, page_no) + yield from (self._parse_episode(ep) for ep in episode_list['items']) + if episode_list['next'] is None: + break + + def _real_extract(self, url): + series_id = self._match_id(url) + + show = self._call_api(f'shows/{series_id}', series_id, 'Downloading show metadata') + entries = self._entries(series_id) + return { + '_type': 'playlist', + 'entries': entries, + 'id': str(show['id']), + 'title': show.get('title'), + 'description': clean_html(show.get('content')), + } diff --git a/yt_dlp/extractor/radiozet.py b/yt_dlp/extractor/radiozet.py new file mode 100644 index 000000000..2e1ff36c2 --- /dev/null +++ b/yt_dlp/extractor/radiozet.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import ( + traverse_obj, + strip_or_none, +) + + +class RadioZetPodcastIE(InfoExtractor): + _VALID_URL = r'https?://player\.radiozet\.pl\/Podcasty/.*?/(?P<id>.+)' + _TEST = { + 'url': 'https://player.radiozet.pl/Podcasty/Nie-Ma-Za-Co/O-przedmiotach-szkolnych-ktore-przydaja-sie-w-zyciu', + 'md5': 'e03665c316b4fbc5f6a8f232948bbba3', + 'info_dict': { + 'id': '42154', + 'display_id': 'O-przedmiotach-szkolnych-ktore-przydaja-sie-w-zyciu', + 'title': 'O przedmiotach szkolnych, które przydają się w życiu', + 'description': 'md5:fa72bed49da334b09e5b2f79851f185c', + 'release_timestamp': 1592985480, + 'ext': 'mp3', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 83, + 'series': 'Nie Ma Za Co', + 'creator': 'Katarzyna Pakosińska', + } + } + + def _call_api(self, podcast_id, display_id): + return self._download_json( + f'https://player.radiozet.pl/api/podcasts/getPodcast/(node)/{podcast_id}/(station)/radiozet', + display_id) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + podcast_id = self._html_search_regex(r'<div.*?\sid="player".*?\sdata-id=[\'"]([^\'"]+)[\'"]', + webpage, 'podcast id') + data = self._call_api(podcast_id, display_id)['data'][0] + + return { + 'id': podcast_id, + 'display_id': display_id, + 'title': strip_or_none(data.get('title')), + 'description': strip_or_none(traverse_obj(data, ('program', 'desc'))), + 'release_timestamp': data.get('published_date'), + 'url': traverse_obj(data, ('player', 'stream')), + 'thumbnail': traverse_obj(data, ('program', 'image', 'original')), + 'duration': traverse_obj(data, ('player', 'duration')), + 'series': strip_or_none(traverse_obj(data, ('program', 'title'))), + 'creator': strip_or_none(traverse_obj(data, ('presenter', 0, 'title'))), + } diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index 27cd01801..6aa62c955 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -14,12 +14,15 @@ from ..utils import ( find_xpath_attr, fix_xml_ampersands, GeoRestrictedError, + get_element_by_class, HEADRequest, int_or_none, parse_duration, + parse_list, remove_start, strip_or_none, try_get, + unescapeHTML, unified_strdate, unified_timestamp, update_url_query, @@ -585,3 +588,84 @@ class RaiIE(RaiBaseIE): info.update(relinker_info) return info + + +class RaiPlayRadioBaseIE(InfoExtractor): + _BASE = 'https://www.raiplayradio.it' + + def get_playlist_iter(self, url, uid): + webpage = self._download_webpage(url, uid) + for attrs in parse_list(webpage): + title = attrs['data-title'].strip() + audio_url = urljoin(url, attrs['data-mediapolis']) + entry = { + 'url': audio_url, + 'id': attrs['data-uniquename'].lstrip('ContentItem-'), + 'title': title, + 'ext': 'mp3', + 'language': 'it', + } + if 'data-image' in attrs: + entry['thumbnail'] = urljoin(url, attrs['data-image']) + yield entry + + +class RaiPlayRadioIE(RaiPlayRadioBaseIE): + _VALID_URL = r'%s/audio/.+?-(?P<id>%s)\.html' % ( + RaiPlayRadioBaseIE._BASE, RaiBaseIE._UUID_RE) + _TEST = { + 'url': 'https://www.raiplayradio.it/audio/2019/07/RADIO3---LEZIONI-DI-MUSICA-36b099ff-4123-4443-9bf9-38e43ef5e025.html', + 'info_dict': { + 'id': '36b099ff-4123-4443-9bf9-38e43ef5e025', + 'ext': 'mp3', + 'title': 'Dal "Chiaro di luna" al "Clair de lune", prima parte con Giovanni Bietti', + 'thumbnail': r're:^https?://.*\.jpg$', + 'language': 'it', + } + } + + def _real_extract(self, url): + audio_id = self._match_id(url) + list_url = url.replace('.html', '-list.html') + return next(entry for entry in self.get_playlist_iter(list_url, audio_id) if entry['id'] == audio_id) + + +class RaiPlayRadioPlaylistIE(RaiPlayRadioBaseIE): + _VALID_URL = r'%s/playlist/.+?-(?P<id>%s)\.html' % ( + RaiPlayRadioBaseIE._BASE, RaiBaseIE._UUID_RE) + _TEST = { + 'url': 'https://www.raiplayradio.it/playlist/2017/12/Alice-nel-paese-delle-meraviglie-72371d3c-d998-49f3-8860-d168cfdf4966.html', + 'info_dict': { + 'id': '72371d3c-d998-49f3-8860-d168cfdf4966', + 'title': "Alice nel paese delle meraviglie", + 'description': "di Lewis Carrol letto da Aldo Busi", + }, + 'playlist_count': 11, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + playlist_webpage = self._download_webpage(url, playlist_id) + playlist_title = unescapeHTML(self._html_search_regex( + r'data-playlist-title="(.+?)"', playlist_webpage, 'title')) + playlist_creator = self._html_search_meta( + 'nomeProgramma', playlist_webpage) + playlist_description = get_element_by_class( + 'textDescriptionProgramma', playlist_webpage) + + player_href = self._html_search_regex( + r'data-player-href="(.+?)"', playlist_webpage, 'href') + list_url = urljoin(url, player_href) + + entries = list(self.get_playlist_iter(list_url, playlist_id)) + for index, entry in enumerate(entries, start=1): + entry.update({ + 'track': entry['title'], + 'track_number': index, + 'artist': playlist_creator, + 'album': playlist_title + }) + + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description, + creator=playlist_creator) diff --git a/yt_dlp/extractor/rcti.py b/yt_dlp/extractor/rcti.py index 31d9779dd..19b2f451c 100644 --- a/yt_dlp/extractor/rcti.py +++ b/yt_dlp/extractor/rcti.py @@ -85,9 +85,6 @@ class RCTIPlusIE(RCTIPlusBaseIE): 'series': 'iNews Malam', 'channel': 'INews', }, - 'params': { - 'format': 'bestvideo', - }, }, { # Missed event/replay 'url': 'https://www.rctiplus.com/missed-event/2507/mou-signing-ceremony-27-juli-2021-1400-wib', 'md5': '649c5f27250faed1452ca8b91e06922d', @@ -132,7 +129,6 @@ class RCTIPlusIE(RCTIPlusBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', }, }] _CONVIVA_JSON_TEMPLATE = { @@ -329,7 +325,6 @@ class RCTIPlusTVIE(RCTIPlusBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', } }, { # Returned video will always change diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index c75d95a8e..a042a59cc 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -8,46 +8,11 @@ from ..utils import ( try_get, unescapeHTML, url_or_none, + traverse_obj ) class RedditIE(InfoExtractor): - _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)' - _TEST = { - # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ - 'url': 'https://v.redd.it/zv89llsvexdz', - 'md5': '0a070c53eba7ec4534d95a5a1259e253', - 'info_dict': { - 'id': 'zv89llsvexdz', - 'ext': 'mp4', - 'title': 'zv89llsvexdz', - }, - 'params': { - 'format': 'bestvideo', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - formats = self._extract_m3u8_formats( - 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, - 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) - - formats.extend(self._extract_mpd_formats( - 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, - mpd_id='dash', fatal=False)) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video_id, - 'formats': formats, - } - - -class RedditRIE(InfoExtractor): _VALID_URL = r'https?://(?P<subdomain>[^/]+\.)?reddit(?:media)?\.com/r/(?P<slug>[^/]+/comments/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', @@ -67,7 +32,6 @@ class RedditRIE(InfoExtractor): 'age_limit': 0, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -151,19 +115,53 @@ class RedditRIE(InfoExtractor): for resolution in resolutions: add_thumbnail(resolution) - return { - '_type': 'url_transparent', - 'url': video_url, + info = { 'title': data.get('title'), 'thumbnails': thumbnails, 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), - 'duration': int_or_none(try_get( - data, - (lambda x: x['media']['reddit_video']['duration'], - lambda x: x['secure_media']['reddit_video']['duration']))), 'like_count': int_or_none(data.get('ups')), 'dislike_count': int_or_none(data.get('downs')), 'comment_count': int_or_none(data.get('num_comments')), 'age_limit': age_limit, } + + # Check if media is hosted on reddit: + reddit_video = traverse_obj(data, (('media', 'secure_media'), 'reddit_video'), get_all=False) + if reddit_video: + playlist_urls = [ + try_get(reddit_video, lambda x: unescapeHTML(x[y])) + for y in ('dash_url', 'hls_url') + ] + + # Update video_id + display_id = video_id + video_id = self._search_regex( + r'https?://v\.redd\.it/(?P<id>[^/?#&]+)', reddit_video['fallback_url'], + 'video_id', default=display_id) + + dash_playlist_url = playlist_urls[0] or f'https://v.redd.it/{video_id}/DASHPlaylist.mpd' + hls_playlist_url = playlist_urls[1] or f'https://v.redd.it/{video_id}/HLSPlaylist.m3u8' + + formats = self._extract_m3u8_formats( + hls_playlist_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + dash_playlist_url, display_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + return { + **info, + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'duration': int_or_none(reddit_video.get('duration')), + } + + # Not hosted on reddit, must continue extraction + return { + **info, + 'display_id': video_id, + '_type': 'url_transparent', + 'url': video_url, + } diff --git a/yt_dlp/extractor/redgifs.py b/yt_dlp/extractor/redgifs.py new file mode 100644 index 000000000..1257d1344 --- /dev/null +++ b/yt_dlp/extractor/redgifs.py @@ -0,0 +1,94 @@ +# coding: utf-8 + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + qualities, + try_get, +) + + +class RedGifsIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|thumbs2?)\.)?redgifs\.com/(?:watch/)?(?P<id>[^-/?#\.]+)' + _FORMATS = { + 'gif': 250, + 'sd': 480, + 'hd': None, + } + _TESTS = [{ + 'url': 'https://www.redgifs.com/watch/squeakyhelplesswisent', + 'info_dict': { + 'id': 'squeakyhelplesswisent', + 'ext': 'mp4', + 'title': 'Hotwife Legs Thick', + 'timestamp': 1636287915, + 'upload_date': '20211107', + 'uploader': 'ignored52', + 'duration': 16, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + } + }, { + 'url': 'https://thumbs2.redgifs.com/SqueakyHelplessWisent-mobile.mp4#t=0', + 'info_dict': { + 'id': 'squeakyhelplesswisent', + 'ext': 'mp4', + 'title': 'Hotwife Legs Thick', + 'timestamp': 1636287915, + 'upload_date': '20211107', + 'uploader': 'ignored52', + 'duration': 16, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url).lower() + + video_info = self._download_json( + 'https://api.redgifs.com/v2/gifs/%s' % video_id, + video_id, 'Downloading video info') + if 'error' in video_info: + raise ExtractorError(f'RedGifs said: {video_info["error"]}', expected=True) + + gif = video_info['gif'] + urls = gif['urls'] + + quality = qualities(tuple(self._FORMATS.keys())) + + orig_height = int_or_none(gif.get('height')) + aspect_ratio = try_get(gif, lambda x: orig_height / x['width']) + + formats = [] + for format_id, height in self._FORMATS.items(): + video_url = urls.get(format_id) + if not video_url: + continue + height = min(orig_height, height or orig_height) + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'width': height * aspect_ratio if aspect_ratio else None, + 'height': height, + 'quality': quality(format_id), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': ' '.join(gif.get('tags') or []) or 'RedGifs', + 'timestamp': int_or_none(gif.get('createDate')), + 'uploader': gif.get('userName'), + 'duration': int_or_none(gif.get('duration')), + 'view_count': int_or_none(gif.get('views')), + 'like_count': int_or_none(gif.get('likes')), + 'categories': gif.get('tags') or [], + 'age_limit': 18, + 'formats': formats, + } diff --git a/yt_dlp/extractor/rmcdecouverte.py b/yt_dlp/extractor/rmcdecouverte.py index 422d47ae9..8bfce3416 100644 --- a/yt_dlp/extractor/rmcdecouverte.py +++ b/yt_dlp/extractor/rmcdecouverte.py @@ -26,7 +26,6 @@ class RMCDecouverteIE(InfoExtractor): 'upload_date': '20210428', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { diff --git a/yt_dlp/extractor/roosterteeth.py b/yt_dlp/extractor/roosterteeth.py index 2c815bda6..18672b2e3 100644 --- a/yt_dlp/extractor/roosterteeth.py +++ b/yt_dlp/extractor/roosterteeth.py @@ -1,25 +1,94 @@ # coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, + join_nonempty, + LazyList, + parse_qs, str_or_none, + traverse_obj, + url_or_none, urlencode_postdata, + urljoin, + update_url_query, ) -class RoosterTeethIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)' +class RoosterTeethBaseIE(InfoExtractor): _NETRC_MACHINE = 'roosterteeth' + _API_BASE = 'https://svod-be.roosterteeth.com' + _API_BASE_URL = f'{_API_BASE}/api/v1' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + if self._get_cookies(self._API_BASE_URL).get('rt_access_token'): + return + + try: + self._download_json( + 'https://auth.roosterteeth.com/oauth/token', + None, 'Logging in', data=urlencode_postdata({ + 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', + 'grant_type': 'password', + 'username': username, + 'password': password, + })) + except ExtractorError as e: + msg = 'Unable to login' + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json(e.cause.read().decode(), None, fatal=False) + if resp: + error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') + if error: + msg += ': ' + error + self.report_warning(msg) + + def _real_initialize(self): + self._login() + + def _extract_video_info(self, data): + thumbnails = [] + for image in traverse_obj(data, ('included', 'images')): + if image.get('type') not in ('episode_image', 'bonus_feature_image'): + continue + thumbnails.extend([{ + 'id': name, + 'url': url, + } for name, url in (image.get('attributes') or {}).items() if url_or_none(url)]) + + attributes = data.get('attributes') or {} + title = traverse_obj(attributes, 'title', 'display_title') + sub_only = attributes.get('is_sponsors_only') + + return { + 'id': str(data.get('id')), + 'display_id': attributes.get('slug'), + 'title': title, + 'description': traverse_obj(attributes, 'description', 'caption'), + 'series': attributes.get('show_title'), + 'season_number': int_or_none(attributes.get('season_number')), + 'season_id': attributes.get('season_id'), + 'episode': title, + 'episode_number': int_or_none(attributes.get('number')), + 'episode_id': str_or_none(data.get('uuid')), + 'channel_id': attributes.get('channel_id'), + 'duration': int_or_none(attributes.get('length')), + 'thumbnails': thumbnails, + 'availability': self._availability( + needs_premium=sub_only, needs_subscription=sub_only, needs_auth=sub_only, + is_private=False, is_unlisted=False), + 'tags': attributes.get('genres') + } + + +class RoosterTeethIE(RoosterTeethBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', - 'md5': 'e2bd7764732d785ef797700a2489f212', 'info_dict': { 'id': '9156', 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', @@ -30,19 +99,20 @@ class RoosterTeethIE(InfoExtractor): 'series': 'Million Dollars, But...', 'episode': 'Million Dollars, But... The Game Announcement', }, + 'skip_download': 'm3u8', }, { 'url': 'https://roosterteeth.com/watch/rwby-bonus-25', - 'md5': 'fe8d9d976b272c18a24fe7f1f5830084', 'info_dict': { - 'id': '31', + 'id': '40432', 'display_id': 'rwby-bonus-25', - 'title': 'Volume 2, World of Remnant 3', - 'description': 'md5:8d58d3270292ea11da00ea712bbfb009', - 'episode': 'Volume 2, World of Remnant 3', - 'channel_id': 'fab60c1c-29cb-43bc-9383-5c3538d9e246', + 'title': 'Grimm', + 'description': 'md5:f30ff570741213418a8d2c19868b93ab', + 'episode': 'Grimm', + 'channel_id': '92f780eb-ebfe-4bf5-a3b5-c6ad5460a5f1', 'thumbnail': r're:^https?://.*\.(png|jpe?g)$', 'ext': 'mp4', }, + 'skip_download': 'm3u8', }, { 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', 'only_matching': True, @@ -63,40 +133,10 @@ class RoosterTeethIE(InfoExtractor): 'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement', 'only_matching': True, }] - _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/watch/' - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - try: - self._download_json( - 'https://auth.roosterteeth.com/oauth/token', - None, 'Logging in', data=urlencode_postdata({ - 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', - 'grant_type': 'password', - 'username': username, - 'password': password, - })) - except ExtractorError as e: - msg = 'Unable to login' - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - resp = self._parse_json(e.cause.read().decode(), None, fatal=False) - if resp: - error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') - if error: - msg += ': ' + error - self.report_warning(msg) - - def _real_initialize(self): - if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'): - return - self._login() def _real_extract(self, url): display_id = self._match_id(url) - api_episode_url = self._EPISODE_BASE_URL + display_id + api_episode_url = f'{self._API_BASE_URL}/watch/{display_id}' try: video_data = self._download_json( @@ -118,36 +158,62 @@ class RoosterTeethIE(InfoExtractor): episode = self._download_json( api_episode_url, display_id, 'Downloading episode JSON metadata')['data'][0] - attributes = episode['attributes'] - title = attributes.get('title') or attributes['display_title'] - video_id = compat_str(episode['id']) - - thumbnails = [] - for image in episode.get('included', {}).get('images', []): - if image.get('type') in ('episode_image', 'bonus_feature_image'): - img_attributes = image.get('attributes') or {} - for k in ('thumb', 'small', 'medium', 'large'): - img_url = img_attributes.get(k) - if img_url: - thumbnails.append({ - 'id': k, - 'url': img_url, - }) return { - 'id': video_id, 'display_id': display_id, - 'title': title, - 'description': attributes.get('description') or attributes.get('caption'), - 'thumbnails': thumbnails, - 'series': attributes.get('show_title'), - 'season_number': int_or_none(attributes.get('season_number')), - 'season_id': attributes.get('season_id'), - 'episode': title, - 'episode_number': int_or_none(attributes.get('number')), - 'episode_id': str_or_none(episode.get('uuid')), 'formats': formats, - 'channel_id': attributes.get('channel_id'), - 'duration': int_or_none(attributes.get('length')), - 'subtitles': subtitles + 'subtitles': subtitles, + **self._extract_video_info(episode) + } + + +class RoosterTeethSeriesIE(RoosterTeethBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/series/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://roosterteeth.com/series/rwby?season=7', + 'playlist_count': 13, + 'info_dict': { + 'id': 'rwby-7', + 'title': 'RWBY - Season 7', } + }, { + 'url': 'https://roosterteeth.com/series/role-initiative', + 'playlist_mincount': 16, + 'info_dict': { + 'id': 'role-initiative', + 'title': 'Role Initiative', + } + }, { + 'url': 'https://roosterteeth.com/series/let-s-play-minecraft?season=9', + 'playlist_mincount': 50, + 'info_dict': { + 'id': 'let-s-play-minecraft-9', + 'title': 'Let\'s Play Minecraft - Season 9', + } + }] + + def _entries(self, series_id, season_number): + display_id = join_nonempty(series_id, season_number) + # TODO: extract bonus material + for data in self._download_json( + f'{self._API_BASE_URL}/shows/{series_id}/seasons?order=asc&order_by', display_id)['data']: + idx = traverse_obj(data, ('attributes', 'number')) + if season_number and idx != season_number: + continue + season_url = update_url_query(urljoin(self._API_BASE, data['links']['episodes']), {'per_page': 1000}) + season = self._download_json(season_url, display_id, f'Downloading season {idx} JSON metadata')['data'] + for episode in season: + yield self.url_result( + f'https://www.roosterteeth.com{episode["canonical_links"]["self"]}', + RoosterTeethIE.ie_key(), + **self._extract_video_info(episode)) + + def _real_extract(self, url): + series_id = self._match_id(url) + season_number = traverse_obj(parse_qs(url), ('season', 0), expected_type=int_or_none) + + entries = LazyList(self._entries(series_id, season_number)) + return self.playlist_result( + entries, + join_nonempty(series_id, season_number), + join_nonempty(entries[0].get('series'), season_number, delim=' - Season ')) diff --git a/yt_dlp/extractor/rtrfm.py b/yt_dlp/extractor/rtrfm.py new file mode 100644 index 000000000..93d51e8ed --- /dev/null +++ b/yt_dlp/extractor/rtrfm.py @@ -0,0 +1,67 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RTRFMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtrfm\.com\.au/(?:shows|show-episode)/(?P<id>[^/?\#&]+)' + _TESTS = [ + { + 'url': 'https://rtrfm.com.au/shows/breakfast/', + 'md5': '46168394d3a5ce237cf47e85d0745413', + 'info_dict': { + 'id': 'breakfast-2021-11-16', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': r're:^Breakfast with Taylah \d{4}-\d{2}-\d{2}$', + 'description': 'md5:0979c3ab1febfbec3f1ccb743633c611', + }, + 'skip': 'ID and md5 changes daily', + }, + { + 'url': 'https://rtrfm.com.au/show-episode/breakfast-2021-11-11/', + 'md5': '396bedf1e40f96c62b30d4999202a790', + 'info_dict': { + 'id': 'breakfast-2021-11-11', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': 'Breakfast with Taylah 2021-11-11', + 'description': 'md5:0979c3ab1febfbec3f1ccb743633c611', + }, + }, + { + 'url': 'https://rtrfm.com.au/show-episode/breakfast-2020-06-01/', + 'md5': '594027f513ec36a24b15d65007a24dff', + 'info_dict': { + 'id': 'breakfast-2020-06-01', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': 'Breakfast with Taylah 2020-06-01', + 'description': r're:^Breakfast with Taylah ', + }, + 'skip': 'This audio has expired', + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show, date, title = self._search_regex( + r'''\.playShow(?:From)?\(['"](?P<show>[^'"]+)['"],\s*['"](?P<date>[0-9]{4}-[0-9]{2}-[0-9]{2})['"],\s*['"](?P<title>[^'"]+)['"]''', + webpage, 'details', group=('show', 'date', 'title')) + url = self._download_json( + 'https://restreams.rtrfm.com.au/rzz', + show, 'Downloading MP3 URL', query={'n': show, 'd': date})['u'] + # This is the only indicator of an error until trying to download the URL and + # downloads of mp4 URLs always fail (403 for current episodes, 404 for missing). + if '.mp4' in url: + url = None + self.raise_no_formats('Expired or no episode on this date', expected=True) + return { + 'id': '%s-%s' % (show, date), + 'title': '%s %s' % (title, date), + 'series': title, + 'url': url, + 'release_date': date, + 'description': self._og_search_description(webpage), + } diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py index 0a806ee4e..4090f6385 100644 --- a/yt_dlp/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py @@ -10,7 +10,14 @@ from ..utils import ( class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=|/watch/)|news/(?:embeds/)?video/)(?P<id>[0-9]+)' + _VALID_URL = r'''(?x) + https?://(?:www\.)?sbs\.com\.au/(?: + ondemand(?: + /video/(?:single/)?| + /movie/[^/]+/| + .*?\bplay=|/watch/ + )|news/(?:embeds/)?video/ + )(?P<id>[0-9]+)''' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -46,6 +53,13 @@ class SBSIE(InfoExtractor): }, { 'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/movie/coherence/1469404227931', + 'only_matching': True, + }, { + 'note': 'Live stream', + 'url': 'https://www.sbs.com.au/ondemand/video/1726824003663/sbs-24x7-live-stream-nsw', + 'only_matching': True, }] def _real_extract(self, url): @@ -75,4 +89,5 @@ class SBSIE(InfoExtractor): 'ie_key': 'ThePlatform', 'id': video_id, 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}), + 'is_live': player_params.get('streamType') == 'live', } diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py new file mode 100644 index 000000000..6f4240422 --- /dev/null +++ b/yt_dlp/extractor/senategov.py @@ -0,0 +1,213 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + parse_qs, + unsmuggle_url, +) + +_COMMITTEES = { + 'ag': ('76440', 'http://ag-f.akamaihd.net'), + 'aging': ('76442', 'http://aging-f.akamaihd.net'), + 'approps': ('76441', 'http://approps-f.akamaihd.net'), + 'arch': ('', 'http://ussenate-f.akamaihd.net'), + 'armed': ('76445', 'http://armed-f.akamaihd.net'), + 'banking': ('76446', 'http://banking-f.akamaihd.net'), + 'budget': ('76447', 'http://budget-f.akamaihd.net'), + 'cecc': ('76486', 'http://srs-f.akamaihd.net'), + 'commerce': ('80177', 'http://commerce1-f.akamaihd.net'), + 'csce': ('75229', 'http://srs-f.akamaihd.net'), + 'dpc': ('76590', 'http://dpc-f.akamaihd.net'), + 'energy': ('76448', 'http://energy-f.akamaihd.net'), + 'epw': ('76478', 'http://epw-f.akamaihd.net'), + 'ethics': ('76449', 'http://ethics-f.akamaihd.net'), + 'finance': ('76450', 'http://finance-f.akamaihd.net'), + 'foreign': ('76451', 'http://foreign-f.akamaihd.net'), + 'govtaff': ('76453', 'http://govtaff-f.akamaihd.net'), + 'help': ('76452', 'http://help-f.akamaihd.net'), + 'indian': ('76455', 'http://indian-f.akamaihd.net'), + 'intel': ('76456', 'http://intel-f.akamaihd.net'), + 'intlnarc': ('76457', 'http://intlnarc-f.akamaihd.net'), + 'jccic': ('85180', 'http://jccic-f.akamaihd.net'), + 'jec': ('76458', 'http://jec-f.akamaihd.net'), + 'judiciary': ('76459', 'http://judiciary-f.akamaihd.net'), + 'rpc': ('76591', 'http://rpc-f.akamaihd.net'), + 'rules': ('76460', 'http://rules-f.akamaihd.net'), + 'saa': ('76489', 'http://srs-f.akamaihd.net'), + 'smbiz': ('76461', 'http://smbiz-f.akamaihd.net'), + 'srs': ('75229', 'http://srs-f.akamaihd.net'), + 'uscc': ('76487', 'http://srs-f.akamaihd.net'), + 'vetaff': ('76462', 'http://vetaff-f.akamaihd.net'), +} + + +class SenateISVPIE(InfoExtractor): + _IE_NAME = 'senate.gov:isvp' + _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' + + _TESTS = [{ + 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', + 'info_dict': { + 'id': 'judiciary031715', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', + 'info_dict': { + 'id': 'commerce011514', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', + # checksum differs each time + 'info_dict': { + 'id': 'intel090613', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + } + }, { + # From http://www.c-span.org/video/?96791-1 + 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', + 'only_matching': True, + }] + + @staticmethod + def _search_iframe_url(webpage): + mobj = re.search( + r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + qs = compat_parse_qs(self._match_valid_url(url).group('qs')) + if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = re.sub(r'.mp4$', '', qs['filename'][0]) + + webpage = self._download_webpage(url, video_id) + + if smuggled_data.get('force_title'): + title = smuggled_data['force_title'] + else: + title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id) + poster = qs.get('poster') + thumbnail = poster[0] if poster else None + + video_type = qs['type'][0] + committee = video_type if video_type == 'arch' else qs['comm'][0] + + stream_num, domain = _COMMITTEES[committee] + + formats = [] + if video_type == 'arch': + filename = video_id if '.' in video_id else video_id + '.mp4' + m3u8_url = compat_urlparse.urljoin(domain, 'i/' + filename + '/master.m3u8') + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8') + else: + hdcore_sign = 'hdcore=3.1.0' + url_params = (domain, video_id, stream_num) + f4m_url = f'%s/z/%s_1@%s/manifest.f4m?{hdcore_sign}' % url_params + m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params + for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): + # URLs without the extra param induce an 404 error + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.append(entry) + for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): + mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url']) + if mobj: + entry['format_id'] += mobj.group('tag') + formats.append(entry) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } + + +class SenateGovIE(InfoExtractor): + _IE_NAME = 'senate.gov' + _VALID_URL = r'https?:\/\/(?:www\.)?(help|appropriations|judiciary|banking|armed-services|finance)\.senate\.gov' + _TESTS = [{ + 'url': 'https://www.help.senate.gov/hearings/vaccines-saving-lives-ensuring-confidence-and-protecting-public-health', + 'info_dict': { + 'id': 'help090920', + 'display_id': 'vaccines-saving-lives-ensuring-confidence-and-protecting-public-health', + 'title': 'Vaccines: Saving Lives, Ensuring Confidence, and Protecting Public Health', + 'description': 'The U.S. Senate Committee on Health, Education, Labor & Pensions', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.appropriations.senate.gov/hearings/watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD', + 'info_dict': { + 'id': 'appropsA051518', + 'display_id': 'watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD', + 'title': 'Review of the FY2019 Budget Request for the U.S. Army', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.banking.senate.gov/hearings/21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization', + 'info_dict': { + 'id': 'banking041521', + 'display_id': '21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization', + 'title': '21st Century Communities: Public Transportation Infrastructure Investment and FAST Act Reauthorization', + 'description': 'The Official website of The United States Committee on Banking, Housing, and Urban Affairs', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._generic_id(url) + webpage = self._download_webpage(url, display_id) + parse_info = parse_qs(self._search_regex( + r'<iframe class="[^>"]*streaminghearing[^>"]*"\s[^>]*\bsrc="([^">]*)', webpage, 'hearing URL')) + + stream_num, stream_domain = _COMMITTEES[parse_info['comm'][-1]] + filename = parse_info['filename'][-1] + + formats = self._extract_m3u8_formats( + f'{stream_domain}/i/{filename}_1@{stream_num}/master.m3u8', + display_id, ext='mp4') + self._sort_formats(formats) + + title = self._html_search_regex( + (*self._og_regexes('title'), r'(?s)<title>([^<]*?)</title>'), webpage, 'video title') + + return { + 'id': re.sub(r'.mp4$', '', filename), + 'display_id': display_id, + 'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'age_limit': self._rta_search(webpage), + 'formats': formats + } diff --git a/yt_dlp/extractor/senateisvp.py b/yt_dlp/extractor/senateisvp.py deleted file mode 100644 index 8794d47ef..000000000 --- a/yt_dlp/extractor/senateisvp.py +++ /dev/null @@ -1,153 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - unsmuggle_url, -) -from ..compat import ( - compat_parse_qs, - compat_urlparse, -) - - -class SenateISVPIE(InfoExtractor): - _COMM_MAP = [ - ['ag', '76440', 'http://ag-f.akamaihd.net'], - ['aging', '76442', 'http://aging-f.akamaihd.net'], - ['approps', '76441', 'http://approps-f.akamaihd.net'], - ['armed', '76445', 'http://armed-f.akamaihd.net'], - ['banking', '76446', 'http://banking-f.akamaihd.net'], - ['budget', '76447', 'http://budget-f.akamaihd.net'], - ['cecc', '76486', 'http://srs-f.akamaihd.net'], - ['commerce', '80177', 'http://commerce1-f.akamaihd.net'], - ['csce', '75229', 'http://srs-f.akamaihd.net'], - ['dpc', '76590', 'http://dpc-f.akamaihd.net'], - ['energy', '76448', 'http://energy-f.akamaihd.net'], - ['epw', '76478', 'http://epw-f.akamaihd.net'], - ['ethics', '76449', 'http://ethics-f.akamaihd.net'], - ['finance', '76450', 'http://finance-f.akamaihd.net'], - ['foreign', '76451', 'http://foreign-f.akamaihd.net'], - ['govtaff', '76453', 'http://govtaff-f.akamaihd.net'], - ['help', '76452', 'http://help-f.akamaihd.net'], - ['indian', '76455', 'http://indian-f.akamaihd.net'], - ['intel', '76456', 'http://intel-f.akamaihd.net'], - ['intlnarc', '76457', 'http://intlnarc-f.akamaihd.net'], - ['jccic', '85180', 'http://jccic-f.akamaihd.net'], - ['jec', '76458', 'http://jec-f.akamaihd.net'], - ['judiciary', '76459', 'http://judiciary-f.akamaihd.net'], - ['rpc', '76591', 'http://rpc-f.akamaihd.net'], - ['rules', '76460', 'http://rules-f.akamaihd.net'], - ['saa', '76489', 'http://srs-f.akamaihd.net'], - ['smbiz', '76461', 'http://smbiz-f.akamaihd.net'], - ['srs', '75229', 'http://srs-f.akamaihd.net'], - ['uscc', '76487', 'http://srs-f.akamaihd.net'], - ['vetaff', '76462', 'http://vetaff-f.akamaihd.net'], - ['arch', '', 'http://ussenate-f.akamaihd.net/'] - ] - _IE_NAME = 'senate.gov' - _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' - _TESTS = [{ - 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', - 'info_dict': { - 'id': 'judiciary031715', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', - 'info_dict': { - 'id': 'commerce011514', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player' - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', - # checksum differs each time - 'info_dict': { - 'id': 'intel090613', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player' - } - }, { - # From http://www.c-span.org/video/?96791-1 - 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', - 'only_matching': True, - }] - - @staticmethod - def _search_iframe_url(webpage): - mobj = re.search( - r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", - webpage) - if mobj: - return mobj.group('url') - - def _get_info_for_comm(self, committee): - for entry in self._COMM_MAP: - if entry[0] == committee: - return entry[1:] - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - qs = compat_parse_qs(self._match_valid_url(url).group('qs')) - if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): - raise ExtractorError('Invalid URL', expected=True) - - video_id = re.sub(r'.mp4$', '', qs['filename'][0]) - - webpage = self._download_webpage(url, video_id) - - if smuggled_data.get('force_title'): - title = smuggled_data['force_title'] - else: - title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id) - poster = qs.get('poster') - thumbnail = poster[0] if poster else None - - video_type = qs['type'][0] - committee = video_type if video_type == 'arch' else qs['comm'][0] - stream_num, domain = self._get_info_for_comm(committee) - - formats = [] - if video_type == 'arch': - filename = video_id if '.' in video_id else video_id + '.mp4' - formats = [{ - # All parameters in the query string are necessary to prevent a 403 error - 'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=', - }] - else: - hdcore_sign = 'hdcore=3.1.0' - url_params = (domain, video_id, stream_num) - f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign - m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params - for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): - # URLs without the extra param induce an 404 error - entry.update({'extra_param_to_segment_url': hdcore_sign}) - formats.append(entry) - for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): - mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url']) - if mobj: - entry['format_id'] += mobj.group('tag') - formats.append(entry) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - } diff --git a/yt_dlp/extractor/sevenplus.py b/yt_dlp/extractor/sevenplus.py index 210c44ab2..9867961f0 100644 --- a/yt_dlp/extractor/sevenplus.py +++ b/yt_dlp/extractor/sevenplus.py @@ -35,7 +35,6 @@ class SevenPlusIE(BrightcoveNewIE): 'episode': 'Wind Surf', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, } }, { diff --git a/yt_dlp/extractor/slideslive.py b/yt_dlp/extractor/slideslive.py index 9409a0100..df6084647 100644 --- a/yt_dlp/extractor/slideslive.py +++ b/yt_dlp/extractor/slideslive.py @@ -35,9 +35,6 @@ class SlidesLiveIE(InfoExtractor): 'ext': 'mp4', 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', }, - 'params': { - 'format': 'bestvideo', - }, }, { # video_service_name = youtube 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 824528474..2bb449220 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -893,5 +893,6 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): break def _get_n_results(self, query, n): - tracks = self._get_collection('search/tracks', query, limit=n, q=query) - return self.playlist_result(tracks, query, query) + return self.playlist_result(itertools.islice( + self._get_collection('search/tracks', query, limit=n, q=query), + 0, None if n == float('inf') else n), query, query) diff --git a/yt_dlp/extractor/southpark.py b/yt_dlp/extractor/southpark.py index d49749467..942a52dcf 100644 --- a/yt_dlp/extractor/southpark.py +++ b/yt_dlp/extractor/southpark.py @@ -6,19 +6,18 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/((?:video-)?clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _TESTS = [{ - 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', + 'url': 'https://southpark.cc.com/video-clips/d7wr06/south-park-you-all-agreed-to-counseling', 'info_dict': { - 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30', 'ext': 'mp4', - 'title': 'South Park|Bat Daded', - 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', - 'timestamp': 1112760000, - 'upload_date': '20050406', + 'title': 'You All Agreed to Counseling', + 'description': 'Kenny, Cartman, Stan, and Kyle visit Mr. Mackey and ask for his help getting Mrs. Nelson to come back. Mr. Mackey reveals the only way to get things back to normal is to get the teachers vaccinated.', + 'timestamp': 1615352400, + 'upload_date': '20210310', }, }, { 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', @@ -40,11 +39,11 @@ class SouthParkIE(MTVServicesInfoExtractor): class SouthParkEsIE(SouthParkIE): IE_NAME = 'southpark.cc.com:español' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/es/episodios/(?P<id>.+?)(\?|#|$))' _LANG = 'es' _TESTS = [{ - 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', + 'url': 'http://southpark.cc.com/es/episodios/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', 'info_dict': { 'title': 'Cartman Consigue Una Sonda Anal', 'description': 'Cartman Consigue Una Sonda Anal', diff --git a/yt_dlp/extractor/srgssr.py b/yt_dlp/extractor/srgssr.py index cbc1c47d2..f9919816d 100644 --- a/yt_dlp/extractor/srgssr.py +++ b/yt_dlp/extractor/srgssr.py @@ -7,6 +7,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + join_nonempty, parse_iso8601, qualities, try_get, @@ -94,11 +95,7 @@ class SRGSSRIE(InfoExtractor): continue protocol = source.get('protocol') quality = source.get('quality') - format_id = [] - for e in (protocol, source.get('encoding'), quality): - if e: - format_id.append(e) - format_id = '-'.join(format_id) + format_id = join_nonempty(protocol, source.get('encoding'), quality) if protocol in ('HDS', 'HLS'): if source.get('tokenType') == 'AKAMAI': diff --git a/yt_dlp/extractor/streamff.py b/yt_dlp/extractor/streamff.py new file mode 100644 index 000000000..6b190bb3b --- /dev/null +++ b/yt_dlp/extractor/streamff.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import int_or_none, parse_iso8601 + + +class StreamFFIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?streamff\.com/v/(?P<id>[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'https://streamff.com/v/55cc94', + 'md5': '8745a67bb5e5c570738efe7983826370', + 'info_dict': { + 'id': '55cc94', + 'ext': 'mp4', + 'title': '55cc94', + 'timestamp': 1634764643, + 'upload_date': '20211020', + 'view_count': int, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json(f'https://streamff.com/api/videos/{video_id}', video_id) + return { + 'id': video_id, + 'title': json_data.get('name') or video_id, + 'url': 'https://streamff.com/%s' % json_data['videoLink'], + 'view_count': int_or_none(json_data.get('views')), + 'timestamp': parse_iso8601(json_data.get('date')), + } diff --git a/yt_dlp/extractor/stripchat.py b/yt_dlp/extractor/stripchat.py new file mode 100644 index 000000000..efd0afc75 --- /dev/null +++ b/yt_dlp/extractor/stripchat.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + ExtractorError, + lowercase_escape, + try_get, +) + + +class StripchatIE(InfoExtractor): + _VALID_URL = r'https?://stripchat\.com/(?P<id>[0-9A-Za-z-_]+)' + _TESTS = [{ + 'url': 'https://stripchat.com/feel_me', + 'info_dict': { + 'id': 'feel_me', + 'ext': 'mp4', + 'title': 're:^feel_me [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': str, + 'is_live': True, + 'age_limit': 18, + }, + 'skip': 'Room is offline', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'https://stripchat.com/%s/' % video_id, video_id, + headers=self.geo_verification_headers()) + + data = self._parse_json( + self._search_regex( + r'<script\b[^>]*>\s*window\.__PRELOADED_STATE__\s*=(?P<value>.*?)<\/script>', + webpage, 'data', default='{}', group='value'), + video_id, transform_source=lowercase_escape, fatal=False) + if not data: + raise ExtractorError('Unable to find configuration for stream.') + + if try_get(data, lambda x: x['viewCam']['show'], dict): + raise ExtractorError('Model is in private show', expected=True) + elif not try_get(data, lambda x: x['viewCam']['model']['isLive'], bool): + raise ExtractorError('Model is offline', expected=True) + + server = try_get(data, lambda x: x['viewCam']['viewServers']['flashphoner-hls'], compat_str) + host = try_get(data, lambda x: x['config']['data']['hlsStreamHost'], compat_str) + model_id = try_get(data, lambda x: x['viewCam']['model']['id'], int) + + formats = self._extract_m3u8_formats( + 'https://b-%s.%s/hls/%d/%d.m3u8' % (server, host, model_id, model_id), + video_id, ext='mp4', m3u8_id='hls', fatal=False, live=True) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(video_id), + 'description': self._og_search_description(webpage), + 'is_live': True, + 'formats': formats, + # Stripchat declares the RTA meta-tag, but in an non-standard format so _rta_search() can't be used + 'age_limit': 18, + } diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index 38e0086b3..489f197fe 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -168,7 +168,6 @@ class SVTPlayIE(SVTPlayBaseIE): }, }, 'params': { - 'format': 'bestvideo', # skip for now due to download test asserts that segment is > 10000 bytes and svt uses # init segments that are smaller # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B diff --git a/yt_dlp/extractor/telemundo.py b/yt_dlp/extractor/telemundo.py index 18552a0ef..e326bbdd5 100644 --- a/yt_dlp/extractor/telemundo.py +++ b/yt_dlp/extractor/telemundo.py @@ -1,4 +1,4 @@ -# coding=utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/yt_dlp/extractor/telequebec.py b/yt_dlp/extractor/telequebec.py index 800d87b70..4bef2fe76 100644 --- a/yt_dlp/extractor/telequebec.py +++ b/yt_dlp/extractor/telequebec.py @@ -43,9 +43,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): 'uploader_id': '6150020952001', 'upload_date': '20200512', }, - 'params': { - 'format': 'bestvideo', - }, 'add_ie': ['BrightcoveNew'], }, { 'url': 'https://zonevideo.telequebec.tv/media/55267/le-soleil/passe-partout', @@ -58,9 +55,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): 'upload_date': '20200625', 'timestamp': 1593090307, }, - 'params': { - 'format': 'bestvideo', - }, 'add_ie': ['BrightcoveNew'], }, { # no description @@ -157,9 +151,6 @@ class TeleQuebecEmissionIE(InfoExtractor): 'timestamp': 1588713424, 'uploader_id': '6150020952001', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression', 'only_matching': True, @@ -220,9 +211,6 @@ class TeleQuebecVideoIE(TeleQuebecBaseIE): 'timestamp': 1603115930, 'uploader_id': '6101674910001', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'https://video.telequebec.tv/player-live/28527', 'only_matching': True, diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index c810cfd0d..5b3222ecf 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -58,7 +58,7 @@ class TenPlayIE(InfoExtractor): 'email': username, 'password': password, })) - return "Bearer " + data['jwt']['accessToken'] + return 'Bearer ' + data['jwt']['accessToken'] def _real_extract(self, url): content_id = self._match_id(url) diff --git a/yt_dlp/extractor/tf1.py b/yt_dlp/extractor/tf1.py index 669eb5015..44785bc65 100644 --- a/yt_dlp/extractor/tf1.py +++ b/yt_dlp/extractor/tf1.py @@ -29,7 +29,6 @@ class TF1IE(InfoExtractor): 'params': { # Sometimes wat serves the whole file with the --test option 'skip_download': True, - 'format': 'bestvideo', }, }, { 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', diff --git a/yt_dlp/extractor/threeqsdn.py b/yt_dlp/extractor/threeqsdn.py index bb7610352..e5c6a6de1 100644 --- a/yt_dlp/extractor/threeqsdn.py +++ b/yt_dlp/extractor/threeqsdn.py @@ -9,6 +9,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + join_nonempty, parse_iso8601, ) @@ -119,24 +120,16 @@ class ThreeQSDNIE(InfoExtractor): src = s.get('src') if not (src and self._is_valid_url(src, video_id)): continue - width = None - format_id = ['http'] ext = determine_ext(src) - if ext: - format_id.append(ext) height = int_or_none(s.get('height')) - if height: - format_id.append('%dp' % height) - if aspect: - width = int(height * aspect) formats.append({ 'ext': ext, - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty('http', ext, height and '%dp' % height), 'height': height, 'source_preference': 0, 'url': src, 'vcodec': 'none' if height == 0 else None, - 'width': width, + 'width': int(height * aspect) if height and aspect else None, }) # It seems like this would be correctly handled by default # However, unless someone can confirm this, the old diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 1db6327e2..7d79ad8d5 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -12,6 +12,7 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, int_or_none, + join_nonempty, str_or_none, traverse_obj, try_get, @@ -38,8 +39,8 @@ class TikTokBaseIE(InfoExtractor): 'build_number': self._APP_VERSION, 'manifest_version_code': self._MANIFEST_APP_VERSION, 'update_version_code': self._MANIFEST_APP_VERSION, - 'openudid': ''.join(random.choice('0123456789abcdef') for i in range(16)), - 'uuid': ''.join([random.choice(string.digits) for num in range(16)]), + 'openudid': ''.join(random.choice('0123456789abcdef') for _ in range(16)), + 'uuid': ''.join([random.choice(string.digits) for _ in range(16)]), '_rticket': int(time.time() * 1000), 'ts': int(time.time()), 'device_brand': 'Google', @@ -66,7 +67,7 @@ class TikTokBaseIE(InfoExtractor): 'as': 'a1qwert123', 'cp': 'cbfhckdckkde1', } - self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for i in range(160))) + self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160))) return self._download_json( 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, fatal=fatal, note=note, errnote=errnote, headers={ @@ -107,8 +108,8 @@ class TikTokBaseIE(InfoExtractor): 'acodec': 'aac', 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked **add_meta, **parsed_meta, - 'format_note': ' '.join(filter(None, ( - add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else ''))) + 'format_note': join_nonempty( + add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' ') } for url in addr.get('url_list') or []] # Hack: Add direct video links first to prioritize them when removing duplicate formats @@ -416,7 +417,7 @@ class TikTokUserIE(TikTokBaseIE): 'max_cursor': 0, 'min_cursor': 0, 'retry_type': 'no_retry', - 'device_id': ''.join(random.choice(string.digits) for i in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. + 'device_id': ''.join(random.choice(string.digits) for _ in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. } max_retries = self.get_param('extractor_retries', 3) @@ -437,6 +438,7 @@ class TikTokUserIE(TikTokBaseIE): **self._parse_aweme_video_app(video), 'ie_key': TikTokIE.ie_key(), 'extractor': 'TikTok', + 'webpage_url': f'https://tiktok.com/@{user_id}/video/{video["aweme_id"]}', } if not post_list.get('has_more'): break diff --git a/yt_dlp/extractor/tokentube.py b/yt_dlp/extractor/tokentube.py index d6362117f..579623fed 100644 --- a/yt_dlp/extractor/tokentube.py +++ b/yt_dlp/extractor/tokentube.py @@ -6,7 +6,10 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, + get_element_by_class, parse_count, + remove_end, unified_strdate, js_to_json, OnDemandPagedList, @@ -35,7 +38,7 @@ class TokentubeIE(InfoExtractor): 'id': '3950239124', 'ext': 'mp4', 'title': 'Linux Ubuntu Studio perus käyttö', - 'description': 'md5:854ff1dc732ff708976de2880ea32050', + 'description': 'md5:46077d0daaba1974f2dc381257f9d64c', 'uploader': 'jyrilehtonen', 'upload_date': '20210825', }, @@ -45,7 +48,7 @@ class TokentubeIE(InfoExtractor): 'id': '3582463289', 'ext': 'mp4', 'title': 'Police for Freedom - toiminta aloitetaan Suomessa ❤️??', - 'description': 'md5:cd92e620d7f5fa162e8410d0fc9a08be', + 'description': 'md5:37ebf1cb44264e0bf23ed98b337ee63e', 'uploader': 'Voitontie', 'upload_date': '20210428', } @@ -90,7 +93,10 @@ class TokentubeIE(InfoExtractor): r'<a\s*class="place-left"[^>]+>(.+?)</a>', webpage, 'uploader', fatal=False) - description = self._html_search_meta('description', webpage) + description = (clean_html(get_element_by_class('p-d-txt', webpage)) + or self._html_search_meta(('og:description', 'description', 'twitter:description'), webpage)) + + description = remove_end(description, 'Category') self._sort_formats(formats) diff --git a/yt_dlp/extractor/tonline.py b/yt_dlp/extractor/tonline.py index cc11eae2a..9b6a40db5 100644 --- a/yt_dlp/extractor/tonline.py +++ b/yt_dlp/extractor/tonline.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import int_or_none, join_nonempty class TOnlineIE(InfoExtractor): @@ -30,13 +30,8 @@ class TOnlineIE(InfoExtractor): asset_source = asset.get('source') or asset.get('source2') if not asset_source: continue - formats_id = [] - for field_key in ('type', 'profile'): - field_value = asset.get(field_key) - if field_value: - formats_id.append(field_value) formats.append({ - 'format_id': '-'.join(formats_id), + 'format_id': join_nonempty('type', 'profile', from_dict=asset), 'url': asset_source, }) diff --git a/yt_dlp/extractor/tv2.py b/yt_dlp/extractor/tv2.py index e0851531c..da351eeb0 100644 --- a/yt_dlp/extractor/tv2.py +++ b/yt_dlp/extractor/tv2.py @@ -19,7 +19,7 @@ from ..utils import ( class TV2IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/v\d*/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { @@ -33,6 +33,9 @@ class TV2IE(InfoExtractor): 'view_count': int, 'categories': list, }, + }, { + 'url': 'http://www.tv2.no/v2/916509', + 'only_matching': True, }] _PROTOCOLS = ('HLS', 'DASH') _GEO_COUNTRIES = ['NO'] diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py index 1e42b33a4..48e2c6e76 100644 --- a/yt_dlp/extractor/tvp.py +++ b/yt_dlp/extractor/tvp.py @@ -2,35 +2,40 @@ from __future__ import unicode_literals import itertools +import random import re from .common import InfoExtractor from ..utils import ( - clean_html, determine_ext, + dict_get, ExtractorError, - get_element_by_attribute, + int_or_none, + js_to_json, orderedSet, + str_or_none, + try_get, ) class TVPIE(InfoExtractor): IE_NAME = 'tvp' IE_DESC = 'Telewizja Polska' - _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|polandin\.com)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)' _TESTS = [{ + # TVPlayer 2 in js wrapper 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', - 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'info_dict': { 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, odc. 13 – Władek', 'description': 'md5:437f48b93558370b031740546b696e24', + 'age_limit': 12, }, }, { + # TVPlayer legacy 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', - 'md5': 'b0005b542e5b4de643a9690326ab1257', 'info_dict': { 'id': '17916176', 'ext': 'mp4', @@ -38,16 +43,63 @@ class TVPIE(InfoExtractor): 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', }, }, { - # page id is not the same as video id(#7799) - 'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930', - 'md5': '84cd3c8aec4840046e5ab712416b73d0', + # TVPlayer 2 in iframe + 'url': 'https://wiadomosci.tvp.pl/50725617/dzieci-na-sprzedaz-dla-homoseksualistow', 'info_dict': { - 'id': '33908820', + 'id': '50725617', 'ext': 'mp4', - 'title': 'Wiadomości, 28.09.2017, 19:30', - 'description': 'Wydanie główne codziennego serwisu informacyjnego.' + 'title': 'Dzieci na sprzedaż dla homoseksualistów', + 'description': 'md5:7d318eef04e55ddd9f87a8488ac7d590', + 'age_limit': 12, }, - 'skip': 'HTTP Error 404: Not Found', + }, { + # TVPlayer 2 in client-side rendered website (regional; window.__newsData) + 'url': 'https://warszawa.tvp.pl/25804446/studio-yayo', + 'info_dict': { + 'id': '25804446', + 'ext': 'mp4', + 'title': 'Studio Yayo', + 'upload_date': '20160616', + 'timestamp': 1466075700, + } + }, { + # TVPlayer 2 in client-side rendered website (tvp.info; window.__videoData) + 'url': 'https://www.tvp.info/52880236/09042021-0800', + 'info_dict': { + 'id': '52880236', + 'ext': 'mp4', + 'title': '09.04.2021, 08:00', + }, + }, { + # client-side rendered (regional) program (playlist) page + 'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia', + 'info_dict': { + 'id': '9660819', + 'description': 'Od poniedziałku do piątku o 18:55', + 'title': 'Rozmowa dnia', + }, + 'playlist_mincount': 1800, + 'params': { + 'skip_download': True, + } + }, { + # ABC-specific video embeding + # moved to https://bajkowakraina.tvp.pl/wideo/50981130,teleranek,51027049,zubr,51116450 + 'url': 'https://abc.tvp.pl/48636269/zubry-odc-124', + 'info_dict': { + 'id': '48320456', + 'ext': 'mp4', + 'title': 'Teleranek, Żubr', + }, + 'skip': 'unavailable', + }, { + # yet another vue page + 'url': 'https://jp2.tvp.pl/46925618/filmy', + 'info_dict': { + 'id': '46925618', + 'title': 'Filmy', + }, + 'playlist_mincount': 19, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', 'only_matching': True, @@ -66,137 +118,344 @@ class TVPIE(InfoExtractor): }, { 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji', 'only_matching': True, + }, { + 'url': 'https://tvp.info/49193823/teczowe-flagi-na-pomnikach-prokuratura-wszczela-postepowanie-wieszwiecej', + 'only_matching': True, + }, { + 'url': 'https://www.tvpparlament.pl/retransmisje-vod/inne/wizyta-premiera-mateusza-morawieckiego-w-firmie-berotu-sp-z-oo/48857277', + 'only_matching': True, + }, { + 'url': 'https://polandin.com/47942651/pln-10-billion-in-subsidies-transferred-to-companies-pm', + 'only_matching': True, }] + def _parse_vue_website_data(self, webpage, page_id): + website_data = self._search_regex([ + # website - regiony, tvp.info + # directory - jp2.tvp.pl + r'window\.__(?:website|directory)Data\s*=\s*({(?:.|\s)+?});', + ], webpage, 'website data') + if not website_data: + return None + return self._parse_json(website_data, page_id, transform_source=js_to_json) + + def _extract_vue_video(self, video_data, page_id=None): + if isinstance(video_data, str): + video_data = self._parse_json(video_data, page_id, transform_source=js_to_json) + thumbnails = [] + image = video_data.get('image') + if image: + for thumb in (image if isinstance(image, list) else [image]): + thmb_url = str_or_none(thumb.get('url')) + if thmb_url: + thumbnails.append({ + 'url': thmb_url, + }) + is_website = video_data.get('type') == 'website' + if is_website: + url = video_data['url'] + fucked_up_url_parts = re.match(r'https?://vod\.tvp\.pl/(\d+)/([^/?#]+)', url) + if fucked_up_url_parts: + url = f'https://vod.tvp.pl/website/{fucked_up_url_parts.group(2)},{fucked_up_url_parts.group(1)}' + else: + url = 'tvp:' + str_or_none(video_data.get('_id') or page_id) + return { + '_type': 'url_transparent', + 'id': str_or_none(video_data.get('_id') or page_id), + 'url': url, + 'ie_key': 'TVPEmbed' if not is_website else 'TVPWebsite', + 'title': str_or_none(video_data.get('title')), + 'description': str_or_none(video_data.get('lead')), + 'timestamp': int_or_none(video_data.get('release_date_long')), + 'duration': int_or_none(video_data.get('duration')), + 'thumbnails': thumbnails, + } + + def _handle_vuejs_page(self, url, webpage, page_id): + # vue client-side rendered sites (all regional pages + tvp.info) + video_data = self._search_regex([ + r'window\.__(?:news|video)Data\s*=\s*({(?:.|\s)+?})\s*;', + ], webpage, 'video data', default=None) + if video_data: + return self._extract_vue_video(video_data, page_id=page_id) + # paged playlists + website_data = self._parse_vue_website_data(webpage, page_id) + if website_data: + entries = self._vuejs_entries(url, website_data, page_id) + + return { + '_type': 'playlist', + 'id': page_id, + 'title': str_or_none(website_data.get('title')), + 'description': str_or_none(website_data.get('lead')), + 'entries': entries, + } + raise ExtractorError('Could not extract video/website data') + + def _vuejs_entries(self, url, website_data, page_id): + + def extract_videos(wd): + if wd.get('latestVideo'): + yield self._extract_vue_video(wd['latestVideo']) + for video in wd.get('videos') or []: + yield self._extract_vue_video(video) + for video in wd.get('items') or []: + yield self._extract_vue_video(video) + + yield from extract_videos(website_data) + + if website_data.get('items_total_count') > website_data.get('items_per_page'): + for page in itertools.count(2): + page_website_data = self._parse_vue_website_data( + self._download_webpage(url, page_id, note='Downloading page #%d' % page, + query={'page': page}), + page_id) + if not page_website_data.get('videos') and not page_website_data.get('items'): + break + yield from extract_videos(page_website_data) + def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) + webpage, urlh = self._download_webpage_handle(url, page_id) + + # The URL may redirect to a VOD + # example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii + if TVPWebsiteIE.suitable(urlh.url): + return self.url_result(urlh.url, ie=TVPWebsiteIE.ie_key(), video_id=page_id) + + if re.search( + r'window\.__(?:video|news|website|directory)Data\s*=', + webpage): + return self._handle_vuejs_page(url, webpage, page_id) + + # classic server-side rendered sites video_id = self._search_regex([ + r'<iframe[^>]+src="[^"]*?embed\.php\?(?:[^&]+&)*ID=(\d+)', r'<iframe[^>]+src="[^"]*?object_id=(\d+)', r"object_id\s*:\s*'(\d+)'", - r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id) + r'data-video-id="(\d+)"', + + # abc.tvp.pl - somehow there are more than one video IDs that seem to be the same video? + # the first one is referenced to as "copyid", and seems to be unused by the website + r'<script>\s*tvpabc\.video\.init\(\s*\d+,\s*(\d+)\s*\)\s*</script>', + ], webpage, 'video id', default=page_id) return { '_type': 'url_transparent', 'url': 'tvp:' + video_id, 'description': self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'description', webpage, default=None), + webpage, default=None) or (self._html_search_meta( + 'description', webpage, default=None) + if '//s.tvp.pl/files/portal/v' in webpage else None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'ie_key': 'TVPEmbed', } +class TVPStreamIE(InfoExtractor): + IE_NAME = 'tvp:stream' + _VALID_URL = r'(?:tvpstream:|https?://tvpstream\.vod\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)' + _TESTS = [{ + # untestable as "video" id changes many times across a day + 'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455', + 'only_matching': True, + }, { + 'url': 'tvpstream:39821455', + 'only_matching': True, + }, { + # the default stream when you provide no channel_id, most probably TVP Info + 'url': 'tvpstream:', + 'only_matching': True, + }, { + 'url': 'https://tvpstream.vod.tvp.pl/', + 'only_matching': True, + }] + + _PLAYER_BOX_RE = r'<div\s[^>]*id\s*=\s*["\']?tvp_player_box["\']?[^>]+data-%s-id\s*=\s*["\']?(\d+)' + _BUTTON_RE = r'<div\s[^>]*data-channel-id=["\']?%s["\']?[^>]*\sdata-title=(?:"([^"]*)"|\'([^\']*)\')[^>]*\sdata-stationname=(?:"([^"]*)"|\'([^\']*)\')' + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel_url = self._proto_relative_url('//tvpstream.vod.tvp.pl/?channel_id=%s' % channel_id or 'default') + webpage = self._download_webpage(channel_url, channel_id, 'Downloading channel webpage') + if not channel_id: + channel_id = self._search_regex(self._PLAYER_BOX_RE % 'channel', + webpage, 'default channel id') + video_id = self._search_regex(self._PLAYER_BOX_RE % 'video', + webpage, 'video id') + audition_title, station_name = self._search_regex( + self._BUTTON_RE % (re.escape(channel_id)), webpage, + 'audition title and station name', + group=(1, 2)) + return { + '_type': 'url_transparent', + 'id': channel_id, + 'url': 'tvp:%s' % video_id, + 'title': audition_title, + 'alt_title': station_name, + 'is_live': True, + 'ie_key': 'TVPEmbed', + } + + class TVPEmbedIE(InfoExtractor): IE_NAME = 'tvp:embed' IE_DESC = 'Telewizja Polska' - _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' + _VALID_URL = r'''(?x) + (?: + tvp: + |https?:// + (?:[^/]+\.)? + (?:tvp(?:parlament)?\.pl|tvp\.info|polandin\.com)/ + (?:sess/ + (?:tvplayer\.php\?.*?object_id + |TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd]) + |shared/details\.php\?.*?object_id) + =) + (?P<id>\d+) + ''' _TESTS = [{ 'url': 'tvp:194536', - 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'info_dict': { 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, odc. 13 – Władek', + 'description': 'md5:76649d2014f65c99477be17f23a4dead', + 'age_limit': 12, }, }, { - # not available - 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', - 'md5': '8c9cd59d16edabf39331f93bf8a766c7', + 'url': 'https://www.tvp.pl/sess/tvplayer.php?object_id=51247504&autoplay=false', 'info_dict': { - 'id': '22670268', + 'id': '51247504', 'ext': 'mp4', - 'title': 'Panorama, 07.12.2015, 15:40', + 'title': 'Razmova 091220', }, - 'skip': 'Transmisja została zakończona lub materiał niedostępny', }, { - 'url': 'tvp:22670268', + # TVPlayer2 embed URL + 'url': 'https://tvp.info/sess/TVPlayer2/embed.php?ID=50595757', + 'only_matching': True, + }, { + 'url': 'https://wiadomosci.tvp.pl/sess/TVPlayer2/api.php?id=51233452', + 'only_matching': True, + }, { + # pulsembed on dziennik.pl + 'url': 'https://www.tvp.pl/shared/details.php?copy_id=52205981&object_id=52204505&autoplay=false&is_muted=false&allowfullscreen=true&template=external-embed/video/iframe-video.html', 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage, **kw): + return [m.group('embed') for m in re.finditer( + r'(?x)<iframe[^>]+?src=(["\'])(?P<embed>%s)' % TVPEmbedIE._VALID_URL[4:], + webpage)] + def _real_extract(self, url): video_id = self._match_id(url) + # it could be anything that is a valid JS function name + callback = random.choice(( + 'jebac_pis', + 'jebacpis', + 'ziobro', + 'sasin70', + 'sasin_przejebal_70_milionow_PLN', + 'tvp_is_a_state_propaganda_service', + )) + webpage = self._download_webpage( - 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) - - error = self._html_search_regex( - r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>', - webpage, 'error', default=None) or clean_html( - get_element_by_attribute('class', 'msg error', webpage)) - if error: - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, clean_html(error)), expected=True) - - title = self._search_regex( - r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', - webpage, 'title', group='title') - series_title = self._search_regex( - r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1', - webpage, 'series', group='series', default=None) - if series_title: - title = '%s, %s' % (series_title, title) - - thumbnail = self._search_regex( - r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) - - video_url = self._search_regex( - r'0:{src:([\'"])(?P<url>.*?)\1', webpage, - 'formats', group='url', default=None) - if not video_url or 'material_niedostepny.mp4' in video_url: - video_url = self._download_json( - 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, - video_id)['video_url'] + ('https://www.tvp.pl/sess/TVPlayer2/api.php?id=%s' + + '&@method=getTvpConfig&@callback=%s') % (video_id, callback), video_id) + + # stripping JSONP padding + datastr = webpage[15 + len(callback):-3] + if datastr.startswith('null,'): + error = self._parse_json(datastr[5:], video_id) + raise ExtractorError(error[0]['desc']) + + content = self._parse_json(datastr, video_id)['content'] + info = content['info'] + is_live = try_get(info, lambda x: x['isLive'], bool) formats = [] - video_url_base = self._search_regex( - r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', - video_url, 'video base url', default=None) - if video_url_base: - # TODO: <Group> found instead of <AdaptationSet> in MPD manifest. - # It's not mentioned in MPEG-DASH standard. Figure that out. - # formats.extend(self._extract_mpd_formats( - # video_url_base + '.ism/video.mpd', - # video_id, mpd_id='dash', fatal=False)) - formats.extend(self._extract_ism_formats( - video_url_base + '.ism/Manifest', - video_id, 'mss', fatal=False)) - formats.extend(self._extract_f4m_formats( - video_url_base + '.ism/video.f4m', - video_id, f4m_id='hds', fatal=False)) - m3u8_formats = self._extract_m3u8_formats( - video_url_base + '.ism/video.m3u8', video_id, - 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none', m3u8_formats)) - formats.extend(m3u8_formats) - for i, m3u8_format in enumerate(m3u8_formats, 2): - http_url = '%s-%d.mp4' % (video_url_base, i) - if self._is_valid_url(http_url, video_id): - f = m3u8_format.copy() - f.update({ - 'url': http_url, - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - formats = [{ - 'format_id': 'direct', - 'url': video_url, - 'ext': determine_ext(video_url, 'mp4'), - }] + for file in content['files']: + video_url = file.get('url') + if not video_url: + continue + if video_url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False, live=is_live)) + elif video_url.endswith('.mpd'): + if is_live: + # doesn't work with either ffmpeg or native downloader + continue + formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False)) + elif video_url.endswith('.f4m'): + formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False)) + elif video_url.endswith('.ism/manifest'): + formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False)) + else: + # mp4, wmv or something + quality = file.get('quality', {}) + formats.append({ + 'format_id': 'direct', + 'url': video_url, + 'ext': determine_ext(video_url, file['type']), + 'fps': int_or_none(quality.get('fps')), + 'tbr': int_or_none(quality.get('bitrate')), + 'width': int_or_none(quality.get('width')), + 'height': int_or_none(quality.get('height')), + }) self._sort_formats(formats) - return { + title = dict_get(info, ('subtitle', 'title', 'seoTitle')) + description = dict_get(info, ('description', 'seoDescription')) + thumbnails = [] + for thumb in content.get('posters') or (): + thumb_url = thumb.get('src') + if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url: + continue + thumbnails.append({ + 'url': thumb.get('src'), + 'width': thumb.get('width'), + 'height': thumb.get('height'), + }) + age_limit = try_get(info, lambda x: x['ageGroup']['minAge'], int) + if age_limit == 1: + age_limit = 0 + duration = try_get(info, lambda x: x['duration'], int) if not is_live else None + + subtitles = {} + for sub in content.get('subtitles') or []: + if not sub.get('url'): + continue + subtitles.setdefault(sub['lang'], []).append({ + 'url': sub['url'], + 'ext': sub.get('type'), + }) + + info_dict = { 'id': video_id, 'title': title, - 'thumbnail': thumbnail, + 'description': description, + 'thumbnails': thumbnails, + 'age_limit': age_limit, + 'is_live': is_live, + 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } + # vod.tvp.pl + if info.get('vortalName') == 'vod': + info_dict.update({ + 'title': '%s, %s' % (info.get('title'), info.get('subtitle')), + 'series': info.get('title'), + 'season': info.get('season'), + 'episode_number': info.get('episode'), + }) + + return info_dict + class TVPWebsiteIE(InfoExtractor): IE_NAME = 'tvp:series' @@ -204,18 +463,20 @@ class TVPWebsiteIE(InfoExtractor): _TESTS = [{ # series - 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', + 'url': 'https://vod.tvp.pl/website/wspaniale-stulecie,17069012/video', 'info_dict': { - 'id': '38678312', + 'id': '17069012', }, - 'playlist_count': 115, + 'playlist_count': 312, }, { # film - 'url': 'https://vod.tvp.pl/website/gloria,35139666', + 'url': 'https://vod.tvp.pl/website/krzysztof-krawczyk-cale-moje-zycie,51374466', 'info_dict': { - 'id': '36637049', + 'id': '51374509', 'ext': 'mp4', - 'title': 'Gloria, Gloria', + 'title': 'Krzysztof Krawczyk – całe moje życie, Krzysztof Krawczyk – całe moje życie', + 'description': 'md5:2e80823f00f5fc263555482f76f8fa42', + 'age_limit': 12, }, 'params': { 'skip_download': True, diff --git a/yt_dlp/extractor/tvplay.py b/yt_dlp/extractor/tvplay.py index fbafb41f8..b5dbc5526 100644 --- a/yt_dlp/extractor/tvplay.py +++ b/yt_dlp/extractor/tvplay.py @@ -12,9 +12,9 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, - parse_duration, parse_iso8601, qualities, + traverse_obj, try_get, update_url_query, url_or_none, @@ -369,7 +369,6 @@ class ViafreeIE(InfoExtractor): 'upload_date': '20201217' }, 'params': { - 'format': 'bestvideo', 'skip_download': True } }, { @@ -432,77 +431,96 @@ class ViafreeIE(InfoExtractor): class TVPlayHomeIE(InfoExtractor): - _VALID_URL = r'https?://(?:tv3?)?play\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/(?:[^/]+/)*[^/?#&]+-(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:tv3?)? + play\.(?:tv3|skaties)\.(?P<country>lv|lt|ee)/ + (?P<live>lives/)? + [^?#&]+(?:episode|programme|clip)-(?P<id>\d+) + ''' _TESTS = [{ - 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/', + 'url': 'https://play.tv3.lt/series/gauju-karai-karveliai,serial-2343791/serija-8,episode-2343828', 'info_dict': { - 'id': '366367', + 'id': '2343828', 'ext': 'mp4', - 'title': 'Aferistai', - 'description': 'Aferistai. Kalėdinė pasaka.', - 'series': 'Aferistai [N-7]', - 'season': '1 sezonas', + 'title': 'Gaujų karai. Karveliai (2021) | S01E08: Serija 8', + 'description': 'md5:f6fcfbb236429f05531131640dfa7c81', + 'duration': 2710, + 'season': 'Gaujų karai. Karveliai', 'season_number': 1, - 'duration': 464, - 'timestamp': 1394209658, - 'upload_date': '20140307', - 'age_limit': 18, + 'release_year': 2021, + 'episode': 'Serija 8', + 'episode_number': 8, }, 'params': { - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { - 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/', - 'only_matching': True, + 'url': 'https://play.tv3.lt/series/moterys-meluoja-geriau-n-7,serial-2574652/serija-25,episode-3284937', + 'info_dict': { + 'id': '3284937', + 'ext': 'mp4', + 'season': 'Moterys meluoja geriau [N-7]', + 'season_number': 14, + 'release_year': 2021, + 'episode': 'Serija 25', + 'episode_number': 25, + 'title': 'Moterys meluoja geriau [N-7] (2021) | S14|E25: Serija 25', + 'description': 'md5:c6926e9710f1a126f028fbe121eddb79', + 'duration': 2440, + }, + 'skip': '404' }, { - 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/', + 'url': 'https://play.tv3.lt/lives/tv6-lt,live-2838694/optibet-a-lygos-rungtynes-marijampoles-suduva--vilniaus-riteriai,programme-3422014', 'only_matching': True, }, { - 'url': 'https://play.tv3.lt/aferistai-10047125', + 'url': 'https://tv3play.skaties.lv/series/women-lie-better-lv,serial-1024464/women-lie-better-lv,episode-1038762', 'only_matching': True, }, { - 'url': 'https://tv3play.skaties.lv/vinas-melo-labak-10280317', + 'url': 'https://play.tv3.ee/series/_,serial-2654462/_,episode-2654474', 'only_matching': True, }, { - 'url': 'https://play.tv3.ee/cool-d-ga-mehhikosse-10044354', + 'url': 'https://tv3play.skaties.lv/clips/tv3-zinas-valsti-lidz-15novembrim-bus-majsede,clip-3464509', 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + country, is_live, video_id = self._match_valid_url(url).groups() - asset = self._download_json( - urljoin(url, '/sb/public/asset/' + video_id), video_id) + api_path = 'lives/programmes' if is_live else 'vods' + data = self._download_json( + urljoin(url, f'/api/products/{api_path}/{video_id}?platform=BROWSER&lang={country.upper()}'), + video_id) - m3u8_url = asset['movie']['contentUrl'] - video_id = asset['assetId'] - asset_title = asset['title'] - title = asset_title['title'] - - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') + video_type = 'CATCHUP' if is_live else 'MOVIE' + stream_id = data['programRecordingId'] if is_live else video_id + stream = self._download_json( + urljoin(url, f'/api/products/{stream_id}/videos/playlist?videoType={video_type}&platform=BROWSER'), video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + stream['sources']['HLS'][0]['src'], video_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) - thumbnails = None - image_url = asset.get('imageUrl') - if image_url: - thumbnails = [{ - 'url': urljoin(url, image_url), - 'ext': 'jpg', - }] - - metadata = asset.get('metadata') or {} + thumbnails = set(traverse_obj( + data, (('galary', 'images', 'artworks'), ..., ..., ('miniUrl', 'mainUrl')), expected_type=url_or_none)) return { 'id': video_id, - 'title': title, - 'description': asset_title.get('summaryLong') or asset_title.get('summaryShort'), - 'thumbnails': thumbnails, - 'duration': parse_duration(asset_title.get('runTime')), - 'series': asset.get('tvSeriesTitle'), - 'season': asset.get('tvSeasonTitle'), - 'season_number': int_or_none(metadata.get('seasonNumber')), - 'episode': asset_title.get('titleBrief'), - 'episode_number': int_or_none(metadata.get('episodeNumber')), + 'title': self._resolve_title(data), + 'description': traverse_obj(data, 'description', 'lead'), + 'duration': int_or_none(data.get('duration')), + 'season': traverse_obj(data, ('season', 'serial', 'title')), + 'season_number': int_or_none(traverse_obj(data, ('season', 'number'))), + 'episode': data.get('title'), + 'episode_number': int_or_none(data.get('episode')), + 'release_year': int_or_none(traverse_obj(data, ('season', 'serial', 'year'))), + 'thumbnails': [{'url': url, 'ext': 'jpg'} for url in thumbnails], 'formats': formats, + 'subtitles': subtitles, } + + @staticmethod + def _resolve_title(data): + return try_get(data, lambda x: ( + f'{data["season"]["serial"]["title"]} ({data["season"]["serial"]["year"]}) | ' + f'S{data["season"]["number"]:02d}E{data["episode"]:02d}: {data["title"]}' + )) or data.get('title') diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index be70beed4..cd97f0a24 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -24,6 +24,8 @@ from ..utils import ( parse_iso8601, parse_qs, qualities, + str_or_none, + traverse_obj, try_get, unified_timestamp, update_url_query, @@ -52,6 +54,7 @@ class TwitchBaseIE(InfoExtractor): 'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11', 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', 'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687', + 'VideoPlayer_ChapterSelectButtonVideo': '8d2793384aac3773beab5e59bd5d6f585aedb923d292800119e03d40cd0f9b41', } def _real_initialize(self): @@ -249,6 +252,38 @@ class TwitchVodIE(TwitchBaseIE): }, { 'url': 'https://player.twitch.tv/?video=480452374', 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/videos/635475444', + 'info_dict': { + 'id': 'v635475444', + 'ext': 'mp4', + 'title': 'Riot Games', + 'duration': 11643, + 'uploader': 'Riot Games', + 'uploader_id': 'riotgames', + 'timestamp': 1590770569, + 'upload_date': '20200529', + 'chapters': [ + { + 'start_time': 0, + 'end_time': 573, + 'title': 'League of Legends' + }, + { + 'start_time': 573, + 'end_time': 3922, + 'title': 'Legends of Runeterra' + }, + { + 'start_time': 3922, + 'end_time': 11643, + 'title': 'Art' + } + ], + }, + 'params': { + 'skip_download': True + } }] def _download_info(self, item_id): @@ -259,16 +294,24 @@ class TwitchVodIE(TwitchBaseIE): 'channelLogin': '', 'videoID': item_id, }, + }, { + 'operationName': 'VideoPlayer_ChapterSelectButtonVideo', + 'variables': { + 'includePrivate': False, + 'videoID': item_id, + }, }], - 'Downloading stream metadata GraphQL')[0]['data'] - video = data.get('video') + 'Downloading stream metadata GraphQL') + + video = traverse_obj(data, (0, 'data', 'video')) + video['moments'] = traverse_obj(data, (1, 'data', 'video', 'moments', 'edges', ..., 'node')) + if video is None: raise ExtractorError( 'Video %s does not exist' % item_id, expected=True) return self._extract_info_gql(video, item_id) - @staticmethod - def _extract_info(info): + def _extract_info(self, info): status = info.get('status') if status == 'recording': is_live = True @@ -302,18 +345,39 @@ class TwitchVodIE(TwitchBaseIE): 'timestamp': parse_iso8601(info.get('recorded_at')), 'view_count': int_or_none(info.get('views')), 'is_live': is_live, + 'was_live': True, } - @staticmethod - def _extract_info_gql(info, item_id): + def _extract_moments(self, info, item_id): + for moment in info.get('moments') or []: + start_time = int_or_none(moment.get('positionMilliseconds'), 1000) + duration = int_or_none(moment.get('durationMilliseconds'), 1000) + name = str_or_none(moment.get('description')) + + if start_time is None or duration is None: + self.report_warning(f'Important chapter information missing for chapter {name}', item_id) + continue + yield { + 'start_time': start_time, + 'end_time': start_time + duration, + 'title': name, + } + + def _extract_info_gql(self, info, item_id): vod_id = info.get('id') or item_id # id backward compatibility for download archives if vod_id[0] != 'v': vod_id = 'v%s' % vod_id thumbnail = url_or_none(info.get('previewThumbnailURL')) + is_live = None if thumbnail: - for p in ('width', 'height'): - thumbnail = thumbnail.replace('{%s}' % p, '0') + if thumbnail.endswith('/404_processing_{width}x{height}.png'): + is_live, thumbnail = True, None + else: + is_live = False + for p in ('width', 'height'): + thumbnail = thumbnail.replace('{%s}' % p, '0') + return { 'id': vod_id, 'title': info.get('title') or 'Untitled Broadcast', @@ -324,6 +388,9 @@ class TwitchVodIE(TwitchBaseIE): 'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str), 'timestamp': unified_timestamp(info.get('publishedAt')), 'view_count': int_or_none(info.get('viewCount')), + 'chapters': list(self._extract_moments(info, item_id)), + 'is_live': is_live, + 'was_live': True, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/ustream.py b/yt_dlp/extractor/ustream.py index 8b758795f..4a7a8f879 100644 --- a/yt_dlp/extractor/ustream.py +++ b/yt_dlp/extractor/ustream.py @@ -13,6 +13,7 @@ from ..utils import ( ExtractorError, int_or_none, float_or_none, + join_nonempty, mimetype2ext, str_or_none, ) @@ -139,8 +140,8 @@ class UstreamIE(InfoExtractor): content_type = stream['contentType'] kind = content_type.split('/')[0] f = { - 'format_id': '-'.join(filter(None, [ - 'dash', kind, str_or_none(stream.get('bitrate'))])), + 'format_id': join_nonempty( + 'dash', kind, str_or_none(stream.get('bitrate'))), 'protocol': 'http_dash_segments', # TODO: generate a MPD doc for external players? 'url': encode_data_uri(b'<MPD/>', 'text/xml'), diff --git a/yt_dlp/extractor/vice.py b/yt_dlp/extractor/vice.py index ca4d3edbd..c8c30559e 100644 --- a/yt_dlp/extractor/vice.py +++ b/yt_dlp/extractor/vice.py @@ -290,7 +290,6 @@ class ViceArticleIE(ViceBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', }, 'add_ie': [ViceIE.ie_key()], }, { diff --git a/yt_dlp/extractor/vidlii.py b/yt_dlp/extractor/vidlii.py index f4774256b..ce7487ec1 100644 --- a/yt_dlp/extractor/vidlii.py +++ b/yt_dlp/extractor/vidlii.py @@ -5,9 +5,11 @@ import re from .common import InfoExtractor from ..utils import ( + HEADRequest, float_or_none, get_element_by_id, int_or_none, + str_to_int, strip_or_none, unified_strdate, urljoin, @@ -36,6 +38,25 @@ class VidLiiIE(InfoExtractor): 'tags': ['Vidlii', 'Jan', 'Videogames'], } }, { + 'url': 'https://www.vidlii.com/watch?v=zTAtaAgOLKt', + 'md5': '5778f7366aa4c569b77002f8bf6b614f', + 'info_dict': { + 'id': 'zTAtaAgOLKt', + 'ext': 'mp4', + 'title': 'FULPTUBE SUCKS.', + 'description': 'md5:087b2ca355d4c8f8f77e97c43e72d711', + 'thumbnail': 'https://www.vidlii.com/usfi/thmp/zTAtaAgOLKt.jpg', + 'uploader': 'Homicide', + 'uploader_url': 'https://www.vidlii.com/user/Homicide', + 'upload_date': '20210612', + 'duration': 89, + 'view_count': int, + 'comment_count': int, + 'average_rating': float, + 'categories': ['News & Politics'], + 'tags': ['fulp', 'tube', 'sucks', 'bad', 'fulptube'], + }, + }, { 'url': 'https://www.vidlii.com/embed?v=tJluaH4BJ3v&a=0', 'only_matching': True, }] @@ -45,10 +66,20 @@ class VidLiiIE(InfoExtractor): webpage = self._download_webpage( 'https://www.vidlii.com/watch?v=%s' % video_id, video_id) - - video_url = self._search_regex( - r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', webpage, - 'video url', group='url') + formats = [] + + sources = [source[1] for source in re.findall( + r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', + webpage) or []] + for source in sources: + height = int(self._search_regex(r'(\d+).mp4', source, 'height', default=360)) + if self._request_webpage(HEADRequest(source), video_id, f'Checking {height}p url', errnote=False): + formats.append({ + 'url': source, + 'format_id': f'{height}p', + 'height': height, + }) + self._sort_formats(formats) title = self._search_regex( (r'<h1>([^<]+)</h1>', r'<title>([^<]+) - VidLii<'), webpage, @@ -82,9 +113,9 @@ class VidLiiIE(InfoExtractor): default=None) or self._search_regex( r'duration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) - view_count = int_or_none(self._search_regex( - (r'<strong>(\d+)</strong> views', - r'Views\s*:\s*<strong>(\d+)</strong>'), + view_count = str_to_int(self._search_regex( + (r'<strong>([,0-9]+)</strong> views', + r'Views\s*:\s*<strong>([,0-9]+)</strong>'), webpage, 'view count', fatal=False)) comment_count = int_or_none(self._search_regex( @@ -109,11 +140,11 @@ class VidLiiIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, + 'formats': formats, 'uploader_url': uploader_url, 'upload_date': upload_date, 'duration': duration, diff --git a/yt_dlp/extractor/viki.py b/yt_dlp/extractor/viki.py index acb5ae550..6a3c5532d 100644 --- a/yt_dlp/extractor/viki.py +++ b/yt_dlp/extractor/viki.py @@ -135,9 +135,6 @@ class VikiIE(VikiBaseIE): 'uploader': 'FCC', 'upload_date': '20201127', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'info_dict': { @@ -151,9 +148,6 @@ class VikiIE(VikiBaseIE): 'duration': 3570, 'episode_number': 14, }, - 'params': { - 'format': 'bestvideo', - }, 'skip': 'Blocked in the US', }, { # clip @@ -203,9 +197,6 @@ class VikiIE(VikiBaseIE): 'age_limit': 13, 'episode_number': 1, }, - 'params': { - 'format': 'bestvideo', - }, }, { # youtube external 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', @@ -241,9 +232,6 @@ class VikiIE(VikiBaseIE): 'title': 'Love In Magic', 'age_limit': 13, }, - 'params': { - 'format': 'bestvideo', - }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 04c504934..e2b86662b 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -18,6 +18,7 @@ from ..utils import ( determine_ext, ExtractorError, get_element_by_class, + HEADRequest, js_to_json, int_or_none, merge_dicts, @@ -35,6 +36,7 @@ from ..utils import ( urlencode_postdata, urljoin, unescapeHTML, + urlhandle_detect_ext, ) @@ -229,27 +231,26 @@ class VimeoBaseInfoExtractor(InfoExtractor): query['unlisted_hash'] = unlisted_hash download_data = self._download_json( url, video_id, fatal=False, query=query, - headers={'X-Requested-With': 'XMLHttpRequest'}) - if download_data: - source_file = download_data.get('source_file') - if isinstance(source_file, dict): - download_url = source_file.get('download_url') - if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): - source_name = source_file.get('public_name', 'Original') - if self._is_valid_url(download_url, video_id, '%s video' % source_name): - ext = (try_get( - source_file, lambda x: x['extension'], - compat_str) or determine_ext( - download_url, None) or 'mp4').lower() - return { - 'url': download_url, - 'ext': ext, - 'width': int_or_none(source_file.get('width')), - 'height': int_or_none(source_file.get('height')), - 'filesize': parse_filesize(source_file.get('size')), - 'format_id': source_name, - 'quality': 1, - } + headers={'X-Requested-With': 'XMLHttpRequest'}, + expected_status=(403, 404)) or {} + source_file = download_data.get('source_file') + download_url = try_get(source_file, lambda x: x['download_url']) + if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): + source_name = source_file.get('public_name', 'Original') + if self._is_valid_url(download_url, video_id, '%s video' % source_name): + ext = (try_get( + source_file, lambda x: x['extension'], + compat_str) or determine_ext( + download_url, None) or 'mp4').lower() + return { + 'url': download_url, + 'ext': ext, + 'width': int_or_none(source_file.get('width')), + 'height': int_or_none(source_file.get('height')), + 'filesize': parse_filesize(source_file.get('size')), + 'format_id': source_name, + 'quality': 1, + } jwt_response = self._download_json( 'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {} @@ -258,15 +259,19 @@ class VimeoBaseInfoExtractor(InfoExtractor): headers = {'Authorization': 'jwt %s' % jwt_response['jwt']} original_response = self._download_json( f'https://api.vimeo.com/videos/{video_id}', video_id, - headers=headers, fatal=False) or {} - for download_data in original_response.get('download') or {}: + headers=headers, fatal=False, expected_status=(403, 404)) or {} + for download_data in original_response.get('download') or []: download_url = download_data.get('link') if not download_url or download_data.get('quality') != 'source': continue - query = parse_qs(download_url) + ext = determine_ext(parse_qs(download_url).get('filename', [''])[0].lower(), default_ext=None) + if not ext: + urlh = self._request_webpage( + HEADRequest(download_url), video_id, fatal=False, note='Determining source extension') + ext = urlh and urlhandle_detect_ext(urlh) return { 'url': download_url, - 'ext': determine_ext(query.get('filename', [''])[0].lower()), + 'ext': ext or 'unknown_video', 'format_id': download_data.get('public_name', 'Original'), 'width': int_or_none(download_data.get('width')), 'height': int_or_none(download_data.get('height')), @@ -291,7 +296,7 @@ class VimeoIE(VimeoBaseInfoExtractor): )? vimeo(?:pro)?\.com/ (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) - (?:.*?/)? + (?:[^/]+/)*? (?: (?: play_redirect_hls| @@ -362,7 +367,6 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'format': 'best[protocol=https]', }, - 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://vimeo.com/68375962', @@ -402,7 +406,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'upload_date': '20130928', 'duration': 187, }, - 'expected_warnings': ['Unable to download JSON metadata'], + 'params': {'format': 'http-1080p'}, }, { 'url': 'http://vimeo.com/76979871', @@ -424,7 +428,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'es': [{'ext': 'vtt'}], 'fr': [{'ext': 'vtt'}], }, - } + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], }, { # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/ @@ -469,7 +474,6 @@ class VimeoIE(VimeoBaseInfoExtractor): 'description': 'md5:f2edc61af3ea7a5592681ddbb683db73', 'upload_date': '20200225', }, - 'expected_warnings': ['Unable to download JSON metadata'], }, { # only available via https://vimeo.com/channels/tributes/6213729 and @@ -491,7 +495,6 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'skip_download': True, }, - 'expected_warnings': ['Unable to download JSON metadata'], }, { # redirects to ondemand extractor and should be passed through it @@ -511,7 +514,6 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'skip_download': True, }, - 'expected_warnings': ['Unable to download JSON metadata'], 'skip': 'this page is no longer available.', }, { @@ -572,14 +574,55 @@ class VimeoIE(VimeoBaseInfoExtractor): 'only_matching': True, }, { + 'note': 'Direct URL with hash', 'url': 'https://vimeo.com/160743502/abd0e13fb4', - 'only_matching': True, + 'info_dict': { + 'id': '160743502', + 'ext': 'mp4', + 'uploader': 'Julian Tryba', + 'uploader_id': 'aliniamedia', + 'title': 'Harrisville New Hampshire', + 'timestamp': 1459259666, + 'upload_date': '20160329', + }, + 'params': {'skip_download': True}, + }, + { + 'url': 'https://vimeo.com/138909882', + 'info_dict': { + 'id': '138909882', + 'ext': 'mp4', + 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!', + 'description': 'md5:5967e090768a831488f6e74b7821b3c1', + 'uploader_id': 'fireworkchampions', + 'uploader': 'Firework Champions', + 'upload_date': '20150910', + 'timestamp': 1441901895, + }, + 'params': { + 'skip_download': True, + 'format': 'Original', + }, + }, + { + 'url': 'https://vimeo.com/channels/staffpicks/143603739', + 'info_dict': { + 'id': '143603739', + 'ext': 'mp4', + 'uploader': 'Karim Huu Do', + 'timestamp': 1445846953, + 'upload_date': '20151026', + 'title': 'The Shoes - Submarine Feat. Blaine Harrison', + 'uploader_id': 'karimhd', + 'description': 'md5:8e2eea76de4504c2e8020a9bcfa1e843', + }, + 'params': {'skip_download': 'm3u8'}, }, { # requires passing unlisted_hash(a52724358e) to load_download_config request 'url': 'https://vimeo.com/392479337/a52724358e', 'only_matching': True, - } + }, # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header ] @@ -708,7 +751,8 @@ class VimeoIE(VimeoBaseInfoExtractor): headers['Referer'] = url # Extract ID from URL - video_id, unlisted_hash = self._match_valid_url(url).groups() + mobj = self._match_valid_url(url).groupdict() + video_id, unlisted_hash = mobj['id'], mobj.get('unlisted_hash') if unlisted_hash: return self._extract_from_api(video_id, unlisted_hash) @@ -768,18 +812,19 @@ class VimeoIE(VimeoBaseInfoExtractor): timestamp = None video_description = None info_dict = {} + config_url = None channel_id = self._search_regex( r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) if channel_id: config_url = self._html_search_regex( - r'\bdata-config-url="([^"]+)"', webpage, 'config URL') + r'\bdata-config-url="([^"]+)"', webpage, 'config URL', default=None) video_description = clean_html(get_element_by_class('description', webpage)) info_dict.update({ 'channel_id': channel_id, 'channel_url': 'https://vimeo.com/channels/' + channel_id, }) - else: + if not config_url: page_config = self._parse_json(self._search_regex( r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', webpage, 'page config', default='{}'), video_id, fatal=False) @@ -1100,10 +1145,10 @@ class VimeoGroupsIE(VimeoChannelIE): IE_NAME = 'vimeo:group' _VALID_URL = r'https://vimeo\.com/groups/(?P<id>[^/]+)(?:/(?!videos?/\d+)|$)' _TESTS = [{ - 'url': 'https://vimeo.com/groups/kattykay', + 'url': 'https://vimeo.com/groups/meetup', 'info_dict': { - 'id': 'kattykay', - 'title': 'Katty Kay', + 'id': 'meetup', + 'title': 'Vimeo Meetup!', }, 'playlist_mincount': 27, }] @@ -1125,7 +1170,6 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'uploader_id': 'user21297594', 'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks", }, - 'expected_warnings': ['Unable to download JSON metadata'], }, { 'note': 'video player needs Referer', 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index d8a9b9ab4..9a5c9ee6b 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -51,7 +51,7 @@ class VKBaseIE(InfoExtractor): self._apply_first_set_cookie_header(url_handle, 'remixlhk') login_page = self._download_webpage( - 'https://login.vk.com/?act=login', None, + 'https://vk.com/login', None, note='Logging in', data=urlencode_postdata(login_form)) @@ -471,6 +471,13 @@ class VKIE(VKBaseIE): }) self._sort_formats(formats) + subtitles = {} + for sub in data.get('subs') or {}: + subtitles.setdefault(sub.get('lang', 'en'), []).append({ + 'ext': sub.get('title', '.srt').split('.')[-1], + 'url': url_or_none(sub.get('url')), + }) + return { 'id': video_id, 'formats': formats, @@ -484,6 +491,7 @@ class VKIE(VKBaseIE): 'like_count': int_or_none(mv_data.get('likes')), 'comment_count': int_or_none(mv_data.get('commcount')), 'is_live': is_live, + 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/vlive.py b/yt_dlp/extractor/vlive.py index 4340b1d4c..8fccf1b63 100644 --- a/yt_dlp/extractor/vlive.py +++ b/yt_dlp/extractor/vlive.py @@ -12,6 +12,7 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + LazyList, merge_dicts, str_or_none, strip_or_none, @@ -363,11 +364,10 @@ class VLiveChannelIE(VLiveBaseIE): if board.get('boardType') not in ('STAR', 'VLIVE_PLUS'): raise ExtractorError(f'Board {board_name!r} is not supported', expected=True) - entries = self._entries(posts_id or channel_id, board_name) - first_video = next(entries) - channel_name = first_video['channel'] + entries = LazyList(self._entries(posts_id or channel_id, board_name)) + channel_name = entries[0]['channel'] return self.playlist_result( - itertools.chain([first_video], entries), + entries, f'{channel_id}-{posts_id}' if posts_id else channel_id, f'{channel_name} - {board_name}' if channel_name and board_name else channel_name) diff --git a/yt_dlp/extractor/vrv.py b/yt_dlp/extractor/vrv.py index 419602148..7bc55f333 100644 --- a/yt_dlp/extractor/vrv.py +++ b/yt_dlp/extractor/vrv.py @@ -19,6 +19,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + join_nonempty, traverse_obj, ) @@ -141,14 +142,10 @@ class VRVIE(VRVBaseIE): def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'): return [] - stream_id_list = [] - if audio_lang: - stream_id_list.append('audio-%s' % audio_lang) - if hardsub_lang: - stream_id_list.append('hardsub-%s' % hardsub_lang) - format_id = stream_format - if stream_id_list: - format_id += '-' + '-'.join(stream_id_list) + format_id = join_nonempty( + stream_format, + audio_lang and 'audio-%s' % audio_lang, + hardsub_lang and 'hardsub-%s' % hardsub_lang) if 'hls' in stream_format: adaptive_formats = self._extract_m3u8_formats( url, video_id, 'mp4', m3u8_id=format_id, diff --git a/yt_dlp/extractor/vupload.py b/yt_dlp/extractor/vupload.py index 9846ababc..2229a6591 100644 --- a/yt_dlp/extractor/vupload.py +++ b/yt_dlp/extractor/vupload.py @@ -7,6 +7,7 @@ from ..utils import ( parse_filesize, extract_attributes, int_or_none, + js_to_json ) @@ -28,8 +29,11 @@ class VuploadIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title') - video_e = self._html_search_regex(r'\|([a-z0-9]{60})\|', webpage, 'video') - video_url = f'https://wurize.megaupload.to/{video_e}/v.mp4' + video_json = self._parse_json(self._html_search_regex(r'sources:\s*(.+?]),', webpage, 'video'), video_id, transform_source=js_to_json) + formats = [] + for source in video_json: + if source['src'].endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(source['src'], video_id, m3u8_id='hls')) duration = parse_duration(self._html_search_regex( r'<i\s*class=["\']fad\s*fa-clock["\']></i>\s*([\d:]+)\s*</div>', webpage, 'duration', fatal=False)) filesize_approx = parse_filesize(self._html_search_regex( @@ -40,7 +44,7 @@ class VuploadIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'duration': duration, 'filesize_approx': filesize_approx, 'width': int_or_none(extra_video_info.get('width')), diff --git a/yt_dlp/extractor/wakanim.py b/yt_dlp/extractor/wakanim.py index a61a630e2..a70a71961 100644 --- a/yt_dlp/extractor/wakanim.py +++ b/yt_dlp/extractor/wakanim.py @@ -25,7 +25,6 @@ class WakanimIE(InfoExtractor): 'episode_number': 2, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { diff --git a/yt_dlp/extractor/watchbox.py b/yt_dlp/extractor/watchbox.py index 7469fe962..d19d80102 100644 --- a/yt_dlp/extractor/watchbox.py +++ b/yt_dlp/extractor/watchbox.py @@ -30,7 +30,6 @@ class WatchBoxIE(InfoExtractor): 'release_year': 2009, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'expected_warnings': ['Failed to download m3u8 information'], @@ -52,7 +51,6 @@ class WatchBoxIE(InfoExtractor): 'episode_number': 1, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'expected_warnings': ['Failed to download m3u8 information'], diff --git a/yt_dlp/extractor/wdr.py b/yt_dlp/extractor/wdr.py index f54aa6ff9..d3229d8af 100644 --- a/yt_dlp/extractor/wdr.py +++ b/yt_dlp/extractor/wdr.py @@ -22,7 +22,11 @@ from ..utils import ( class WDRIE(InfoExtractor): - _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P<id>\d+)\.js' + _VALID_URL = r'''(?x)https?:// + (?:deviceids-medp\.wdr\.de/ondemand/\d+/| + kinder\.wdr\.de/(?!mediathek/)[^#?]+-) + (?P<id>\d+)\.(?:js|assetjsonp) + ''' _GEO_COUNTRIES = ['DE'] _TEST = { 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', diff --git a/yt_dlp/extractor/webcaster.py b/yt_dlp/extractor/webcaster.py index e4b65f54f..a858e992c 100644 --- a/yt_dlp/extractor/webcaster.py +++ b/yt_dlp/extractor/webcaster.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + join_nonempty, xpath_text, ) @@ -34,12 +35,9 @@ class WebcasterIE(InfoExtractor): title = xpath_text(video, './/event_name', 'event name', fatal=True) - def make_id(parts, separator): - return separator.join(filter(None, parts)) - formats = [] for format_id in (None, 'noise'): - track_tag = make_id(('track', format_id), '_') + track_tag = join_nonempty('track', format_id, delim='_') for track in video.findall('.//iphone/%s' % track_tag): track_url = track.text if not track_url: @@ -48,7 +46,7 @@ class WebcasterIE(InfoExtractor): m3u8_formats = self._extract_m3u8_formats( track_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=make_id(('hls', format_id), '-'), fatal=False) + m3u8_id=join_nonempty('hls', format_id, delim='-'), fatal=False) for f in m3u8_formats: f.update({ 'source_preference': 0 if format_id == 'noise' else 1, diff --git a/yt_dlp/extractor/willow.py b/yt_dlp/extractor/willow.py new file mode 100644 index 000000000..4d3d62f95 --- /dev/null +++ b/yt_dlp/extractor/willow.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from ..utils import ExtractorError +from .common import InfoExtractor + + +class WillowIE(InfoExtractor): + _VALID_URL = r'https?://(www\.)?willow\.tv/videos/(?P<id>[0-9a-z-_]+)' + _GEO_COUNTRIES = ['US'] + + _TESTS = [{ + 'url': 'http://willow.tv/videos/d5winning-moment-eng-vs-ind-streaming-online-4th-test-india-tour-of-england-2021', + 'info_dict': { + 'id': '169662', + 'display_id': 'd5winning-moment-eng-vs-ind-streaming-online-4th-test-india-tour-of-england-2021', + 'ext': 'mp4', + 'title': 'Winning Moment: 4th Test, England vs India', + 'thumbnail': 'https://aimages.willow.tv/ytThumbnails/6748_D5winning_moment.jpg', + 'duration': 233, + 'timestamp': 1630947954, + 'upload_date': '20210906', + 'location': 'Kennington Oval, London', + 'series': 'India tour of England 2021', + }, + 'params': { + 'skip_download': True, # AES-encrypted m3u8 + }, + }, { + 'url': 'http://willow.tv/videos/highlights-short-ind-vs-nz-streaming-online-2nd-t20i-new-zealand-tour-of-india-2021', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = self._parse_json(self._html_search_regex( + r'var\s+data_js\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, + 'data_js'), video_id) + + video = next((v for v in video_data.get('trending_videos') or [] + if v.get('secureurl')), None) + if not video: + raise ExtractorError('No videos found') + + formats = self._extract_m3u8_formats(video['secureurl'], video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': str(video.get('content_id')), + 'display_id': video.get('video_slug'), + 'title': video.get('video_name') or self._html_search_meta('twitter:title', webpage), + 'formats': formats, + 'thumbnail': video.get('yt_thumb_url') or self._html_search_meta( + 'twitter:image', webpage, default=None), + 'duration': video.get('duration_seconds'), + 'timestamp': video.get('created_date'), + 'location': video.get('venue'), + 'series': video.get('series_name'), + } diff --git a/yt_dlp/extractor/wppilot.py b/yt_dlp/extractor/wppilot.py new file mode 100644 index 000000000..3003a0f10 --- /dev/null +++ b/yt_dlp/extractor/wppilot.py @@ -0,0 +1,177 @@ +# coding: utf-8 + +from .common import InfoExtractor +from ..utils import ( + try_get, + ExtractorError, +) + +import json +import random +import re + + +class WPPilotBaseIE(InfoExtractor): + _VIDEO_URL = 'https://pilot.wp.pl/api/v1/channel/%s' + _VIDEO_GUEST_URL = 'https://pilot.wp.pl/api/v1/guest/channel/%s' + + _HEADERS_WEB = { + 'Content-Type': 'application/json; charset=UTF-8', + 'Referer': 'https://pilot.wp.pl/tv/', + } + + def _get_channel_list(self, cache=True): + if cache is True: + cache_res = self._downloader.cache.load('wppilot', 'channel-list') + if cache_res: + return cache_res, True + webpage = self._download_webpage('https://pilot.wp.pl/tv/', None, 'Downloading webpage') + page_data_base_url = self._search_regex( + r'<script src="(https://wp-pilot-gatsby\.wpcdn\.pl/v[\d.-]+/desktop)', + webpage, 'gatsby build version') + '/page-data' + page_data = self._download_json(f'{page_data_base_url}/tv/page-data.json', None, 'Downloading page data') + for qhash in page_data['staticQueryHashes']: + qhash_content = self._download_json( + f'{page_data_base_url}/sq/d/{qhash}.json', None, + 'Searching for channel list') + channel_list = try_get(qhash_content, lambda x: x['data']['allChannels']['nodes']) + if channel_list is None: + continue + self._downloader.cache.store('wppilot', 'channel-list', channel_list) + return channel_list, False + raise ExtractorError('Unable to find the channel list') + + def _parse_channel(self, chan): + return { + 'id': str(chan['id']), + 'title': chan['name'], + 'is_live': True, + 'thumbnails': [{ + 'id': key, + 'url': chan[key], + } for key in ('thumbnail', 'thumbnail_mobile', 'icon') if chan.get(key)], + } + + +class WPPilotIE(WPPilotBaseIE): + _VALID_URL = r'(?:https?://pilot\.wp\.pl/tv/?#|wppilot:)(?P<id>[a-z\d-]+)' + IE_NAME = 'wppilot' + + _TESTS = [{ + 'url': 'https://pilot.wp.pl/tv/#telewizja-wp-hd', + 'info_dict': { + 'id': '158', + 'ext': 'mp4', + 'title': 'Telewizja WP HD', + }, + 'params': { + 'format': 'bestvideo', + }, + }, { + # audio only + 'url': 'https://pilot.wp.pl/tv/#radio-nowy-swiat', + 'info_dict': { + 'id': '238', + 'ext': 'm4a', + 'title': 'Radio Nowy Świat', + }, + 'params': { + 'format': 'bestaudio', + }, + }, { + 'url': 'wppilot:9', + 'only_matching': True, + }] + + def _get_channel(self, id_or_slug): + video_list, is_cached = self._get_channel_list(cache=True) + key = 'id' if re.match(r'^\d+$', id_or_slug) else 'slug' + for video in video_list: + if video.get(key) == id_or_slug: + return self._parse_channel(video) + # if cached channel not found, download and retry + if is_cached: + video_list, _ = self._get_channel_list(cache=False) + for video in video_list: + if video.get(key) == id_or_slug: + return self._parse_channel(video) + raise ExtractorError('Channel not found') + + def _real_extract(self, url): + video_id = self._match_id(url) + + channel = self._get_channel(video_id) + video_id = str(channel['id']) + + is_authorized = next((c for c in self._downloader.cookiejar if c.name == 'netviapisessid'), None) + # cookies starting with "g:" are assigned to guests + is_authorized = True if is_authorized is not None and not is_authorized.value.startswith('g:') else False + + video = self._download_json( + (self._VIDEO_URL if is_authorized else self._VIDEO_GUEST_URL) % video_id, + video_id, query={ + 'device_type': 'web', + }, headers=self._HEADERS_WEB, + expected_status=(200, 422)) + + stream_token = try_get(video, lambda x: x['_meta']['error']['info']['stream_token']) + if stream_token: + close = self._download_json( + 'https://pilot.wp.pl/api/v1/channels/close', video_id, + 'Invalidating previous stream session', headers=self._HEADERS_WEB, + data=json.dumps({ + 'channelId': video_id, + 't': stream_token, + }).encode('utf-8')) + if try_get(close, lambda x: x['data']['status']) == 'ok': + return self.url_result(url, ie=WPPilotIE.ie_key()) + + formats = [] + + for fmt in video['data']['stream_channel']['streams']: + # live DASH does not work for now + # if fmt['type'] == 'dash@live:abr': + # formats.extend( + # self._extract_mpd_formats( + # random.choice(fmt['url']), video_id)) + if fmt['type'] == 'hls@live:abr': + formats.extend( + self._extract_m3u8_formats( + random.choice(fmt['url']), + video_id, live=True)) + + self._sort_formats(formats) + + channel['formats'] = formats + return channel + + +class WPPilotChannelsIE(WPPilotBaseIE): + _VALID_URL = r'(?:https?://pilot\.wp\.pl/(?:tv/?)?(?:\?[^#]*)?#?|wppilot:)$' + IE_NAME = 'wppilot:channels' + + _TESTS = [{ + 'url': 'wppilot:', + 'info_dict': { + 'id': 'wppilot', + 'title': 'WP Pilot', + }, + 'playlist_mincount': 100, + }, { + 'url': 'https://pilot.wp.pl/', + 'only_matching': True, + }] + + def _entries(self): + channel_list, _ = self._get_channel_list() + for chan in channel_list: + entry = self._parse_channel(chan) + entry.update({ + '_type': 'url_transparent', + 'url': f'wppilot:{chan["id"]}', + 'ie_key': WPPilotIE.ie_key(), + }) + yield entry + + def _real_extract(self, url): + return self.playlist_result(self._entries(), 'wppilot', 'WP Pilot') diff --git a/yt_dlp/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py index 8fc64914c..ab07f01af 100644 --- a/yt_dlp/extractor/xvideos.py +++ b/yt_dlp/extractor/xvideos.py @@ -19,7 +19,7 @@ class XVideosIE(InfoExtractor): (?: (?:[^/]+\.)?xvideos2?\.com/video| (?:www\.)?xvideos\.es/video| - flashservice\.xvideos\.com/embedframe/| + (?:www|flashservice)\.xvideos\.com/embedframe/| static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video= ) (?P<id>[0-9]+) @@ -38,6 +38,9 @@ class XVideosIE(InfoExtractor): 'url': 'https://flashservice.xvideos.com/embedframe/4588838', 'only_matching': True, }, { + 'url': 'https://www.xvideos.com/embedframe/4588838', + 'only_matching': True, + }, { 'url': 'http://static-hw.xvideos.com/swf/xv-player.swf?id_video=4588838', 'only_matching': True, }, { @@ -80,9 +83,7 @@ class XVideosIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - webpage = self._download_webpage( - 'https://www.xvideos.com/video%s/' % video_id, video_id) + webpage = self._download_webpage(url, video_id) mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) if mobj: diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py index 9974d65d6..67095f2fd 100644 --- a/yt_dlp/extractor/yandexvideo.py +++ b/yt_dlp/extractor/yandexvideo.py @@ -7,6 +7,7 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + extract_attributes, int_or_none, try_get, url_or_none, @@ -148,7 +149,7 @@ class YandexVideoIE(InfoExtractor): class ZenYandexIE(InfoExtractor): - _VALID_URL = r'https?://zen\.yandex\.ru/media/(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-(?P<id>[a-z0-9-]+)' + _VALID_URL = r'https?://zen\.yandex\.ru(?:/video)?/(media|watch)/(?:(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-)?(?P<id>[a-z0-9-]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/media/popmech/izverjenie-vulkana-iz-spichek-zreliscnyi-opyt-6002240ff8b1af50bb2da5e3', 'info_dict': { @@ -156,19 +157,38 @@ class ZenYandexIE(InfoExtractor): 'ext': 'mp4', 'title': 'Извержение вулкана из спичек: зрелищный опыт', 'description': 'md5:053ad3c61b5596d510c9a199dc8ee633', - 'thumbnail': 'https://avatars.mds.yandex.net/get-zen-pub-og/3558619/pub_6002240ff8b1af50bb2da5e3_600bad814d953e4132a30b5e/orig', + 'thumbnail': 're:^https://avatars.mds.yandex.net/', 'uploader': 'Популярная механика', }, + 'params': { + 'skip_download': 'm3u8', + }, }, { 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/vot-eto-focus-dedy-morozy-na-gidrociklah-60c7c443da18892ebfe85ed7', 'info_dict': { 'id': '60c7c443da18892ebfe85ed7', 'ext': 'mp4', 'title': 'ВОТ ЭТО Focus. Деды Морозы на гидроциклах', - 'description': 'md5:8684912f6086f298f8078d4af0e8a600', - 'thumbnail': 'https://avatars.mds.yandex.net/get-zen-pub-og/4410519/pub_60c7c443da18892ebfe85ed7_60c7c48e060a163121f42cc3/orig', + 'description': 'md5:f3db3d995763b9bbb7b56d4ccdedea89', + 'thumbnail': 're:^https://avatars.mds.yandex.net/', 'uploader': 'AcademeG DailyStream' }, + 'params': { + 'skip_download': 'm3u8', + 'format': 'bestvideo', + }, + }, { + 'url': 'https://zen.yandex.ru/video/watch/6002240ff8b1af50bb2da5e3', + 'info_dict': { + 'id': '6002240ff8b1af50bb2da5e3', + 'ext': 'mp4', + 'title': 'Извержение вулкана из спичек: зрелищный опыт', + 'description': 'md5:053ad3c61b5596d510c9a199dc8ee633', + 'uploader': 'Популярная механика', + }, + 'params': { + 'skip_download': 'm3u8', + }, }, { 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/novyi-samsung-fold-3-moskvich-barahlit-612f93b7f8d48e7e945792a2?from=channel&rid=2286618386.482.1630817595976.42360', 'only_matching': True, @@ -177,23 +197,37 @@ class ZenYandexIE(InfoExtractor): def _real_extract(self, url): id = self._match_id(url) webpage = self._download_webpage(url, id) - data_json = self._parse_json(self._search_regex(r'w\._data\s?=\s?({.+?});', webpage, 'metadata'), id) - stream_json = try_get(data_json, lambda x: x['publication']['content']['gifContent'], dict) - stream_url = stream_json.get('stream') or try_get(stream_json, lambda x: x['streams']['url']) - formats = self._extract_m3u8_formats(stream_url, id) + data_json = self._parse_json( + self._search_regex(r'data\s*=\s*({["\']_*serverState_*video.+?});', webpage, 'metadata'), id) + serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', + webpage, 'server state').replace('State', 'Settings') + uploader = self._search_regex(r'(<a\s*class=["\']card-channel-link[^"\']+["\'][^>]+>)', + webpage, 'uploader', default='<a>') + uploader_name = extract_attributes(uploader).get('aria-label') + video_json = try_get(data_json, lambda x: x[serverstate]['exportData']['video'], dict) + stream_urls = try_get(video_json, lambda x: x['video']['streams']) + formats = [] + for s_url in stream_urls: + ext = determine_ext(s_url) + if ext == 'mpd': + formats.extend(self._extract_mpd_formats(s_url, id, mpd_id='dash')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats(s_url, id, 'mp4')) self._sort_formats(formats) return { 'id': id, - 'title': try_get(data_json, (lambda x: x['og']['title'], lambda x: x['publication']['content']['preview']['title'])), - 'uploader': data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']), - 'description': try_get(data_json, lambda x: x['og']['description']), - 'thumbnail': try_get(data_json, lambda x: x['og']['imageUrl']), + 'title': video_json.get('title') or self._og_search_title(webpage), 'formats': formats, + 'duration': int_or_none(video_json.get('duration')), + 'view_count': int_or_none(video_json.get('views')), + 'uploader': uploader_name or data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']), + 'description': self._og_search_description(webpage) or try_get(data_json, lambda x: x['og']['description']), + 'thumbnail': self._og_search_thumbnail(webpage) or try_get(data_json, lambda x: x['og']['imageUrl']), } class ZenYandexChannelIE(InfoExtractor): - _VALID_URL = r'https?://zen\.yandex\.ru/(?!media)(?:id/)?(?P<id>[a-z0-9-_]+)' + _VALID_URL = r'https?://zen\.yandex\.ru/(?!media|video)(?:id/)?(?P<id>[a-z0-9-_]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/tok_media', 'info_dict': { diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 658b45fe1..ba135613b 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -9,6 +9,7 @@ import datetime import hashlib import itertools import json +import math import os.path import random import re @@ -28,6 +29,7 @@ from ..compat import ( ) from ..jsinterp import JSInterpreter from ..utils import ( + bug_reports_message, bytes_to_intlist, clean_html, datetime_from_str, @@ -39,8 +41,10 @@ from ..utils import ( int_or_none, intlist_to_bytes, is_html, + join_nonempty, mimetype2ext, network_exceptions, + NO_DEFAULT, orderedSet, parse_codecs, parse_count, @@ -65,6 +69,10 @@ from ..utils import ( ) +def get_first(obj, keys, **kwargs): + return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) + + # any clients starting with _ cannot be explicity requested by the user INNERTUBE_CLIENTS = { 'web': { @@ -258,6 +266,70 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False + _INVIDIOUS_SITES = ( + # invidious-redirect websites + r'(?:www\.)?redirect\.invidious\.io', + r'(?:(?:www|dev)\.)?invidio\.us', + # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md + r'(?:www\.)?invidious\.pussthecat\.org', + r'(?:www\.)?invidious\.zee\.li', + r'(?:www\.)?invidious\.ethibox\.fr', + r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', + # youtube-dl invidious instances list + r'(?:(?:www|no)\.)?invidiou\.sh', + r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', + r'(?:www\.)?invidious\.kabi\.tk', + r'(?:www\.)?invidious\.mastodon\.host', + r'(?:www\.)?invidious\.zapashcanon\.fr', + r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', + r'(?:www\.)?invidious\.tinfoil-hat\.net', + r'(?:www\.)?invidious\.himiko\.cloud', + r'(?:www\.)?invidious\.reallyancient\.tech', + r'(?:www\.)?invidious\.tube', + r'(?:www\.)?invidiou\.site', + r'(?:www\.)?invidious\.site', + r'(?:www\.)?invidious\.xyz', + r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.048596\.xyz', + r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?inv\.skyn3t\.in', + r'(?:www\.)?tube\.poal\.co', + r'(?:www\.)?tube\.connect\.cafe', + r'(?:www\.)?vid\.wxzm\.sx', + r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?vid\.puffyan\.us', + r'(?:www\.)?yewtu\.be', + r'(?:www\.)?yt\.elukerio\.org', + r'(?:www\.)?yt\.lelux\.fi', + r'(?:www\.)?invidious\.ggc-project\.de', + r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?ytprivate\.com', + r'(?:www\.)?invidious\.13ad\.de', + r'(?:www\.)?invidious\.toot\.koeln', + r'(?:www\.)?invidious\.fdn\.fr', + r'(?:www\.)?watch\.nettohikari\.com', + r'(?:www\.)?invidious\.namazso\.eu', + r'(?:www\.)?invidious\.silkky\.cloud', + r'(?:www\.)?invidious\.exonip\.de', + r'(?:www\.)?invidious\.riverside\.rocks', + r'(?:www\.)?invidious\.blamefran\.net', + r'(?:www\.)?invidious\.moomoo\.de', + r'(?:www\.)?ytb\.trom\.tf', + r'(?:www\.)?yt\.cyberhost\.uk', + r'(?:www\.)?kgg2m7yk5aybusll\.onion', + r'(?:www\.)?qklhadlycap4cnod\.onion', + r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', + r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', + r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', + r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', + r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', + r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', + r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', + r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', + r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', + r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', + ) + def _login(self): """ Attempt to log in to YouTube. @@ -437,9 +509,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): Extracts visitorData from an API response or ytcfg Appears to be used to track session state """ - return traverse_obj( - args, (..., ('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))), - expected_type=compat_str, get_all=False) + return get_first( + args, (('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))), + expected_type=str) @property def is_authenticated(self): @@ -696,69 +768,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube' - _INVIDIOUS_SITES = ( - # invidious-redirect websites - r'(?:www\.)?redirect\.invidious\.io', - r'(?:(?:www|dev)\.)?invidio\.us', - # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md - r'(?:www\.)?invidious\.pussthecat\.org', - r'(?:www\.)?invidious\.zee\.li', - r'(?:www\.)?invidious\.ethibox\.fr', - r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', - # youtube-dl invidious instances list - r'(?:(?:www|no)\.)?invidiou\.sh', - r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', - r'(?:www\.)?invidious\.kabi\.tk', - r'(?:www\.)?invidious\.mastodon\.host', - r'(?:www\.)?invidious\.zapashcanon\.fr', - r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', - r'(?:www\.)?invidious\.tinfoil-hat\.net', - r'(?:www\.)?invidious\.himiko\.cloud', - r'(?:www\.)?invidious\.reallyancient\.tech', - r'(?:www\.)?invidious\.tube', - r'(?:www\.)?invidiou\.site', - r'(?:www\.)?invidious\.site', - r'(?:www\.)?invidious\.xyz', - r'(?:www\.)?invidious\.nixnet\.xyz', - r'(?:www\.)?invidious\.048596\.xyz', - r'(?:www\.)?invidious\.drycat\.fr', - r'(?:www\.)?inv\.skyn3t\.in', - r'(?:www\.)?tube\.poal\.co', - r'(?:www\.)?tube\.connect\.cafe', - r'(?:www\.)?vid\.wxzm\.sx', - r'(?:www\.)?vid\.mint\.lgbt', - r'(?:www\.)?vid\.puffyan\.us', - r'(?:www\.)?yewtu\.be', - r'(?:www\.)?yt\.elukerio\.org', - r'(?:www\.)?yt\.lelux\.fi', - r'(?:www\.)?invidious\.ggc-project\.de', - r'(?:www\.)?yt\.maisputain\.ovh', - r'(?:www\.)?ytprivate\.com', - r'(?:www\.)?invidious\.13ad\.de', - r'(?:www\.)?invidious\.toot\.koeln', - r'(?:www\.)?invidious\.fdn\.fr', - r'(?:www\.)?watch\.nettohikari\.com', - r'(?:www\.)?invidious\.namazso\.eu', - r'(?:www\.)?invidious\.silkky\.cloud', - r'(?:www\.)?invidious\.exonip\.de', - r'(?:www\.)?invidious\.riverside\.rocks', - r'(?:www\.)?invidious\.blamefran\.net', - r'(?:www\.)?invidious\.moomoo\.de', - r'(?:www\.)?ytb\.trom\.tf', - r'(?:www\.)?yt\.cyberhost\.uk', - r'(?:www\.)?kgg2m7yk5aybusll\.onion', - r'(?:www\.)?qklhadlycap4cnod\.onion', - r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', - r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', - r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', - r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', - r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', - r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', - r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', - r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', - r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', - r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', - ) _VALID_URL = r"""(?x)^ ( (?:https?://|//) # http(s):// or protocol-independent URL @@ -792,7 +801,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow (?:\#|$)""" % { - 'invidious': '|'.join(_INVIDIOUS_SITES), + 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), } _PLAYER_INFO_RE = ( r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player', @@ -1666,7 +1675,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # shorts 'url': 'https://www.youtube.com/shorts/BGQWPY4IigY', 'only_matching': True, - }, + }, { + 'note': 'Storyboards', + 'url': 'https://www.youtube.com/watch?v=5KLPxDtMqe8', + 'info_dict': { + 'id': '5KLPxDtMqe8', + 'ext': 'mhtml', + 'format_id': 'sb0', + 'title': 'Your Brain is Plastic', + 'uploader_id': 'scishow', + 'description': 'md5:89cd86034bdb5466cd87c6ba206cd2bc', + 'upload_date': '20140324', + 'uploader': 'SciShow', + }, 'params': {'format': 'mhtml', 'skip_download': True} + } ] @classmethod @@ -1720,7 +1742,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError('Cannot identify player %r' % player_url) return id_m.group('id') - def _load_player(self, video_id, player_url, fatal=True) -> bool: + def _load_player(self, video_id, player_url, fatal=True): player_id = self._extract_player_info(player_url) if player_id not in self._code_cache: code = self._download_webpage( @@ -1729,7 +1751,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): errnote='Download of %s failed' % player_url) if code: self._code_cache[player_id] = code - return player_id in self._code_cache + return self._code_cache.get(player_id) def _extract_signature_function(self, video_id, player_url, example_sig): player_id = self._extract_player_info(player_url) @@ -1743,8 +1765,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) - if self._load_player(video_id, player_url): - code = self._code_cache[player_id] + code = self._load_player(video_id, player_url) + if code: res = self._parse_sig_js(code) test_string = ''.join(map(compat_chr, range(len(example_sig)))) @@ -1755,6 +1777,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return res def _print_sig_code(self, func, example_sig): + if not self.get_param('youtube_print_sig_code'): + return + def gen_sig_code(idxs): def _genslice(start, end, step): starts = '' if start == 0 else str(start) @@ -1831,13 +1856,58 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) self._player_cache[player_id] = func func = self._player_cache[player_id] - if self.get_param('youtube_print_sig_code'): - self._print_sig_code(func, s) + self._print_sig_code(func, s) return func(s) except Exception as e: - tb = traceback.format_exc() - raise ExtractorError( - 'Signature extraction failed: ' + tb, cause=e) + raise ExtractorError('Signature extraction failed: ' + traceback.format_exc(), cause=e) + + def _decrypt_nsig(self, s, video_id, player_url): + """Turn the encrypted n field into a working signature""" + if player_url is None: + raise ExtractorError('Cannot decrypt nsig without player_url') + if player_url.startswith('//'): + player_url = 'https:' + player_url + elif not re.match(r'https?://', player_url): + player_url = compat_urlparse.urljoin( + 'https://www.youtube.com', player_url) + + sig_id = ('nsig_value', s) + if sig_id in self._player_cache: + return self._player_cache[sig_id] + + try: + player_id = ('nsig', player_url) + if player_id not in self._player_cache: + self._player_cache[player_id] = self._extract_n_function(video_id, player_url) + func = self._player_cache[player_id] + self._player_cache[sig_id] = func(s) + self.write_debug(f'Decrypted nsig {s} => {self._player_cache[sig_id]}') + return self._player_cache[sig_id] + except Exception as e: + raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) + + def _extract_n_function_name(self, jscode): + return self._search_regex( + (r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]{3})\([a-zA-Z0-9]\)',), + jscode, 'Initial JS player n function name', group='nfunc') + + def _extract_n_function(self, video_id, player_url): + player_id = self._extract_player_info(player_url) + func_code = self._downloader.cache.load('youtube-nsig', player_id) + + if func_code: + jsi = JSInterpreter(func_code) + else: + jscode = self._load_player(video_id, player_url) + funcname = self._extract_n_function_name(jscode) + jsi = JSInterpreter(jscode) + func_code = jsi.extract_function_code(funcname) + self._downloader.cache.store('youtube-nsig', player_id, func_code) + + if self.get_param('youtube_print_sig_code'): + self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n') + + return lambda s: jsi.extract_function_from_code(*func_code)([s]) def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): """ @@ -1856,18 +1926,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError(error_msg) self.report_warning(error_msg) return - if self._load_player(video_id, player_url, fatal=fatal): - player_id = self._extract_player_info(player_url) - code = self._code_cache[player_id] + code = self._load_player(video_id, player_url, fatal=fatal) + if code: sts = int_or_none(self._search_regex( r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code, 'JS player signature timestamp', group='sts', fatal=fatal)) return sts def _mark_watched(self, video_id, player_responses): - playback_url = traverse_obj( - player_responses, (..., 'playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'), - expected_type=url_or_none, get_all=False) + playback_url = get_first( + player_responses, ('playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'), + expected_type=url_or_none) if not playback_url: self.report_warning('Unable to mark watched') return @@ -2290,18 +2359,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _get_requested_clients(self, url, smuggled_data): requested_clients = [] + default = ['android', 'web'] allowed_clients = sorted( [client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'], key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) for client in self._configuration_arg('player_client'): if client in allowed_clients: requested_clients.append(client) + elif client == 'default': + requested_clients.extend(default) elif client == 'all': requested_clients.extend(allowed_clients) else: self.report_warning(f'Skipping unsupported client {client}') if not requested_clients: - requested_clients = ['android', 'web'] + requested_clients = default if smuggled_data.get('is_music_url') or self.is_music_url(url): requested_clients.extend( @@ -2387,7 +2459,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return prs, player_url def _extract_formats(self, streaming_data, video_id, player_url, is_live): - itags, stream_ids = [], [] + itags, stream_ids = {}, [] itag_qualities, res_qualities = {}, {} q = qualities([ # Normally tiny is the smallest video-only formats. But @@ -2440,8 +2512,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' fmt_url += '&' + sp + '=' + signature + query = parse_qs(fmt_url) + throttled = False + if query.get('ratebypass') != ['yes'] and query.get('n'): + try: + fmt_url = update_url_query(fmt_url, { + 'n': self._decrypt_nsig(query['n'][0], video_id, player_url)}) + except ExtractorError as e: + self.report_warning( + f'nsig extraction failed: You may experience throttling for some formats\n' + f'n = {query["n"][0]} ; player = {player_url}\n{e}', only_once=True) + throttled = True + if itag: - itags.append(itag) + itags[itag] = 'https' stream_ids.append(stream_id) tbr = float_or_none( @@ -2450,11 +2534,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), 'format_id': itag, - 'format_note': ', '.join(filter(None, ( + 'format_note': join_nonempty( '%s%s' % (audio_track.get('displayName') or '', ' (default)' if audio_track.get('audioIsDefault') else ''), - fmt.get('qualityLabel') or quality.replace('audio_quality_', '')))), - 'fps': int_or_none(fmt.get('fps')), + fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), + throttled and 'THROTTLED', delim=', '), + 'source_preference': -10 if throttled else -1, + 'fps': int_or_none(fmt.get('fps')) or None, 'height': height, 'quality': q(quality), 'tbr': tbr, @@ -2489,46 +2575,71 @@ class YoutubeIE(YoutubeBaseInfoExtractor): and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True)) get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True) - def guess_quality(f): - for val, qdict in ((f.get('format_id'), itag_qualities), (f.get('height'), res_qualities)): - if val in qdict: - return q(qdict[val]) - return -1 + def process_manifest_format(f, proto, itag): + if itag in itags: + if itags[itag] == proto or f'{itag}-{proto}' in itags: + return False + itag = f'{itag}-{proto}' + if itag: + f['format_id'] = itag + itags[itag] = proto + + f['quality'] = next(( + q(qdict[val]) + for val, qdict in ((f.get('format_id', '').split('-')[0], itag_qualities), (f.get('height'), res_qualities)) + if val in qdict), -1) + return True for sd in streaming_data: hls_manifest_url = get_hls and sd.get('hlsManifestUrl') if hls_manifest_url: for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False): - itag = self._search_regex( - r'/itag/(\d+)', f['url'], 'itag', default=None) - if itag in itags: - itag += '-hls' - if itag in itags: - continue - if itag: - f['format_id'] = itag - itags.append(itag) - f['quality'] = guess_quality(f) - yield f + if process_manifest_format(f, 'hls', self._search_regex( + r'/itag/(\d+)', f['url'], 'itag', default=None)): + yield f dash_manifest_url = get_dash and sd.get('dashManifestUrl') if dash_manifest_url: for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False): - itag = f['format_id'] - if itag in itags: - itag += '-dash' - if itag in itags: - continue - if itag: - f['format_id'] = itag - itags.append(itag) - f['quality'] = guess_quality(f) - filesize = int_or_none(self._search_regex( - r'/clen/(\d+)', f.get('fragment_base_url') - or f['url'], 'file size', default=None)) - if filesize: - f['filesize'] = filesize - yield f + if process_manifest_format(f, 'dash', f['format_id']): + f['filesize'] = int_or_none(self._search_regex( + r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) + yield f + + def _extract_storyboard(self, player_responses, duration): + spec = get_first( + player_responses, ('storyboards', 'playerStoryboardSpecRenderer', 'spec'), default='').split('|')[::-1] + if not spec: + return + base_url = spec.pop() + L = len(spec) - 1 + for i, args in enumerate(spec): + args = args.split('#') + counts = list(map(int_or_none, args[:5])) + if len(args) != 8 or not all(counts): + self.report_warning(f'Malformed storyboard {i}: {"#".join(args)}{bug_reports_message()}') + continue + width, height, frame_count, cols, rows = counts + N, sigh = args[6:] + + url = base_url.replace('$L', str(L - i)).replace('$N', N) + f'&sigh={sigh}' + fragment_count = frame_count / (cols * rows) + fragment_duration = duration / fragment_count + yield { + 'format_id': f'sb{i}', + 'format_note': 'storyboard', + 'ext': 'mhtml', + 'protocol': 'mhtml', + 'acodec': 'none', + 'vcodec': 'none', + 'url': url, + 'width': width, + 'height': height, + 'fragments': [{ + 'path': url.replace('$M', str(j)), + 'duration': min(fragment_duration, duration - (j * fragment_duration)), + } for j in range(math.ceil(fragment_count))], + } def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -2547,8 +2658,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._get_requested_clients(url, smuggled_data), video_id, webpage, master_ytcfg) - get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) - playability_statuses = traverse_obj( player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[]) @@ -2574,49 +2683,48 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or search_meta(['og:title', 'twitter:title', 'title'])) video_description = get_first(video_details, 'shortDescription') - if not smuggled_data.get('force_singlefeed', False): - if not self.get_param('noplaylist'): - multifeed_metadata_list = get_first( - player_responses, - ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'), - expected_type=str) - if multifeed_metadata_list: - entries = [] - feed_ids = [] - for feed in multifeed_metadata_list.split(','): - # Unquote should take place before split on comma (,) since textual - # fields may contain comma as well (see - # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs( - compat_urllib_parse_unquote_plus(feed)) - - def feed_entry(name): - return try_get( - feed_data, lambda x: x[name][0], compat_str) - - feed_id = feed_entry('id') - if not feed_id: - continue - feed_title = feed_entry('title') - title = video_title - if feed_title: - title += ' (%s)' % feed_title - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - '%swatch?v=%s' % (base_url, feed_data['id'][0]), - {'force_singlefeed': True}), - 'title': title, - }) - feed_ids.append(feed_id) - self.to_screen( - 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' - % (', '.join(feed_ids), video_id)) - return self.playlist_result( - entries, video_id, video_title, video_description) - else: + multifeed_metadata_list = get_first( + player_responses, + ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'), + expected_type=str) + if multifeed_metadata_list and not smuggled_data.get('force_singlefeed'): + if self.get_param('noplaylist'): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + else: + entries = [] + feed_ids = [] + for feed in multifeed_metadata_list.split(','): + # Unquote should take place before split on comma (,) since textual + # fields may contain comma as well (see + # https://github.com/ytdl-org/youtube-dl/issues/8536) + feed_data = compat_parse_qs( + compat_urllib_parse_unquote_plus(feed)) + + def feed_entry(name): + return try_get( + feed_data, lambda x: x[name][0], compat_str) + + feed_id = feed_entry('id') + if not feed_id: + continue + feed_title = feed_entry('title') + title = video_title + if feed_title: + title += ' (%s)' % feed_title + entries.append({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + 'url': smuggle_url( + '%swatch?v=%s' % (base_url, feed_data['id'][0]), + {'force_singlefeed': True}), + 'title': title, + }) + feed_ids.append(feed_id) + self.to_screen( + 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' + % (', '.join(feed_ids), video_id)) + return self.playlist_result( + entries, video_id, video_title, video_description) live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails')) is_live = get_first(video_details, 'isLive') @@ -2645,16 +2753,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if reason: self.raise_no_formats(reason, expected=True) - for f in formats: - if '&c=WEB&' in f['url'] and '&ratebypass=yes&' not in f['url']: # throttled - f['source_preference'] = -10 - # TODO: this method is not reliable - f['format_note'] = format_field(f, 'format_note', '%s ') + '(maybe throttled)' - - # Source is given priority since formats that throttle are given lower source_preference - # When throttling issue is fully fixed, remove this - self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang')) - keywords = get_first(video_details, 'keywords', expected_type=list) or [] if not keywords and webpage: keywords = [ @@ -2742,6 +2840,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not duration and live_endtime and live_starttime: duration = live_endtime - live_starttime + formats.extend(self._extract_storyboard(player_responses, duration)) + + # Source is given priority since formats that throttle are given lower source_preference + # When throttling issue is fully fixed, remove this + self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang', 'proto')) + info = { 'id': video_id, 'title': self._live_title(video_title) if is_live else video_title, @@ -3014,494 +3118,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return info -class YoutubeTabIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube Tabs' - _VALID_URL = r'''(?x) - https?:// - (?:\w+\.)? - (?: - youtube(?:kids)?\.com| - invidio\.us - )/ - (?: - (?P<channel_type>channel|c|user|browse)/| - (?P<not_channel> - feed/|hashtag/| - (?:playlist|watch)\?.*?\blist= - )| - (?!(?:%s)\b) # Direct URLs - ) - (?P<id>[^/?\#&]+) - ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES - IE_NAME = 'youtube:tab' - - _TESTS = [{ - 'note': 'playlists, multipage', - 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', - 'playlist_mincount': 94, - 'info_dict': { - 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Игорь Клейнер - Playlists', - 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', - 'uploader': 'Игорь Клейнер', - 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', - }, - }, { - 'note': 'playlists, multipage, different order', - 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', - 'playlist_mincount': 94, - 'info_dict': { - 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Игорь Клейнер - Playlists', - 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', - 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'uploader': 'Игорь Клейнер', - }, - }, { - 'note': 'playlists, series', - 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'UCYO_jab_esuFRV4b17AJtAw', - 'title': '3Blue1Brown - Playlists', - 'description': 'md5:e1384e8a133307dd10edee76e875d62f', - 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', - 'uploader': '3Blue1Brown', - }, - }, { - 'note': 'playlists, singlepage', - 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', - 'playlist_mincount': 4, - 'info_dict': { - 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', - 'title': 'ThirstForScience - Playlists', - 'description': 'md5:609399d937ea957b0f53cbffb747a14c', - 'uploader': 'ThirstForScience', - 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', - } - }, { - 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', - 'only_matching': True, - }, { - 'note': 'basic, single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', - 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'title': 'youtube-dl public playlist', - }, - 'playlist_count': 1, - }, { - 'note': 'empty playlist', - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', - 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', - 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', - 'title': 'youtube-dl empty playlist', - }, - 'playlist_count': 0, - }, { - 'note': 'Home tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Home', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 2, - }, { - 'note': 'Videos tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Videos', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 975, - }, { - 'note': 'Videos tab, sorted by popular', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Videos', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 199, - }, { - 'note': 'Playlists tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Playlists', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 17, - }, { - 'note': 'Community tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Community', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 18, - }, { - 'note': 'Channels tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Channels', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 12, - }, { - 'note': 'Search tab', - 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', - 'playlist_mincount': 40, - 'info_dict': { - 'id': 'UCYO_jab_esuFRV4b17AJtAw', - 'title': '3Blue1Brown - Search - linear algebra', - 'description': 'md5:e1384e8a133307dd10edee76e875d62f', - 'uploader': '3Blue1Brown', - 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', - }, - }, { - 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', - 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'info_dict': { - 'title': '29C3: Not my department', - 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'uploader': 'Christiaan008', - 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', - 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268', - }, - 'playlist_count': 96, - }, { - 'note': 'Large playlist', - 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', - 'info_dict': { - 'title': 'Uploads from Cauchemar', - 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', - 'uploader': 'Cauchemar', - 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', - }, - 'playlist_mincount': 1123, - }, { - 'note': 'even larger playlist, 8832 videos', - 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', - 'only_matching': True, - }, { - 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', - 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', - 'info_dict': { - 'title': 'Uploads from Interstellar Movie', - 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', - 'uploader': 'Interstellar Movie', - 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', - }, - 'playlist_mincount': 21, - }, { - 'note': 'Playlist with "show unavailable videos" button', - 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q', - 'info_dict': { - 'title': 'Uploads from Phim Siêu Nhân Nhật Bản', - 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q', - 'uploader': 'Phim Siêu Nhân Nhật Bản', - 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', - }, - 'playlist_mincount': 200, - }, { - 'note': 'Playlist with unavailable videos in page 7', - 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w', - 'info_dict': { - 'title': 'Uploads from BlankTV', - 'id': 'UU8l9frL61Yl5KFOl87nIm2w', - 'uploader': 'BlankTV', - 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w', - }, - 'playlist_mincount': 1000, - }, { - 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844', - 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'info_dict': { - 'title': 'Data Analysis with Dr Mike Pound', - 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', - 'uploader': 'Computerphile', - 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487', - }, - 'playlist_mincount': 11, - }, { - 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'only_matching': True, - }, { - 'note': 'Playlist URL that does not actually serve a playlist', - 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', - 'info_dict': { - 'id': 'FqZTN594JQw', - 'ext': 'webm', - 'title': "Smiley's People 01 detective, Adventure Series, Action", - 'uploader': 'STREEM', - 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', - 'upload_date': '20150526', - 'license': 'Standard YouTube License', - 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', - 'categories': ['People & Blogs'], - 'tags': list, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - 'add_ie': [YoutubeIE.ie_key()], - }, { - 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', - 'info_dict': { - 'id': '3yImotZU3tw', # This will keep changing - 'ext': 'mp4', - 'title': compat_str, - 'uploader': 'Sky News', - 'uploader_id': 'skynews', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', - 'upload_date': r're:\d{8}', - 'description': compat_str, - 'categories': ['News & Politics'], - 'tags': list, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '], - }, { - 'url': 'https://www.youtube.com/user/TheYoungTurks/live', - 'info_dict': { - 'id': 'a48o2S1cPoo', - 'ext': 'mp4', - 'title': 'The Young Turks - Live Main Show', - 'uploader': 'The Young Turks', - 'uploader_id': 'TheYoungTurks', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', - 'upload_date': '20150715', - 'license': 'Standard YouTube License', - 'description': 'md5:438179573adcdff3c97ebb1ee632b891', - 'categories': ['News & Politics'], - 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', - 'only_matching': True, - }, { - 'note': 'A channel that is not live. Should raise error', - 'url': 'https://www.youtube.com/user/numberphile/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/trending', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/library', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/history', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/subscriptions', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/watch_later', - 'only_matching': True, - }, { - 'note': 'Recommended - redirects to home page.', - 'url': 'https://www.youtube.com/feed/recommended', - 'only_matching': True, - }, { - 'note': 'inline playlist with not always working continuations', - 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/course', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/zsecurity', - 'only_matching': True, - }, { - 'url': 'http://www.youtube.com/NASAgovVideo/videos', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/hashtag/cctv9', - 'info_dict': { - 'id': 'cctv9', - 'title': '#cctv9', - }, - 'playlist_mincount': 350, - }, { - 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', - 'only_matching': True, - }, { - 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist', - 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', - 'only_matching': True - }, { - 'note': '/browse/ should redirect to /channel/', - 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng', - 'only_matching': True - }, { - 'note': 'VLPL, should redirect to playlist?list=PL...', - 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', - 'info_dict': { - 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', - 'uploader': 'NoCopyrightSounds', - 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', - 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', - 'title': 'NCS Releases', - }, - 'playlist_mincount': 166, - }, { - 'note': 'Topic, should redirect to playlist?list=UU...', - 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', - 'info_dict': { - 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', - 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', - 'title': 'Uploads from Royalty Free Music - Topic', - 'uploader': 'Royalty Free Music - Topic', - }, - 'expected_warnings': [ - 'A channel/user page was given', - 'The URL does not have a videos tab', - ], - 'playlist_mincount': 101, - }, { - 'note': 'Topic without a UU playlist', - 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', - 'info_dict': { - 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', - 'title': 'UCtFRv9O2AHqOZjjynzrv-xg', - }, - 'expected_warnings': [ - 'A channel/user page was given', - 'The URL does not have a videos tab', - 'Falling back to channel URL', - ], - 'playlist_mincount': 9, - }, { - 'note': 'Youtube music Album', - 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE', - 'info_dict': { - 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0', - 'title': 'Album - Royalty Free Music Library V2 (50 Songs)', - }, - 'playlist_count': 50, - }, { - 'note': 'unlisted single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', - 'info_dict': { - 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', - 'uploader': 'colethedj', - 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', - 'title': 'yt-dlp unlisted playlist test', - 'availability': 'unlisted' - }, - 'playlist_count': 1, - }, { - 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', - 'url': 'https://www.youtube.com/feed/recommended', - 'info_dict': { - 'id': 'recommended', - 'title': 'recommended', - }, - 'playlist_mincount': 50, - 'params': { - 'skip_download': True, - 'extractor_args': {'youtubetab': {'skip': ['webpage']}} - }, - }, { - 'note': 'API Fallback: /videos tab, sorted by oldest first', - 'url': 'https://www.youtube.com/user/theCodyReeder/videos?view=0&sort=da&flow=grid', - 'info_dict': { - 'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', - 'title': 'Cody\'sLab - Videos', - 'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa', - 'uploader': 'Cody\'sLab', - 'uploader_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', - }, - 'playlist_mincount': 650, - 'params': { - 'skip_download': True, - 'extractor_args': {'youtubetab': {'skip': ['webpage']}} - }, - }, { - 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', - 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', - 'info_dict': { - 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', - 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', - 'title': 'Uploads from Royalty Free Music - Topic', - 'uploader': 'Royalty Free Music - Topic', - }, - 'expected_warnings': [ - 'A channel/user page was given', - 'The URL does not have a videos tab', - ], - 'playlist_mincount': 101, - 'params': { - 'skip_download': True, - 'extractor_args': {'youtubetab': {'skip': ['webpage']}} - }, - }] - - @classmethod - def suitable(cls, url): - return False if YoutubeIE.suitable(url) else super( - YoutubeTabIE, cls).suitable(url) +class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _extract_channel_id(self, webpage): channel_id = self._html_search_meta( @@ -3684,49 +3301,53 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if entry: yield entry ''' - def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): - - def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds - contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] - for content in contents: - if not isinstance(content, dict): - continue - is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict) - if not is_renderer: - renderer = content.get('richItemRenderer') - if renderer: - for entry in self._rich_entries(renderer): - yield entry - continuation_list[0] = self._extract_continuation(parent_renderer) + def _extract_entries(self, parent_renderer, continuation_list): + # continuation_list is modified in-place with continuation_list = [continuation_token] + continuation_list[:] = [None] + contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict) + if not is_renderer: + renderer = content.get('richItemRenderer') + if renderer: + for entry in self._rich_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(parent_renderer) + continue + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): continue - isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] - for isr_content in isr_contents: - if not isinstance(isr_content, dict): - continue - known_renderers = { - 'playlistVideoListRenderer': self._playlist_entries, - 'gridRenderer': self._grid_entries, - 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'), - 'backstagePostThreadRenderer': self._post_thread_entries, - 'videoRenderer': lambda x: [self._video_entry(x)], - } - for key, renderer in isr_content.items(): - if key not in known_renderers: - continue - for entry in known_renderers[key](renderer): - if entry: - yield entry - continuation_list[0] = self._extract_continuation(renderer) - break - - if not continuation_list[0]: - continuation_list[0] = self._extract_continuation(is_renderer) + known_renderers = { + 'playlistVideoListRenderer': self._playlist_entries, + 'gridRenderer': self._grid_entries, + 'shelfRenderer': lambda x: self._shelf_entries(x), + 'backstagePostThreadRenderer': self._post_thread_entries, + 'videoRenderer': lambda x: [self._video_entry(x)], + 'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}), + 'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}), + } + for key, renderer in isr_content.items(): + if key not in known_renderers: + continue + for entry in known_renderers[key](renderer): + if entry: + yield entry + continuation_list[0] = self._extract_continuation(renderer) + break if not continuation_list[0]: - continuation_list[0] = self._extract_continuation(parent_renderer) + continuation_list[0] = self._extract_continuation(is_renderer) + + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(parent_renderer) - continuation_list = [None] # Python 2 does not support nonlocal + def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): + continuation_list = [None] + extract_entries = lambda x: self._extract_entries(x, continuation_list) tab_content = try_get(tab, lambda x: x['content'], dict) if not tab_content: return @@ -4118,6 +3739,519 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): entry['url'] = smuggle_url(entry['url'], data) yield entry + _SEARCH_PARAMS = None + + def _search_results(self, query, params=NO_DEFAULT): + data = {'query': query} + if params is NO_DEFAULT: + params = self._SEARCH_PARAMS + if params: + data['params'] = params + continuation_list = [None] + for page_num in itertools.count(1): + data.update(continuation_list[0] or {}) + search = self._extract_response( + item_id='query "%s" page %s' % (query, page_num), ep='search', query=data, + check_get_keys=('contents', 'onResponseReceivedCommands')) + slr_contents = try_get( + search, + (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], + lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), + list) + yield from self._extract_entries({'contents': slr_contents}, continuation_list) + if not continuation_list[0]: + break + + +class YoutubeTabIE(YoutubeTabBaseInfoExtractor): + IE_DESC = 'YouTube Tabs' + _VALID_URL = r'''(?x: + https?:// + (?:\w+\.)? + (?: + youtube(?:kids)?\.com| + %(invidious)s + )/ + (?: + (?P<channel_type>channel|c|user|browse)/| + (?P<not_channel> + feed/|hashtag/| + (?:playlist|watch)\?.*?\blist= + )| + (?!(?:%(reserved_names)s)\b) # Direct URLs + ) + (?P<id>[^/?\#&]+) + )''' % { + 'reserved_names': YoutubeBaseInfoExtractor._RESERVED_NAMES, + 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), + } + IE_NAME = 'youtube:tab' + + _TESTS = [{ + 'note': 'playlists, multipage', + 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + 'uploader': 'Игорь Клейнер', + 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', + }, + }, { + 'note': 'playlists, multipage, different order', + 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'uploader': 'Игорь Клейнер', + }, + }, { + 'note': 'playlists, series', + 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Playlists', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'uploader': '3Blue1Brown', + }, + }, { + 'note': 'playlists, singlepage', + 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', + 'playlist_mincount': 4, + 'info_dict': { + 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + 'title': 'ThirstForScience - Playlists', + 'description': 'md5:609399d937ea957b0f53cbffb747a14c', + 'uploader': 'ThirstForScience', + 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + } + }, { + 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', + 'only_matching': True, + }, { + 'note': 'basic, single video playlist', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'info_dict': { + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'title': 'youtube-dl public playlist', + }, + 'playlist_count': 1, + }, { + 'note': 'empty playlist', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'info_dict': { + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'title': 'youtube-dl empty playlist', + }, + 'playlist_count': 0, + }, { + 'note': 'Home tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Home', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 2, + }, { + 'note': 'Videos tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 975, + }, { + 'note': 'Videos tab, sorted by popular', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 199, + }, { + 'note': 'Playlists tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Playlists', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 17, + }, { + 'note': 'Community tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Community', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 18, + }, { + 'note': 'Channels tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Channels', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 12, + }, { + 'note': 'Search tab', + 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', + 'playlist_mincount': 40, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Search - linear algebra', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader': '3Blue1Brown', + 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', + }, + }, { + 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', + 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'info_dict': { + 'title': '29C3: Not my department', + 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'uploader': 'Christiaan008', + 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', + 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268', + }, + 'playlist_count': 96, + }, { + 'note': 'Large playlist', + 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', + 'info_dict': { + 'title': 'Uploads from Cauchemar', + 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', + 'uploader': 'Cauchemar', + 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', + }, + 'playlist_mincount': 1123, + }, { + 'note': 'even larger playlist, 8832 videos', + 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', + 'only_matching': True, + }, { + 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', + 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', + 'info_dict': { + 'title': 'Uploads from Interstellar Movie', + 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', + 'uploader': 'Interstellar Movie', + 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', + }, + 'playlist_mincount': 21, + }, { + 'note': 'Playlist with "show unavailable videos" button', + 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q', + 'info_dict': { + 'title': 'Uploads from Phim Siêu Nhân Nhật Bản', + 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q', + 'uploader': 'Phim Siêu Nhân Nhật Bản', + 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', + }, + 'playlist_mincount': 200, + }, { + 'note': 'Playlist with unavailable videos in page 7', + 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w', + 'info_dict': { + 'title': 'Uploads from BlankTV', + 'id': 'UU8l9frL61Yl5KFOl87nIm2w', + 'uploader': 'BlankTV', + 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w', + }, + 'playlist_mincount': 1000, + }, { + 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844', + 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'info_dict': { + 'title': 'Data Analysis with Dr Mike Pound', + 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', + 'uploader': 'Computerphile', + 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'only_matching': True, + }, { + 'note': 'Playlist URL that does not actually serve a playlist', + 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', + 'info_dict': { + 'id': 'FqZTN594JQw', + 'ext': 'webm', + 'title': "Smiley's People 01 detective, Adventure Series, Action", + 'uploader': 'STREEM', + 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', + 'upload_date': '20150526', + 'license': 'Standard YouTube License', + 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', + 'categories': ['People & Blogs'], + 'tags': list, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video is not available.', + 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', + 'info_dict': { + 'id': '3yImotZU3tw', # This will keep changing + 'ext': 'mp4', + 'title': compat_str, + 'uploader': 'Sky News', + 'uploader_id': 'skynews', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', + 'upload_date': r're:\d{8}', + 'description': compat_str, + 'categories': ['News & Politics'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '], + }, { + 'url': 'https://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', + 'only_matching': True, + }, { + 'note': 'A channel that is not live. Should raise error', + 'url': 'https://www.youtube.com/user/numberphile/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/trending', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/library', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/history', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/subscriptions', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/watch_later', + 'only_matching': True, + }, { + 'note': 'Recommended - redirects to home page.', + 'url': 'https://www.youtube.com/feed/recommended', + 'only_matching': True, + }, { + 'note': 'inline playlist with not always working continuations', + 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/course', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/zsecurity', + 'only_matching': True, + }, { + 'url': 'http://www.youtube.com/NASAgovVideo/videos', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/TheYoungTurks/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/hashtag/cctv9', + 'info_dict': { + 'id': 'cctv9', + 'title': '#cctv9', + }, + 'playlist_mincount': 350, + }, { + 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', + 'only_matching': True, + }, { + 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist', + 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'only_matching': True + }, { + 'note': '/browse/ should redirect to /channel/', + 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng', + 'only_matching': True + }, { + 'note': 'VLPL, should redirect to playlist?list=PL...', + 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'info_dict': { + 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'uploader': 'NoCopyrightSounds', + 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', + 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'title': 'NCS Releases', + }, + 'playlist_mincount': 166, + }, { + 'note': 'Topic, should redirect to playlist?list=UU...', + 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', + 'info_dict': { + 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', + 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', + 'title': 'Uploads from Royalty Free Music - Topic', + 'uploader': 'Royalty Free Music - Topic', + }, + 'expected_warnings': [ + 'A channel/user page was given', + 'The URL does not have a videos tab', + ], + 'playlist_mincount': 101, + }, { + 'note': 'Topic without a UU playlist', + 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', + 'info_dict': { + 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', + 'title': 'UCtFRv9O2AHqOZjjynzrv-xg', + }, + 'expected_warnings': [ + 'A channel/user page was given', + 'The URL does not have a videos tab', + 'Falling back to channel URL', + ], + 'playlist_mincount': 9, + }, { + 'note': 'Youtube music Album', + 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE', + 'info_dict': { + 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0', + 'title': 'Album - Royalty Free Music Library V2 (50 Songs)', + }, + 'playlist_count': 50, + }, { + 'note': 'unlisted single video playlist', + 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', + 'info_dict': { + 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', + 'uploader': 'colethedj', + 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', + 'title': 'yt-dlp unlisted playlist test', + 'availability': 'unlisted' + }, + 'playlist_count': 1, + }, { + 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', + 'url': 'https://www.youtube.com/feed/recommended', + 'info_dict': { + 'id': 'recommended', + 'title': 'recommended', + }, + 'playlist_mincount': 50, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}} + }, + }, { + 'note': 'API Fallback: /videos tab, sorted by oldest first', + 'url': 'https://www.youtube.com/user/theCodyReeder/videos?view=0&sort=da&flow=grid', + 'info_dict': { + 'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', + 'title': 'Cody\'sLab - Videos', + 'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa', + 'uploader': 'Cody\'sLab', + 'uploader_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', + }, + 'playlist_mincount': 650, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}} + }, + }, { + 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', + 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', + 'info_dict': { + 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', + 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', + 'title': 'Uploads from Royalty Free Music - Topic', + 'uploader': 'Royalty Free Music - Topic', + }, + 'expected_warnings': [ + 'A channel/user page was given', + 'The URL does not have a videos tab', + ], + 'playlist_mincount': 101, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}} + }, + }] + + @classmethod + def suitable(cls, url): + return False if YoutubeIE.suitable(url) else super( + YoutubeTabIE, cls).suitable(url) + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) if self.is_music_url(url): @@ -4250,12 +4384,15 @@ class YoutubePlaylistIE(InfoExtractor): (?: (?: youtube(?:kids)?\.com| - invidio\.us + %(invidious)s ) /.*?\?.*?\blist= )? (?P<id>%(playlist_id)s) - )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + )''' % { + 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE, + 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), + } IE_NAME = 'youtube:playlist' _TESTS = [{ 'note': 'issue #673', @@ -4377,7 +4514,7 @@ class YoutubeYtUserIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) return self.url_result( - 'https://www.youtube.com/user/%s' % user_id, + 'https://www.youtube.com/user/%s/videos' % user_id, ie=YoutubeTabIE.ie_key(), video_id=user_id) @@ -4400,77 +4537,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): ie=YoutubeTabIE.ie_key()) -class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE): - IE_DESC = 'YouTube searches' +class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): + IE_DESC = 'YouTube search' IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' - _SEARCH_PARAMS = None + _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only _TESTS = [] - def _search_results(self, query): - data = {'query': query} - if self._SEARCH_PARAMS: - data['params'] = self._SEARCH_PARAMS - continuation = {} - for page_num in itertools.count(1): - data.update(continuation) - search = self._extract_response( - item_id='query "%s" page %s' % (query, page_num), ep='search', query=data, - check_get_keys=('contents', 'onResponseReceivedCommands') - ) - if not search: - break - slr_contents = try_get( - search, - (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], - lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), - list) - if not slr_contents: - break - - # Youtube sometimes adds promoted content to searches, - # changing the index location of videos and token. - # So we search through all entries till we find them. - continuation = None - for slr_content in slr_contents: - if not continuation: - continuation = self._extract_continuation({'contents': [slr_content]}) - isr_contents = try_get( - slr_content, - lambda x: x['itemSectionRenderer']['contents'], - list) - if not isr_contents: - continue - for content in isr_contents: - if not isinstance(content, dict): - continue - video = content.get('videoRenderer') - if not isinstance(video, dict): - continue - video_id = video.get('videoId') - if not video_id: - continue - - yield self._extract_video(video) - - if not continuation: - break - - -class YoutubeSearchDateIE(YoutubeSearchIE): +class YoutubeSearchDateIE(SearchInfoExtractor, YoutubeTabBaseInfoExtractor): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _SEARCH_KEY = 'ytsearchdate' - IE_DESC = 'YouTube searches, newest videos first' - _SEARCH_PARAMS = 'CAI%3D' + IE_DESC = 'YouTube search, newest videos first' + _SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date -class YoutubeSearchURLIE(YoutubeSearchIE): +class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): IE_DESC = 'YouTube search URLs with sorting and filter support' IE_NAME = YoutubeSearchIE.IE_NAME + '_url' - _SEARCH_KEY = None _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)' - # _MAX_RESULTS = 100 _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -4479,19 +4564,22 @@ class YoutubeSearchURLIE(YoutubeSearchIE): 'title': 'youtube-dl test video', } }, { + 'url': 'https://www.youtube.com/results?search_query=python&sp=EgIQAg%253D%253D', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'python', + 'title': 'python', + } + + }, { 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', 'only_matching': True, }] - @classmethod - def _make_valid_url(cls): - return cls._VALID_URL - def _real_extract(self, url): qs = parse_qs(url) query = (qs.get('search_query') or qs.get('q'))[0] - self._SEARCH_PARAMS = qs.get('sp', ('',))[0] - return self._get_n_results(query, self._MAX_RESULTS) + return self.playlist_result(self._search_results(query, qs.get('sp', (None,))[0]), query, query) class YoutubeFeedsInfoExtractor(YoutubeTabIE): diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py index a13d12436..98d15604d 100644 --- a/yt_dlp/extractor/zattoo.py +++ b/yt_dlp/extractor/zattoo.py @@ -12,6 +12,7 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + join_nonempty, try_get, url_or_none, urlencode_postdata, @@ -156,15 +157,9 @@ class ZattooPlatformBaseIE(InfoExtractor): watch_url = url_or_none(watch.get('url')) if not watch_url: continue - format_id_list = [stream_type] - maxrate = watch.get('maxrate') - if maxrate: - format_id_list.append(compat_str(maxrate)) audio_channel = watch.get('audio_channel') - if audio_channel: - format_id_list.append(compat_str(audio_channel)) preference = 1 if audio_channel == 'A' else None - format_id = '-'.join(format_id_list) + format_id = join_nonempty(stream_type, watch.get('maxrate'), audio_channel) if stream_type in ('dash', 'dash_widevine', 'dash_playready'): this_formats = self._extract_mpd_formats( watch_url, video_id, mpd_id=format_id, fatal=False) diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index 8c279c5ab..df236c050 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -9,12 +9,12 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + join_nonempty, merge_dicts, NO_DEFAULT, orderedSet, parse_codecs, qualities, - str_or_none, try_get, unified_timestamp, update_url_query, @@ -70,11 +70,11 @@ class ZDFBaseIE(InfoExtractor): f = {'vcodec': data[0], 'acodec': data[1]} f.update({ 'url': format_url, - 'format_id': '-'.join(filter(str_or_none, ('http', meta.get('type'), meta.get('quality')))), + 'format_id': join_nonempty('http', meta.get('type'), meta.get('quality')), }) new_formats = [f] formats.extend(merge_dicts(f, { - 'format_note': ', '.join(filter(None, (meta.get('quality'), meta.get('class')))), + 'format_note': join_nonempty('quality', 'class', from_dict=meta, delim=', '), 'language': meta.get('language'), 'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1, 'quality': qualities(self._QUALITIES)(meta.get('quality')), |