diff options
28 files changed, 1074 insertions, 248 deletions
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 99c280ffc..816c40329 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2396,7 +2396,7 @@ class YoutubeDL(object): sanitize_string_field(info_dict, 'id') sanitize_numeric_fields(info_dict) if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None): - self.report_warning('"duration" field is negative, there is an error in extractor') + self.report_warning('"duration" field is negative, there is an error in extractor') if 'playlist' not in info_dict: # It isn't part of a playlist diff --git a/yt_dlp/extractor/aliexpress.py b/yt_dlp/extractor/aliexpress.py index 6f241e683..9722fe9ac 100644 --- a/yt_dlp/extractor/aliexpress.py +++ b/yt_dlp/extractor/aliexpress.py @@ -18,7 +18,7 @@ class AliExpressLiveIE(InfoExtractor): 'id': '2800002704436634', 'ext': 'mp4', 'title': 'CASIMA7.22', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'uploader': 'CASIMA Official Store', 'timestamp': 1500717600, 'upload_date': '20170722', diff --git a/yt_dlp/extractor/alsace20tv.py b/yt_dlp/extractor/alsace20tv.py new file mode 100644 index 000000000..4aae6fe74 --- /dev/null +++ b/yt_dlp/extractor/alsace20tv.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + get_element_by_class, + int_or_none, + unified_strdate, + url_or_none, +) + + +class Alsace20TVBaseIE(InfoExtractor): + def _extract_video(self, video_id, url=None): + info = self._download_json( + 'https://www.alsace20.tv/visionneuse/visio_v9_js.php?key=%s&habillage=0&mode=html' % (video_id, ), + video_id) or {} + title = info.get('titre') + + formats = [] + for res, fmt_url in (info.get('files') or {}).items(): + formats.extend( + self._extract_smil_formats(fmt_url, video_id, fatal=False) + if '/smil:_' in fmt_url + else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False)) + self._sort_formats(formats) + + webpage = (url and self._download_webpage(url, video_id, fatal=False)) or '' + thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage)) + upload_date = self._search_regex(r'/(\d{6})_', thumbnail, 'upload_date', default=None) + upload_date = unified_strdate('20%s-%s-%s' % (upload_date[:2], upload_date[2:4], upload_date[4:])) if upload_date else None + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': clean_html(get_element_by_class('wysiwyg', webpage)), + 'upload_date': upload_date, + 'thumbnail': thumbnail, + 'duration': int_or_none(self._og_search_property('video:duration', webpage) if webpage else None), + 'view_count': int_or_none(info.get('nb_vues')), + } + + +class Alsace20TVIE(Alsace20TVBaseIE): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/(?:[\w-]+/)+[\w-]+-(?P<id>[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/VOD/Actu/JT/Votre-JT-jeudi-3-fevrier-lyNHCXpYJh.html', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'description': 'md5:fc0bc4a0692d3d2dba4524053de4c7b7', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'duration': 1073, + 'view_count': int, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id, url) + + +class Alsace20TVEmbedIE(Alsace20TVBaseIE): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/emb/(?P<id>[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/emb/lyNHCXpYJh', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id) diff --git a/yt_dlp/extractor/ant1newsgr.py b/yt_dlp/extractor/ant1newsgr.py new file mode 100644 index 000000000..7d70e0427 --- /dev/null +++ b/yt_dlp/extractor/ant1newsgr.py @@ -0,0 +1,143 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + HEADRequest, + ExtractorError, + determine_ext, + scale_thumbnails_to_max_format_width, + unescapeHTML, +) + + +class Ant1NewsGrBaseIE(InfoExtractor): + def _download_and_extract_api_data(self, video_id, netloc, cid=None): + url = f'{self.http_scheme()}//{netloc}{self._API_PATH}' + info = self._download_json(url, video_id, query={'cid': cid or video_id}) + try: + source = info['url'] + except KeyError: + raise ExtractorError('no source found for %s' % video_id) + formats, subs = (self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4') + if determine_ext(source) == 'm3u8' else ([{'url': source}], {})) + self._sort_formats(formats) + thumbnails = scale_thumbnails_to_max_format_width( + formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+') + return { + 'id': video_id, + 'title': info.get('title'), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subs, + } + + +class Ant1NewsGrWatchIE(Ant1NewsGrBaseIE): + IE_NAME = 'ant1newsgr:watch' + IE_DESC = 'ant1news.gr videos' + _VALID_URL = r'https?://(?P<netloc>(?:www\.)?ant1news\.gr)/watch/(?P<id>\d+)/' + _API_PATH = '/templates/data/player' + + _TESTS = [{ + 'url': 'https://www.ant1news.gr/watch/1506168/ant1-news-09112021-stis-18-45', + 'md5': '95925e6b32106754235f2417e0d2dfab', + 'info_dict': { + 'id': '1506168', + 'ext': 'mp4', + 'title': 'md5:0ad00fa66ecf8aa233d26ab0dba7514a', + 'description': 'md5:18665af715a6dcfeac1d6153a44f16b0', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/26d46bf6-8158-4f02-b197-7096c714b2de.jpg', + }, + }] + + def _real_extract(self, url): + video_id, netloc = self._match_valid_url(url).group('id', 'netloc') + webpage = self._download_webpage(url, video_id) + info = self._download_and_extract_api_data(video_id, netloc) + info['description'] = self._og_search_description(webpage) + return info + + +class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE): + IE_NAME = 'ant1newsgr:article' + IE_DESC = 'ant1news.gr articles' + _VALID_URL = r'https?://(?:www\.)?ant1news\.gr/[^/]+/article/(?P<id>\d+)/' + + _TESTS = [{ + 'url': 'https://www.ant1news.gr/afieromata/article/549468/o-tzeims-mpont-sta-meteora-oi-apeiles-kai-o-xesikomos-ton-kalogeron', + 'md5': '294f18331bb516539d72d85a82887dcc', + 'info_dict': { + 'id': '_xvg/m_cmbatw=', + 'ext': 'mp4', + 'title': 'md5:a93e8ecf2e4073bfdffcb38f59945411', + 'timestamp': 1603092840, + 'upload_date': '20201019', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/756206d2-d640-40e2-b201-3555abdfc0db.jpg', + }, + }, { + 'url': 'https://ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn', + 'info_dict': { + 'id': '620286', + 'title': 'md5:91fe569e952e4d146485740ae927662b', + }, + 'playlist_mincount': 2, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle') + embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) + if not embed_urls: + raise ExtractorError('no videos found for %s' % video_id, expected=True) + return self.url_result_or_playlist_from_matches( + embed_urls, video_id, info['title'], ie=Ant1NewsGrEmbedIE.ie_key(), + video_kwargs={'url_transparent': True, 'timestamp': info.get('timestamp')}) + + +class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): + IE_NAME = 'ant1newsgr:embed' + IE_DESC = 'ant1news.gr embedded videos' + _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player' + _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)' + _API_PATH = '/news/templates/data/jsonPlayer' + + _TESTS = [{ + 'url': 'https://www.antenna.gr/templates/pages/player?cid=3f_li_c_az_jw_y_u=&w=670&h=377', + 'md5': 'dfc58c3a11a5a9aad2ba316ed447def3', + 'info_dict': { + 'id': '3f_li_c_az_jw_y_u=', + 'ext': 'mp4', + 'title': 'md5:a30c93332455f53e1e84ae0724f0adf7', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/bbe31201-3f09-4a4e-87f5-8ad2159fffe2.jpg', + }, + }] + + @classmethod + def _extract_urls(cls, webpage): + _EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' + _EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)' + for mobj in re.finditer(_EMBED_RE, webpage): + url = unescapeHTML(mobj.group('url')) + if not cls.suitable(url): + continue + yield url + + def _real_extract(self, url): + video_id = self._match_id(url) + + canonical_url = self._request_webpage( + HEADRequest(url), video_id, + note='Resolve canonical player URL', + errnote='Could not resolve canonical player URL').geturl() + _, netloc, _, _, query, _ = urllib.parse.urlparse(canonical_url) + cid = urllib.parse.parse_qs(query)['cid'][0] + + return self._download_and_extract_api_data(video_id, netloc, cid=cid) diff --git a/yt_dlp/extractor/applepodcasts.py b/yt_dlp/extractor/applepodcasts.py index 6a74de758..9139ff777 100644 --- a/yt_dlp/extractor/applepodcasts.py +++ b/yt_dlp/extractor/applepodcasts.py @@ -3,7 +3,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + clean_html, clean_podcast_url, + get_element_by_class, int_or_none, parse_iso8601, try_get, @@ -14,16 +16,17 @@ class ApplePodcastsIE(InfoExtractor): _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)' _TESTS = [{ 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', - 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'md5': '41dc31cd650143e530d9423b6b5a344f', 'info_dict': { 'id': '1000482637777', 'ext': 'mp3', 'title': '207 - Whitney Webb Returns', - 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6', 'upload_date': '20200705', - 'timestamp': 1593921600, - 'duration': 6425, + 'timestamp': 1593932400, + 'duration': 6454, 'series': 'The Tim Dillon Show', + 'thumbnail': 're:.+[.](png|jpe?g|webp)', } }, { 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', @@ -39,24 +42,47 @@ class ApplePodcastsIE(InfoExtractor): def _real_extract(self, url): episode_id = self._match_id(url) webpage = self._download_webpage(url, episode_id) - ember_data = self._parse_json(self._search_regex( - r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', - webpage, 'ember data'), episode_id) - ember_data = ember_data.get(episode_id) or ember_data - episode = ember_data['data']['attributes'] + episode_data = {} + ember_data = {} + # new page type 2021-11 + amp_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<', + webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {} + amp_data = try_get(amp_data, + lambda a: self._parse_json( + next(a[x] for x in iter(a) if episode_id in x), + episode_id), + dict) or {} + amp_data = amp_data.get('d') or [] + episode_data = try_get( + amp_data, + lambda a: next(x for x in a + if x['type'] == 'podcast-episodes' and x['id'] == episode_id), + dict) + if not episode_data: + # try pre 2021-11 page type: TODO: consider deleting if no longer used + ember_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) or {} + ember_data = ember_data.get(episode_id) or ember_data + episode_data = try_get(ember_data, lambda x: x['data'], dict) + episode = episode_data['attributes'] description = episode.get('description') or {} series = None - for inc in (ember_data.get('included') or []): + for inc in (amp_data or ember_data.get('included') or []): if inc.get('type') == 'media/podcast': series = try_get(inc, lambda x: x['attributes']['name']) + series = series or clean_html(get_element_by_class('podcast-header__identity', webpage)) return { 'id': episode_id, - 'title': episode['name'], + 'title': episode.get('name'), 'url': clean_podcast_url(episode['assetUrl']), 'description': description.get('standard') or description.get('short'), 'timestamp': parse_iso8601(episode.get('releaseDateTime')), 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), 'series': series, + 'thumbnail': self._og_search_thumbnail(webpage), + 'vcodec': 'none', } diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index 296b169d2..a7ffdc24c 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -12,6 +12,7 @@ from ..utils import ( int_or_none, parse_qs, qualities, + strip_or_none, try_get, unified_strdate, url_or_none, @@ -253,3 +254,44 @@ class ArteTVPlaylistIE(ArteTVBaseIE): title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') return self.playlist_result(entries, playlist_id, title, description) + + +class ArteTVCategoryIE(ArteTVBaseIE): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/politics-and-society/', + 'info_dict': { + 'id': 'politics-and-society', + 'title': 'Politics and society', + 'description': 'Investigative documentary series, geopolitical analysis, and international commentary', + }, + 'playlist_mincount': 13, + }, + ] + + @classmethod + def suitable(cls, url): + return ( + not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, )) + and super(ArteTVCategoryIE, cls).suitable(url)) + + def _real_extract(self, url): + lang, playlist_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, playlist_id) + + items = [] + for video in re.finditer( + r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang, + webpage): + video = video.group('url') + if video == url: + continue + if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): + items.append(video) + + title = (self._og_search_title(webpage, default=None) + or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None)) + title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) + + return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title, + description=self._og_search_description(webpage, default=None)) diff --git a/yt_dlp/extractor/audiomack.py b/yt_dlp/extractor/audiomack.py index 31fb859ae..19775cf0f 100644 --- a/yt_dlp/extractor/audiomack.py +++ b/yt_dlp/extractor/audiomack.py @@ -29,6 +29,7 @@ class AudiomackIE(InfoExtractor): } }, # audiomack wrapper around soundcloud song + # Needs new test URL. { 'add_ie': ['Soundcloud'], 'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle', diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 199a3f8e2..b664a7007 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -11,6 +11,7 @@ from ..compat import ( compat_etree_Element, compat_HTTPError, compat_str, + compat_urllib_error, compat_urlparse, ) from ..utils import ( @@ -38,7 +39,7 @@ from ..utils import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' + _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ @@ -394,9 +395,17 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_mpd_formats( href, programme_id, mpd_id=format_id, fatal=False)) elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) + # TODO: let expected_status be passed into _extract_xxx_formats() instead + try: + fmts = self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + except ExtractorError as e: + if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError) + and e.exc_info[1].code in (403, 404)): + raise + fmts = [] + formats.extend(fmts) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) @@ -785,20 +794,32 @@ class BBCIE(BBCCoUkIE): 'upload_date': '20150725', }, }, { + # video with window.__INITIAL_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/av/world-europe-59468682', + 'info_dict': { + 'id': 'p0b71qth', + 'ext': 'mp4', + 'title': 'Why France is making this woman a national hero', + 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1638230731, + 'upload_date': '20211130', + }, + }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, }, { + # bbcthreeConfig 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', 'info_dict': { 'id': 'p06556y7', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + 'title': 'Things Not To Say to people that live on council estates', + 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.", + 'duration': 360, + 'thumbnail': r're:https?://.+/.+\.jpg', }, - 'params': { - 'skip_download': True, - } }, { # window.__PRELOADED_STATE__ 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', @@ -1171,9 +1192,16 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) - initial_data = self._parse_json(self._parse_json(self._search_regex( - r'window\.__INITIAL_DATA__\s*=\s*("{.+?}");', webpage, - 'preload state', default='"{}"'), playlist_id, fatal=False), playlist_id, fatal=False) + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, + 'quoted preload state', default=None) + if initial_data is None: + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage, + 'preload state', default={}) + else: + initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) + initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: def parse_media(media): if not media: @@ -1214,7 +1242,10 @@ class BBCIE(BBCCoUkIE): if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in (try_get(resp, lambda x: x['data']['content']['model']['blocks'], list) or []): + for block in (try_get(resp, + (lambda x: x['data']['blocks'], + lambda x: x['data']['content']['model']['blocks'],), + list) or []): if block.get('type') != 'media': continue parse_media(block.get('model')) diff --git a/yt_dlp/extractor/bigo.py b/yt_dlp/extractor/bigo.py index 6e38ecc1d..ddf76ac55 100644 --- a/yt_dlp/extractor/bigo.py +++ b/yt_dlp/extractor/bigo.py @@ -34,9 +34,11 @@ class BigoIE(InfoExtractor): 'https://bigo.tv/studio/getInternalStudioInfo', user_id, data=urlencode_postdata({'siteId': user_id})) + if not isinstance(info_raw, dict): + raise ExtractorError('Received invalid JSON data') if info_raw.get('code'): raise ExtractorError( - f'{info_raw["msg"]} (code {info_raw["code"]})', expected=True) + 'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True) info = info_raw.get('data') or {} if not info.get('alive'): @@ -44,7 +46,7 @@ class BigoIE(InfoExtractor): return { 'id': info.get('roomId') or user_id, - 'title': info.get('roomTopic'), + 'title': info.get('roomTopic') or info.get('nick_name') or user_id, 'formats': [{ 'url': info.get('hls_src'), 'ext': 'mp4', diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index dbf5ef8d4..f86e7cb3e 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1140,8 +1140,8 @@ class InfoExtractor(object): 'url': url, } - def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, **kwargs): - urls = (self.url_result(self._proto_relative_url(m), ie) + def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs): + urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches) if getter else matches)) return self.playlist_result(urls, playlist_id, playlist_title, **kwargs) diff --git a/yt_dlp/extractor/cpac.py b/yt_dlp/extractor/cpac.py new file mode 100644 index 000000000..22741152c --- /dev/null +++ b/yt_dlp/extractor/cpac.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + urljoin, +) + +# compat_range +try: + if callable(xrange): + range = xrange +except (NameError, TypeError): + pass + + +class CPACIE(InfoExtractor): + IE_NAME = 'cpac' + _VALID_URL = r'https?://(?:www\.)?cpac\.ca/(?P<fr>l-)?episode\?id=(?P<id>[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12})' + _TEST = { + # 'url': 'http://www.cpac.ca/en/programs/primetime-politics/episodes/65490909', + 'url': 'https://www.cpac.ca/episode?id=fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'md5': 'e46ad699caafd7aa6024279f2614e8fa', + 'info_dict': { + 'id': 'fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'ext': 'mp4', + 'upload_date': '20220215', + 'title': 'News Conference to Celebrate National Kindness Week – February 15, 2022', + 'description': 'md5:466a206abd21f3a6f776cdef290c23fb', + 'timestamp': 1644901200, + }, + 'params': { + 'format': 'bestvideo', + 'hls_prefer_native': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if '/l-episode?' in url else 'en' + + content = self._download_json( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/episode/index.xml&crafterSite=cpacca&id=' + video_id, + video_id) + video_url = try_get(content, lambda x: x['page']['details']['videoUrl'], compat_str) + formats = [] + if video_url: + content = content['page'] + title = str_or_none(content['details']['title_%s_t' % (url_lang, )]) + formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', ext='mp4') + for fmt in formats: + # prefer language to match URL + fmt_lang = fmt.get('language') + if fmt_lang == url_lang: + fmt['language_preference'] = 10 + elif not fmt_lang: + fmt['language_preference'] = -1 + else: + fmt['language_preference'] = -10 + + self._sort_formats(formats) + + category = str_or_none(content['details']['category_%s_t' % (url_lang, )]) + + def is_live(v_type): + return (v_type == 'live') if v_type is not None else None + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': str_or_none(content['details'].get('description_%s_t' % (url_lang, ))), + 'timestamp': unified_timestamp(content['details'].get('liveDateTime')), + 'category': [category] if category else None, + 'thumbnail': urljoin(url, str_or_none(content['details'].get('image_%s_s' % (url_lang, )))), + 'is_live': is_live(content['details'].get('type')), + } + + +class CPACPlaylistIE(InfoExtractor): + IE_NAME = 'cpac:playlist' + _VALID_URL = r'(?i)https?://(?:www\.)?cpac\.ca/(?:program|search|(?P<fr>emission|rechercher))\?(?:[^&]+&)*?(?P<id>(?:id=\d+|programId=\d+|key=[^&]+))' + + _TESTS = [{ + 'url': 'https://www.cpac.ca/program?id=6', + 'info_dict': { + 'id': 'id=6', + 'title': 'Headline Politics', + 'description': 'Watch CPAC’s signature long-form coverage of the day’s pressing political events as they unfold.', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.cpac.ca/search?key=hudson&type=all&order=desc', + 'info_dict': { + 'id': 'key=hudson', + 'title': 'hudson', + }, + 'playlist_count': 22, + }, { + 'url': 'https://www.cpac.ca/search?programId=50', + 'info_dict': { + 'id': 'programId=50', + 'title': '50', + }, + 'playlist_count': 9, + }, { + 'url': 'https://www.cpac.ca/emission?id=6', + 'only_matching': True, + }, { + 'url': 'https://www.cpac.ca/rechercher?key=hudson&type=all&order=desc', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if any(x in url for x in ('/emission?', '/rechercher?')) else 'en' + pl_type, list_type = ('program', 'itemList') if any(x in url for x in ('/program?', '/emission?')) else ('search', 'searchResult') + api_url = ( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/%s/index.xml&crafterSite=cpacca&%s' + % (pl_type, video_id, )) + content = self._download_json(api_url, video_id) + entries = [] + total_pages = int_or_none(try_get(content, lambda x: x['page'][list_type]['totalPages']), default=1) + for page in range(1, total_pages + 1): + if page > 1: + api_url = update_url_query(api_url, {'page': '%d' % (page, ), }) + content = self._download_json( + api_url, video_id, + note='Downloading continuation - %d' % (page, ), + fatal=False) + + for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []: + episode_url = urljoin(url, try_get(item, lambda x: x['url_%s_s' % (url_lang, )])) + if episode_url: + entries.append(episode_url) + + return self.playlist_result( + (self.url_result(entry) for entry in entries), + playlist_id=video_id, + playlist_title=try_get(content, lambda x: x['page']['program']['title_%s_t' % (url_lang, )]) or video_id.split('=')[-1], + playlist_description=try_get(content, lambda x: x['page']['program']['description_%s_t' % (url_lang, )]), + ) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 5ef1901e4..5448acf01 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -68,6 +68,10 @@ from .anvato import AnvatoIE from .aol import AolIE from .allocine import AllocineIE from .aliexpress import AliExpressLiveIE +from .alsace20tv import ( + Alsace20TVIE, + Alsace20TVEmbedIE, +) from .apa import APAIE from .aparat import AparatIE from .appleconnect import AppleConnectIE @@ -91,6 +95,7 @@ from .arte import ( ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, + ArteTVCategoryIE, ) from .arnes import ArnesIE from .asiancrush import ( @@ -306,6 +311,10 @@ from .commonprotocols import ( from .condenast import CondeNastIE from .contv import CONtvIE from .corus import CorusIE +from .cpac import ( + CPACIE, + CPACPlaylistIE, +) from .cozytv import CozyTVIE from .cracked import CrackedIE from .crackle import CrackleIE @@ -1392,6 +1401,11 @@ from .megatvcom import ( MegaTVComIE, MegaTVComEmbedIE, ) +from .ant1newsgr import ( + Ant1NewsGrWatchIE, + Ant1NewsGrArticleIE, + Ant1NewsGrEmbedIE, +) from .rutv import RUTVIE from .ruutu import RuutuIE from .ruv import ( diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index d975e4bdb..0ddd050ff 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -103,6 +103,7 @@ from .videopress import VideoPressIE from .rutube import RutubeIE from .glomex import GlomexEmbedIE from .megatvcom import MegaTVComEmbedIE +from .ant1newsgr import Ant1NewsGrEmbedIE from .limelight import LimelightBaseIE from .anvato import AnvatoIE from .washingtonpost import WashingtonPostIE @@ -3544,6 +3545,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key()) + # Look for ant1news.gr embeds + ant1newsgr_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) + if ant1newsgr_urls: + return self.playlist_from_matches( + ant1newsgr_urls, video_id, video_title, ie=Ant1NewsGrEmbedIE.ie_key()) + # Look for WashingtonPost embeds wapo_urls = WashingtonPostIE._extract_urls(webpage) if wapo_urls: diff --git a/yt_dlp/extractor/nuvid.py b/yt_dlp/extractor/nuvid.py index 7487824f9..84fb97d6a 100644 --- a/yt_dlp/extractor/nuvid.py +++ b/yt_dlp/extractor/nuvid.py @@ -1,11 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import ( parse_duration, int_or_none, - try_get, + strip_or_none, + traverse_obj, + url_or_none, ) @@ -20,14 +23,30 @@ class NuvidIE(InfoExtractor): 'title': 'italian babe', 'duration': 321.0, 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', } }, { 'url': 'https://m.nuvid.com/video/6523263', + 'md5': 'ebd22ce8e47e1d9a4d0756a15c67da52', 'info_dict': { 'id': '6523263', 'ext': 'mp4', - 'age_limit': 18, 'title': 'Slut brunette college student anal dorm', + 'duration': 421.0, + 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', + 'thumbnails': list, + } + }, { + 'url': 'http://m.nuvid.com/video/6415801/', + 'md5': '638d5ececb138d5753593f751ae3f697', + 'info_dict': { + 'id': '6415801', + 'ext': 'mp4', + 'title': 'My best friend wanted to fuck my wife for a long time', + 'duration': 1882, + 'age_limit': 18, + 'thumbnail': r're:https?://.+\.jpg', } }] @@ -46,6 +65,16 @@ class NuvidIE(InfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', }) + webpage = self._download_webpage( + 'http://m.nuvid.com/video/%s' % (video_id, ), + video_id, 'Downloading video page', fatal=False) or '' + + title = strip_or_none(video_data.get('title') or self._html_search_regex( + (r'''<span\s[^>]*?\btitle\s*=\s*(?P<q>"|'|\b)(?P<title>[^"]+)(?P=q)\s*>''', + r'''<div\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)thumb-holder video(?P=q)>\s*<h5\b[^>]*>(?P<title>[^<]+)</h5''', + r'''<span\s[^>]*?\bclass\s*=\s*(?P<q>"|'|\b)title_thumb(?P=q)>(?P<title>[^<]+)</span'''), + webpage, 'title', group='title')) + formats = [{ 'url': source, 'format_id': qualities.get(quality), @@ -55,19 +84,19 @@ class NuvidIE(InfoExtractor): self._check_formats(formats, video_id) self._sort_formats(formats) - title = video_data.get('title') - thumbnail_base_url = try_get(video_data, lambda x: x['thumbs']['url']) - thumbnail_extension = try_get(video_data, lambda x: x['thumbs']['extension']) - thumbnail_id = self._search_regex( - r'/media/videos/tmb/6523263/preview/(/d+)' + thumbnail_extension, video_data.get('poster', ''), 'thumbnail id', default=19) - thumbnail = f'{thumbnail_base_url}player/{thumbnail_id}{thumbnail_extension}' - duration = parse_duration(video_data.get('duration') or video_data.get('duration_format')) + duration = parse_duration(traverse_obj(video_data, 'duration', 'duration_format')) + thumbnails = [ + {'url': thumb_url} for thumb_url in re.findall( + r'<div\s+class\s*=\s*"video-tmb-wrap"\s*>\s*<img\s+src\s*=\s*"([^"]+)"\s*/>', webpage) + if url_or_none(thumb_url)] + if url_or_none(video_data.get('poster')): + thumbnails.append({'url': video_data['poster'], 'preference': 1}) return { 'id': video_id, 'formats': formats, 'title': title, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'duration': duration, 'age_limit': 18, } diff --git a/yt_dlp/extractor/rutv.py b/yt_dlp/extractor/rutv.py index 3de86b232..66ac32deb 100644 --- a/yt_dlp/extractor/rutv.py +++ b/yt_dlp/extractor/rutv.py @@ -6,7 +6,8 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, - int_or_none + int_or_none, + str_to_int ) @@ -179,7 +180,7 @@ class RUTVIE(InfoExtractor): 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', 'rtmp_live': True, 'ext': 'flv', - 'vbr': int(quality), + 'vbr': str_to_int(quality), 'quality': preference, } elif transport == 'm3u8': diff --git a/yt_dlp/extractor/streamcz.py b/yt_dlp/extractor/streamcz.py index 0191c77de..4cb9923e2 100644 --- a/yt_dlp/extractor/streamcz.py +++ b/yt_dlp/extractor/streamcz.py @@ -22,6 +22,20 @@ class StreamCZIE(InfoExtractor): 'title': 'Bůh', 'display_id': 'buh', 'description': 'md5:8f5f09b9b7bc67df910486cdd88f7165', + 'duration': 1369.6, + 'view_count': int, + } + }, { + 'url': 'https://www.stream.cz/kdo-to-mluvi/kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna-64087937', + 'md5': '41fd358000086a1ccdb068c77809b158', + 'info_dict': { + 'id': '64087937', + 'ext': 'mp4', + 'title': 'Kdo to mluví? Velké odhalení přináší nový pořad už od 25. srpna', + 'display_id': 'kdo-to-mluvi-velke-odhaleni-prinasi-novy-porad-uz-od-25-srpna', + 'description': 'md5:97a811000a6460266029d6c1c2ebcd59', + 'duration': 50.2, + 'view_count': int, } }, { 'url': 'https://www.stream.cz/tajemno/znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili-64147267', @@ -31,7 +45,9 @@ class StreamCZIE(InfoExtractor): 'ext': 'mp4', 'title': 'Zničehonic jim skrz střechu prolítnul záhadný předmět. Badatelé vše objasnili', 'display_id': 'znicehonic-jim-skrz-strechu-prolitnul-zahadny-predmet-badatele-vse-objasnili', - 'description': 'md5:1dcb5e010eb697dedc5942f76c5b3744', + 'description': 'md5:4b8ada6718d34bb011c4e04ca4bc19bf', + 'duration': 442.84, + 'view_count': int, } }] diff --git a/yt_dlp/extractor/tele5.py b/yt_dlp/extractor/tele5.py index 0d9cf75ca..c7beee153 100644 --- a/yt_dlp/extractor/tele5.py +++ b/yt_dlp/extractor/tele5.py @@ -1,19 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from .jwplatform import JWPlatformIE -from .nexx import NexxIE +from .dplay import DPlayIE +from ..compat import compat_urlparse from ..utils import ( - NO_DEFAULT, - parse_qs, - smuggle_url, + ExtractorError, + extract_attributes, ) -class Tele5IE(InfoExtractor): +class Tele5IE(DPlayIE): _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)' _GEO_COUNTRIES = ['DE'] _TESTS = [{ @@ -28,6 +24,7 @@ class Tele5IE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'No longer available: "404 Seite nicht gefunden"', }, { # jwplatform, nexx unavailable 'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/', @@ -42,7 +39,20 @@ class Tele5IE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [JWPlatformIE.ie_key()], + 'skip': 'No longer available, redirects to Filme page', + }, { + 'url': 'https://tele5.de/mediathek/angel-of-mine/', + 'info_dict': { + 'id': '1252360', + 'ext': 'mp4', + 'upload_date': '20220109', + 'timestamp': 1641762000, + 'title': 'Angel of Mine', + 'description': 'md5:a72546a175e1286eb3251843a52d1ad7', + }, + 'params': { + 'format': 'bestvideo', + }, }, { 'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191', 'only_matching': True, @@ -64,45 +74,18 @@ class Tele5IE(InfoExtractor): }] def _real_extract(self, url): - qs = parse_qs(url) - video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0] - - NEXX_ID_RE = r'\d{6,}' - JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}' - - def nexx_result(nexx_id): - return self.url_result( - 'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id, - ie=NexxIE.ie_key(), video_id=nexx_id) - - nexx_id = jwplatform_id = None - - if video_id: - if re.match(NEXX_ID_RE, video_id): - return nexx_result(video_id) - elif re.match(JWPLATFORM_ID_RE, video_id): - jwplatform_id = video_id - - if not nexx_id: - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def extract_id(pattern, name, default=NO_DEFAULT): - return self._html_search_regex( - (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern, - r'\s+id\s*=\s*["\']player_(%s)' % pattern, - r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name, - default=default) - - nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None) - if nexx_id: - return nexx_result(nexx_id) - - if not jwplatform_id: - jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id') - - return self.url_result( - smuggle_url( - 'jwplatform:%s' % jwplatform_id, - {'geo_countries': self._GEO_COUNTRIES}), - ie=JWPlatformIE.ie_key(), video_id=jwplatform_id) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player_element = self._search_regex(r'(<hyoga-player\b[^>]+?>)', webpage, 'video player') + player_info = extract_attributes(player_element) + asset_id, country, realm = (player_info[x] for x in ('assetid', 'locale', 'realm', )) + endpoint = compat_urlparse.urlparse(player_info['endpoint']).hostname + source_type = player_info.get('sourcetype') + if source_type: + endpoint = '%s-%s' % (source_type, endpoint) + try: + return self._get_disco_api_info(url, asset_id, endpoint, realm, country) + except ExtractorError as e: + if getattr(e, 'message', '') == 'Missing deviceId in context': + self.report_drm(video_id) + raise diff --git a/yt_dlp/extractor/tumblr.py b/yt_dlp/extractor/tumblr.py index a9ad2e513..a3e0e15f2 100644 --- a/yt_dlp/extractor/tumblr.py +++ b/yt_dlp/extractor/tumblr.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + traverse_obj, urlencode_postdata ) @@ -14,32 +15,131 @@ class TumblrIE(InfoExtractor): _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' _NETRC_MACHINE = 'tumblr' _LOGIN_URL = 'https://www.tumblr.com/login' + _OAUTH_URL = 'https://www.tumblr.com/api/v2/oauth2/token' _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'md5': '479bb068e5b16462f5176a6828829767', 'info_dict': { 'id': '54196191430', 'ext': 'mp4', - 'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...', + 'title': 'md5:dfac39636969fe6bf1caa2d50405f069', 'description': 'md5:390ab77358960235b6937ab3b8528956', + 'uploader_id': 'tatianamaslanydaily', + 'uploader_url': 'https://tatianamaslanydaily.tumblr.com/', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 127, + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': ['Orphan Black', 'Tatiana Maslany', 'Interview', 'Video', 'OB S1 DVD Extras'], } }, { + 'note': 'multiple formats', 'url': 'https://maskofthedragon.tumblr.com/post/626907179849564160/mona-talking-in-english', 'md5': 'f43ff8a8861712b6cf0e0c2bd84cfc68', 'info_dict': { 'id': '626907179849564160', 'ext': 'mp4', - 'title': 'Me roast is buggered!, Mona\xa0“talking” in\xa0“english”', + 'title': 'Mona\xa0“talking” in\xa0“english”', 'description': 'md5:082a3a621530cb786ad2b7592a6d9e2c', + 'uploader_id': 'maskofthedragon', + 'uploader_url': 'https://maskofthedragon.tumblr.com/', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 7, + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': 'count:19', }, 'params': { 'format': 'hd', }, }, { + 'note': 'non-iframe video (with related posts)', + 'url': 'https://shieldfoss.tumblr.com/post/675519763813908480', + 'md5': '12bdb75661ef443bffe5a4dac1dbf118', + 'info_dict': { + 'id': '675519763813908480', + 'ext': 'mp4', + 'title': 'Shieldfoss', + 'uploader_id': 'nerviovago', + 'uploader_url': 'https://nerviovago.tumblr.com/', + 'thumbnail': r're:^https?://.*\.jpg', + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': [], + } + }, { + 'note': 'dashboard only (original post)', + 'url': 'https://jujanon.tumblr.com/post/159704441298/my-baby-eating', + 'md5': '029f7c91ab386701b211e3d494d2d95e', + 'info_dict': { + 'id': '159704441298', + 'ext': 'mp4', + 'title': 'md5:ba79365861101f4911452728d2950561', + 'description': 'md5:773738196cea76b6996ec71e285bdabc', + 'uploader_id': 'jujanon', + 'uploader_url': 'https://jujanon.tumblr.com/', + 'thumbnail': r're:^https?://.*\.jpg', + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': ['crabs', 'my video', 'my pets'], + } + }, { + 'note': 'dashboard only (reblog)', + 'url': 'https://bartlebyshop.tumblr.com/post/180294460076/duality-of-bird', + 'md5': '04334e7cadb1af680d162912559f51a5', + 'info_dict': { + 'id': '180294460076', + 'ext': 'mp4', + 'title': 'duality of bird', + 'description': 'duality of bird', + 'uploader_id': 'todaysbird', + 'uploader_url': 'https://todaysbird.tumblr.com/', + 'thumbnail': r're:^https?://.*\.jpg', + 'like_count': int, + 'repost_count': int, + 'age_limit': 0, + 'tags': [], + } + }, { + 'note': 'dashboard only (external)', + 'url': 'https://afloweroutofstone.tumblr.com/post/675661759168823296/the-blues-remembers-everything-the-country-forgot', + 'info_dict': { + 'id': 'q67_fd7b8SU', + 'ext': 'mp4', + 'title': 'The Blues Remembers Everything the Country Forgot', + 'alt_title': 'The Blues Remembers Everything the Country Forgot', + 'description': 'md5:1a6b4097e451216835a24c1023707c79', + 'release_date': '20201224', + 'creator': 'md5:c2239ba15430e87c3b971ba450773272', + 'uploader': 'Moor Mother - Topic', + 'upload_date': '20201223', + 'uploader_id': 'UCxrMtFBRkFvQJ_vVM4il08w', + 'uploader_url': 'http://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w', + 'thumbnail': r're:^https?://i.ytimg.com/.*', + 'channel': 'Moor Mother - Topic', + 'channel_id': 'UCxrMtFBRkFvQJ_vVM4il08w', + 'channel_url': 'https://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w', + 'channel_follower_count': int, + 'duration': 181, + 'view_count': int, + 'like_count': int, + 'age_limit': 0, + 'categories': ['Music'], + 'tags': 'count:7', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'availability': 'public', + 'track': 'The Blues Remembers Everything the Country Forgot', + 'artist': 'md5:c2239ba15430e87c3b971ba450773272', + 'album': 'Brass', + 'release_year': 2020, + }, + 'add_ie': ['Youtube'], + }, { 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching', 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab', 'info_dict': { @@ -55,16 +155,47 @@ class TumblrIE(InfoExtractor): # 'add_ie': ['Vidme'], 'skip': 'dead embedded video host' }, { + 'url': 'https://prozdvoices.tumblr.com/post/673201091169681408/what-recording-voice-acting-sounds-like', + 'md5': 'a0063fc8110e6c9afe44065b4ea68177', + 'info_dict': { + 'id': 'eomhW5MLGWA', + 'ext': 'mp4', + 'title': 'what recording voice acting sounds like', + 'description': 'md5:1da3faa22d0e0b1d8b50216c284ee798', + 'uploader': 'ProZD', + 'upload_date': '20220112', + 'uploader_id': 'ProZD', + 'uploader_url': 'http://www.youtube.com/user/ProZD', + 'thumbnail': r're:^https?://i.ytimg.com/.*', + 'channel': 'ProZD', + 'channel_id': 'UC6MFZAOHXlKK1FI7V0XQVeA', + 'channel_url': 'https://www.youtube.com/channel/UC6MFZAOHXlKK1FI7V0XQVeA', + 'channel_follower_count': int, + 'duration': 20, + 'view_count': int, + 'like_count': int, + 'age_limit': 0, + 'categories': ['Film & Animation'], + 'tags': [], + 'live_status': 'not_live', + 'playable_in_embed': True, + 'availability': 'public', + }, + 'add_ie': ['Youtube'], + }, { 'url': 'https://dominustempori.tumblr.com/post/673572712813297664/youtubes-all-right-for-some-pretty-cool', - 'md5': '5e45724c70b748f64f5a1731ac72c84a', + 'md5': '203e9eb8077e3f45bfaeb4c86c1467b8', 'info_dict': { 'id': '87816359', - 'ext': 'mp4', + 'ext': 'mov', 'title': 'Harold Ramis', + 'description': 'md5:be8e68cbf56ce0785c77f0c6c6dfaf2c', 'uploader': 'Resolution Productions Group', 'uploader_id': 'resolutionproductions', 'uploader_url': 'https://vimeo.com/resolutionproductions', + 'upload_date': '20140227', 'thumbnail': r're:^https?://i.vimeocdn.com/video/.*', + 'timestamp': 1393523719, 'duration': 291, }, 'add_ie': ['Vimeo'], @@ -107,116 +238,163 @@ class TumblrIE(InfoExtractor): 'add_ie': ['Instagram'], }] + _providers = { + 'instagram': 'Instagram', + 'vimeo': 'Vimeo', + 'vine': 'Vine', + 'youtube': 'Youtube', + } + + _ACCESS_TOKEN = None + def _real_initialize(self): + self.get_access_token() self._login() + def get_access_token(self): + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page', fatal=False) + if login_page: + self._ACCESS_TOKEN = self._search_regex( + r'"API_TOKEN":\s*"(\w+)"', login_page, 'API access token', fatal=False) + if not self._ACCESS_TOKEN: + self.report_warning('Failed to get access token; metadata will be missing and some videos may not work') + def _login(self): username, password = self._get_login_info() - if username is None: + if not username: return - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - login_form = self._hidden_inputs(login_page) - login_form.update({ - 'user[email]': username, - 'user[password]': password - }) - - response, urlh = self._download_webpage_handle( - self._LOGIN_URL, None, 'Logging in', - data=urlencode_postdata(login_form), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': self._LOGIN_URL, - }) - - # Successful login - if '/dashboard' in urlh.geturl(): + if not self._ACCESS_TOKEN: return - login_errors = self._parse_json( - self._search_regex( - r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response, - 'login errors', default='[]'), - None, fatal=False) - if login_errors: - raise ExtractorError( - 'Unable to login: %s' % login_errors[0], expected=True) - - self.report_warning('Login has probably failed') + self._download_json( + self._OAUTH_URL, None, 'Logging in', + data=urlencode_postdata({ + 'password': password, + 'grant_type': 'password', + 'username': username, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Authorization': f'Bearer {self._ACCESS_TOKEN}', + }, + errnote='Login failed', fatal=False) def _real_extract(self, url): - m_url = self._match_valid_url(url) - video_id = m_url.group('id') - blog = m_url.group('blog_name') + blog, video_id = self._match_valid_url(url).groups() - url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) + url = f'http://{blog}.tumblr.com/post/{video_id}/' webpage, urlh = self._download_webpage_handle(url, video_id) redirect_url = urlh.geturl() - if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): - raise ExtractorError( - 'This Tumblr may contain sensitive media. ' - 'Disable safe mode in your account settings ' - 'at https://www.tumblr.com/settings/account#safe_mode', - expected=True) + api_only = bool(self._search_regex( + r'(tumblr.com|^)/(safe-mode|login_required|blog/view)', + redirect_url, 'redirect', default=None)) + + if api_only and not self._ACCESS_TOKEN: + raise ExtractorError('Cannot get data for dashboard-only post without access token') + + post_json = {} + if self._ACCESS_TOKEN: + post_json = traverse_obj( + self._download_json( + f'https://www.tumblr.com/api/v2/blog/{blog}/posts/{video_id}/permalink', + video_id, headers={'Authorization': f'Bearer {self._ACCESS_TOKEN}'}, fatal=False), + ('response', 'timeline', 'elements', 0)) or {} + content_json = traverse_obj(post_json, ('trail', 0, 'content'), ('content')) or [] + video_json = next( + (item for item in content_json if item.get('type') == 'video'), {}) + media_json = video_json.get('media') or {} + if api_only and not media_json.get('url') and not video_json.get('url'): + raise ExtractorError('Failed to find video data for dashboard-only post') + + if not media_json.get('url') and video_json.get('url'): + # external video host + return self.url_result( + video_json['url'], + self._providers.get(video_json.get('provider'), 'Generic')) + + video_url = self._og_search_video_url(webpage, default=None) + duration = None + formats = [] + + # iframes can supply duration and sometimes additional formats, so check for one iframe_url = self._search_regex( - r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', + fr'src=\'(https?://www\.tumblr\.com/video/{blog}/{video_id}/[^\']+)\'', webpage, 'iframe url', default=None) - if iframe_url is None: + if iframe_url: + iframe = self._download_webpage( + iframe_url, video_id, 'Downloading iframe page', + headers={'Referer': redirect_url}) + + options = self._parse_json( + self._search_regex( + r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe, + 'hd video url', default='', group='options'), + video_id, fatal=False) + if options: + duration = int_or_none(options.get('duration')) + + hd_url = options.get('hdUrl') + if hd_url: + # there are multiple formats; extract them + # ignore other sources of width/height data as they may be wrong + sources = [] + sd_url = self._search_regex( + r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe, + 'sd video url', default=None, group='url') + if sd_url: + sources.append((sd_url, 'sd')) + sources.append((hd_url, 'hd')) + + formats = [{ + 'url': video_url, + 'format_id': format_id, + 'height': int_or_none(self._search_regex( + r'_(\d+)\.\w+$', video_url, 'height', default=None)), + 'quality': quality, + } for quality, (video_url, format_id) in enumerate(sources)] + + if not media_json.get('url') and not video_url and not iframe_url: + # external video host (but we weren't able to figure it out from the api) iframe_url = self._search_regex( r'src=["\'](https?://safe\.txmblr\.com/svc/embed/inline/[^"\']+)["\']', webpage, 'embed iframe url', default=None) return self.url_result(iframe_url or redirect_url, 'Generic') - iframe = self._download_webpage( - iframe_url, video_id, 'Downloading iframe page', - headers={'Referer': redirect_url}) - - duration = None - sources = [] - - sd_url = self._search_regex( - r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe, - 'sd video url', default=None, group='url') - if sd_url: - sources.append((sd_url, 'sd')) - - options = self._parse_json( - self._search_regex( - r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe, - 'hd video url', default='', group='options'), - video_id, fatal=False) - if options: - duration = int_or_none(options.get('duration')) - hd_url = options.get('hdUrl') - if hd_url: - sources.append((hd_url, 'hd')) - - formats = [{ - 'url': video_url, - 'ext': 'mp4', - 'format_id': format_id, - 'height': int_or_none(self._search_regex( - r'/(\d{3,4})$', video_url, 'height', default=None)), - 'quality': quality, - } for quality, (video_url, format_id) in enumerate(sources)] - + formats = formats or [{ + 'url': media_json.get('url') or video_url, + 'width': int_or_none( + media_json.get('width') or self._og_search_property('video:width', webpage, default=None)), + 'height': int_or_none( + media_json.get('height') or self._og_search_property('video:height', webpage, default=None)), + }] self._sort_formats(formats) - # The only place where you can get a title, it's not complete, - # but searching in other places doesn't work for all videos - video_title = self._html_search_regex( - r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>', - webpage, 'title') + # the url we're extracting from might be an original post or it might be a reblog. + # if it's a reblog, og:description will be the reblogger's comment, not the uploader's. + # content_json is always the op, so if it exists but has no text, there's no description + if content_json: + description = '\n\n'.join(( + item.get('text') for item in content_json if item.get('type') == 'text')) or None + else: + description = self._og_search_description(webpage, default=None) + uploader_id = traverse_obj(post_json, 'reblogged_root_name', 'blog_name') return { 'id': video_id, - 'title': video_title, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'title': post_json.get('summary') or (blog if api_only else self._html_search_regex( + r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>', webpage, 'title')), + 'description': description, + 'thumbnail': (traverse_obj(video_json, ('poster', 0, 'url')) + or self._og_search_thumbnail(webpage, default=None)), + 'uploader_id': uploader_id, + 'uploader_url': f'https://{uploader_id}.tumblr.com/' if uploader_id else None, 'duration': duration, + 'like_count': post_json.get('like_count'), + 'repost_count': post_json.get('reblog_count'), + 'age_limit': {True: 18, False: 0}.get(post_json.get('is_nsfw')), + 'tags': post_json.get('tags'), 'formats': formats, } diff --git a/yt_dlp/extractor/tv2dk.py b/yt_dlp/extractor/tv2dk.py index 8bd5fd640..ec5cbdf03 100644 --- a/yt_dlp/extractor/tv2dk.py +++ b/yt_dlp/extractor/tv2dk.py @@ -41,8 +41,16 @@ class TV2DKIE(InfoExtractor): 'duration': 1347, 'view_count': int, }, - 'params': { - 'skip_download': True, + 'add_ie': ['Kaltura'], + }, { + 'url': 'https://www.tv2lorry.dk/gadekamp/gadekamp-6-hoejhuse-i-koebenhavn', + 'info_dict': { + 'id': '1_7iwll9n0', + 'ext': 'mp4', + 'upload_date': '20211027', + 'title': 'Gadekamp #6 - Højhuse i København', + 'uploader_id': 'tv2lorry', + 'timestamp': 1635345229, }, 'add_ie': ['Kaltura'], }, { @@ -91,11 +99,14 @@ class TV2DKIE(InfoExtractor): add_entry(partner_id, kaltura_id) if not entries: kaltura_id = self._search_regex( - r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id') + (r'entry_id\s*:\s*["\']([0-9a-z_]+)', + r'\\u002FentryId\\u002F(\w+)\\u002F'), webpage, 'kaltura id') partner_id = self._search_regex( (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, 'partner id') add_entry(partner_id, kaltura_id) + if len(entries) == 1: + return entries[0] return self.playlist_result(entries) diff --git a/yt_dlp/extractor/tvopengr.py b/yt_dlp/extractor/tvopengr.py index 667f6660f..a11cdc6b0 100644 --- a/yt_dlp/extractor/tvopengr.py +++ b/yt_dlp/extractor/tvopengr.py @@ -7,7 +7,7 @@ from .common import InfoExtractor from ..utils import ( determine_ext, get_elements_text_and_html_by_attribute, - merge_dicts, + scale_thumbnails_to_max_format_width, unescapeHTML, ) @@ -78,21 +78,6 @@ class TVOpenGrWatchIE(TVOpenGrBaseIE): self._sort_formats(formats) return formats, subs - @staticmethod - def _scale_thumbnails_to_max_width(formats, thumbnails, url_width_re): - _keys = ('width', 'height') - max_dimensions = max( - [tuple(format.get(k) or 0 for k in _keys) for format in formats], - default=(0, 0)) - if not max_dimensions[0]: - return thumbnails - return [ - merge_dicts( - {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])}, - dict(zip(_keys, max_dimensions)), thumbnail) - for thumbnail in thumbnails - ] - def _real_extract(self, url): netloc, video_id, display_id = self._match_valid_url(url).group('netloc', 'id', 'slug') if netloc.find('tvopen.gr') == -1: @@ -102,7 +87,7 @@ class TVOpenGrWatchIE(TVOpenGrBaseIE): info['formats'], info['subtitles'] = self._extract_formats_and_subs( self._download_json(self._API_ENDPOINT, video_id, query={'cid': video_id}), video_id) - info['thumbnails'] = self._scale_thumbnails_to_max_width( + info['thumbnails'] = scale_thumbnails_to_max_format_width( info['formats'], info['thumbnails'], r'(?<=/imgHandler/)\d+') description, _html = next(get_elements_text_and_html_by_attribute('class', 'description', webpage)) if description and _html.startswith('<span '): diff --git a/yt_dlp/extractor/uol.py b/yt_dlp/extractor/uol.py index 4a2a97fa4..1baee0b10 100644 --- a/yt_dlp/extractor/uol.py +++ b/yt_dlp/extractor/uol.py @@ -95,7 +95,6 @@ class UOLIE(InfoExtractor): if v: query[k] = v f_url = update_url_query(f_url, query) - format_id = format_id if format_id == 'HLS': m3u8_formats = self._extract_m3u8_formats( f_url, media_id, 'mp4', 'm3u8_native', diff --git a/yt_dlp/extractor/urplay.py b/yt_dlp/extractor/urplay.py index 753ffa49c..eb2ab26e1 100644 --- a/yt_dlp/extractor/urplay.py +++ b/yt_dlp/extractor/urplay.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( dict_get, + ExtractorError, int_or_none, + ISO639Utils, + parse_age_limit, + try_get, unified_timestamp, ) @@ -23,9 +27,10 @@ class URPlayIE(InfoExtractor): 'upload_date': '20171214', 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', 'duration': 2269, - 'categories': ['Kultur & historia'], + 'categories': ['Vetenskap & teknik'], 'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'], 'episode': 'Om vetenskap, kritiskt tänkande och motstånd', + 'age_limit': 15, }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -50,11 +55,16 @@ class URPlayIE(InfoExtractor): video_id = self._match_id(url) url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - vid = int(video_id) - accessible_episodes = self._parse_json(self._html_search_regex( - r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['accessibleEpisodes'] - urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid) + urplayer_data = self._search_nextjs_data(webpage, video_id, fatal=False) or {} + if urplayer_data: + urplayer_data = try_get(urplayer_data, lambda x: x['props']['pageProps']['program'], dict) + if not urplayer_data: + raise ExtractorError('Unable to parse __NEXT_DATA__') + else: + accessible_episodes = self._parse_json(self._html_search_regex( + r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['accessibleEpisodes'] + urplayer_data = next(e for e in accessible_episodes if e.get('id') == int_or_none(video_id)) episode = urplayer_data['title'] host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] @@ -72,11 +82,28 @@ class URPlayIE(InfoExtractor): self._sort_formats(formats) subtitles = {} - subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location") - if subs: - subtitles.setdefault('Svenska', []).append({ - 'url': subs, - }) + + def parse_lang_code(code): + "3-character language code or None (utils candidate)" + if code is None: + return + lang = code.lower() + if not ISO639Utils.long2short(lang): + lang = ISO639Utils.short2long(lang) + return lang or None + + for k, v in (urplayer_data['streamingInfo'].get('sweComplete') or {}).items(): + if (k in ('sd', 'hd') or not isinstance(v, dict)): + continue + lang, sttl_url = (v.get(kk) for kk in ('language', 'location', )) + if not sttl_url: + continue + lang = parse_lang_code(lang) + if not lang: + continue + sttl = subtitles.get(lang) or [] + sttl.append({'ext': k, 'url': sttl_url, }) + subtitles[lang] = sttl image = urplayer_data.get('image') or {} thumbnails = [] @@ -98,7 +125,6 @@ class URPlayIE(InfoExtractor): return { 'id': video_id, - 'subtitles': subtitles, 'title': '%s : %s' % (series_title, episode) if series_title else episode, 'description': urplayer_data.get('description'), 'thumbnails': thumbnails, @@ -111,4 +137,7 @@ class URPlayIE(InfoExtractor): 'season': series.get('label'), 'episode': episode, 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), + 'age_limit': parse_age_limit(min(try_get(a, lambda x: x['from'], int) or 0 + for a in urplayer_data.get('ageRanges', []))), + 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/videa.py b/yt_dlp/extractor/videa.py index 512ade7af..90d705092 100644 --- a/yt_dlp/extractor/videa.py +++ b/yt_dlp/extractor/videa.py @@ -111,7 +111,6 @@ class VideaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_page = self._download_webpage(url, video_id) if 'videa.hu/player' in url: @@ -146,7 +145,7 @@ class VideaIE(InfoExtractor): compat_b64decode(b64_info), key), video_id) video = xpath_element(info, './video', 'video') - if not video: + if video is None: raise ExtractorError(xpath_element( info, './error', fatal=True), expected=True) sources = xpath_element( @@ -163,9 +162,9 @@ class VideaIE(InfoExtractor): source_exp = source.get('exp') if not (source_url and source_name): continue - hash_value = None - if hash_values: - hash_value = xpath_text(hash_values, 'hash_value_' + source_name) + hash_value = ( + xpath_text(hash_values, 'hash_value_' + source_name) + if hash_values is not None else None) if hash_value and source_exp: source_url = update_url_query(source_url, { 'md5': hash_value, diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 77ffb4bfb..458a751fe 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -636,6 +636,24 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/392479337/a52724358e', 'only_matching': True, }, + { + # similar, but all numeric: ID must be 581039021, not 9603038895 + # issue #29690 + 'url': 'https://vimeo.com/581039021/9603038895', + 'info_dict': { + 'id': '581039021', + # these have to be provided but we don't care + 'ext': 'mp4', + 'timestamp': 1627621014, + 'title': 're:.+', + 'uploader_id': 're:.+', + 'uploader': 're:.+', + 'upload_date': r're:\d+', + }, + 'params': { + 'skip_download': True, + }, + } # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header ] diff --git a/yt_dlp/extractor/wdr.py b/yt_dlp/extractor/wdr.py index 45bfe5f3a..ef58a66c3 100644 --- a/yt_dlp/extractor/wdr.py +++ b/yt_dlp/extractor/wdr.py @@ -10,6 +10,7 @@ from ..compat import ( ) from ..utils import ( determine_ext, + dict_get, ExtractorError, js_to_json, strip_jsonp, @@ -22,13 +23,14 @@ from ..utils import ( class WDRIE(InfoExtractor): + __API_URL_TPL = '//deviceids-medp.wdr.de/ondemand/%s/%s' _VALID_URL = r'''(?x)https?:// (?:deviceids-medp\.wdr\.de/ondemand/\d+/| kinder\.wdr\.de/(?!mediathek/)[^#?]+-) (?P<id>\d+)\.(?:js|assetjsonp) ''' _GEO_COUNTRIES = ['DE'] - _TEST = { + _TESTS = [{ 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', 'info_dict': { 'id': 'mdb-1557833', @@ -36,11 +38,19 @@ class WDRIE(InfoExtractor): 'title': 'Biathlon-Staffel verpasst Podest bei Olympia-Generalprobe', 'upload_date': '20180112', }, - } + }] + + def _asset_url(self, wdr_id): + id_len = max(len(wdr_id), 5) + return ''.join(('https:', self.__API_URL_TPL % (wdr_id[:id_len - 4], wdr_id, ), '.js')) def _real_extract(self, url): video_id = self._match_id(url) + if url.startswith('wdr:'): + video_id = url[4:] + url = self._asset_url(video_id) + metadata = self._download_json( url, video_id, transform_source=strip_jsonp) @@ -126,10 +136,10 @@ class WDRIE(InfoExtractor): } -class WDRPageIE(InfoExtractor): - _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' +class WDRPageIE(WDRIE): + _MAUS_REGEX = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/)*?(?P<maus_id>[^/?#.]+)(?:/?|/index\.php5|\.php5)$' _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html' - _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL + _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _MAUS_REGEX _TESTS = [ { @@ -170,11 +180,11 @@ class WDRPageIE(InfoExtractor): { 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { - 'id': 'mdb-1406149', + 'id': 'mdb-2296252', 'ext': 'mp4', - 'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': r're:^WDR Fernsehen im Livestream (?:\(nur in Deutschland erreichbar\) )?[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'alt_title': 'WDR Fernsehen Live', - 'upload_date': '20150101', + 'upload_date': '20201112', 'is_live': True, }, 'params': { @@ -183,7 +193,7 @@ class WDRPageIE(InfoExtractor): }, { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', - 'playlist_mincount': 7, + 'playlist_mincount': 6, 'info_dict': { 'id': 'aktuelle-stunde-120', }, @@ -191,10 +201,10 @@ class WDRPageIE(InfoExtractor): { 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', 'info_dict': { - 'id': 'mdb-1552552', + 'id': 'mdb-2627637', 'ext': 'mp4', 'upload_date': 're:^[0-9]{8}$', - 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', + 'title': 're:^Die Sendung (?:mit der Maus )?vom [0-9.]{10}$', }, 'skip': 'The id changes from week to week because of the new episode' }, @@ -207,6 +217,7 @@ class WDRPageIE(InfoExtractor): 'upload_date': '20130919', 'title': 'Sachgeschichte - Achterbahn ', }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html', @@ -232,6 +243,7 @@ class WDRPageIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', @@ -245,7 +257,7 @@ class WDRPageIE(InfoExtractor): def _real_extract(self, url): mobj = self._match_valid_url(url) - display_id = mobj.group('display_id') + display_id = dict_get(mobj.groupdict(), ('display_id', 'maus_id'), 'wdrmaus') webpage = self._download_webpage(url, display_id) entries = [] @@ -271,6 +283,14 @@ class WDRPageIE(InfoExtractor): jsonp_url = try_get( media_link_obj, lambda x: x['mediaObj']['url'], compat_str) if jsonp_url: + # metadata, or player JS with ['ref'] giving WDR id, or just media, perhaps + clip_id = media_link_obj['mediaObj'].get('ref') + if jsonp_url.endswith('.assetjsonp'): + asset = self._download_json( + jsonp_url, display_id, fatal=False, transform_source=strip_jsonp) + clip_id = try_get(asset, lambda x: x['trackerData']['trackerClipId'], compat_str) + if clip_id: + jsonp_url = self._asset_url(clip_id[4:]) entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key())) # Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html) @@ -290,16 +310,14 @@ class WDRPageIE(InfoExtractor): class WDRElefantIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P<id>.+)' _TEST = { - 'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015', + 'url': 'http://www.wdrmaus.de/elefantenseite/#elefantenkino_wippe', + # adaptive stream: unstable file MD5 'info_dict': { - 'title': 'Folge Oster-Spezial 2015', - 'id': 'mdb-1088195', + 'title': 'Wippe', + 'id': 'mdb-1198320', 'ext': 'mp4', 'age_limit': None, - 'upload_date': '20150406' - }, - 'params': { - 'skip_download': True, + 'upload_date': '20071003' }, } @@ -334,6 +352,7 @@ class WDRMobileIE(InfoExtractor): /[0-9]+/[0-9]+/ (?P<id>[0-9]+)_(?P<title>[0-9]+)''' IE_NAME = 'wdr:mobile' + _WORKING = False # no such domain _TEST = { 'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4', 'info_dict': { diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4e812af99..ee0277fd7 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3410,11 +3410,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if caption_track.get('kind') != 'asr': trans_code += f'-{lang_code}' trans_name += format_field(lang_name, template=' from %s') - process_language( - automatic_captions, base_url, trans_code, trans_name, {'tlang': trans_code}) + # Add an "-orig" label to the original language so that it can be distinguished. + # The subs are returned without "-orig" as well for compatibility if lang_code == f'a-{trans_code}': process_language( - automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {'tlang': trans_code}) + automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {}) + # Setting tlang=lang returns damaged subtitles. + # Not using lang_code == f'a-{trans_code}' here for future-proofing + orig_lang = parse_qs(base_url).get('lang', [None])[-1] + process_language(automatic_captions, base_url, trans_code, trans_name, + {} if orig_lang == trans_code else {'tlang': trans_code}) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index 0aa5184f7..5f4d26622 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -136,6 +136,34 @@ class ZDFBaseIE(InfoExtractor): class ZDFIE(ZDFBaseIE): _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' _TESTS = [{ + # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', + 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'info_dict': { + 'id': '210222_phx_nachgehakt_corona_protest', + 'ext': 'mp4', + 'title': 'Wohin führt der Protest in der Pandemie?', + 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', + 'duration': 1691, + 'timestamp': 1613948400, + 'upload_date': '20210221', + }, + 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', + }, { + # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html + 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', + 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + 'info_dict': { + 'id': '141007_ab18_10wochensommer_film', + 'ext': 'mp4', + 'title': 'Ab 18! - 10 Wochen Sommer', + 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', + 'duration': 2660, + 'timestamp': 1608604200, + 'upload_date': '20201222', + }, + 'skip': 'No longer available: "Diese Seite wurde leider nicht gefunden"', + }, { 'url': 'https://www.zdf.de/nachrichten/heute-journal/heute-journal-vom-30-12-2021-100.html', 'info_dict': { 'id': '211230_sendung_hjo', @@ -195,13 +223,16 @@ class ZDFIE(ZDFBaseIE): 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', 'only_matching': True, }, { - # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html - 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', - 'only_matching': True - }, { - # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html - 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', - 'only_matching': True + 'url': 'https://www.zdf.de/arte/todliche-flucht/page-video-artede-toedliche-flucht-16-100.html', + 'info_dict': { + 'id': 'video_artede_083871-001-A', + 'ext': 'mp4', + 'title': 'Tödliche Flucht (1/6)', + 'description': 'md5:e34f96a9a5f8abd839ccfcebad3d5315', + 'duration': 3193.0, + 'timestamp': 1641355200, + 'upload_date': '20220105', + }, }] def _extract_entry(self, url, player, content, video_id): diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index be0c69d8f..87463c999 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5271,6 +5271,28 @@ def join_nonempty(*values, delim='-', from_dict=None): return delim.join(map(str, filter(None, values))) +def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re): + """ + Find the largest format dimensions in terms of video width and, for each thumbnail: + * Modify the URL: Match the width with the provided regex and replace with the former width + * Update dimensions + + This function is useful with video services that scale the provided thumbnails on demand + """ + _keys = ('width', 'height') + max_dimensions = max( + [tuple(format.get(k) or 0 for k in _keys) for format in formats], + default=(0, 0)) + if not max_dimensions[0]: + return thumbnails + return [ + merge_dicts( + {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])}, + dict(zip(_keys, max_dimensions)), thumbnail) + for thumbnail in thumbnails + ] + + def parse_http_range(range): """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """ if not range: |