diff options
Diffstat (limited to 'youtube_dl/extractor/ard.py')
-rw-r--r-- | youtube_dl/extractor/ard.py | 282 |
1 files changed, 130 insertions, 152 deletions
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 5b7b2dd6d..8adae4644 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import json import re from .common import InfoExtractor @@ -23,28 +22,66 @@ from ..utils import ( from ..compat import compat_etree_fromstring -class ARDMediathekBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['DE'] +class ARDMediathekIE(InfoExtractor): + IE_NAME = 'ARD:mediathek' + _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' + + _TESTS = [{ + # available till 26.07.2022 + 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', + 'info_dict': { + 'id': '44726822', + 'ext': 'mp4', + 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', + 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', + 'duration': 1740, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', + 'only_matching': True, + }, { + # audio + 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', + 'only_matching': True, + }, { + 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', + 'only_matching': True, + }, { + # audio + 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', + 'only_matching': True, + }, { + 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) def _extract_media_info(self, media_info_url, webpage, video_id): media_info = self._download_json( media_info_url, video_id, 'Downloading media JSON') - return self._parse_media_info(media_info, video_id, '"fsk"' in webpage) - def _parse_media_info(self, media_info, video_id, fsk): formats = self._extract_formats(media_info, video_id) if not formats: - if fsk: + if '"fsk"' in webpage: raise ExtractorError( 'This video is only available after 20:00', expected=True) elif media_info.get('_geoblocked'): - self.raise_geo_restricted( - 'This video is not available due to geoblocking', - countries=self._GEO_COUNTRIES) + raise ExtractorError('This video is not available due to geo restriction', expected=True) self._sort_formats(formats) + duration = int_or_none(media_info.get('_duration')) + thumbnail = media_info.get('_previewImage') + is_live = media_info.get('_isLive') is True + subtitles = {} subtitle_url = media_info.get('_subtitleUrl') if subtitle_url: @@ -55,9 +92,9 @@ class ARDMediathekBaseIE(InfoExtractor): return { 'id': video_id, - 'duration': int_or_none(media_info.get('_duration')), - 'thumbnail': media_info.get('_previewImage'), - 'is_live': media_info.get('_isLive') is True, + 'duration': duration, + 'thumbnail': thumbnail, + 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, } @@ -86,11 +123,11 @@ class ARDMediathekBaseIE(InfoExtractor): update_url_query(stream_url, { 'hdcore': '3.1.1', 'plugin': 'aasp-3.1.1.69.124' - }), video_id, f4m_id='hds', fatal=False)) + }), + video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: if server and server.startswith('rtmp'): f = { @@ -103,9 +140,7 @@ class ARDMediathekBaseIE(InfoExtractor): 'url': stream_url, 'format_id': 'a%s-%s-%s' % (num, ext, quality) } - m = re.search( - r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', - stream_url) + m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url) if m: f.update({ 'width': int(m.group('width')), @@ -116,48 +151,6 @@ class ARDMediathekBaseIE(InfoExtractor): formats.append(f) return formats - -class ARDMediathekIE(ARDMediathekBaseIE): - IE_NAME = 'ARD:mediathek' - _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' - - _TESTS = [{ - # available till 26.07.2022 - 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', - 'info_dict': { - 'id': '44726822', - 'ext': 'mp4', - 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', - 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', - 'duration': 1740, - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, { - 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', - 'only_matching': True, - }, { - # audio - 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', - 'only_matching': True, - }, { - 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', - 'only_matching': True, - }, { - # audio - 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', - 'only_matching': True, - }, { - 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', - 'only_matching': True, - }] - - @classmethod - def suitable(cls, url): - return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) - def _real_extract(self, url): # determine video id from url m = re.match(self._VALID_URL, url) @@ -249,7 +242,7 @@ class ARDMediathekIE(ARDMediathekBaseIE): class ARDIE(InfoExtractor): - _VALID_URL = r'(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html' + _VALID_URL = r'(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html' _TESTS = [{ # available till 14.02.2019 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', @@ -264,9 +257,6 @@ class ARDIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html', - 'only_matching': True, - }, { 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', 'only_matching': True, }] @@ -312,32 +302,22 @@ class ARDIE(InfoExtractor): } -class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?:player|live|video)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)' +class ARDBetaMediathekIE(InfoExtractor): + _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P<video_id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^/?#]+))?' _TESTS = [{ - 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', + 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', + 'md5': '2d02d996156ea3c397cfc5036b5d7f8f', 'info_dict': { 'display_id': 'die-robuste-roswita', - 'id': '70153354', - 'title': 'Die robuste Roswita', + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', + 'title': 'Tatort: Die robuste Roswita', 'description': r're:^Der Mord.*trüber ist als die Ilm.', 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', - 'timestamp': 1577047500, - 'upload_date': '20191222', + 'thumbnail': 'https://img.ardmediathek.de/standard/00/55/43/59/34/-1774185891/16x9/960?mandant=ard', + 'upload_date': '20180826', 'ext': 'mp4', }, }, { - 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'only_matching': True, - }, { - 'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/', - 'only_matching': True, - }, { - 'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/', - 'only_matching': True, - }, { 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', 'only_matching': True, }, { @@ -348,75 +328,73 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id - - player_page = self._download_json( - 'https://api.ardmediathek.de/public-gateway', - display_id, data=json.dumps({ - 'query': '''{ - playerPage(client:"%s", clipId: "%s") { - blockedByFsk - broadcastedOn - maturityContentRating - mediaCollection { - _duration - _geoblocked - _isLive - _mediaArray { - _mediaStreamArray { - _quality - _server - _stream - } - } - _previewImage - _subtitleUrl - _type - } - show { - title - } - synopsis - title - tracking { - atiCustomVars { - contentId - } - } - } -}''' % (mobj.group('client'), video_id), - }).encode(), headers={ - 'Content-Type': 'application/json' - })['data']['playerPage'] - title = player_page['title'] - content_id = str_or_none(try_get( - player_page, lambda x: x['tracking']['atiCustomVars']['contentId'])) - media_collection = player_page.get('mediaCollection') or {} - if not media_collection and content_id: - media_collection = self._download_json( - 'https://www.ardmediathek.de/play/media/' + content_id, - content_id, fatal=False) or {} - info = self._parse_media_info( - media_collection, content_id or video_id, - player_page.get('blockedByFsk')) - age_limit = None - description = player_page.get('synopsis') - maturity_content_rating = player_page.get('maturityContentRating') - if maturity_content_rating: - age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) - if not age_limit and description: - age_limit = int_or_none(self._search_regex( - r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) - info.update({ - 'age_limit': age_limit, + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') + data = self._parse_json(data_json, display_id) + + res = { + 'id': video_id, 'display_id': display_id, - 'title': title, - 'description': description, - 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), - 'series': try_get(player_page, lambda x: x['show']['title']), + } + formats = [] + subtitles = {} + geoblocked = False + for widget in data.values(): + if widget.get('_geoblocked') is True: + geoblocked = True + if '_duration' in widget: + res['duration'] = int_or_none(widget['_duration']) + if 'clipTitle' in widget: + res['title'] = widget['clipTitle'] + if '_previewImage' in widget: + res['thumbnail'] = widget['_previewImage'] + if 'broadcastedOn' in widget: + res['timestamp'] = unified_timestamp(widget['broadcastedOn']) + if 'synopsis' in widget: + res['description'] = widget['synopsis'] + subtitle_url = url_or_none(widget.get('_subtitleUrl')) + if subtitle_url: + subtitles.setdefault('de', []).append({ + 'ext': 'ttml', + 'url': subtitle_url, + }) + if '_quality' in widget: + format_url = url_or_none(try_get( + widget, lambda x: x['_stream']['json'][0])) + if not format_url: + continue + ext = determine_ext(format_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=3.11.0', + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls', + fatal=False)) + else: + # HTTP formats are not available when geoblocked is True, + # other formats are fine though + if geoblocked: + continue + quality = str_or_none(widget.get('_quality')) + formats.append({ + 'format_id': ('http-' + quality) if quality else 'http', + 'url': format_url, + 'preference': 10, # Plain HTTP, that's nice + }) + + if not formats and geoblocked: + self.raise_geo_restricted( + msg='This video is not available due to geoblocking', + countries=['DE']) + + self._sort_formats(formats) + res.update({ + 'subtitles': subtitles, + 'formats': formats, }) - return info + + return res |