diff options
Diffstat (limited to 'hypervideo_dl/extractor/zdf.py')
-rw-r--r-- | hypervideo_dl/extractor/zdf.py | 150 |
1 files changed, 72 insertions, 78 deletions
diff --git a/hypervideo_dl/extractor/zdf.py b/hypervideo_dl/extractor/zdf.py index 5f4d266..fca426a 100644 --- a/hypervideo_dl/extractor/zdf.py +++ b/hypervideo_dl/extractor/zdf.py @@ -1,18 +1,16 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + NO_DEFAULT, + ExtractorError, determine_ext, + extract_attributes, float_or_none, int_or_none, join_nonempty, merge_dicts, - NO_DEFAULT, - orderedSet, parse_codecs, qualities, traverse_obj, @@ -72,6 +70,7 @@ class ZDFBaseIE(InfoExtractor): f.update({ 'url': format_url, 'format_id': join_nonempty('http', meta.get('type'), meta.get('quality')), + 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)) }) new_formats = [f] formats.extend(merge_dicts(f, { @@ -111,7 +110,6 @@ class ZDFBaseIE(InfoExtractor): 'class': track.get('class'), 'language': track.get('language'), }) - self._sort_formats(formats, ('hasaud', 'res', 'quality', 'language_preference')) duration = float_or_none(try_get( ptmd, lambda x: x['attributes']['duration']['value']), scale=1000) @@ -122,6 +120,7 @@ class ZDFBaseIE(InfoExtractor): 'duration': duration, 'formats': formats, 'subtitles': self._extract_subtitles(ptmd), + '_format_sort_fields': ('tbr', 'res', 'quality', 'language_preference'), } def _extract_player(self, webpage, video_id, fatal=True): @@ -190,7 +189,7 @@ class ZDFIE(ZDFBaseIE): }, }, { 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html', - 'md5': '3d6f1049e9682178a11c54b91f3dd065', + 'md5': '1b93bdec7d02fc0b703c5e7687461628', 'info_dict': { 'ext': 'mp4', 'id': 'video_funk_1770473', @@ -233,23 +232,34 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1641355200, 'upload_date': '20220105', }, + 'skip': 'No longer available "Diese Seite wurde leider nicht gefunden"' + }, { + 'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html', + 'info_dict': { + 'id': '191205_1800_sendung_sok8', + 'ext': 'mp4', + 'title': 'Das Geld anderer Leute', + 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d', + 'duration': 2581.0, + 'timestamp': 1654790700, + 'upload_date': '20220609', + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350', + }, }] def _extract_entry(self, url, player, content, video_id): title = content.get('title') or content['teaserHeadline'] t = content['mainVideoContent']['http://zdf.de/rels/target'] - - ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') - + ptmd_path = traverse_obj(t, ( + (('streams', 'default'), None), + ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template') + ), get_all=False) if not ptmd_path: - ptmd_path = traverse_obj( - t, ('streams', 'default', 'http://zdf.de/rels/streams/ptmd-template'), - 'http://zdf.de/rels/streams/ptmd-template').replace( - '{playerId}', 'ngplayer_2_4') + raise ExtractorError('Could not extract ptmd_path') info = self._extract_ptmd( - urljoin(url, ptmd_path), video_id, player['apiToken'], url) + urljoin(url, ptmd_path.replace('{playerId}', 'ngplayer_2_4')), video_id, player['apiToken'], url) thumbnails = [] layouts = try_get( @@ -298,16 +308,16 @@ class ZDFIE(ZDFBaseIE): 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, video_id) - document = video['document'] - - title = document['titel'] - content_id = document['basename'] - formats = [] - format_urls = set() - for f in document['formitaeten']: - self._extract_format(content_id, formats, format_urls, f) - self._sort_formats(formats) + formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list) + document = formitaeten and video['document'] + if formitaeten: + title = document['titel'] + content_id = document['basename'] + + format_urls = set() + for f in formitaeten or []: + self._extract_format(content_id, formats, format_urls, f) thumbnails = [] teaser_bild = document.get('teaserBild') @@ -353,9 +363,9 @@ class ZDFChannelIE(ZDFBaseIE): 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', 'info_dict': { 'id': 'das-aktuelle-sportstudio', - 'title': 'das aktuelle sportstudio | ZDF', + 'title': 'das aktuelle sportstudio', }, - 'playlist_mincount': 23, + 'playlist_mincount': 18, }, { 'url': 'https://www.zdf.de/dokumentation/planet-e', 'info_dict': { @@ -364,6 +374,14 @@ class ZDFChannelIE(ZDFBaseIE): }, 'playlist_mincount': 50, }, { + 'url': 'https://www.zdf.de/gesellschaft/aktenzeichen-xy-ungeloest', + 'info_dict': { + 'id': 'aktenzeichen-xy-ungeloest', + 'title': 'Aktenzeichen XY... ungelöst', + 'entries': "lambda x: not any('xy580-fall1-kindermoerder-gesucht-100' in e['url'] for e in x)", + }, + 'playlist_mincount': 2, + }, { 'url': 'https://www.zdf.de/filme/taunuskrimi/', 'only_matching': True, }] @@ -372,60 +390,36 @@ class ZDFChannelIE(ZDFBaseIE): def suitable(cls, url): return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url) + def _og_search_title(self, webpage, fatal=False): + title = super(ZDFChannelIE, self)._og_search_title(webpage, fatal=fatal) + return re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', title or '')[0] or None + def _real_extract(self, url): channel_id = self._match_id(url) webpage = self._download_webpage(url, channel_id) - entries = [ - self.url_result(item_url, ie=ZDFIE.ie_key()) - for item_url in orderedSet(re.findall( - r'data-plusbar-url=["\'](http.+?\.html)', webpage))] - - return self.playlist_result( - entries, channel_id, self._og_search_title(webpage, fatal=False)) - - r""" - player = self._extract_player(webpage, channel_id) - - channel_id = self._search_regex( - r'docId\s*:\s*(["\'])(?P<id>(?!\1).+?)\1', webpage, - 'channel id', group='id') - - channel = self._call_api( - 'https://api.zdf.de/content/documents/%s.json' % channel_id, - player, url, channel_id) - - items = [] - for module in channel['module']: - for teaser in try_get(module, lambda x: x['teaser'], list) or []: - t = try_get( - teaser, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - items.extend(try_get( - t, - lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - items.extend(try_get( - module, - lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - - entries = [] - entry_urls = set() - for item in items: - t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - sharing_url = t.get('http://zdf.de/rels/sharing-url') - if not sharing_url or not isinstance(sharing_url, compat_str): - continue - if sharing_url in entry_urls: - continue - entry_urls.add(sharing_url) - entries.append(self.url_result( - sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id'))) - - return self.playlist_result(entries, channel_id, channel.get('title')) - """ + matches = re.finditer( + r'''<div\b[^>]*?\sdata-plusbar-id\s*=\s*(["'])(?P<p_id>[\w-]+)\1[^>]*?\sdata-plusbar-url=\1(?P<url>%s)\1''' % ZDFIE._VALID_URL, + webpage) + + if self._downloader.params.get('noplaylist', False): + entry = next( + (self.url_result(m.group('url'), ie=ZDFIE.ie_key()) for m in matches), + None) + self.to_screen('Downloading just the main video because of --no-playlist') + if entry: + return entry + else: + self.to_screen('Downloading playlist %s - add --no-playlist to download just the main video' % (channel_id, )) + + def check_video(m): + v_ref = self._search_regex( + r'''(<a\b[^>]*?\shref\s*=[^>]+?\sdata-target-id\s*=\s*(["'])%s\2[^>]*>)''' % (m.group('p_id'), ), + webpage, 'check id', default='') + v_ref = extract_attributes(v_ref) + return v_ref.get('data-target-video-type') != 'novideo' + + return self.playlist_from_matches( + (m.group('url') for m in matches if check_video(m)), + channel_id, self._og_search_title(webpage, fatal=False)) |