aboutsummaryrefslogtreecommitdiffstats
path: root/hypervideo_dl/extractor/ard.py
diff options
context:
space:
mode:
Diffstat (limited to 'hypervideo_dl/extractor/ard.py')
-rw-r--r--hypervideo_dl/extractor/ard.py76
1 files changed, 58 insertions, 18 deletions
diff --git a/hypervideo_dl/extractor/ard.py b/hypervideo_dl/extractor/ard.py
index 048d30f..7ea339b 100644
--- a/hypervideo_dl/extractor/ard.py
+++ b/hypervideo_dl/extractor/ard.py
@@ -280,7 +280,7 @@ class ARDMediathekIE(ARDMediathekBaseIE):
info.update({
'id': video_id,
- 'title': self._live_title(title) if info.get('is_live') else title,
+ 'title': title,
'description': description,
'thumbnail': thumbnail,
})
@@ -376,9 +376,24 @@ class ARDIE(InfoExtractor):
formats.append(f)
self._sort_formats(formats)
+ _SUB_FORMATS = (
+ ('./dataTimedText', 'ttml'),
+ ('./dataTimedTextNoOffset', 'ttml'),
+ ('./dataTimedTextVtt', 'vtt'),
+ )
+
+ subtitles = {}
+ for subsel, subext in _SUB_FORMATS:
+ for node in video_node.findall(subsel):
+ subtitles.setdefault('de', []).append({
+ 'url': node.attrib['url'],
+ 'ext': subext,
+ })
+
return {
'id': xpath_text(video_node, './videoId', default=display_id),
'formats': formats,
+ 'subtitles': subtitles,
'display_id': display_id,
'title': video_node.find('./title').text,
'duration': parse_duration(video_node.find('./duration').text),
@@ -388,7 +403,14 @@ class ARDIE(InfoExtractor):
class ARDBetaMediathekIE(ARDMediathekBaseIE):
- _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?P<mode>player|live|video|sendung|sammlung)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
+ _VALID_URL = r'''(?x)https://
+ (?:(?:beta|www)\.)?ardmediathek\.de/
+ (?:(?P<client>[^/]+)/)?
+ (?:player|live|video|(?P<playlist>sendung|sammlung))/
+ (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
+ (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
+ (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
+
_TESTS = [{
'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
@@ -403,6 +425,25 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'upload_date': '20200805',
'ext': 'mp4',
},
+ 'skip': 'Error',
+ }, {
+ 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
+ 'md5': 'f1837e563323b8a642a8ddeff0131f51',
+ 'info_dict': {
+ 'id': '10049223',
+ 'ext': 'mp4',
+ 'title': 'tagesschau, 20:00 Uhr',
+ 'timestamp': 1636398000,
+ 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b',
+ 'upload_date': '20211108',
+ },
+ }, {
+ 'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1',
+ 'playlist_count': 6,
+ 'info_dict': {
+ 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw',
+ 'title': 'beforeigners/beforeigners/staffel-1',
+ },
}, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
'only_matching': True,
@@ -426,6 +467,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
# playlist of type 'sammlung'
'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
'only_matching': True,
+ }, {
+ 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
+ 'only_matching': True,
}]
def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber):
@@ -522,23 +569,16 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
break
pageNumber = pageNumber + 1
- return self.playlist_result(entries, playlist_title=display_id)
+ return self.playlist_result(entries, playlist_id, playlist_title=display_id)
def _real_extract(self, url):
- mobj = self._match_valid_url(url)
- video_id = mobj.group('video_id')
- display_id = mobj.group('display_id')
- if display_id:
- display_id = display_id.rstrip('/')
- if not display_id:
- display_id = video_id
-
- if mobj.group('mode') in ('sendung', 'sammlung'):
- # this is a playlist-URL
- return self._ARD_extract_playlist(
- url, video_id, display_id,
- mobj.group('client'),
- mobj.group('mode'))
+ video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
+ 'id', 'display_id', 'playlist', 'client', 'season')
+ display_id, client = display_id or video_id, client or 'ard'
+
+ if playlist_type:
+ # TODO: Extract only specified season
+ return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
player_page = self._download_json(
'https://api.ardmediathek.de/public-gateway',
@@ -574,7 +614,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
}
}
}
-}''' % (mobj.group('client'), video_id),
+}''' % (client, video_id),
}).encode(), headers={
'Content-Type': 'application/json'
})['data']['playerPage']