aboutsummaryrefslogtreecommitdiffstats
path: root/yt_dlp/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'yt_dlp/extractor')
-rw-r--r--yt_dlp/extractor/adobepass.py26
-rw-r--r--yt_dlp/extractor/banbye.py153
-rw-r--r--yt_dlp/extractor/bilibili.py67
-rw-r--r--yt_dlp/extractor/common.py12
-rw-r--r--yt_dlp/extractor/ellentube.py3
-rw-r--r--yt_dlp/extractor/extractors.py20
-rw-r--r--yt_dlp/extractor/fc2.py1
-rw-r--r--yt_dlp/extractor/generic.py22
-rw-r--r--yt_dlp/extractor/go.py8
-rw-r--r--yt_dlp/extractor/itprotv.py141
-rw-r--r--yt_dlp/extractor/lastfm.py129
-rw-r--r--yt_dlp/extractor/rai.py5
-rw-r--r--yt_dlp/extractor/tver.py19
-rw-r--r--yt_dlp/extractor/viki.py2
-rw-r--r--yt_dlp/extractor/vimeo.py100
-rw-r--r--yt_dlp/extractor/viu.py219
-rw-r--r--yt_dlp/extractor/wasdtv.py161
-rw-r--r--yt_dlp/extractor/youtube.py89
18 files changed, 977 insertions, 200 deletions
diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py
index f0eba8844..5d98301b8 100644
--- a/yt_dlp/extractor/adobepass.py
+++ b/yt_dlp/extractor/adobepass.py
@@ -1650,21 +1650,27 @@ class AdobePassIE(InfoExtractor):
hidden_data = self._hidden_inputs(first_bookend_page)
hidden_data['history_val'] = 1
- provider_login_redirect_page = self._download_webpage(
+ provider_login_redirect_page_res = self._download_webpage_handle(
urlh.geturl(), video_id, 'Sending First Bookend',
query=hidden_data)
- provider_tryauth_url = self._html_search_regex(
- r'url:\s*[\'"]([^\'"]+)', provider_login_redirect_page, 'ajaxurl')
+ provider_login_redirect_page, urlh = provider_login_redirect_page_res
- provider_tryauth_page = self._download_webpage(
- provider_tryauth_url, video_id, 'Submitting TryAuth',
- query=hidden_data)
+ # Some website partners seem to not have the extra ajaxurl redirect step, so we check if we already
+ # have the login prompt or not
+ if 'id="password" type="password" name="password"' in provider_login_redirect_page:
+ provider_login_page_res = provider_login_redirect_page_res
+ else:
+ provider_tryauth_url = self._html_search_regex(
+ r'url:\s*[\'"]([^\'"]+)', provider_login_redirect_page, 'ajaxurl')
+ provider_tryauth_page = self._download_webpage(
+ provider_tryauth_url, video_id, 'Submitting TryAuth',
+ query=hidden_data)
- provider_login_page_res = self._download_webpage_handle(
- f'https://authorize.suddenlink.net/saml/module.php/authSynacor/login.php?AuthState={provider_tryauth_page}',
- video_id, 'Getting Login Page',
- query=hidden_data)
+ provider_login_page_res = self._download_webpage_handle(
+ f'https://authorize.suddenlink.net/saml/module.php/authSynacor/login.php?AuthState={provider_tryauth_page}',
+ video_id, 'Getting Login Page',
+ query=hidden_data)
provider_association_redirect, urlh = post_form(
provider_login_page_res, 'Logging in', {
diff --git a/yt_dlp/extractor/banbye.py b/yt_dlp/extractor/banbye.py
new file mode 100644
index 000000000..3d4d36ec3
--- /dev/null
+++ b/yt_dlp/extractor/banbye.py
@@ -0,0 +1,153 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import math
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse_urlparse,
+ compat_parse_qs,
+)
+from ..utils import (
+ format_field,
+ InAdvancePagedList,
+ traverse_obj,
+ unified_timestamp,
+)
+
+
+class BanByeBaseIE(InfoExtractor):
+ _API_BASE = 'https://api.banbye.com'
+ _CDN_BASE = 'https://cdn.banbye.com'
+ _VIDEO_BASE = 'https://banbye.com/watch'
+
+ @staticmethod
+ def _extract_playlist_id(url, param='playlist'):
+ return compat_parse_qs(
+ compat_urllib_parse_urlparse(url).query).get(param, [None])[0]
+
+ def _extract_playlist(self, playlist_id):
+ data = self._download_json(f'{self._API_BASE}/playlists/{playlist_id}', playlist_id)
+ return self.playlist_result([
+ self.url_result(f'{self._VIDEO_BASE}/{video_id}', BanByeIE)
+ for video_id in data['videoIds']], playlist_id, data.get('name'))
+
+
+class BanByeIE(BanByeBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T',
+ 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5',
+ 'info_dict': {
+ 'id': 'v_ytfmvkVYLE8T',
+ 'ext': 'mp4',
+ 'title': 'md5:5ec098f88a0d796f987648de6322ba0f',
+ 'description': 'md5:4d94836e73396bc18ef1fa0f43e5a63a',
+ 'uploader': 'wRealu24',
+ 'channel_id': 'ch_wrealu24',
+ 'channel_url': 'https://banbye.com/channel/ch_wrealu24',
+ 'timestamp': 1647604800,
+ 'upload_date': '20220318',
+ 'duration': 1931,
+ 'thumbnail': r're:https?://.*\.webp',
+ 'tags': 'count:5',
+ 'like_count': int,
+ 'dislike_count': int,
+ 'view_count': int,
+ 'comment_count': int,
+ },
+ }, {
+ 'url': 'https://banbye.com/watch/v_2JjQtqjKUE_F?playlistId=p_Ld82N6gBw_OJ',
+ 'info_dict': {
+ 'title': 'Krzysztof Karoń',
+ 'id': 'p_Ld82N6gBw_OJ',
+ },
+ 'playlist_count': 9,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ playlist_id = self._extract_playlist_id(url, 'playlistId')
+
+ if self._yes_playlist(playlist_id, video_id):
+ return self._extract_playlist(playlist_id)
+
+ data = self._download_json(f'{self._API_BASE}/videos/{video_id}', video_id)
+ thumbnails = [{
+ 'id': f'{quality}p',
+ 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.webp',
+ } for quality in [48, 96, 144, 240, 512, 1080]]
+ formats = [{
+ 'format_id': f'http-{quality}p',
+ 'quality': quality,
+ 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4',
+ } for quality in data['quality']]
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': data.get('title'),
+ 'description': data.get('desc'),
+ 'uploader': traverse_obj(data, ('channel', 'name')),
+ 'channel_id': data.get('channelId'),
+ 'channel_url': format_field(data, 'channelId', 'https://banbye.com/channel/%s'),
+ 'timestamp': unified_timestamp(data.get('publishedAt')),
+ 'duration': data.get('duration'),
+ 'tags': data.get('tags'),
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'like_count': data.get('likes'),
+ 'dislike_count': data.get('dislikes'),
+ 'view_count': data.get('views'),
+ 'comment_count': data.get('commentCount'),
+ }
+
+
+class BanByeChannelIE(BanByeBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?channel/(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://banbye.com/channel/ch_wrealu24',
+ 'info_dict': {
+ 'title': 'wRealu24',
+ 'id': 'ch_wrealu24',
+ 'description': 'md5:da54e48416b74dfdde20a04867c0c2f6',
+ },
+ 'playlist_mincount': 791,
+ }, {
+ 'url': 'https://banbye.com/channel/ch_wrealu24?playlist=p_Ld82N6gBw_OJ',
+ 'info_dict': {
+ 'title': 'Krzysztof Karoń',
+ 'id': 'p_Ld82N6gBw_OJ',
+ },
+ 'playlist_count': 9,
+ }]
+ _PAGE_SIZE = 100
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+ playlist_id = self._extract_playlist_id(url)
+
+ if playlist_id:
+ return self._extract_playlist(playlist_id)
+
+ def page_func(page_num):
+ data = self._download_json(f'{self._API_BASE}/videos', channel_id, query={
+ 'channelId': channel_id,
+ 'sort': 'new',
+ 'limit': self._PAGE_SIZE,
+ 'offset': page_num * self._PAGE_SIZE,
+ }, note=f'Downloading page {page_num+1}')
+ return [
+ self.url_result(f"{self._VIDEO_BASE}/{video['_id']}", BanByeIE)
+ for video in data['items']
+ ]
+
+ channel_data = self._download_json(f'{self._API_BASE}/channels/{channel_id}', channel_id)
+ entries = InAdvancePagedList(
+ page_func,
+ math.ceil(channel_data['videoCount'] / self._PAGE_SIZE),
+ self._PAGE_SIZE)
+
+ return self.playlist_result(
+ entries, channel_id, channel_data.get('name'), channel_data.get('description'))
diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py
index b4eb20642..dd1ff512e 100644
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@@ -15,6 +15,7 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ filter_dict,
int_or_none,
float_or_none,
mimetype2ext,
@@ -755,15 +756,21 @@ class BiliIntlBaseIE(InfoExtractor):
for i, line in enumerate(json['body']) if line.get('content'))
return data
- def _get_subtitles(self, ep_id):
- sub_json = self._call_api(f'/web/v2/subtitle?episode_id={ep_id}&platform=web', ep_id)
+ def _get_subtitles(self, *, ep_id=None, aid=None):
+ sub_json = self._call_api(
+ '/web/v2/subtitle', ep_id or aid, note='Downloading subtitles list',
+ errnote='Unable to download subtitles list', query=filter_dict({
+ 'platform': 'web',
+ 'episode_id': ep_id,
+ 'aid': aid,
+ }))
subtitles = {}
for sub in sub_json.get('subtitles') or []:
sub_url = sub.get('url')
if not sub_url:
continue
sub_data = self._download_json(
- sub_url, ep_id, errnote='Unable to download subtitles', fatal=False,
+ sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False,
note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '')
if not sub_data:
continue
@@ -773,9 +780,14 @@ class BiliIntlBaseIE(InfoExtractor):
})
return subtitles
- def _get_formats(self, ep_id):
- video_json = self._call_api(f'/web/playurl?ep_id={ep_id}&platform=web', ep_id,
- note='Downloading video formats', errnote='Unable to download video formats')
+ def _get_formats(self, *, ep_id=None, aid=None):
+ video_json = self._call_api(
+ '/web/playurl', ep_id or aid, note='Downloading video formats',
+ errnote='Unable to download video formats', query=filter_dict({
+ 'platform': 'web',
+ 'ep_id': ep_id,
+ 'aid': aid,
+ }))
video_json = video_json['playurl']
formats = []
for vid in video_json.get('video') or []:
@@ -809,15 +821,15 @@ class BiliIntlBaseIE(InfoExtractor):
self._sort_formats(formats)
return formats
- def _extract_ep_info(self, episode_data, ep_id):
+ def _extract_video_info(self, video_data, *, ep_id=None, aid=None):
return {
- 'id': ep_id,
- 'title': episode_data.get('title_display') or episode_data['title'],
- 'thumbnail': episode_data.get('cover'),
+ 'id': ep_id or aid,
+ 'title': video_data.get('title_display') or video_data.get('title'),
+ 'thumbnail': video_data.get('cover'),
'episode_number': int_or_none(self._search_regex(
- r'^E(\d+)(?:$| - )', episode_data.get('title_display'), 'episode number', default=None)),
- 'formats': self._get_formats(ep_id),
- 'subtitles': self._get_subtitles(ep_id),
+ r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)),
+ 'formats': self._get_formats(ep_id=ep_id, aid=aid),
+ 'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid),
'extractor_key': BiliIntlIE.ie_key(),
}
@@ -854,7 +866,7 @@ class BiliIntlBaseIE(InfoExtractor):
class BiliIntlIE(BiliIntlBaseIE):
- _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?(play/(?P<season_id>\d+)/(?P<ep_id>\d+)|video/(?P<aid>\d+))'
_TESTS = [{
# Bstation page
'url': 'https://www.bilibili.tv/en/play/34613/341736',
@@ -889,24 +901,35 @@ class BiliIntlIE(BiliIntlBaseIE):
}, {
'url': 'https://www.biliintl.com/en/play/34613/341736',
'only_matching': True,
+ }, {
+ # User-generated content (as opposed to a series licensed from a studio)
+ 'url': 'https://bilibili.tv/en/video/2019955076',
+ 'only_matching': True,
+ }, {
+ # No language in URL
+ 'url': 'https://www.bilibili.tv/video/2019955076',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- season_id, video_id = self._match_valid_url(url).groups()
+ season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
+ video_id = ep_id or aid
webpage = self._download_webpage(url, video_id)
# Bstation layout
initial_data = self._parse_json(self._search_regex(
- r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
+ r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage,
'preload state', default='{}'), video_id, fatal=False) or {}
- episode_data = traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict)
+ video_data = (
+ traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict)
+ or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {})
- if not episode_data:
+ if season_id and not video_data:
# Non-Bstation layout, read through episode list
season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id)
- episode_data = next(
+ video_data = next(
episode for episode in traverse_obj(season_json, ('sections', ..., 'episodes', ...), expected_type=dict)
- if str(episode.get('episode_id')) == video_id)
- return self._extract_ep_info(episode_data, video_id)
+ if str(episode.get('episode_id')) == ep_id)
+ return self._extract_video_info(video_data, ep_id=ep_id, aid=aid)
class BiliIntlSeriesIE(BiliIntlBaseIE):
@@ -934,7 +957,7 @@ class BiliIntlSeriesIE(BiliIntlBaseIE):
series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id)
for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]):
episode_id = str(episode.get('episode_id'))
- yield self._extract_ep_info(episode, episode_id)
+ yield self._extract_video_info(episode, ep_id=episode_id)
def _real_extract(self, url):
series_id = self._match_id(url)
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index f3ae3fd4c..d0e57da23 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -49,6 +49,7 @@ from ..utils import (
error_to_compat_str,
extract_attributes,
ExtractorError,
+ filter_dict,
fix_xml_ampersands,
float_or_none,
format_field,
@@ -248,14 +249,14 @@ class InfoExtractor(object):
license: License name the video is licensed under.
creator: The creator of the video.
timestamp: UNIX timestamp of the moment the video was uploaded
- upload_date: Video upload date (YYYYMMDD).
+ upload_date: Video upload date in UTC (YYYYMMDD).
If not explicitly set, calculated from timestamp
release_timestamp: UNIX timestamp of the moment the video was released.
If it is not clear whether to use timestamp or this, use the former
- release_date: The date (YYYYMMDD) when the video was released.
+ release_date: The date (YYYYMMDD) when the video was released in UTC.
If not explicitly set, calculated from release_timestamp
modified_timestamp: UNIX timestamp of the moment the video was last modified.
- modified_date: The date (YYYYMMDD) when the video was last modified.
+ modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
If not explicitly set, calculated from modified_timestamp
uploader_id: Nickname or id of the video uploader.
uploader_url: Full URL to a personal webpage of the video uploader.
@@ -469,6 +470,7 @@ class InfoExtractor(object):
_GEO_IP_BLOCKS = None
_WORKING = True
_NETRC_MACHINE = None
+ IE_DESC = None
_LOGIN_HINTS = {
'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials',
@@ -1033,7 +1035,7 @@ class InfoExtractor(object):
if transform_source:
json_string = transform_source(json_string)
try:
- return json.loads(json_string)
+ return json.loads(json_string, strict=False)
except ValueError as ve:
errmsg = '%s: Failed to parse JSON ' % video_id
if fatal:
@@ -1587,7 +1589,7 @@ class InfoExtractor(object):
break
traverse_json_ld(json_ld)
- return dict((k, v) for k, v in info.items() if v is not None)
+ return filter_dict(info)
def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
return self._parse_json(
diff --git a/yt_dlp/extractor/ellentube.py b/yt_dlp/extractor/ellentube.py
index 544473274..d451bc048 100644
--- a/yt_dlp/extractor/ellentube.py
+++ b/yt_dlp/extractor/ellentube.py
@@ -26,7 +26,7 @@ class EllenTubeBaseIE(InfoExtractor):
duration = None
for entry in data.get('media'):
if entry.get('id') == 'm3u8':
- formats = self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
entry['url'], video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls')
duration = int_or_none(entry.get('duration'))
@@ -48,6 +48,7 @@ class EllenTubeBaseIE(InfoExtractor):
'view_count': get_insight('view'),
'like_count': get_insight('like'),
'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py
index 4eda27cdc..e5ae12a7d 100644
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@@ -122,6 +122,10 @@ from .awaan import (
)
from .azmedien import AZMedienIE
from .baidu import BaiduVideoIE
+from .banbye import (
+ BanByeIE,
+ BanByeChannelIE,
+)
from .bandaichannel import BandaiChannelIE
from .bandcamp import (
BandcampIE,
@@ -674,6 +678,12 @@ from .iqiyi import (
IqIE,
IqAlbumIE
)
+
+from .itprotv import (
+ ITProTVIE,
+ ITProTVCourseIE
+)
+
from .itv import (
ITVIE,
ITVBTCCIE,
@@ -731,6 +741,11 @@ from .laola1tv import (
EHFTVIE,
ITTFIE,
)
+from .lastfm import (
+ LastFMIE,
+ LastFMPlaylistIE,
+ LastFMUserIE,
+)
from .lbry import (
LBRYIE,
LBRYChannelIE,
@@ -1962,6 +1977,11 @@ from .washingtonpost import (
WashingtonPostIE,
WashingtonPostArticleIE,
)
+from .wasdtv import (
+ WASDTVStreamIE,
+ WASDTVRecordIE,
+ WASDTVClipIE,
+)
from .wat import WatIE
from .watchbox import WatchBoxIE
from .watchindianporn import WatchIndianPornIE
diff --git a/yt_dlp/extractor/fc2.py b/yt_dlp/extractor/fc2.py
index 7fc6b0e3d..54a83aa16 100644
--- a/yt_dlp/extractor/fc2.py
+++ b/yt_dlp/extractor/fc2.py
@@ -212,7 +212,6 @@ class FC2LiveIE(InfoExtractor):
'Accept': '*/*',
'User-Agent': std_headers['User-Agent'],
})
- ws.__enter__()
self.write_debug('[debug] Sending HLS server request')
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py
index 97e34808f..4a2e30158 100644
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -17,6 +17,7 @@ from ..compat import (
)
from ..utils import (
determine_ext,
+ dict_get,
ExtractorError,
float_or_none,
HEADRequest,
@@ -31,6 +32,7 @@ from ..utils import (
parse_resolution,
sanitized_Request,
smuggle_url,
+ str_or_none,
unescapeHTML,
unified_timestamp,
unsmuggle_url,
@@ -3778,11 +3780,12 @@ class GenericIE(InfoExtractor):
# Video.js embed
mobj = re.search(
- r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;',
+ r'(?s)\bvideojs\s*\(.+?([a-zA-Z0-9_$]+)\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;',
webpage)
if mobj is not None:
+ varname = mobj.group(1)
sources = self._parse_json(
- mobj.group(1), video_id, transform_source=js_to_json,
+ mobj.group(2), video_id, transform_source=js_to_json,
fatal=False) or []
if not isinstance(sources, list):
sources = [sources]
@@ -3819,6 +3822,21 @@ class GenericIE(InfoExtractor):
'Referer': full_response.geturl(),
},
})
+ # https://docs.videojs.com/player#addRemoteTextTrack
+ # https://html.spec.whatwg.org/multipage/media.html#htmltrackelement
+ for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
+ sub = self._parse_json(
+ sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {}
+ src = str_or_none(sub.get('src'))
+ if not src:
+ continue
+ subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
+ 'url': compat_urlparse.urljoin(url, src),
+ 'name': sub.get('label'),
+ 'http_headers': {
+ 'Referer': full_response.geturl(),
+ },
+ })
if formats or subtitles:
self.report_detected('video.js embed')
self._sort_formats(formats)
diff --git a/yt_dlp/extractor/go.py b/yt_dlp/extractor/go.py
index 2ccc6df21..f92e16600 100644
--- a/yt_dlp/extractor/go.py
+++ b/yt_dlp/extractor/go.py
@@ -217,6 +217,7 @@ class GoIE(AdobePassIE):
title = video_data['title']
formats = []
+ subtitles = {}
for asset in video_data.get('assets', {}).get('asset', []):
asset_url = asset.get('value')
if not asset_url:
@@ -256,8 +257,10 @@ class GoIE(AdobePassIE):
error_message = ', '.join([error['message'] for error in errors])
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
asset_url += '?' + entitlement['uplynkData']['sessionKey']
- formats.extend(self._extract_m3u8_formats(
- asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False))
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
else:
f = {
'format_id': format_id,
@@ -281,7 +284,6 @@ class GoIE(AdobePassIE):
formats.append(f)
self._sort_formats(formats)
- subtitles = {}
for cc in video_data.get('closedcaption', {}).get('src', []):
cc_url = cc.get('value')
if not cc_url:
diff --git a/yt_dlp/extractor/itprotv.py b/yt_dlp/extractor/itprotv.py
new file mode 100644
index 000000000..64cb4e69a
--- /dev/null
+++ b/yt_dlp/extractor/itprotv.py
@@ -0,0 +1,141 @@
+# coding: utf-8
+
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+ int_or_none,
+ str_or_none,
+ traverse_obj,
+ urljoin
+)
+
+
+class ITProTVBaseIE(InfoExtractor):
+ _ENDPOINTS = {
+ 'course': 'course?url={}&brand=00002560-0000-3fa9-0000-1d61000035f3',
+ 'episode': 'brand/00002560-0000-3fa9-0000-1d61000035f3/episode?url={}'
+ }
+
+ def _call_api(self, ep, item_id, webpage):
+ return self._download_json(
+ f'https://api.itpro.tv/api/urza/v3/consumer-web/{self._ENDPOINTS[ep].format(item_id)}',
+ item_id, note=f'Fetching {ep} data API',
+ headers={'Authorization': f'Bearer {self._fetch_jwt(webpage)}'})[ep]
+
+ def _fetch_jwt(self, webpage):
+ return self._search_regex(r'{"passedToken":"([\w-]+\.[\w-]+\.[\w-]+)",', webpage, 'jwt')
+
+ def _check_if_logged_in(self, webpage):
+ if re.match(r'{\s*member\s*:\s*null', webpage):
+ self.raise_login_required()
+
+
+class ITProTVIE(ITProTVBaseIE):
+ _VALID_URL = r'https://app.itpro.tv/course/(?P<course>[\w-]+)/(?P<id>[\w-]+)'
+ _TESTS = [{
+ 'url': 'https://app.itpro.tv/course/guided-tour/introductionitprotv',
+ 'md5': 'bca4a28c2667fd1a63052e71a94bb88c',
+ 'info_dict': {
+ 'id': 'introductionitprotv',
+ 'ext': 'mp4',
+ 'title': 'An Introduction to ITProTV 101',
+ 'thumbnail': 'https://itprotv-image-bucket.s3.amazonaws.com/getting-started/itprotv-101-introduction-PGM.11_39_56_02.Still001.png',
+ 'description': 'md5:b175c2c3061ce35a4dd33865b2c1da4e',
+ 'duration': 269,
+ 'series': 'ITProTV 101',
+ 'series_id': 'guided-tour',
+ 'availability': 'needs_auth',
+ 'chapter': 'ITProTV 101',
+ 'chapter_number': 1,
+ 'chapter_id': '5dbb3de426b46c0010b5d1b6'
+ },
+ },
+ {
+ 'url': 'https://app.itpro.tv/course/beyond-tech/job-interview-tips',
+ 'md5': '101a299b98c47ccf4c67f9f0951defa8',
+ 'info_dict': {
+ 'id': 'job-interview-tips',
+ 'ext': 'mp4',
+ 'title': 'Job Interview Tips',
+ 'thumbnail': 'https://s3.amazonaws.com:443/production-itprotv-thumbnails/2f370bf5-294d-4bbe-ab80-c0b5781630ea.png',
+ 'description': 'md5:30d8ba483febdf89ec85623aad3c3cb6',
+ 'duration': 267,
+ 'series': 'Beyond Tech',
+ 'series_id': 'beyond-tech',
+ 'availability': 'needs_auth',
+ 'chapter': 'Job Development',
+ 'chapter_number': 2,
+ 'chapter_id': '5f7c78d424330c000edf04d9'
+ },
+ }]
+
+ def _real_extract(self, url):
+ episode_id, course_name = self._match_valid_url(url).group('id', 'course')
+ webpage = self._download_webpage(url, episode_id)
+ self._check_if_logged_in(webpage)
+ course = self._call_api('course', course_name, webpage)
+ episode = self._call_api('episode', episode_id, webpage)
+
+ chapter_number, chapter = next((
+ (i, topic) for i, topic in enumerate(course.get('topics') or [], 1)
+ if traverse_obj(topic, 'id') == episode.get('topic')), {})
+
+ return {
+ 'id': episode_id,
+ 'title': episode.get('title'),
+ 'description': episode.get('description'),
+ 'thumbnail': episode.get('thumbnail'),
+ 'formats': [
+ {'url': episode[f'jwVideo{h}Embed'], 'height': h}
+ for h in (320, 480, 720, 1080) if episode.get(f'jwVideo{h}Embed')
+ ],
+ 'duration': int_or_none(episode.get('length')),
+ 'series': course.get('name'),
+ 'series_id': course.get('url'),
+ 'chapter': str_or_none(chapter.get('title')),
+ 'chapter_number': chapter_number,
+ 'chapter_id': str_or_none(chapter.get('id')),
+ 'subtitles': {
+ 'en': [{'ext': 'vtt', 'data': episode['enCaptionData']}]
+ } if episode.get('enCaptionData') else None,
+ }
+
+
+class ITProTVCourseIE(ITProTVBaseIE):
+ _VALID_URL = r'https?://app.itpro.tv/course/(?P<id>[\w-]+)/?(?:$|[#?])'
+ _TESTS = [
+ {
+ 'url': 'https://app.itpro.tv/course/guided-tour',
+ 'info_dict': {
+ 'id': 'guided-tour',
+ 'description': 'md5:b175c2c3061ce35a4dd33865b2c1da4e',
+ 'title': 'ITProTV 101',
+ },
+ 'playlist_count': 6
+ },
+ {
+ 'url': 'https://app.itpro.tv/course/beyond-tech',
+ 'info_dict': {
+ 'id': 'beyond-tech',
+ 'description': 'md5:44cd99855e7f81a15ce1269bd0621fed',
+ 'title': 'Beyond Tech'
+ },
+ 'playlist_count': 15
+ },
+ ]
+
+ def _real_extract(self, url):
+ course_id = self._match_id(url)
+ webpage = self._download_webpage(url, course_id)
+ self._check_if_logged_in(webpage)
+ course = self._call_api('course', course_id, webpage)
+
+ entries = [self.url_result(
+ urljoin(url, f'{course_id}/{episode["url"]}'), ITProTVIE,
+ episode['url'], episode.get('title'), url_transparent=True)
+ for episode in course['episodes']]
+
+ return self.playlist_result(
+ entries, course_id, course.get('name'), course.get('description'))
diff --git a/yt_dlp/extractor/lastfm.py b/yt_dlp/extractor/lastfm.py
new file mode 100644
index 000000000..5215717e8
--- /dev/null
+++ b/yt_dlp/extractor/lastfm.py
@@ -0,0 +1,129 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none, format_field
+
+
+class LastFMPlaylistBaseIE(InfoExtractor):
+ def _entries(self, url, playlist_id):
+ webpage = self._download_webpage(url, playlist_id)
+ start_page_number = int_or_none(self._search_regex(
+ r'\bpage=(\d+)', url, 'page', default=None)) or 1
+ last_page_number = int_or_none(self._search_regex(
+ r'>(\d+)</a>[^<]*</li>[^<]*<li[^>]+class="pagination-next', webpage, 'last_page', default=None))
+
+ for page_number in range(start_page_number, (last_page_number or start_page_number) + 1):
+ webpage = self._download_webpage(
+ url, playlist_id,
+ note='Downloading page %d%s' % (page_number, format_field(last_page_number, template=' of %d')),
+ query={'page': page_number})
+ page_entries = [
+ self.url_result(player_url, 'Youtube')
+ for player_url in set(re.findall(r'data-youtube-url="([^"]+)"', webpage))
+ ]
+
+ for e in page_entries:
+ yield e
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ return self.playlist_result(self._entries(url, playlist_id), playlist_id)
+
+
+class LastFMPlaylistIE(LastFMPlaylistBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?last\.fm/(music|tag)/(?P<id>[^/]+)(?:/[^/]+)?/?(?:[?#]|$)'
+ _TESTS = [{
+ 'url': 'https://www.last.fm/music/Oasis/(What%27s+the+Story)+Morning+Glory%3F',
+ 'info_dict': {
+ 'id': 'Oasis',
+ },
+ 'playlist_count': 11,
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis?top_tracks_date_preset=ALL#top-tracks',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis/+tracks',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis/+tracks?page=2',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis/+tracks?date_preset=LAST_90_DAYS#top-tracks',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/tag/rock',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/tag/rock/tracks',
+ 'only_matching': True,
+ }]
+
+
+class LastFMUserIE(LastFMPlaylistBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?last\.fm/user/[^/]+/playlists/(?P<id>[^/#?]+)'
+ _TESTS = [{
+ 'url': 'https://www.last.fm/user/mehq/playlists/12319471',
+ 'info_dict': {
+ 'id': '12319471',
+ },
+ 'playlist_count': 30,
+ }]
+
+
+class LastFMIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?last\.fm/music(?:/[^/]+){2}/(?P<id>[^/#?]+)'
+ _TESTS = [{
+ 'url': 'https://www.last.fm/music/Oasis/_/Wonderwall',
+ 'md5': '9c4a70c2e84c03d54fe24229b9e13b7b',
+ 'info_dict': {
+ 'id': '6hzrDeceEKc',
+ 'ext': 'mp4',
+ 'title': 'Oasis - Wonderwall (Official Video)',
+ 'thumbnail': r're:^https?://i.ytimg.com/.*\.jpg$',
+ 'description': 'md5:0848669853c10687cc28e88b5756738f',
+ 'uploader': 'Oasis',
+ 'uploader_id': 'oasisinetofficial',
+ 'upload_date': '20080207',
+ 'album': '(What\'s The Story) Morning Glory? (Remastered)',
+ 'track': 'Wonderwall (Remastered)',
+ 'channel_id': 'UCUDVBtnOQi4c7E8jebpjc9Q',
+ 'view_count': int,
+ 'live_status': 'not_live',
+ 'channel_url': 'https://www.youtube.com/channel/UCUDVBtnOQi4c7E8jebpjc9Q',
+ 'tags': 'count:39',
+ 'creator': 'Oasis',
+ 'uploader_url': 're:^https?://www.youtube.com/user/oasisinetofficial',
+ 'duration': 279,
+ 'alt_title': 'Wonderwall (Remastered)',
+ 'age_limit': 0,
+ 'channel': 'Oasis',
+ 'channel_follower_count': int,
+ 'categories': ['Music'],
+ 'availability': 'public',
+ 'like_count': int,
+ 'playable_in_embed': True,
+ 'artist': 'Oasis',
+ },
+ 'add_ie': ['Youtube'],
+ }, {
+ 'url': 'https://www.last.fm/music/Oasis/_/Don%27t+Look+Back+In+Anger+-+Remastered/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.last.fm/music/Guns+N%27+Roses/_/Sweet+Child+o%27+Mine',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ player_url = self._search_regex(r'(?s)class="header-new-playlink"\s+href="([^"]+)"', webpage, 'player_url')
+ return self.url_result(player_url, 'Youtube')
diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py
index 34f127285..9d243b2be 100644
--- a/yt_dlp/extractor/rai.py
+++ b/yt_dlp/extractor/rai.py
@@ -11,6 +11,7 @@ from ..compat import (
from ..utils import (
determine_ext,
ExtractorError,
+ filter_dict,
find_xpath_attr,
fix_xml_ampersands,
GeoRestrictedError,
@@ -110,11 +111,11 @@ class RaiBaseIE(InfoExtractor):
if not audio_only:
formats.extend(self._create_http_urls(relinker_url, formats))
- return dict((k, v) for k, v in {
+ return filter_dict({
'is_live': is_live,
'duration': duration,
'formats': formats,
- }.items() if v is not None)
+ })
def _create_http_urls(self, relinker_url, fmts):
_RELINKER_REG = r'https?://(?P<host>[^/]+?)/(?:i/)?(?P<extra>[^/]+?)/(?P<path>.+?)/(?P<id>\d+)(?:_(?P<quality>[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?'
diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py
index b8ac41483..9ff3136e2 100644
--- a/yt_dlp/extractor/tver.py
+++ b/yt_dlp/extractor/tver.py
@@ -14,7 +14,7 @@ from ..utils import (
class TVerIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>(?:corner|episode|feature)/(?P<id>f?\d+))'
+ _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>corner|episode|feature|lp|tokyo2020/video)/(?P<id>[fc]?\d+)'
# videos are only available for 7 days
_TESTS = [{
'url': 'https://tver.jp/corner/f0062178',
@@ -29,6 +29,15 @@ class TVerIE(InfoExtractor):
# subtitle = ' '
'url': 'https://tver.jp/corner/f0068870',
'only_matching': True,
+ }, {
+ 'url': 'https://tver.jp/lp/f0009694',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://tver.jp/lp/c0000239',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://tver.jp/tokyo2020/video/6264525510001',
+ 'only_matching': True,
}]
_TOKEN = None
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
@@ -39,9 +48,11 @@ class TVerIE(InfoExtractor):
def _real_extract(self, url):
path, video_id = self._match_valid_url(url).groups()
- api_response = self._download_json(
- 'https://api.tver.jp/v4/' + path, video_id,
- query={'token': self._TOKEN})
+ if path == 'lp':
+ webpage = self._download_webpage(url, video_id)
+ redirect_path = self._search_regex(r'to_href="([^"]+)', webpage, 'redirect path')
+ path, video_id = self._match_valid_url(f'https://tver.jp{redirect_path}').groups()
+ api_response = self._download_json(f'https://api.tver.jp/v4/{path}/{video_id}', video_id, query={'token': self._TOKEN})
p_id = traverse_obj(api_response, ('main', 'publisher_id'))
if not p_id:
error_msg, expected = traverse_obj(api_response, ('episode', 0, 'textbar', 0, ('text', 'longer')), get_all=False), True
diff --git a/yt_dlp/extractor/viki.py b/yt_dlp/extractor/viki.py
index 8234ba7df..8a930798d 100644
--- a/yt_dlp/extractor/viki.py
+++ b/yt_dlp/extractor/viki.py
@@ -261,7 +261,7 @@ class VikiIE(VikiBaseIE):
mpd_content = self._download_webpage(mpd_url, video_id, note='Downloading initial MPD manifest')
mpd_url = self._search_regex(
r'(?mi)<BaseURL>(http.+.mpd)', mpd_content, 'new manifest', default=mpd_url)
- if 'mpdhd_high' not in mpd_url:
+ if 'mpdhd_high' not in mpd_url and 'sig=' not in mpd_url:
# Modify the URL to get 1080p
mpd_url = mpd_url.replace('mpdhd', 'mpdhd_high')
formats = self._extract_mpd_formats(mpd_url, video_id)
diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py
index 051cf1b17..972fb480b 100644
--- a/yt_dlp/extractor/vimeo.py
+++ b/yt_dlp/extractor/vimeo.py
@@ -327,7 +327,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'info_dict': {
'id': '56015672',
'ext': 'mp4',
- 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+ 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc",
'description': 'md5:2d3305bad981a06ff79f027f19865021',
'timestamp': 1355990239,
'upload_date': '20121220',
@@ -340,6 +340,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'params': {
'format': 'best[protocol=https]',
},
+ 'skip': 'No longer available'
},
{
'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876',
@@ -357,6 +358,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
'upload_date': '20130610',
'timestamp': 1370893156,
'license': 'by',
+ 'thumbnail': 'https://i.vimeocdn.com/video/440260469-19b0d92fca3bd84066623b53f1eb8aaa3980c6c809e2d67b6b39ab7b4a77a344-d_960',
+ 'view_count': int,
+ 'comment_count': int,
+ 'like_count': int,
},
'params': {
'format': 'best[protocol=https]',
@@ -364,7 +369,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
},
{
'url': 'http://player.vimeo.com/video/54469442',
- 'md5': '619b811a4417aa4abe78dc653becf511',
+ 'md5': 'b3e7f4d2cbb53bd7dc3bb6ff4ed5cfbd',
'note': 'Videos that embed the url in the player page',
'info_dict': {
'id': '54469442',
@@ -375,6 +380,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader_id': 'businessofsoftware',
'duration': 3610,
'description': None,
+ 'thumbnail': 'https://i.vimeocdn.com/video/376682406-f34043e7b766af6bef2af81366eacd6724f3fc3173179a11a97a1e26587c9529-d_1280',
},
'params': {
'format': 'best[protocol=https]',
@@ -395,6 +401,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader': 'Jaime Marquínez Ferrándiz',
'duration': 10,
'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f',
+ 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_960',
+ 'view_count': int,
+ 'comment_count': int,
+ 'like_count': int,
},
'params': {
'format': 'best[protocol=https]',
@@ -417,6 +427,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
'timestamp': 1380339469,
'upload_date': '20130928',
'duration': 187,
+ 'thumbnail': 'https://i.vimeocdn.com/video/450239872-a05512d9b1e55d707a7c04365c10980f327b06d966351bc403a5d5d65c95e572-d_1280',
+ 'view_count': int,
+ 'comment_count': int,
+ 'like_count': int,
},
'params': {'format': 'http-1080p'},
},
@@ -425,7 +439,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'note': 'Video with subtitles',
'info_dict': {
'id': '76979871',
- 'ext': 'mp4',
+ 'ext': 'mov',
'title': 'The New Vimeo Player (You Know, For Videos)',
'description': 'md5:2ec900bf97c3f389378a96aee11260ea',
'timestamp': 1381846109,
@@ -454,6 +468,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader': 'Tulio Gonçalves',
'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user28849593',
'uploader_id': 'user28849593',
+ 'duration': 118,
+ 'thumbnail': 'https://i.vimeocdn.com/video/478636036-c18440305ef3df9decfb6bf207a61fe39d2d17fa462a96f6f2d93d30492b037d-d_1280',
},
},
{
@@ -470,6 +486,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
'timestamp': 1324343742,
'upload_date': '20111220',
'description': 'md5:ae23671e82d05415868f7ad1aec21147',
+ 'duration': 60,
+ 'comment_count': int,
+ 'view_count': int,
+ 'thumbnail': 'https://i.vimeocdn.com/video/231174622-dd07f015e9221ff529d451e1cc31c982b5d87bfafa48c4189b1da72824ee289a-d_1280',
+ 'like_count': int,
},
},
{
@@ -485,6 +506,9 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader': 'Framework Studio',
'description': 'md5:f2edc61af3ea7a5592681ddbb683db73',
'upload_date': '20200225',
+ 'duration': 176,
+ 'thumbnail': 'https://i.vimeocdn.com/video/859377297-836494a4ef775e9d4edbace83937d9ad34dc846c688c0c419c0e87f7ab06c4b3-d_1280',
+ 'uploader_url': 'https://vimeo.com/frameworkla',
},
},
{
@@ -503,6 +527,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
'timestamp': 1250886430,
'upload_date': '20090821',
'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6',
+ 'duration': 321,
+ 'comment_count': int,
+ 'view_count': int,
+ 'thumbnail': 'https://i.vimeocdn.com/video/22728298-bfc22146f930de7cf497821c7b0b9f168099201ecca39b00b6bd31fcedfca7a6-d_1280',
+ 'like_count': int,
},
'params': {
'skip_download': True,
@@ -535,10 +564,17 @@ class VimeoIE(VimeoBaseInfoExtractor):
'id': '68375962',
'ext': 'mp4',
'title': 'youtube-dl password protected test video',
+ 'timestamp': 1371200155,
+ 'upload_date': '20130614',
'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128',
'uploader_id': 'user18948128',
'uploader': 'Jaime Marquínez Ferrándiz',
'duration': 10,
+ 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f',
+ 'thumbnail': 'https://i.vimeocdn.com/video/440665496-b2c5aee2b61089442c794f64113a8e8f7d5763c3e6b3ebfaf696ae6413f8b1f4-d_960',
+ 'view_count': int,
+ 'comment_count': int,
+ 'like_count': int,
},
'params': {
'format': 'best[protocol=https]',
@@ -568,12 +604,18 @@ class VimeoIE(VimeoBaseInfoExtractor):
'info_dict': {
'id': '119195465',
'ext': 'mp4',
- 'title': 'youtube-dl test video \'ä"BaW_jenozKc',
+ 'title': "youtube-dl test video '' ä↭𝕐-BaW jenozKc",
'uploader': 'Philipp Hagemeister',
'uploader_id': 'user20132939',
'description': 'md5:fa7b6c6d8db0bdc353893df2f111855b',
'upload_date': '20150209',
'timestamp': 1423518307,
+ 'thumbnail': 'https://i.vimeocdn.com/video/default_1280',
+ 'duration': 10,
+ 'like_count': int,
+ 'uploader_url': 'https://vimeo.com/user20132939',
+ 'view_count': int,
+ 'comment_count': int,
},
'params': {
'format': 'best[protocol=https]',
@@ -596,6 +638,14 @@ class VimeoIE(VimeoBaseInfoExtractor):
'title': 'Harrisville New Hampshire',
'timestamp': 1459259666,
'upload_date': '20160329',
+ 'release_timestamp': 1459259666,
+ 'license': 'by-nc',
+ 'duration': 159,
+ 'comment_count': int,
+ 'thumbnail': 'https://i.vimeocdn.com/video/562802436-585eeb13b5020c6ac0f171a2234067938098f84737787df05ff0d767f6d54ee9-d_1280',
+ 'like_count': int,
+ 'uploader_url': 'https://vimeo.com/aliniamedia',
+ 'release_date': '20160329',
},
'params': {'skip_download': True},
},
@@ -627,6 +677,14 @@ class VimeoIE(VimeoBaseInfoExtractor):
'title': 'The Shoes - Submarine Feat. Blaine Harrison',
'uploader_id': 'karimhd',
'description': 'md5:8e2eea76de4504c2e8020a9bcfa1e843',
+ 'channel_id': 'staffpicks',
+ 'duration': 336,
+ 'comment_count': int,
+ 'view_count': int,
+ 'thumbnail': 'https://i.vimeocdn.com/video/541243181-b593db36a16db2f0096f655da3f5a4dc46b8766d77b0f440df937ecb0c418347-d_1280',
+ 'like_count': int,
+ 'uploader_url': 'https://vimeo.com/karimhd',
+ 'channel_url': 'https://vimeo.com/channels/staffpicks',
},
'params': {'skip_download': 'm3u8'},
},
@@ -641,13 +699,19 @@ class VimeoIE(VimeoBaseInfoExtractor):
'url': 'https://vimeo.com/581039021/9603038895',
'info_dict': {
'id': '581039021',
- # these have to be provided but we don't care
'ext': 'mp4',
'timestamp': 1627621014,
- 'title': 're:.+',
- 'uploader_id': 're:.+',
- 'uploader': 're:.+',
- 'upload_date': r're:\d+',
+ 'release_timestamp': 1627621014,
+ 'duration': 976,
+ 'comment_count': int,
+ 'thumbnail': 'https://i.vimeocdn.com/video/1202249320-4ddb2c30398c0dc0ee059172d1bd5ea481ad12f0e0e3ad01d2266f56c744b015-d_1280',
+ 'like_count': int,
+ 'uploader_url': 'https://vimeo.com/txwestcapital',
+ 'release_date': '20210730',
+ 'uploader': 'Christopher Inks',
+ 'title': 'Thursday, July 29, 2021 BMA Evening Video Update',
+ 'uploader_id': 'txwestcapital',
+ 'upload_date': '20210730',
},
'params': {
'skip_download': True,
@@ -961,9 +1025,15 @@ class VimeoOndemandIE(VimeoIE):
'uploader': 'גם סרטים',
'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms',
'uploader_id': 'gumfilms',
- 'description': 'md5:4c027c965e439de4baab621e48b60791',
+ 'description': 'md5:aeeba3dbd4d04b0fa98a4fdc9c639998',
'upload_date': '20140906',
'timestamp': 1410032453,
+ 'thumbnail': 'https://i.vimeocdn.com/video/488238335-d7bf151c364cff8d467f1b73784668fe60aae28a54573a35d53a1210ae283bd8-d_1280',
+ 'comment_count': int,
+ 'license': 'https://creativecommons.org/licenses/by-nc-nd/3.0/',
+ 'duration': 53,
+ 'view_count': int,
+ 'like_count': int,
},
'params': {
'format': 'best[protocol=https]',
@@ -982,6 +1052,11 @@ class VimeoOndemandIE(VimeoIE):
'description': 'md5:c3c46a90529612c8279fb6af803fc0df',
'upload_date': '20150502',
'timestamp': 1430586422,
+ 'duration': 121,
+ 'comment_count': int,
+ 'view_count': int,
+ 'thumbnail': 'https://i.vimeocdn.com/video/517077723-7066ae1d9a79d3eb361334fb5d58ec13c8f04b52f8dd5eadfbd6fb0bcf11f613-d_1280',
+ 'like_count': int,
},
'params': {
'skip_download': True,
@@ -1011,7 +1086,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
'id': 'tributes',
'title': 'Vimeo Tributes',
},
- 'playlist_mincount': 25,
+ 'playlist_mincount': 22,
}]
_BASE_URL_TEMPL = 'https://vimeo.com/channels/%s'
@@ -1196,6 +1271,9 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
'uploader': 'Richard Hardwick',
'uploader_id': 'user21297594',
'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks",
+ 'duration': 304,
+ 'thumbnail': 'https://i.vimeocdn.com/video/450115033-43303819d9ebe24c2630352e18b7056d25197d09b3ae901abdac4c4f1d68de71-d_1280',
+ 'uploader_url': 'https://vimeo.com/user21297594',
},
}, {
'note': 'video player needs Referer',
diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py
index b633df95d..b0a1fca68 100644
--- a/yt_dlp/extractor/viu.py
+++ b/yt_dlp/extractor/viu.py
@@ -1,55 +1,32 @@
# coding: utf-8
from __future__ import unicode_literals
-import json
import re
+import json
+import uuid
+import random
+import urllib.parse
from .common import InfoExtractor
-from ..compat import (
- compat_kwargs,
- compat_str,
- compat_urlparse,
- compat_urllib_request,
-)
+from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
+ strip_or_none,
try_get,
smuggle_url,
unsmuggle_url,
+ url_or_none,
)
class ViuBaseIE(InfoExtractor):
- def _real_initialize(self):
- viu_auth_res = self._request_webpage(
- 'https://www.viu.com/api/apps/v2/authenticate', None,
- 'Requesting Viu auth', query={
- 'acct': 'test',
- 'appid': 'viu_desktop',
- 'fmt': 'json',
- 'iid': 'guest',
- 'languageid': 'default',
- 'platform': 'desktop',
- 'userid': 'guest',
- 'useridtype': 'guest',
- 'ver': '1.0'
- }, headers=self.geo_verification_headers())
- self._auth_token = viu_auth_res.info()['X-VIU-AUTH']
-
- def _call_api(self, path, *args, **kwargs):
- headers = self.geo_verification_headers()
- headers.update({
- 'X-VIU-AUTH': self._auth_token
- })
- headers.update(kwargs.get('headers', {}))
- kwargs['headers'] = headers
+ def _call_api(self, path, *args, headers={}, **kwargs):
response = self._download_json(
- 'https://www.viu.com/api/' + path, *args,
- **compat_kwargs(kwargs))['response']
+ f'https://www.viu.com/api/{path}', *args, **kwargs,
+ headers={**self.geo_verification_headers(), **headers})['response']
if response.get('status') != 'success':
- raise ExtractorError('%s said: %s' % (
- self.IE_NAME, response['message']), expected=True)
+ raise ExtractorError(f'{self.IE_NAME} said: {response["message"]}', expected=True)
return response
@@ -101,6 +78,7 @@ class ViuIE(ViuBaseIE):
tdirforwhole = video_data.get('tdirforwhole')
# #EXT-X-BYTERANGE is not supported by native hls downloader
# and ffmpeg (#10955)
+ # FIXME: It is supported in yt-dlp
# hls_file = video_data.get('hlsfile')
hls_file = video_data.get('jwhlsfile')
if url_path and tdirforwhole and hls_file:
@@ -110,10 +88,9 @@ class ViuIE(ViuBaseIE):
# r'(/hlsc_)[a-z]+(\d+\.m3u8)',
# r'\1whe\2', video_data['href'])
m3u8_url = video_data['href']
- formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4')
self._sort_formats(formats)
- subtitles = {}
for key, value in video_data.items():
mobj = re.match(r'^subtitle_(?P<lang>[^_]+)_(?P<ext>(vtt|srt))', key)
if not mobj:
@@ -227,42 +204,63 @@ class ViuOTTIE(InfoExtractor):
'zh-cn': 2,
'en-us': 3,
}
- _user_info = None
+
+ _user_token = None
+ _auth_codes = {}
def _detect_error(self, response):
- code = response.get('status', {}).get('code')
- if code > 0:
+ code = try_get(response, lambda x: x['status']['code'])
+ if code and code > 0:
message = try_get(response, lambda x: x['status']['message'])
- raise ExtractorError('%s said: %s (%s)' % (
- self.IE_NAME, message, code), expected=True)
- return response['data']
-
- def _raise_login_required(self):
- raise ExtractorError(
- 'This video requires login. '
- 'Specify --username and --password or --netrc (machine: %s) '
- 'to provide account credentials.' % self._NETRC_MACHINE,
- expected=True)
+ raise ExtractorError(f'{self.IE_NAME} said: {message} ({code})', expected=True)
+ return response.get('data') or {}
def _login(self, country_code, video_id):
- if not self._user_info:
+ if self._user_token is None:
username, password = self._get_login_info()
- if username is None or password is None:
+ if username is None:
return
+ headers = {
+ 'Authorization': f'Bearer {self._auth_codes[country_code]}',
+ 'Content-Type': 'application/json'
+ }
+ data = self._download_json(
+ 'https://api-gateway-global.viu.com/api/account/validate',
+ video_id, 'Validating email address', headers=headers,
+ data=json.dumps({
+ 'principal': username,
+ 'provider': 'email'
+ }).encode())
+ if not data.get('exists'):
+ raise ExtractorError('Invalid email address')
data = self._download_json(
- compat_urllib_request.Request(
- 'https://www.viu.com/ott/%s/index.php' % country_code, method='POST'),
- video_id, 'Logging in', errnote=False, fatal=False,
- query={'r': 'user/login'},
+ 'https://api-gateway-global.viu.com/api/auth/login',
+ video_id, 'Logging in', headers=headers,
data=json.dumps({
- 'username': username,
+ 'email': username,
'password': password,
- 'platform_flag_label': 'web',
+ 'provider': 'email',
}).encode())
- self._user_info = self._detect_error(data)['user']
-
- return self._user_info
+ self._detect_error(data)
+ self._user_token = data.get('identity')
+ # need to update with valid user's token else will throw an error again
+ self._auth_codes[country_code] = data.get('token')
+ return self._user_token
+
+ def _get_token(self, country_code, video_id):
+ rand = ''.join(random.choice('0123456789') for _ in range(10))
+ return self._download_json(
+ f'https://api-gateway-global.viu.com/api/auth/token?v={rand}000', video_id,
+ headers={'Content-Type': 'application/json'}, note='Getting bearer token',
+ data=json.dumps({
+ 'countryCode': country_code.upper(),
+ 'platform': 'browser',
+ 'platformFlagLabel': 'web',
+ 'language': 'en',
+ 'uuid': str(uuid.uuid4()),
+ 'carrierId': '0'
+ }).encode('utf-8'))['token']
def _real_extract(self, url):
url, idata = unsmuggle_url(url, {})
@@ -279,16 +277,16 @@ class ViuOTTIE(InfoExtractor):
query['area_id'] = area_id
product_data = self._download_json(
- 'http://www.viu.com/ott/%s/index.php' % country_code, video_id,
+ f'http://www.viu.com/ott/{country_code}/index.php', video_id,
'Downloading video info', query=query)['data']
video_data = product_data.get('current_product')
if not video_data:
- raise ExtractorError('This video is not available in your region.', expected=True)
+ self.raise_geo_restricted()
series_id = video_data.get('series_id')
if self._yes_playlist(series_id, video_id, idata):
- series = product_data.get('series', {})
+ series = product_data.get('series') or {}
product = series.get('product')
if product:
entries = []
@@ -296,14 +294,10 @@ class ViuOTTIE(InfoExtractor):
item_id = entry.get('product_id')
if not item_id:
continue
- item_id = compat_str(item_id)
entries.append(self.url_result(
- smuggle_url(
- 'http://www.viu.com/ott/%s/%s/vod/%s/' % (country_code, lang_code, item_id),
- {'force_noplaylist': True}), # prevent infinite recursion
- 'ViuOTT',
- item_id,
- entry.get('synopsis', '').strip()))
+ smuggle_url(f'http://www.viu.com/ott/{country_code}/{lang_code}/vod/{item_id}/',
+ {'force_noplaylist': True}),
+ ViuOTTIE, str(item_id), entry.get('synopsis', '').strip()))
return self.playlist_result(entries, series_id, series.get('name'), series.get('description'))
@@ -312,69 +306,65 @@ class ViuOTTIE(InfoExtractor):
'ccs_product_id': video_data['ccs_product_id'],
'language_flag_id': self._LANGUAGE_FLAG.get(lang_code.lower()) or '3',
}
- headers = {
- 'Referer': url,
- 'Origin': url,
- }
- try:
+
+ def download_playback():
stream_data = self._download_json(
- 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
- video_id, 'Downloading stream info', query=query, headers=headers)
- stream_data = self._detect_error(stream_data)['stream']
- except (ExtractorError, KeyError):
- stream_data = None
- if video_data.get('user_level', 0) > 0:
- user = self._login(country_code, video_id)
- if user:
- query['identity'] = user['identity']
- stream_data = self._download_json(
- 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
- video_id, 'Downloading stream info', query=query, headers=headers)
- stream_data = self._detect_error(stream_data).get('stream')
- else:
- # preview is limited to 3min for non-members
- # try to bypass the duration limit
- duration_limit = True
- query['duration'] = '180'
- stream_data = self._download_json(
- 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
- video_id, 'Downloading stream info', query=query, headers=headers)
- try:
- stream_data = self._detect_error(stream_data)['stream']
- except (ExtractorError, KeyError): # if still not working, give up
- self._raise_login_required()
+ 'https://api-gateway-global.viu.com/api/playback/distribute',
+ video_id=video_id, query=query, fatal=False, note='Downloading stream info',
+ headers={
+ 'Authorization': f'Bearer {self._auth_codes[country_code]}',
+ 'Referer': url,
+ 'Origin': url
+ })
+ return self._detect_error(stream_data).get('stream')
+
+ if not self._auth_codes.get(country_code):
+ self._auth_codes[country_code] = self._get_token(country_code, video_id)
+ stream_data = None
+ try:
+ stream_data = download_playback()
+ except (ExtractorError, KeyError):
+ token = self._login(country_code, video_id)
+ if token is not None:
+ query['identity'] = token
+ else:
+ # preview is limited to 3min for non-members. But we can try to bypass it
+ duration_limit, query['duration'] = True, '180'
+ try:
+ stream_data = download_playback()
+ except (ExtractorError, KeyError):
+ if token is not None:
+ raise
+ self.raise_login_required(method='password')
if not stream_data:
raise ExtractorError('Cannot get stream info', expected=True)
- stream_sizes = stream_data.get('size', {})
formats = []
- for vid_format, stream_url in stream_data.get('url', {}).items():
- height = int_or_none(self._search_regex(
- r's(\d+)p', vid_format, 'height', default=None))
+ for vid_format, stream_url in (stream_data.get('url') or {}).items():
+ height = int(self._search_regex(r's(\d+)p', vid_format, 'height', default=None))
# bypass preview duration limit
if duration_limit:
- stream_url = compat_urlparse.urlparse(stream_url)
- query = dict(compat_urlparse.parse_qsl(stream_url.query, keep_blank_values=True))
- time_duration = int_or_none(video_data.get('time_duration'))
+ stream_url = urllib.parse.urlparse(stream_url)
query.update({
- 'duration': time_duration if time_duration > 0 else '9999999',
+ 'duration': video_data.get('time_duration') or '9999999',
'duration_start': '0',
})
- stream_url = stream_url._replace(query=compat_urlparse.urlencode(query)).geturl()
+ stream_url = stream_url._replace(query=urllib.parse.urlencode(dict(
+ urllib.parse.parse_qsl(stream_url.query, keep_blank_values=True)))).geturl()
formats.append({
'format_id': vid_format,
'url': stream_url,
'height': height,
'ext': 'mp4',
- 'filesize': int_or_none(stream_sizes.get(vid_format))
+ 'filesize': try_get(stream_data, lambda x: x['size'][vid_format], int)
})
self._sort_formats(formats)
subtitles = {}
- for sub in video_data.get('subtitle', []):
+ for sub in video_data.get('subtitle') or []:
sub_url = sub.get('url')
if not sub_url:
continue
@@ -383,17 +373,16 @@ class ViuOTTIE(InfoExtractor):
'ext': 'srt',
})
- title = video_data['synopsis'].strip()
-
+ title = strip_or_none(video_data.get('synopsis'))
return {
'id': video_id,
'title': title,
'description': video_data.get('description'),
- 'series': product_data.get('series', {}).get('name'),
+ 'series': try_get(product_data, lambda x: x['series']['name']),
'episode': title,
'episode_number': int_or_none(video_data.get('number')),
'duration': int_or_none(stream_data.get('duration')),
- 'thumbnail': video_data.get('cover_image_url'),
+ 'thumbnail': url_or_none(video_data.get('cover_image_url')),
'formats': formats,
'subtitles': subtitles,
}
diff --git a/yt_dlp/extractor/wasdtv.py b/yt_dlp/extractor/wasdtv.py
new file mode 100644
index 000000000..38c10dc62
--- /dev/null
+++ b/yt_dlp/extractor/wasdtv.py
@@ -0,0 +1,161 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_iso8601,
+ traverse_obj,
+ try_get,
+)
+
+
+class WASDTVBaseIE(InfoExtractor):
+
+ def _fetch(self, path, video_id, description, query={}):
+ response = self._download_json(
+ f'https://wasd.tv/api/{path}', video_id, query=query,
+ note=f'Downloading {description} metadata',
+ errnote=f'Unable to download {description} metadata')
+ error = response.get('error')
+ if error:
+ raise ExtractorError(f'{self.IE_NAME} returned error: {error}', expected=True)
+ return response.get('result')
+
+ def _extract_thumbnails(self, thumbnails_dict):
+ return [{
+ 'url': url,
+ 'preference': index,
+ } for index, url in enumerate(
+ traverse_obj(thumbnails_dict, (('small', 'medium', 'large'),))) if url]
+
+ def _real_extract(self, url):
+ container = self._get_container(url)
+ stream = traverse_obj(container, ('media_container_streams', 0))
+ media = try_get(stream, lambda x: x['stream_media'][0])
+ if not media:
+ raise ExtractorError('Can not extract media data.', expected=True)
+ media_meta = media.get('media_meta')
+ media_url, is_live = self._get_media_url(media_meta)
+ video_id = media.get('media_id') or container.get('media_container_id')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(media_url, video_id, 'mp4')
+ self._sort_formats(formats)
+ return {
+ 'id': str(video_id),
+ 'title': container.get('media_container_name') or self._og_search_title(self._download_webpage(url, video_id)),
+ 'description': container.get('media_container_description'),
+ 'thumbnails': self._extract_thumbnails(media_meta.get('media_preview_images')),
+ 'timestamp': parse_iso8601(container.get('created_at')),
+ 'view_count': int_or_none(stream.get('stream_current_viewers' if is_live else 'stream_total_viewers')),
+ 'is_live': is_live,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ def _get_container(self, url):
+ raise NotImplementedError('Subclass for get media container')
+
+ def _get_media_url(self, media_meta):
+ raise NotImplementedError('Subclass for get media url')
+
+
+class WASDTVStreamIE(WASDTVBaseIE):
+ IE_NAME = 'wasdtv:stream'
+ _VALID_URL = r'https?://wasd\.tv/(?P<id>[^/#?]+)$'
+ _TESTS = [{
+ 'url': 'https://wasd.tv/24_7',
+ 'info_dict': {
+ 'id': '559738',
+ 'ext': 'mp4',
+ 'title': 'Live 24/7 Music',
+ 'description': '24&#x2F;7 Music',
+ 'timestamp': int,
+ 'upload_date': r're:^\d{8}$',
+ 'is_live': True,
+ 'view_count': int,
+ },
+ }]
+
+ def _get_container(self, url):
+ nickname = self._match_id(url)
+ channel = self._fetch(f'channels/nicknames/{nickname}', video_id=nickname, description='channel')
+ channel_id = channel.get('channel_id')
+ containers = self._fetch(
+ 'v2/media-containers', channel_id, 'running media containers',
+ query={
+ 'channel_id': channel_id,
+ 'media_container_type': 'SINGLE',
+ 'media_container_status': 'RUNNING',
+ })
+ if not containers:
+ raise ExtractorError(f'{nickname} is offline', expected=True)
+ return containers[0]
+
+ def _get_media_url(self, media_meta):
+ return media_meta['media_url'], True
+
+
+class WASDTVRecordIE(WASDTVBaseIE):
+ IE_NAME = 'wasdtv:record'
+ _VALID_URL = r'https?://wasd\.tv/[^/#?]+/videos\?record=(?P<id>\d+)$'
+ _TESTS = [{
+ 'url': 'https://wasd.tv/spacemita/videos?record=907755',
+ 'md5': 'c9899dd85be4cc997816ff9f9ca516ce',
+ 'info_dict': {
+ 'id': '906825',
+ 'ext': 'mp4',
+ 'title': 'Музыкальный',
+ 'description': 'md5:f510388d929ff60ae61d4c3cab3137cc',
+ 'timestamp': 1645812079,
+ 'upload_date': '20220225',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ 'is_live': False,
+ 'view_count': int,
+ },
+ }]
+
+ def _get_container(self, url):
+ container_id = self._match_id(url)
+ return self._fetch(
+ f'v2/media-containers/{container_id}', container_id, 'media container')
+
+ def _get_media_url(self, media_meta):
+ media_archive_url = media_meta.get('media_archive_url')
+ if media_archive_url:
+ return media_archive_url, False
+ return media_meta['media_url'], True
+
+
+class WASDTVClipIE(WASDTVBaseIE):
+ IE_NAME = 'wasdtv:clip'
+ _VALID_URL = r'https?://wasd\.tv/[^/#?]+/clips\?clip=(?P<id>\d+)$'
+ _TESTS = [{
+ 'url': 'https://wasd.tv/spacemita/clips?clip=26804',
+ 'md5': '818885e720143d7a4e776ff66fcff148',
+ 'info_dict': {
+ 'id': '26804',
+ 'ext': 'mp4',
+ 'title': 'Пуш флексит на голове стримера',
+ 'timestamp': 1646682908,
+ 'upload_date': '20220307',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ 'view_count': int,
+ },
+ }]
+
+ def _real_extract(self, url):
+ clip_id = self._match_id(url)
+ clip = self._fetch(f'v2/clips/{clip_id}', video_id=clip_id, description='clip')
+ clip_data = clip.get('clip_data')
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(clip_data.get('url'), video_id=clip_id, ext='mp4')
+ self._sort_formats(formats)
+ return {
+ 'id': clip_id,
+ 'title': clip.get('clip_title') or self._og_search_title(self._download_webpage(url, clip_id, fatal=False)),
+ 'thumbnails': self._extract_thumbnails(clip_data.get('preview')),
+ 'timestamp': parse_iso8601(clip.get('created_at')),
+ 'view_count': int_or_none(clip.get('clip_views_count')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index d74d5b0e9..19b4985f6 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -217,15 +217,35 @@ INNERTUBE_CLIENTS = {
}
},
'INNERTUBE_CONTEXT_CLIENT_NAME': 2
- }
+ },
+ # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option)
+ # See: https://github.com/zerodytrash/YouTube-Internal-Clients
+ 'tv_embedded': {
+ 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ 'INNERTUBE_CONTEXT': {
+ 'client': {
+ 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER',
+ 'clientVersion': '2.0',
+ },
+ },
+ 'INNERTUBE_CONTEXT_CLIENT_NAME': 85
+ },
}
+def _split_innertube_client(client_name):
+ variant, *base = client_name.rsplit('.', 1)
+ if base:
+ return variant, base[0], variant
+ base, *variant = client_name.split('_', 1)
+ return client_name, base, variant[0] if variant else None
+
+
def build_innertube_clients():
THIRD_PARTY = {
- 'embedUrl': 'https://google.com', # Can be any valid URL
+ 'embedUrl': 'https://www.youtube.com/', # Can be any valid URL
}
- BASE_CLIENTS = ('android', 'web', 'ios', 'mweb')
+ BASE_CLIENTS = ('android', 'web', 'tv', 'ios', 'mweb')
priority = qualities(BASE_CLIENTS[::-1])
for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()):
@@ -234,15 +254,15 @@ def build_innertube_clients():
ytcfg.setdefault('REQUIRE_JS_PLAYER', True)
ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en')
- base_client, *variant = client.split('_')
+ _, base_client, variant = _split_innertube_client(client)
ytcfg['priority'] = 10 * priority(base_client)
if not variant:
- INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg)
- agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED'
- agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY
- agegate_ytcfg['priority'] -= 1
- elif variant == ['embedded']:
+ INNERTUBE_CLIENTS[f'{client}_embedscreen'] = embedscreen = copy.deepcopy(ytcfg)
+ embedscreen['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED'
+ embedscreen['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY
+ embedscreen['priority'] -= 3
+ elif variant == 'embedded':
ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY
ytcfg['priority'] -= 2
else:
@@ -807,6 +827,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
description = self._get_text(renderer, 'descriptionSnippet')
duration = parse_duration(self._get_text(
renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
+ if duration is None:
+ duration = parse_duration(self._search_regex(
+ r'(?i)(ago)(?!.*\1)\s+(?P<duration>[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$',
+ traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str),
+ video_id, default=None, group='duration'))
+
view_count = self._get_count(renderer, 'viewCountText')
uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
@@ -818,12 +844,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str)
badges = self._extract_badges(renderer)
thumbnails = self._extract_thumbnails(renderer, 'thumbnail')
+ navigation_url = urljoin('https://www.youtube.com/', traverse_obj(
+ renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str))
+ url = f'https://www.youtube.com/watch?v={video_id}'
+ if overlay_style == 'SHORTS' or (navigation_url and '/shorts/' in navigation_url):
+ url = f'https://www.youtube.com/shorts/{video_id}'
return {
'_type': 'url',
'ie_key': YoutubeIE.ie_key(),
'id': video_id,
- 'url': f'https://www.youtube.com/watch?v={video_id}',
+ 'url': url,
'title': title,
'description': description,
'duration': duration,
@@ -2940,13 +2971,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
video_id, 'initial player response')
- original_clients = clients
+ all_clients = set(clients)
clients = clients[::-1]
prs = []
- def append_client(client_name):
- if client_name in INNERTUBE_CLIENTS and client_name not in original_clients:
- clients.append(client_name)
+ def append_client(*client_names):
+ """ Append the first client name that exists but not already used """
+ for client_name in client_names:
+ actual_client = _split_innertube_client(client_name)[0]
+ if actual_client in INNERTUBE_CLIENTS:
+ if actual_client not in all_clients:
+ clients.append(client_name)
+ all_clients.add(actual_client)
+ return
# Android player_response does not have microFormats which are needed for
# extraction of some data. So we return the initial_pr with formats
@@ -2961,7 +2998,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
tried_iframe_fallback = False
player_url = None
while clients:
- client = clients.pop()
+ client, base_client, variant = _split_innertube_client(clients.pop())
player_ytcfg = master_ytcfg if client == 'web' else {}
if 'configs' not in self._configuration_arg('player_skip'):
player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg
@@ -2989,10 +3026,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
prs.append(pr)
# creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in
- if client.endswith('_agegate') and self._is_unplayable(pr) and self.is_authenticated:
- append_client(client.replace('_agegate', '_creator'))
+ if variant == 'embedded' and self._is_unplayable(pr) and self.is_authenticated:
+ append_client(f'{base_client}_creator')
elif self._is_agegated(pr):
- append_client(f'{client}_agegate')
+ if variant == 'tv_embedded':
+ append_client(f'{base_client}_embedded')
+ elif not variant:
+ append_client(f'tv_embedded.{base_client}', f'{base_client}_embedded')
if last_error:
if not len(prs):
@@ -3013,7 +3053,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[])
for fmt in streaming_formats:
- if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
+ if fmt.get('targetDurationSec'):
continue
itag = str_or_none(fmt.get('itag'))
@@ -3095,6 +3135,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'fps': int_or_none(fmt.get('fps')) or None,
'height': height,
'quality': q(quality),
+ 'has_drm': bool(fmt.get('drmFamilies')),
'tbr': tbr,
'url': fmt_url,
'width': int_or_none(fmt.get('width')),
@@ -3468,6 +3509,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
subtitles, automatic_captions = {}, {}
for lang_code, caption_track in captions.items():
base_url = caption_track.get('baseUrl')
+ orig_lang = parse_qs(base_url).get('lang', [None])[-1]
if not base_url:
continue
lang_name = self._get_text(caption_track, 'name', max_runs=1)
@@ -3481,19 +3523,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
for trans_code, trans_name in translation_languages.items():
if not trans_code:
continue
+ orig_trans_code = trans_code
if caption_track.get('kind') != 'asr':
+ if 'translated_subs' in self._configuration_arg('skip'):
+ continue
trans_code += f'-{lang_code}'
trans_name += format_field(lang_name, template=' from %s')
# Add an "-orig" label to the original language so that it can be distinguished.
# The subs are returned without "-orig" as well for compatibility
- if lang_code == f'a-{trans_code}':
+ if lang_code == f'a-{orig_trans_code}':
process_language(
automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {})
# Setting tlang=lang returns damaged subtitles.
- # Not using lang_code == f'a-{trans_code}' here for future-proofing
- orig_lang = parse_qs(base_url).get('lang', [None])[-1]
process_language(automatic_captions, base_url, trans_code, trans_name,
- {} if orig_lang == trans_code else {'tlang': trans_code})
+ {} if orig_lang == orig_trans_code else {'tlang': trans_code})
info['automatic_captions'] = automatic_captions
info['subtitles'] = subtitles