aboutsummaryrefslogtreecommitdiffstats
path: root/yt_dlp/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'yt_dlp/extractor')
-rw-r--r--yt_dlp/extractor/abc.py64
-rw-r--r--yt_dlp/extractor/audiomack.py34
-rw-r--r--yt_dlp/extractor/common.py20
-rw-r--r--yt_dlp/extractor/extractors.py7
-rw-r--r--yt_dlp/extractor/facebook.py41
-rw-r--r--yt_dlp/extractor/generic.py4
-rw-r--r--yt_dlp/extractor/gronkh.py5
-rw-r--r--yt_dlp/extractor/hse.py95
-rw-r--r--yt_dlp/extractor/ondemandkorea.py6
-rw-r--r--yt_dlp/extractor/plutotv.py7
-rw-r--r--yt_dlp/extractor/sendtonews.py2
-rw-r--r--yt_dlp/extractor/soundcloud.py53
-rw-r--r--yt_dlp/extractor/youtube.py386
-rw-r--r--yt_dlp/extractor/zee5.py8
14 files changed, 594 insertions, 138 deletions
diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py
index e3369306c..354453a27 100644
--- a/yt_dlp/extractor/abc.py
+++ b/yt_dlp/extractor/abc.py
@@ -8,6 +8,7 @@ import time
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ dict_get,
ExtractorError,
js_to_json,
int_or_none,
@@ -253,3 +254,66 @@ class ABCIViewIE(InfoExtractor):
'subtitles': subtitles,
'is_live': is_live,
}
+
+
+class ABCIViewShowSeriesIE(InfoExtractor):
+ IE_NAME = 'abc.net.au:iview:showseries'
+ _VALID_URL = r'https?://iview\.abc\.net\.au/show/(?P<id>[^/]+)(?:/series/\d+)?$'
+ _GEO_COUNTRIES = ['AU']
+
+ _TESTS = [{
+ 'url': 'https://iview.abc.net.au/show/upper-middle-bogan',
+ 'info_dict': {
+ 'id': '124870-1',
+ 'title': 'Series 1',
+ 'description': 'md5:93119346c24a7c322d446d8eece430ff',
+ 'series': 'Upper Middle Bogan',
+ 'season': 'Series 1',
+ 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.*\.jpg$'
+ },
+ 'playlist_count': 8,
+ }, {
+ 'url': 'https://iview.abc.net.au/show/upper-middle-bogan',
+ 'info_dict': {
+ 'id': 'CO1108V001S00',
+ 'ext': 'mp4',
+ 'title': 'Series 1 Ep 1 I\'m A Swan',
+ 'description': 'md5:7b676758c1de11a30b79b4d301e8da93',
+ 'series': 'Upper Middle Bogan',
+ 'uploader_id': 'abc1',
+ 'upload_date': '20210630',
+ 'timestamp': 1625036400,
+ },
+ 'params': {
+ 'noplaylist': True,
+ 'skip_download': 'm3u8',
+ },
+ }]
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+ webpage = self._download_webpage(url, show_id)
+ webpage_data = self._search_regex(
+ r'window\.__INITIAL_STATE__\s*=\s*[\'"](.+?)[\'"]\s*;',
+ webpage, 'initial state')
+ video_data = self._parse_json(
+ unescapeHTML(webpage_data).encode('utf-8').decode('unicode_escape'), show_id)
+ video_data = video_data['route']['pageData']['_embedded']
+
+ if self.get_param('noplaylist') and 'highlightVideo' in video_data:
+ self.to_screen('Downloading just the highlight video because of --no-playlist')
+ return self.url_result(video_data['highlightVideo']['shareUrl'], ie=ABCIViewIE.ie_key())
+
+ self.to_screen(f'Downloading playlist {show_id} - add --no-playlist to just download the highlight video')
+ series = video_data['selectedSeries']
+ return {
+ '_type': 'playlist',
+ 'entries': [self.url_result(episode['shareUrl'])
+ for episode in series['_embedded']['videoEpisodes']],
+ 'id': series.get('id'),
+ 'title': dict_get(series, ('title', 'displaySubtitle')),
+ 'description': series.get('description'),
+ 'series': dict_get(series, ('showTitle', 'displayTitle')),
+ 'season': dict_get(series, ('title', 'displaySubtitle')),
+ 'thumbnail': series.get('thumbnail'),
+ }
diff --git a/yt_dlp/extractor/audiomack.py b/yt_dlp/extractor/audiomack.py
index cc7771354..31fb859ae 100644
--- a/yt_dlp/extractor/audiomack.py
+++ b/yt_dlp/extractor/audiomack.py
@@ -14,7 +14,7 @@ from ..utils import (
class AudiomackIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P<id>[\w/-]+)'
+ _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:song/|(?=.+/song/))(?P<id>[\w/-]+)'
IE_NAME = 'audiomack'
_TESTS = [
# hosted on audiomack
@@ -39,15 +39,16 @@ class AudiomackIE(InfoExtractor):
'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]',
'uploader': 'ILOVEMAKONNEN',
'upload_date': '20160414',
- }
+ },
+ 'skip': 'Song has been removed from the site',
},
]
def _real_extract(self, url):
- # URLs end with [uploader name]/[uploader title]
+ # URLs end with [uploader name]/song/[uploader title]
# this title is whatever the user types in, and is rarely
# the proper song title. Real metadata is in the api response
- album_url_tag = self._match_id(url)
+ album_url_tag = self._match_id(url).replace('/song/', '/')
# Request the extended version of the api for extra fields like artist and title
api_response = self._download_json(
@@ -73,13 +74,13 @@ class AudiomackIE(InfoExtractor):
class AudiomackAlbumIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P<id>[\w/-]+)'
+ _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:album/|(?=.+/album/))(?P<id>[\w/-]+)'
IE_NAME = 'audiomack:album'
_TESTS = [
# Standard album playlist
{
'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape',
- 'playlist_count': 15,
+ 'playlist_count': 11,
'info_dict':
{
'id': '812251',
@@ -95,24 +96,27 @@ class AudiomackAlbumIE(InfoExtractor):
},
'playlist': [{
'info_dict': {
- 'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)',
- 'id': '837577',
+ 'title': 'PPP (Pistol P Project) - 8. Real (prod by SYK SENSE )',
+ 'id': '837576',
+ 'ext': 'mp3',
+ 'uploader': 'Lil Herb a.k.a. G Herbo',
+ }
+ }, {
+ 'info_dict': {
+ 'title': 'PPP (Pistol P Project) - 10. 4 Minutes Of Hell Part 4 (prod by DY OF 808 MAFIA)',
+ 'id': '837580',
'ext': 'mp3',
'uploader': 'Lil Herb a.k.a. G Herbo',
}
}],
- 'params': {
- 'playliststart': 9,
- 'playlistend': 9,
- }
}
]
def _real_extract(self, url):
- # URLs end with [uploader name]/[uploader title]
+ # URLs end with [uploader name]/album/[uploader title]
# this title is whatever the user types in, and is rarely
# the proper song title. Real metadata is in the api response
- album_url_tag = self._match_id(url)
+ album_url_tag = self._match_id(url).replace('/album/', '/')
result = {'_type': 'playlist', 'entries': []}
# There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata
# Therefore we don't know how many songs the album has and must infi-loop until failure
@@ -134,7 +138,7 @@ class AudiomackAlbumIE(InfoExtractor):
# Pull out the album metadata and add to result (if it exists)
for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]:
if apikey in api_response and resultkey not in result:
- result[resultkey] = api_response[apikey]
+ result[resultkey] = compat_str(api_response[apikey])
song_id = url_basename(api_response['url']).rpartition('.')[0]
result['entries'].append({
'id': compat_str(api_response.get('id', song_id)),
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index ebf2e3cea..9abbaf04f 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -163,9 +163,8 @@ class InfoExtractor(object):
* filesize_approx An estimate for the number of bytes
* player_url SWF Player URL (used for rtmpdump).
* protocol The protocol that will be used for the actual
- download, lower-case.
- "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
- "m3u8", "m3u8_native" or "http_dash_segments".
+ download, lower-case. One of "http", "https" or
+ one of the protocols defined in downloader.PROTOCOL_MAP
* fragment_base_url
Base URL for fragments. Each fragment's path
value (if present) will be relative to
@@ -181,6 +180,8 @@ class InfoExtractor(object):
fragment_base_url
* "duration" (optional, int or float)
* "filesize" (optional, int)
+ * is_from_start Is a live format that can be downloaded
+ from the start. Boolean
* preference Order number of this format. If this field is
present and not None, the formats get sorted
by this field, regardless of all other values.
@@ -1451,8 +1452,13 @@ class InfoExtractor(object):
})
extract_interaction_statistic(e)
- for e in json_ld:
- if '@context' in e:
+ def traverse_json_ld(json_ld, at_top_level=True):
+ for e in json_ld:
+ if at_top_level and '@context' not in e:
+ continue
+ if at_top_level and set(e.keys()) == {'@context', '@graph'}:
+ traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
+ break
item_type = e.get('@type')
if expected_type is not None and expected_type != item_type:
continue
@@ -1488,7 +1494,7 @@ class InfoExtractor(object):
info.update({
'timestamp': parse_iso8601(e.get('datePublished')),
'title': unescapeHTML(e.get('headline')),
- 'description': unescapeHTML(e.get('articleBody')),
+ 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
})
elif item_type == 'VideoObject':
extract_video_object(e)
@@ -1503,6 +1509,8 @@ class InfoExtractor(object):
continue
else:
break
+ traverse_json_ld(json_ld)
+
return dict((k, v) for k, v in info.items() if v is not None)
def _search_nextjs_data(self, webpage, video_id, **kw):
diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py
index e4755b3d1..ee5ea533f 100644
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .abc import (
ABCIE,
ABCIViewIE,
+ ABCIViewShowSeriesIE,
)
from .abcnews import (
AbcNewsIE,
@@ -434,6 +435,7 @@ from .eyedotv import EyedoTVIE
from .facebook import (
FacebookIE,
FacebookPluginsVideoIE,
+ FacebookRedirectURLIE,
)
from .fancode import (
FancodeVodIE,
@@ -563,6 +565,10 @@ from .hrti import (
HRTiIE,
HRTiPlaylistIE,
)
+from .hse import (
+ HSEShowIE,
+ HSEProductIE,
+)
from .huajiao import HuajiaoIE
from .huffpost import HuffPostIE
from .hungama import (
@@ -1357,6 +1363,7 @@ from .soundcloud import (
SoundcloudEmbedIE,
SoundcloudIE,
SoundcloudSetIE,
+ SoundcloudRelatedIE,
SoundcloudUserIE,
SoundcloudTrackStationIE,
SoundcloudPlaylistIE,
diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py
index 44d3dc0d7..6dbcd690d 100644
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@@ -23,9 +23,11 @@ from ..utils import (
merge_dicts,
network_exceptions,
parse_count,
+ parse_qs,
qualities,
sanitized_Request,
try_get,
+ url_or_none,
urlencode_postdata,
urljoin,
)
@@ -746,3 +748,42 @@ class FacebookPluginsVideoIE(InfoExtractor):
return self.url_result(
compat_urllib_parse_unquote(self._match_id(url)),
FacebookIE.ie_key())
+
+
+class FacebookRedirectURLIE(InfoExtractor):
+ IE_DESC = False # Do not list
+ _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/flx/warn[/?]'
+ _TESTS = [{
+ 'url': 'https://www.facebook.com/flx/warn/?h=TAQHsoToz&u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&s=1',
+ 'info_dict': {
+ 'id': 'pO8h3EaFRdo',
+ 'ext': 'mp4',
+ 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set',
+ 'description': 'md5:2d713ccbb45b686a1888397b2c77ca6b',
+ 'channel_id': 'UCGBpxWJr9FNOcFYA5GkKrMg',
+ 'playable_in_embed': True,
+ 'categories': ['Music'],
+ 'channel': 'Boiler Room',
+ 'uploader_id': 'brtvofficial',
+ 'uploader': 'Boiler Room',
+ 'tags': 'count:11',
+ 'duration': 3332,
+ 'live_status': 'not_live',
+ 'thumbnail': 'https://i.ytimg.com/vi/pO8h3EaFRdo/maxresdefault.jpg',
+ 'channel_url': 'https://www.youtube.com/channel/UCGBpxWJr9FNOcFYA5GkKrMg',
+ 'availability': 'public',
+ 'uploader_url': 'http://www.youtube.com/user/brtvofficial',
+ 'upload_date': '20150917',
+ 'age_limit': 0,
+ 'view_count': int,
+ 'like_count': int,
+ },
+ 'add_ie': ['Youtube'],
+ 'params': {'skip_download': 'Youtube'},
+ }]
+
+ def _real_extract(self, url):
+ redirect_url = url_or_none(parse_qs(url).get('u', [None])[-1])
+ if not redirect_url:
+ raise ExtractorError('Invalid facebook redirect URL', expected=True)
+ return self.url_result(redirect_url)
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py
index 51557f0f1..1ec0ce986 100644
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -3653,6 +3653,10 @@ class GenericIE(InfoExtractor):
json_ld = self._search_json_ld(webpage, video_id, default={})
if json_ld.get('url'):
self.report_detected('JSON LD')
+ if determine_ext(json_ld.get('url')) == 'm3u8':
+ json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles(
+ json_ld['url'], video_id, 'mp4')
+ json_ld.pop('url')
return merge_dicts(json_ld, info_dict)
def check_video(vurl):
diff --git a/yt_dlp/extractor/gronkh.py b/yt_dlp/extractor/gronkh.py
index 58cd59511..c9f1dd256 100644
--- a/yt_dlp/extractor/gronkh.py
+++ b/yt_dlp/extractor/gronkh.py
@@ -6,7 +6,7 @@ from ..utils import unified_strdate
class GronkhIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/stream/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/(?:watch/)?stream/(?P<id>\d+)'
_TESTS = [{
'url': 'https://gronkh.tv/stream/536',
@@ -19,6 +19,9 @@ class GronkhIE(InfoExtractor):
'upload_date': '20211001'
},
'params': {'skip_download': True}
+ }, {
+ 'url': 'https://gronkh.tv/watch/stream/546',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/yt_dlp/extractor/hse.py b/yt_dlp/extractor/hse.py
new file mode 100644
index 000000000..9144ff8dc
--- /dev/null
+++ b/yt_dlp/extractor/hse.py
@@ -0,0 +1,95 @@
+# coding: utf-8
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ traverse_obj,
+ unified_timestamp,
+)
+
+
+class HSEShowBaseInfoExtractor(InfoExtractor):
+ _GEO_COUNTRIES = ['DE']
+
+ def _extract_redux_data(self, url, video_id):
+ webpage = self._download_webpage(url, video_id)
+ redux = self._html_search_regex(
+ r'window\.__REDUX_DATA__\s*=\s*({.*});?', webpage, 'redux data')
+ return self._parse_json(redux.replace('\n', ''), video_id)
+
+ def _extract_formats_and_subtitles(self, sources, video_id):
+ if not sources:
+ raise ExtractorError('No video found', expected=True, video_id=video_id)
+ formats, subtitles = [], {}
+ for src in sources:
+ if src['mimetype'] != 'application/x-mpegURL':
+ continue
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, ext='mp4')
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ self._sort_formats(formats)
+ return formats, subtitles
+
+
+class HSEShowIE(HSEShowBaseInfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/c/tv-shows/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'https://www.hse.de/dpl/c/tv-shows/505350',
+ 'info_dict': {
+ 'id': '505350',
+ 'ext': 'mp4',
+ 'title': 'Pfeffinger Mode & Accessoires',
+ 'timestamp': 1638810000,
+ 'upload_date': '20211206',
+ 'channel': 'HSE24',
+ 'uploader': 'Arina Pirayesh'
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ json_data = self._extract_redux_data(url, video_id)
+ formats, subtitles = self._extract_formats_and_subtitles(
+ traverse_obj(json_data, ('tvShowPage', 'tvShowVideo', 'sources')), video_id)
+
+ show = traverse_obj(json_data, ('tvShowPage', 'tvShow')) or {}
+ return {
+ 'id': video_id,
+ 'title': show.get('title') or video_id,
+ 'formats': formats,
+ 'timestamp': unified_timestamp(f'{show.get("date")} {show.get("hour")}:00'),
+ 'thumbnail': traverse_obj(json_data, ('tvShowVideo', 'poster')),
+ 'channel': self._search_regex(
+ r'tvShow \| ([A-Z0-9]+)_', show.get('actionFieldText') or '', video_id, fatal=False),
+ 'uploader': show.get('presenter'),
+ 'subtitles': subtitles,
+ }
+
+
+class HSEProductIE(HSEShowBaseInfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/p/product/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'https://www.hse.de/dpl/p/product/408630',
+ 'info_dict': {
+ 'id': '408630',
+ 'ext': 'mp4',
+ 'title': 'Hose im Ponte-Mix',
+ 'uploader': 'Judith Williams'
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ json_data = self._extract_redux_data(url, video_id)
+ video = traverse_obj(json_data, ('productContent', 'productContent', 'videos', 0)) or {}
+ formats, subtitles = self._extract_formats_and_subtitles(video.get('sources'), video_id)
+
+ return {
+ 'id': video_id,
+ 'title': traverse_obj(json_data, ('productDetail', 'product', 'name', 'short')) or video_id,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'thumbnail': video.get('poster'),
+ 'uploader': traverse_obj(json_data, ('productDetail', 'product', 'brand', 'brandName')),
+ }
diff --git a/yt_dlp/extractor/ondemandkorea.py b/yt_dlp/extractor/ondemandkorea.py
index cc3c587bc..e933ea2cc 100644
--- a/yt_dlp/extractor/ondemandkorea.py
+++ b/yt_dlp/extractor/ondemandkorea.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
ExtractorError,
@@ -71,8 +73,8 @@ class OnDemandKoreaIE(InfoExtractor):
jw_config = self._parse_json(
self._search_regex(
- r'(?s)odkPlayer\.init.*?(?P<options>{[^;]+}).*?;',
- webpage, 'jw config', group='options'),
+ r'playlist\s*=\s*\[(?P<options>.+)];?$',
+ webpage, 'jw config', flags=re.MULTILINE, group='options'),
video_id, transform_source=js_to_json)
info = self._parse_jwplayer_data(
jw_config, video_id, require_title=False, m3u8_id='hls',
diff --git a/yt_dlp/extractor/plutotv.py b/yt_dlp/extractor/plutotv.py
index 0cf82466a..26aff1af5 100644
--- a/yt_dlp/extractor/plutotv.py
+++ b/yt_dlp/extractor/plutotv.py
@@ -20,11 +20,11 @@ from ..utils import (
class PlutoTVIE(InfoExtractor):
_VALID_URL = r'''(?x)
- https?://(?:www\.)?pluto\.tv(?:/en)?/on-demand
+ https?://(?:www\.)?pluto\.tv(?:/[^/]+)?/on-demand
/(?P<video_type>movies|series)
/(?P<series_or_movie_slug>[^/]+)
(?:
- /seasons?/(?P<season_no>\d+)
+ (?:/seasons?/(?P<season_no>\d+))?
(?:/episode/(?P<episode_slug>[^/]+))?
)?
/?(?:$|[#?])'''
@@ -84,6 +84,9 @@ class PlutoTVIE(InfoExtractor):
}, {
'url': 'https://pluto.tv/en/on-demand/series/manhunters-fugitive-task-force/seasons/1/episode/third-times-the-charm-1-1',
'only_matching': True,
+ }, {
+ 'url': 'https://pluto.tv/it/on-demand/series/csi-vegas/episode/legacy-2021-1-1',
+ 'only_matching': True,
}
]
diff --git a/yt_dlp/extractor/sendtonews.py b/yt_dlp/extractor/sendtonews.py
index bc38a0f1e..858547b54 100644
--- a/yt_dlp/extractor/sendtonews.py
+++ b/yt_dlp/extractor/sendtonews.py
@@ -80,7 +80,7 @@ class SendtoNewsIE(InfoExtractor):
'format_id': '%s-%d' % (determine_protocol(f), tbr),
'tbr': tbr,
})
- # 'tbr' was explicitly set to be prefered over 'height' originally,
+ # 'tbr' was explicitly set to be preferred over 'height' originally,
# So this is being kept unless someone can confirm this is unnecessary
self._sort_formats(info_dict['formats'], ('tbr', 'res'))
diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py
index d5cbe70ea..f251e5599 100644
--- a/yt_dlp/extractor/soundcloud.py
+++ b/yt_dlp/extractor/soundcloud.py
@@ -214,8 +214,9 @@ class SoundcloudIE(SoundcloudBaseIE):
(?!stations/track)
(?P<uploader>[\w\d-]+)/
(?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
- (?P<title>[\w\d-]+)/?
- (?P<token>[^?]+?)?(?:[?].*)?$)
+ (?P<title>[\w\d-]+)
+ (?:/(?P<token>(?!(?:albums|sets|recommended))[^?]+?))?
+ (?:[?].*)?$)
|(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+)
(?:/?\?secret_token=(?P<secret_token>[^&]+))?)
)
@@ -827,6 +828,54 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
track_id, 'Track station: %s' % track['title'])
+class SoundcloudRelatedIE(SoundcloudPagedPlaylistBaseIE):
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<slug>[\w\d-]+/[\w\d-]+)/(?P<relation>albums|sets|recommended)'
+ IE_NAME = 'soundcloud:related'
+ _TESTS = [{
+ 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/recommended',
+ 'info_dict': {
+ 'id': '1084577272',
+ 'title': 'Sexapil - Pingers 5 (Recommended)',
+ },
+ 'playlist_mincount': 50,
+ }, {
+ 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/albums',
+ 'info_dict': {
+ 'id': '1084577272',
+ 'title': 'Sexapil - Pingers 5 (Albums)',
+ },
+ 'playlist_mincount': 1,
+ }, {
+ 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/sets',
+ 'info_dict': {
+ 'id': '1084577272',
+ 'title': 'Sexapil - Pingers 5 (Sets)',
+ },
+ 'playlist_mincount': 4,
+ }]
+
+ _BASE_URL_MAP = {
+ 'albums': 'tracks/%s/albums',
+ 'sets': 'tracks/%s/playlists_without_albums',
+ 'recommended': 'tracks/%s/related',
+ }
+
+ def _real_extract(self, url):
+ slug, relation = self._match_valid_url(url).group('slug', 'relation')
+
+ track = self._download_json(
+ self._resolv_url(self._BASE_URL + slug),
+ slug, 'Downloading track info', headers=self._HEADERS)
+
+ if track.get('errors'):
+ raise ExtractorError(f'{self.IE_NAME} said: %s' % ','.join(
+ str(err['error_message']) for err in track['errors']), expected=True)
+
+ return self._extract_playlist(
+ self._API_V2_BASE + self._BASE_URL_MAP[relation] % track['id'], str(track['id']),
+ '%s (%s)' % (track.get('title') or slug, relation.capitalize()))
+
+
class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
_VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$'
IE_NAME = 'soundcloud:playlist'
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index 8f64b6657..1f5009399 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -5,6 +5,7 @@ from __future__ import unicode_literals
import calendar
import copy
import datetime
+import functools
import hashlib
import itertools
import json
@@ -15,6 +16,7 @@ import re
import sys
import time
import traceback
+import threading
from .common import InfoExtractor, SearchInfoExtractor
from ..compat import (
@@ -55,6 +57,7 @@ from ..utils import (
smuggle_url,
str_or_none,
str_to_int,
+ strftime_or_none,
traverse_obj,
try_get,
unescapeHTML,
@@ -358,7 +361,20 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
consent_id = random.randint(100, 999)
self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
+ def _initialize_pref(self):
+ cookies = self._get_cookies('https://www.youtube.com/')
+ pref_cookie = cookies.get('PREF')
+ pref = {}
+ if pref_cookie:
+ try:
+ pref = dict(compat_urlparse.parse_qsl(pref_cookie.value))
+ except ValueError:
+ self.report_warning('Failed to parse user PREF cookie' + bug_reports_message())
+ pref.update({'hl': 'en'})
+ self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref))
+
def _real_initialize(self):
+ self._initialize_pref()
self._initialize_consent()
self._login()
@@ -391,23 +407,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client)
def _extract_context(self, ytcfg=None, default_client='web'):
- _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict)
- context = _get_context(ytcfg)
- if context:
- return context
-
- context = _get_context(self._get_default_ytcfg(default_client))
- if not ytcfg:
- return context
-
- # Recreate the client context (required)
- context['client'].update({
- 'clientVersion': self._extract_client_version(ytcfg, default_client),
- 'clientName': self._extract_client_name(ytcfg, default_client),
- })
- visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str)
- if visitor_data:
- context['client']['visitorData'] = visitor_data
+ context = get_first(
+ (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict)
+ # Enforce language for extraction
+ traverse_obj(context, 'client', expected_type=dict, default={})['hl'] = 'en'
return context
_SAPISID = None
@@ -664,6 +667,29 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
if text:
return text
+ @staticmethod
+ def extract_relative_time(relative_time_text):
+ """
+ Extracts a relative time from string and converts to dt object
+ e.g. 'streamed 6 days ago', '5 seconds ago (edited)'
+ """
+ mobj = re.search(r'(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text)
+ if mobj:
+ try:
+ return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')), precision='auto')
+ except ValueError:
+ return None
+
+ def _extract_time_text(self, renderer, *path_list):
+ text = self._get_text(renderer, *path_list) or ''
+ dt = self.extract_relative_time(text)
+ timestamp = None
+ if isinstance(dt, datetime.datetime):
+ timestamp = calendar.timegm(dt.timetuple())
+ if text and timestamp is None:
+ self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True)
+ return timestamp, text
+
def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
default_client='web'):
@@ -750,7 +776,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'view count', default=None))
uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
-
+ channel_id = traverse_obj(
+ renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), expected_type=str, get_all=False)
+ timestamp, time_text = self._extract_time_text(renderer, 'publishedTimeText')
+ scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False))
+ overlay_style = traverse_obj(
+ renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str)
+ badges = self._extract_badges(renderer)
return {
'_type': 'url',
'ie_key': YoutubeIE.ie_key(),
@@ -761,6 +793,14 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'duration': duration,
'view_count': view_count,
'uploader': uploader,
+ 'channel_id': channel_id,
+ 'upload_date': strftime_or_none(timestamp, '%Y%m%d'),
+ 'live_status': ('is_upcoming' if scheduled_timestamp is not None
+ else 'was_live' if 'streamed' in time_text.lower()
+ else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges
+ else None),
+ 'release_timestamp': scheduled_timestamp,
+ 'availability': self._availability(needs_premium='premium' in badges, needs_subscription='members only' in badges)
}
@@ -1709,6 +1749,142 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._code_cache = {}
self._player_cache = {}
+ def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data):
+ EXPIRATION_DURATION = 18_000
+ lock = threading.Lock()
+
+ is_live = True
+ expiration_time = time.time() + EXPIRATION_DURATION
+ formats = [f for f in formats if f.get('is_from_start')]
+
+ def refetch_manifest(format_id):
+ nonlocal formats, expiration_time, is_live
+ if time.time() <= expiration_time:
+ return
+
+ _, _, prs, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url)
+ video_details = traverse_obj(
+ prs, (..., 'videoDetails'), expected_type=dict, default=[])
+ microformats = traverse_obj(
+ prs, (..., 'microformat', 'playerMicroformatRenderer'),
+ expected_type=dict, default=[])
+ _, is_live, _, formats = self._list_formats(video_id, microformats, video_details, prs, player_url)
+ expiration_time = time.time() + EXPIRATION_DURATION
+
+ def mpd_feed(format_id):
+ """
+ @returns (manifest_url, manifest_stream_number, is_live) or None
+ """
+ with lock:
+ refetch_manifest(format_id)
+
+ f = next((f for f in formats if f['format_id'] == format_id), None)
+ if not f:
+ self.report_warning(
+ f'Cannot find refreshed manifest for format {format_id}{bug_reports_message()}')
+ return None
+ return f['manifest_url'], f['manifest_stream_number'], is_live
+
+ for f in formats:
+ f['protocol'] = 'http_dash_segments_generator'
+ f['fragments'] = functools.partial(
+ self._live_dash_fragments, f['format_id'], live_start_time, mpd_feed)
+
+ def _live_dash_fragments(self, format_id, live_start_time, mpd_feed, ctx):
+ FETCH_SPAN, MAX_DURATION = 5, 432000
+
+ mpd_url, stream_number, is_live = None, None, True
+
+ begin_index = 0
+ download_start_time = ctx.get('start') or time.time()
+
+ lack_early_segments = download_start_time - (live_start_time or download_start_time) > MAX_DURATION
+ if lack_early_segments:
+ self.report_warning(bug_reports_message(
+ 'Starting download from the last 120 hours of the live stream since '
+ 'YouTube does not have data before that. If you think this is wrong,'), only_once=True)
+ lack_early_segments = True
+
+ known_idx, no_fragment_score, last_segment_url = begin_index, 0, None
+ fragments, fragment_base_url = None, None
+
+ def _extract_sequence_from_mpd(refresh_sequence):
+ nonlocal mpd_url, stream_number, is_live, no_fragment_score, fragments, fragment_base_url
+ # Obtain from MPD's maximum seq value
+ old_mpd_url = mpd_url
+ mpd_url, stream_number, is_live = mpd_feed(format_id) or (mpd_url, stream_number, False)
+ if old_mpd_url == mpd_url and not refresh_sequence:
+ return True, last_seq
+ try:
+ fmts, _ = self._extract_mpd_formats_and_subtitles(
+ mpd_url, None, note=False, errnote=False, fatal=False)
+ except ExtractorError:
+ fmts = None
+ if not fmts:
+ no_fragment_score += 1
+ return False, last_seq
+ fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number)
+ fragments = fmt_info['fragments']
+ fragment_base_url = fmt_info['fragment_base_url']
+ assert fragment_base_url
+
+ _last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1))
+ return True, _last_seq
+
+ while is_live:
+ fetch_time = time.time()
+ if no_fragment_score > 30:
+ return
+ if last_segment_url:
+ # Obtain from "X-Head-Seqnum" header value from each segment
+ try:
+ urlh = self._request_webpage(
+ last_segment_url, None, note=False, errnote=False, fatal=False)
+ except ExtractorError:
+ urlh = None
+ last_seq = try_get(urlh, lambda x: int_or_none(x.headers['X-Head-Seqnum']))
+ if last_seq is None:
+ no_fragment_score += 1
+ last_segment_url = None
+ continue
+ else:
+ should_retry, last_seq = _extract_sequence_from_mpd(True)
+ if not should_retry:
+ continue
+
+ if known_idx > last_seq:
+ last_segment_url = None
+ continue
+
+ last_seq += 1
+
+ if begin_index < 0 and known_idx < 0:
+ # skip from the start when it's negative value
+ known_idx = last_seq + begin_index
+ if lack_early_segments:
+ known_idx = max(known_idx, last_seq - int(MAX_DURATION // fragments[-1]['duration']))
+ try:
+ for idx in range(known_idx, last_seq):
+ # do not update sequence here or you'll get skipped some part of it
+ should_retry, _ = _extract_sequence_from_mpd(False)
+ if not should_retry:
+ # retry when it gets weird state
+ known_idx = idx - 1
+ raise ExtractorError('breaking out of outer loop')
+ last_segment_url = urljoin(fragment_base_url, 'sq/%d' % idx)
+ yield {
+ 'url': last_segment_url,
+ }
+ if known_idx == last_seq:
+ no_fragment_score += 5
+ else:
+ no_fragment_score = 0
+ known_idx = last_seq
+ except ExtractorError:
+ continue
+
+ time.sleep(max(0, FETCH_SPAN + fetch_time - time.time()))
+
def _extract_player_url(self, *ytcfgs, webpage=None):
player_url = traverse_obj(
ytcfgs, (..., 'PLAYER_JS_URL'), (..., 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'jsUrl'),
@@ -2064,19 +2240,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
regex), webpage, name, default='{}'), video_id, fatal=False)
- @staticmethod
- def parse_time_text(time_text):
- """
- Parse the comment time text
- time_text is in the format 'X units ago (edited)'
- """
- time_text_split = time_text.split(' ')
- if len(time_text_split) >= 3:
- try:
- return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
- except ValueError:
- return None
-
def _extract_comment(self, comment_renderer, parent=None):
comment_id = comment_renderer.get('commentId')
if not comment_id:
@@ -2085,10 +2248,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
text = self._get_text(comment_renderer, 'contentText')
# note: timestamp is an estimate calculated from the current time and time_text
- time_text = self._get_text(comment_renderer, 'publishedTimeText') or ''
- time_text_dt = self.parse_time_text(time_text)
- if isinstance(time_text_dt, datetime.datetime):
- timestamp = calendar.timegm(time_text_dt.timetuple())
+ timestamp, time_text = self._extract_time_text(comment_renderer, 'publishedTimeText')
author = self._get_text(comment_renderer, 'authorText')
author_id = try_get(comment_renderer,
lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
@@ -2261,11 +2421,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
yield from self._comment_entries(renderer, ytcfg, video_id)
max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0])
- # Force English regardless of account setting to prevent parsing issues
- # See: https://github.com/yt-dlp/yt-dlp/issues/532
- ytcfg = copy.deepcopy(ytcfg)
- traverse_obj(
- ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en'
return itertools.islice(_real_comment_extract(contents), 0, max_comments)
@staticmethod
@@ -2531,11 +2686,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
dct['container'] = dct['ext'] + '_dash'
yield dct
+ live_from_start = is_live and self.get_param('live_from_start')
skip_manifests = self._configuration_arg('skip')
- get_dash = (
- (not is_live or self._configuration_arg('include_live_dash'))
- and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True))
- get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True)
+ if not self.get_param('youtube_include_hls_manifest', True):
+ skip_manifests.append('hls')
+ get_dash = 'dash' not in skip_manifests and (
+ not is_live or live_from_start or self._configuration_arg('include_live_dash'))
+ get_hls = not live_from_start and 'hls' not in skip_manifests
def process_manifest_format(f, proto, itag):
if itag in itags:
@@ -2566,6 +2723,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if process_manifest_format(f, 'dash', f['format_id']):
f['filesize'] = int_or_none(self._search_regex(
r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
+ if live_from_start:
+ f['is_from_start'] = True
+
yield f
def _extract_storyboard(self, player_responses, duration):
@@ -2603,12 +2763,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
} for j in range(math.ceil(fragment_count))],
}
- def _real_extract(self, url):
- url, smuggled_data = unsmuggle_url(url, {})
- video_id = self._match_id(url)
-
- base_url = self.http_scheme() + '//www.youtube.com/'
- webpage_url = base_url + 'watch?v=' + video_id
+ def _download_player_responses(self, url, smuggled_data, video_id, webpage_url):
webpage = None
if 'webpage' not in self._configuration_arg('player_skip'):
webpage = self._download_webpage(
@@ -2620,6 +2775,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._get_requested_clients(url, smuggled_data),
video_id, webpage, master_ytcfg)
+ return webpage, master_ytcfg, player_responses, player_url
+
+ def _list_formats(self, video_id, microformats, video_details, player_responses, player_url):
+ live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails'))
+ is_live = get_first(video_details, 'isLive')
+ if is_live is None:
+ is_live = get_first(live_broadcast_details, 'isLiveNow')
+
+ streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
+ formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live))
+
+ return live_broadcast_details, is_live, streaming_data, formats
+
+ def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+ video_id = self._match_id(url)
+
+ base_url = self.http_scheme() + '//www.youtube.com/'
+ webpage_url = base_url + 'watch?v=' + video_id
+
+ webpage, master_ytcfg, player_responses, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url)
+
playability_statuses = traverse_obj(
player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[])
@@ -2688,13 +2865,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return self.playlist_result(
entries, video_id, video_title, video_description)
- live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails'))
- is_live = get_first(video_details, 'isLive')
- if is_live is None:
- is_live = get_first(live_broadcast_details, 'isLiveNow')
-
- streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
- formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live))
+ live_broadcast_details, is_live, streaming_data, formats = self._list_formats(video_id, microformats, video_details, player_responses, player_url)
if not formats:
if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
@@ -2797,10 +2968,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
is_live = False
if is_upcoming is None and (live_content or is_live):
is_upcoming = False
- live_starttime = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp'))
- live_endtime = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp'))
- if not duration and live_endtime and live_starttime:
- duration = live_endtime - live_starttime
+ live_start_time = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp'))
+ live_end_time = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp'))
+ if not duration and live_end_time and live_start_time:
+ duration = live_end_time - live_start_time
+
+ if is_live and self.get_param('live_from_start'):
+ self._prepare_live_from_start_formats(formats, video_id, live_start_time, url, webpage_url, smuggled_data)
formats.extend(self._extract_storyboard(player_responses, duration))
@@ -2843,7 +3017,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else None if is_live is None or is_upcoming is None
else live_content),
'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL
- 'release_timestamp': live_starttime,
+ 'release_timestamp': live_start_time,
}
pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
@@ -4223,7 +4397,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data)
return info_dict
- _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL)
+ _URL_RE = re.compile(rf'(?P<pre>{_VALID_URL})(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$')
def __real_extract(self, url, smuggled_data):
item_id = self._match_id(url)
@@ -4232,36 +4406,33 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
compat_opts = self.get_param('compat_opts', [])
def get_mobj(url):
- mobj = self._url_re.match(url).groupdict()
+ mobj = self._URL_RE.match(url).groupdict()
mobj.update((k, '') for k, v in mobj.items() if v is None)
return mobj
- mobj = get_mobj(url)
+ mobj, redirect_warning = get_mobj(url), None
# Youtube returns incomplete data if tabname is not lower case
pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel']
if is_channel:
if smuggled_data.get('is_music_url'):
- if item_id[:2] == 'VL':
- # Youtube music VL channels have an equivalent playlist
+ if item_id[:2] == 'VL': # Youtube music VL channels have an equivalent playlist
item_id = item_id[2:]
- pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False
- elif item_id[:2] == 'MP':
- # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist
+ pre, tab, post, is_channel = f'https://www.youtube.com/playlist?list={item_id}', '', '', False
+ elif item_id[:2] == 'MP': # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist
mdata = self._extract_tab_endpoint(
- 'https://music.youtube.com/channel/%s' % item_id, item_id, default_client='web_music')
- murl = traverse_obj(
- mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'), get_all=False, expected_type=compat_str)
+ f'https://music.youtube.com/channel/{item_id}', item_id, default_client='web_music')
+ murl = traverse_obj(mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'),
+ get_all=False, expected_type=compat_str)
if not murl:
- raise ExtractorError('Failed to resolve album to playlist.')
+ raise ExtractorError('Failed to resolve album to playlist')
return self.url_result(murl, ie=YoutubeTabIE.ie_key())
- elif mobj['channel_type'] == 'browse':
- # Youtube music /browse/ should be changed to /channel/
- pre = 'https://www.youtube.com/channel/%s' % item_id
+ elif mobj['channel_type'] == 'browse': # Youtube music /browse/ should be changed to /channel/
+ pre = f'https://www.youtube.com/channel/{item_id}'
+
if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
# Home URLs should redirect to /videos/
- self.report_warning(
- 'A channel/user page was given. All the channel\'s videos will be downloaded. '
- 'To download only the videos in the home page, add a "/featured" to the URL')
+ redirect_warning = ('A channel/user page was given. All the channel\'s videos will be downloaded. '
+ 'To download only the videos in the home page, add a "/featured" to the URL')
tab = '/videos'
url = ''.join((pre, tab, post))
@@ -4269,28 +4440,27 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
# Handle both video/playlist URLs
qs = parse_qs(url)
- video_id = qs.get('v', [None])[0]
- playlist_id = qs.get('list', [None])[0]
+ video_id, playlist_id = [qs.get(key, [None])[0] for key in ('v', 'list')]
if not video_id and mobj['not_channel'].startswith('watch'):
if not playlist_id:
# If there is neither video or playlist ids, youtube redirects to home page, which is undesirable
raise ExtractorError('Unable to recognize tab page')
# Common mistake: https://www.youtube.com/watch?list=playlist_id
- self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id)
- url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
+ self.report_warning(f'A video URL was given without video ID. Trying to download playlist {playlist_id}')
+ url = f'https://www.youtube.com/playlist?list={playlist_id}'
mobj = get_mobj(url)
if video_id and playlist_id:
if self.get_param('noplaylist'):
- self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
- return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id)
- self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id))
+ self.to_screen(f'Downloading just video {video_id} because of --no-playlist')
+ return self.url_result(f'https://www.youtube.com/watch?v={video_id}',
+ ie=YoutubeIE.ie_key(), video_id=video_id)
+ self.to_screen(f'Downloading playlist {playlist_id}; add --no-playlist to just download video {video_id}')
data, ytcfg = self._extract_data(url, item_id)
- tabs = try_get(
- data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
+ tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list)
if tabs:
selected_tab = self._extract_selected_tab(tabs)
tab_name = selected_tab.get('title', '')
@@ -4299,41 +4469,45 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
# Live tab should have redirected to the video
raise ExtractorError('The channel is not currently live', expected=True)
if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
+ redirect_warning = f'The URL does not have a {mobj["tab"][1:]} tab'
if not mobj['not_channel'] and item_id[:2] == 'UC':
# Topic channels don't have /videos. Use the equivalent playlist instead
- self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
- pl_id = 'UU%s' % item_id[2:]
- pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
+ pl_id = f'UU{item_id[2:]}'
+ pl_url = f'https://www.youtube.com/playlist?list={pl_id}'
try:
- data, ytcfg, item_id, url = *self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True), pl_id, pl_url
+ data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True)
except ExtractorError:
- self.report_warning('The playlist gave error. Falling back to channel URL')
- else:
- self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
+ redirect_warning += ' and the playlist redirect gave error'
+ else:
+ item_id, url, tab_name = pl_id, pl_url, mobj['tab'][1:]
+ redirect_warning += f'. Redirecting to playlist {pl_id} instead'
+ if tab_name.lower() != mobj['tab'][1:]:
+ redirect_warning += f'. {tab_name} tab is being downloaded instead'
- self.write_debug('Final URL: %s' % url)
+ if redirect_warning:
+ self.report_warning(redirect_warning)
+ self.write_debug(f'Final URL: {url}')
# YouTube sometimes provides a button to reload playlist with unavailable videos.
if 'no-youtube-unavailable-videos' not in compat_opts:
data = self._reload_with_unavailable_videos(item_id, data, ytcfg) or data
self._extract_and_report_alerts(data, only_once=True)
- tabs = try_get(
- data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
+ tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list)
if tabs:
return self._extract_from_tabs(item_id, ytcfg, data, tabs)
- playlist = try_get(
- data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
+ playlist = traverse_obj(
+ data, ('contents', 'twoColumnWatchNextResults', 'playlist', 'playlist'), expected_type=dict)
if playlist:
return self._extract_from_playlist(item_id, url, data, playlist, ytcfg)
- video_id = try_get(
- data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
- compat_str) or video_id
+ video_id = traverse_obj(
+ data, ('currentVideoEndpoint', 'watchEndpoint', 'videoId'), expected_type=str) or video_id
if video_id:
if mobj['tab'] != '/live': # live tab is expected to redirect to video
- self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
- return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id)
+ self.report_warning(f'Unable to recognize playlist. Downloading just video {video_id}')
+ return self.url_result(f'https://www.youtube.com/watch?v={video_id}',
+ ie=YoutubeIE.ie_key(), video_id=video_id)
raise ExtractorError('Unable to recognize tab page')
diff --git a/yt_dlp/extractor/zee5.py b/yt_dlp/extractor/zee5.py
index 462bc4efe..5a5eebd30 100644
--- a/yt_dlp/extractor/zee5.py
+++ b/yt_dlp/extractor/zee5.py
@@ -177,7 +177,7 @@ class Zee5SeriesIE(InfoExtractor):
https?://(?:www\.)?zee5\.com/(?:[^#?]+/)?
(?:tvshows|kids|zee5originals)(?:/[^#/?]+){2}/
)
- (?P<id>[^#/?]+)/?(?:$|[?#])
+ (?P<id>[^#/?]+)(?:/episodes)?/?(?:$|[?#])
'''
_TESTS = [{
'url': 'https://www.zee5.com/kids/kids-shows/krishna-balram/0-6-1871',
@@ -209,8 +209,10 @@ class Zee5SeriesIE(InfoExtractor):
'info_dict': {
'id': '0-6-270',
},
- }
- ]
+ }, {
+ 'url': 'https://www.zee5.com/tvshows/details/chala-hawa-yeu-dya-ladies-zindabaad/0-6-2943/episodes',
+ 'only_matching': True,
+ }]
def _entries(self, show_id):
access_token_request = self._download_json(