diff options
Diffstat (limited to 'yt_dlp/extractor')
-rw-r--r-- | yt_dlp/extractor/abc.py | 64 | ||||
-rw-r--r-- | yt_dlp/extractor/audiomack.py | 34 | ||||
-rw-r--r-- | yt_dlp/extractor/common.py | 20 | ||||
-rw-r--r-- | yt_dlp/extractor/extractors.py | 7 | ||||
-rw-r--r-- | yt_dlp/extractor/facebook.py | 41 | ||||
-rw-r--r-- | yt_dlp/extractor/generic.py | 4 | ||||
-rw-r--r-- | yt_dlp/extractor/gronkh.py | 5 | ||||
-rw-r--r-- | yt_dlp/extractor/hse.py | 95 | ||||
-rw-r--r-- | yt_dlp/extractor/ondemandkorea.py | 6 | ||||
-rw-r--r-- | yt_dlp/extractor/plutotv.py | 7 | ||||
-rw-r--r-- | yt_dlp/extractor/sendtonews.py | 2 | ||||
-rw-r--r-- | yt_dlp/extractor/soundcloud.py | 53 | ||||
-rw-r--r-- | yt_dlp/extractor/youtube.py | 386 | ||||
-rw-r--r-- | yt_dlp/extractor/zee5.py | 8 |
14 files changed, 594 insertions, 138 deletions
diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index e3369306c..354453a27 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -8,6 +8,7 @@ import time from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + dict_get, ExtractorError, js_to_json, int_or_none, @@ -253,3 +254,66 @@ class ABCIViewIE(InfoExtractor): 'subtitles': subtitles, 'is_live': is_live, } + + +class ABCIViewShowSeriesIE(InfoExtractor): + IE_NAME = 'abc.net.au:iview:showseries' + _VALID_URL = r'https?://iview\.abc\.net\.au/show/(?P<id>[^/]+)(?:/series/\d+)?$' + _GEO_COUNTRIES = ['AU'] + + _TESTS = [{ + 'url': 'https://iview.abc.net.au/show/upper-middle-bogan', + 'info_dict': { + 'id': '124870-1', + 'title': 'Series 1', + 'description': 'md5:93119346c24a7c322d446d8eece430ff', + 'series': 'Upper Middle Bogan', + 'season': 'Series 1', + 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.*\.jpg$' + }, + 'playlist_count': 8, + }, { + 'url': 'https://iview.abc.net.au/show/upper-middle-bogan', + 'info_dict': { + 'id': 'CO1108V001S00', + 'ext': 'mp4', + 'title': 'Series 1 Ep 1 I\'m A Swan', + 'description': 'md5:7b676758c1de11a30b79b4d301e8da93', + 'series': 'Upper Middle Bogan', + 'uploader_id': 'abc1', + 'upload_date': '20210630', + 'timestamp': 1625036400, + }, + 'params': { + 'noplaylist': True, + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) + webpage_data = self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*[\'"](.+?)[\'"]\s*;', + webpage, 'initial state') + video_data = self._parse_json( + unescapeHTML(webpage_data).encode('utf-8').decode('unicode_escape'), show_id) + video_data = video_data['route']['pageData']['_embedded'] + + if self.get_param('noplaylist') and 'highlightVideo' in video_data: + self.to_screen('Downloading just the highlight video because of --no-playlist') + return self.url_result(video_data['highlightVideo']['shareUrl'], ie=ABCIViewIE.ie_key()) + + self.to_screen(f'Downloading playlist {show_id} - add --no-playlist to just download the highlight video') + series = video_data['selectedSeries'] + return { + '_type': 'playlist', + 'entries': [self.url_result(episode['shareUrl']) + for episode in series['_embedded']['videoEpisodes']], + 'id': series.get('id'), + 'title': dict_get(series, ('title', 'displaySubtitle')), + 'description': series.get('description'), + 'series': dict_get(series, ('showTitle', 'displayTitle')), + 'season': dict_get(series, ('title', 'displaySubtitle')), + 'thumbnail': series.get('thumbnail'), + } diff --git a/yt_dlp/extractor/audiomack.py b/yt_dlp/extractor/audiomack.py index cc7771354..31fb859ae 100644 --- a/yt_dlp/extractor/audiomack.py +++ b/yt_dlp/extractor/audiomack.py @@ -14,7 +14,7 @@ from ..utils import ( class AudiomackIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P<id>[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:song/|(?=.+/song/))(?P<id>[\w/-]+)' IE_NAME = 'audiomack' _TESTS = [ # hosted on audiomack @@ -39,15 +39,16 @@ class AudiomackIE(InfoExtractor): 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', 'uploader': 'ILOVEMAKONNEN', 'upload_date': '20160414', - } + }, + 'skip': 'Song has been removed from the site', }, ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/song/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/song/', '/') # Request the extended version of the api for extra fields like artist and title api_response = self._download_json( @@ -73,13 +74,13 @@ class AudiomackIE(InfoExtractor): class AudiomackAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P<id>[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:album/|(?=.+/album/))(?P<id>[\w/-]+)' IE_NAME = 'audiomack:album' _TESTS = [ # Standard album playlist { 'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape', - 'playlist_count': 15, + 'playlist_count': 11, 'info_dict': { 'id': '812251', @@ -95,24 +96,27 @@ class AudiomackAlbumIE(InfoExtractor): }, 'playlist': [{ 'info_dict': { - 'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)', - 'id': '837577', + 'title': 'PPP (Pistol P Project) - 8. Real (prod by SYK SENSE )', + 'id': '837576', + 'ext': 'mp3', + 'uploader': 'Lil Herb a.k.a. G Herbo', + } + }, { + 'info_dict': { + 'title': 'PPP (Pistol P Project) - 10. 4 Minutes Of Hell Part 4 (prod by DY OF 808 MAFIA)', + 'id': '837580', 'ext': 'mp3', 'uploader': 'Lil Herb a.k.a. G Herbo', } }], - 'params': { - 'playliststart': 9, - 'playlistend': 9, - } } ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/album/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/album/', '/') result = {'_type': 'playlist', 'entries': []} # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata # Therefore we don't know how many songs the album has and must infi-loop until failure @@ -134,7 +138,7 @@ class AudiomackAlbumIE(InfoExtractor): # Pull out the album metadata and add to result (if it exists) for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]: if apikey in api_response and resultkey not in result: - result[resultkey] = api_response[apikey] + result[resultkey] = compat_str(api_response[apikey]) song_id = url_basename(api_response['url']).rpartition('.')[0] result['entries'].append({ 'id': compat_str(api_response.get('id', song_id)), diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index ebf2e3cea..9abbaf04f 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -163,9 +163,8 @@ class InfoExtractor(object): * filesize_approx An estimate for the number of bytes * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual - download, lower-case. - "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe", - "m3u8", "m3u8_native" or "http_dash_segments". + download, lower-case. One of "http", "https" or + one of the protocols defined in downloader.PROTOCOL_MAP * fragment_base_url Base URL for fragments. Each fragment's path value (if present) will be relative to @@ -181,6 +180,8 @@ class InfoExtractor(object): fragment_base_url * "duration" (optional, int or float) * "filesize" (optional, int) + * is_from_start Is a live format that can be downloaded + from the start. Boolean * preference Order number of this format. If this field is present and not None, the formats get sorted by this field, regardless of all other values. @@ -1451,8 +1452,13 @@ class InfoExtractor(object): }) extract_interaction_statistic(e) - for e in json_ld: - if '@context' in e: + def traverse_json_ld(json_ld, at_top_level=True): + for e in json_ld: + if at_top_level and '@context' not in e: + continue + if at_top_level and set(e.keys()) == {'@context', '@graph'}: + traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False) + break item_type = e.get('@type') if expected_type is not None and expected_type != item_type: continue @@ -1488,7 +1494,7 @@ class InfoExtractor(object): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), 'title': unescapeHTML(e.get('headline')), - 'description': unescapeHTML(e.get('articleBody')), + 'description': unescapeHTML(e.get('articleBody') or e.get('description')), }) elif item_type == 'VideoObject': extract_video_object(e) @@ -1503,6 +1509,8 @@ class InfoExtractor(object): continue else: break + traverse_json_ld(json_ld) + return dict((k, v) for k, v in info.items() if v is not None) def _search_nextjs_data(self, webpage, video_id, **kw): diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index e4755b3d1..ee5ea533f 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .abc import ( ABCIE, ABCIViewIE, + ABCIViewShowSeriesIE, ) from .abcnews import ( AbcNewsIE, @@ -434,6 +435,7 @@ from .eyedotv import EyedoTVIE from .facebook import ( FacebookIE, FacebookPluginsVideoIE, + FacebookRedirectURLIE, ) from .fancode import ( FancodeVodIE, @@ -563,6 +565,10 @@ from .hrti import ( HRTiIE, HRTiPlaylistIE, ) +from .hse import ( + HSEShowIE, + HSEProductIE, +) from .huajiao import HuajiaoIE from .huffpost import HuffPostIE from .hungama import ( @@ -1357,6 +1363,7 @@ from .soundcloud import ( SoundcloudEmbedIE, SoundcloudIE, SoundcloudSetIE, + SoundcloudRelatedIE, SoundcloudUserIE, SoundcloudTrackStationIE, SoundcloudPlaylistIE, diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 44d3dc0d7..6dbcd690d 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -23,9 +23,11 @@ from ..utils import ( merge_dicts, network_exceptions, parse_count, + parse_qs, qualities, sanitized_Request, try_get, + url_or_none, urlencode_postdata, urljoin, ) @@ -746,3 +748,42 @@ class FacebookPluginsVideoIE(InfoExtractor): return self.url_result( compat_urllib_parse_unquote(self._match_id(url)), FacebookIE.ie_key()) + + +class FacebookRedirectURLIE(InfoExtractor): + IE_DESC = False # Do not list + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/flx/warn[/?]' + _TESTS = [{ + 'url': 'https://www.facebook.com/flx/warn/?h=TAQHsoToz&u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&s=1', + 'info_dict': { + 'id': 'pO8h3EaFRdo', + 'ext': 'mp4', + 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', + 'description': 'md5:2d713ccbb45b686a1888397b2c77ca6b', + 'channel_id': 'UCGBpxWJr9FNOcFYA5GkKrMg', + 'playable_in_embed': True, + 'categories': ['Music'], + 'channel': 'Boiler Room', + 'uploader_id': 'brtvofficial', + 'uploader': 'Boiler Room', + 'tags': 'count:11', + 'duration': 3332, + 'live_status': 'not_live', + 'thumbnail': 'https://i.ytimg.com/vi/pO8h3EaFRdo/maxresdefault.jpg', + 'channel_url': 'https://www.youtube.com/channel/UCGBpxWJr9FNOcFYA5GkKrMg', + 'availability': 'public', + 'uploader_url': 'http://www.youtube.com/user/brtvofficial', + 'upload_date': '20150917', + 'age_limit': 0, + 'view_count': int, + 'like_count': int, + }, + 'add_ie': ['Youtube'], + 'params': {'skip_download': 'Youtube'}, + }] + + def _real_extract(self, url): + redirect_url = url_or_none(parse_qs(url).get('u', [None])[-1]) + if not redirect_url: + raise ExtractorError('Invalid facebook redirect URL', expected=True) + return self.url_result(redirect_url) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 51557f0f1..1ec0ce986 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -3653,6 +3653,10 @@ class GenericIE(InfoExtractor): json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url'): self.report_detected('JSON LD') + if determine_ext(json_ld.get('url')) == 'm3u8': + json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles( + json_ld['url'], video_id, 'mp4') + json_ld.pop('url') return merge_dicts(json_ld, info_dict) def check_video(vurl): diff --git a/yt_dlp/extractor/gronkh.py b/yt_dlp/extractor/gronkh.py index 58cd59511..c9f1dd256 100644 --- a/yt_dlp/extractor/gronkh.py +++ b/yt_dlp/extractor/gronkh.py @@ -6,7 +6,7 @@ from ..utils import unified_strdate class GronkhIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/stream/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/(?:watch/)?stream/(?P<id>\d+)' _TESTS = [{ 'url': 'https://gronkh.tv/stream/536', @@ -19,6 +19,9 @@ class GronkhIE(InfoExtractor): 'upload_date': '20211001' }, 'params': {'skip_download': True} + }, { + 'url': 'https://gronkh.tv/watch/stream/546', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/hse.py b/yt_dlp/extractor/hse.py new file mode 100644 index 000000000..9144ff8dc --- /dev/null +++ b/yt_dlp/extractor/hse.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unified_timestamp, +) + + +class HSEShowBaseInfoExtractor(InfoExtractor): + _GEO_COUNTRIES = ['DE'] + + def _extract_redux_data(self, url, video_id): + webpage = self._download_webpage(url, video_id) + redux = self._html_search_regex( + r'window\.__REDUX_DATA__\s*=\s*({.*});?', webpage, 'redux data') + return self._parse_json(redux.replace('\n', ''), video_id) + + def _extract_formats_and_subtitles(self, sources, video_id): + if not sources: + raise ExtractorError('No video found', expected=True, video_id=video_id) + formats, subtitles = [], {} + for src in sources: + if src['mimetype'] != 'application/x-mpegURL': + continue + fmts, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, ext='mp4') + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + self._sort_formats(formats) + return formats, subtitles + + +class HSEShowIE(HSEShowBaseInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/c/tv-shows/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.hse.de/dpl/c/tv-shows/505350', + 'info_dict': { + 'id': '505350', + 'ext': 'mp4', + 'title': 'Pfeffinger Mode & Accessoires', + 'timestamp': 1638810000, + 'upload_date': '20211206', + 'channel': 'HSE24', + 'uploader': 'Arina Pirayesh' + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._extract_redux_data(url, video_id) + formats, subtitles = self._extract_formats_and_subtitles( + traverse_obj(json_data, ('tvShowPage', 'tvShowVideo', 'sources')), video_id) + + show = traverse_obj(json_data, ('tvShowPage', 'tvShow')) or {} + return { + 'id': video_id, + 'title': show.get('title') or video_id, + 'formats': formats, + 'timestamp': unified_timestamp(f'{show.get("date")} {show.get("hour")}:00'), + 'thumbnail': traverse_obj(json_data, ('tvShowVideo', 'poster')), + 'channel': self._search_regex( + r'tvShow \| ([A-Z0-9]+)_', show.get('actionFieldText') or '', video_id, fatal=False), + 'uploader': show.get('presenter'), + 'subtitles': subtitles, + } + + +class HSEProductIE(HSEShowBaseInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hse\.de/dpl/p/product/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.hse.de/dpl/p/product/408630', + 'info_dict': { + 'id': '408630', + 'ext': 'mp4', + 'title': 'Hose im Ponte-Mix', + 'uploader': 'Judith Williams' + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._extract_redux_data(url, video_id) + video = traverse_obj(json_data, ('productContent', 'productContent', 'videos', 0)) or {} + formats, subtitles = self._extract_formats_and_subtitles(video.get('sources'), video_id) + + return { + 'id': video_id, + 'title': traverse_obj(json_data, ('productDetail', 'product', 'name', 'short')) or video_id, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': video.get('poster'), + 'uploader': traverse_obj(json_data, ('productDetail', 'product', 'brand', 'brandName')), + } diff --git a/yt_dlp/extractor/ondemandkorea.py b/yt_dlp/extractor/ondemandkorea.py index cc3c587bc..e933ea2cc 100644 --- a/yt_dlp/extractor/ondemandkorea.py +++ b/yt_dlp/extractor/ondemandkorea.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -71,8 +73,8 @@ class OnDemandKoreaIE(InfoExtractor): jw_config = self._parse_json( self._search_regex( - r'(?s)odkPlayer\.init.*?(?P<options>{[^;]+}).*?;', - webpage, 'jw config', group='options'), + r'playlist\s*=\s*\[(?P<options>.+)];?$', + webpage, 'jw config', flags=re.MULTILINE, group='options'), video_id, transform_source=js_to_json) info = self._parse_jwplayer_data( jw_config, video_id, require_title=False, m3u8_id='hls', diff --git a/yt_dlp/extractor/plutotv.py b/yt_dlp/extractor/plutotv.py index 0cf82466a..26aff1af5 100644 --- a/yt_dlp/extractor/plutotv.py +++ b/yt_dlp/extractor/plutotv.py @@ -20,11 +20,11 @@ from ..utils import ( class PlutoTVIE(InfoExtractor): _VALID_URL = r'''(?x) - https?://(?:www\.)?pluto\.tv(?:/en)?/on-demand + https?://(?:www\.)?pluto\.tv(?:/[^/]+)?/on-demand /(?P<video_type>movies|series) /(?P<series_or_movie_slug>[^/]+) (?: - /seasons?/(?P<season_no>\d+) + (?:/seasons?/(?P<season_no>\d+))? (?:/episode/(?P<episode_slug>[^/]+))? )? /?(?:$|[#?])''' @@ -84,6 +84,9 @@ class PlutoTVIE(InfoExtractor): }, { 'url': 'https://pluto.tv/en/on-demand/series/manhunters-fugitive-task-force/seasons/1/episode/third-times-the-charm-1-1', 'only_matching': True, + }, { + 'url': 'https://pluto.tv/it/on-demand/series/csi-vegas/episode/legacy-2021-1-1', + 'only_matching': True, } ] diff --git a/yt_dlp/extractor/sendtonews.py b/yt_dlp/extractor/sendtonews.py index bc38a0f1e..858547b54 100644 --- a/yt_dlp/extractor/sendtonews.py +++ b/yt_dlp/extractor/sendtonews.py @@ -80,7 +80,7 @@ class SendtoNewsIE(InfoExtractor): 'format_id': '%s-%d' % (determine_protocol(f), tbr), 'tbr': tbr, }) - # 'tbr' was explicitly set to be prefered over 'height' originally, + # 'tbr' was explicitly set to be preferred over 'height' originally, # So this is being kept unless someone can confirm this is unnecessary self._sort_formats(info_dict['formats'], ('tbr', 'res')) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index d5cbe70ea..f251e5599 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -214,8 +214,9 @@ class SoundcloudIE(SoundcloudBaseIE): (?!stations/track) (?P<uploader>[\w\d-]+)/ (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) - (?P<title>[\w\d-]+)/? - (?P<token>[^?]+?)?(?:[?].*)?$) + (?P<title>[\w\d-]+) + (?:/(?P<token>(?!(?:albums|sets|recommended))[^?]+?))? + (?:[?].*)?$) |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+) (?:/?\?secret_token=(?P<secret_token>[^&]+))?) ) @@ -827,6 +828,54 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): track_id, 'Track station: %s' % track['title']) +class SoundcloudRelatedIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<slug>[\w\d-]+/[\w\d-]+)/(?P<relation>albums|sets|recommended)' + IE_NAME = 'soundcloud:related' + _TESTS = [{ + 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/recommended', + 'info_dict': { + 'id': '1084577272', + 'title': 'Sexapil - Pingers 5 (Recommended)', + }, + 'playlist_mincount': 50, + }, { + 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/albums', + 'info_dict': { + 'id': '1084577272', + 'title': 'Sexapil - Pingers 5 (Albums)', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://soundcloud.com/wajang/sexapil-pingers-5/sets', + 'info_dict': { + 'id': '1084577272', + 'title': 'Sexapil - Pingers 5 (Sets)', + }, + 'playlist_mincount': 4, + }] + + _BASE_URL_MAP = { + 'albums': 'tracks/%s/albums', + 'sets': 'tracks/%s/playlists_without_albums', + 'recommended': 'tracks/%s/related', + } + + def _real_extract(self, url): + slug, relation = self._match_valid_url(url).group('slug', 'relation') + + track = self._download_json( + self._resolv_url(self._BASE_URL + slug), + slug, 'Downloading track info', headers=self._HEADERS) + + if track.get('errors'): + raise ExtractorError(f'{self.IE_NAME} said: %s' % ','.join( + str(err['error_message']) for err in track['errors']), expected=True) + + return self._extract_playlist( + self._API_V2_BASE + self._BASE_URL_MAP[relation] % track['id'], str(track['id']), + '%s (%s)' % (track.get('title') or slug, relation.capitalize())) + + class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' IE_NAME = 'soundcloud:playlist' diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 8f64b6657..1f5009399 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -5,6 +5,7 @@ from __future__ import unicode_literals import calendar import copy import datetime +import functools import hashlib import itertools import json @@ -15,6 +16,7 @@ import re import sys import time import traceback +import threading from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( @@ -55,6 +57,7 @@ from ..utils import ( smuggle_url, str_or_none, str_to_int, + strftime_or_none, traverse_obj, try_get, unescapeHTML, @@ -358,7 +361,20 @@ class YoutubeBaseInfoExtractor(InfoExtractor): consent_id = random.randint(100, 999) self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + def _initialize_pref(self): + cookies = self._get_cookies('https://www.youtube.com/') + pref_cookie = cookies.get('PREF') + pref = {} + if pref_cookie: + try: + pref = dict(compat_urlparse.parse_qsl(pref_cookie.value)) + except ValueError: + self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) + pref.update({'hl': 'en'}) + self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref)) + def _real_initialize(self): + self._initialize_pref() self._initialize_consent() self._login() @@ -391,23 +407,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client) def _extract_context(self, ytcfg=None, default_client='web'): - _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict) - context = _get_context(ytcfg) - if context: - return context - - context = _get_context(self._get_default_ytcfg(default_client)) - if not ytcfg: - return context - - # Recreate the client context (required) - context['client'].update({ - 'clientVersion': self._extract_client_version(ytcfg, default_client), - 'clientName': self._extract_client_name(ytcfg, default_client), - }) - visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str) - if visitor_data: - context['client']['visitorData'] = visitor_data + context = get_first( + (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) + # Enforce language for extraction + traverse_obj(context, 'client', expected_type=dict, default={})['hl'] = 'en' return context _SAPISID = None @@ -664,6 +667,29 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if text: return text + @staticmethod + def extract_relative_time(relative_time_text): + """ + Extracts a relative time from string and converts to dt object + e.g. 'streamed 6 days ago', '5 seconds ago (edited)' + """ + mobj = re.search(r'(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text) + if mobj: + try: + return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')), precision='auto') + except ValueError: + return None + + def _extract_time_text(self, renderer, *path_list): + text = self._get_text(renderer, *path_list) or '' + dt = self.extract_relative_time(text) + timestamp = None + if isinstance(dt, datetime.datetime): + timestamp = calendar.timegm(dt.timetuple()) + if text and timestamp is None: + self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True) + return timestamp, text + def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None, default_client='web'): @@ -750,7 +776,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'view count', default=None)) uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') - + channel_id = traverse_obj( + renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), expected_type=str, get_all=False) + timestamp, time_text = self._extract_time_text(renderer, 'publishedTimeText') + scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) + overlay_style = traverse_obj( + renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) + badges = self._extract_badges(renderer) return { '_type': 'url', 'ie_key': YoutubeIE.ie_key(), @@ -761,6 +793,14 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'duration': duration, 'view_count': view_count, 'uploader': uploader, + 'channel_id': channel_id, + 'upload_date': strftime_or_none(timestamp, '%Y%m%d'), + 'live_status': ('is_upcoming' if scheduled_timestamp is not None + else 'was_live' if 'streamed' in time_text.lower() + else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges + else None), + 'release_timestamp': scheduled_timestamp, + 'availability': self._availability(needs_premium='premium' in badges, needs_subscription='members only' in badges) } @@ -1709,6 +1749,142 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._code_cache = {} self._player_cache = {} + def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data): + EXPIRATION_DURATION = 18_000 + lock = threading.Lock() + + is_live = True + expiration_time = time.time() + EXPIRATION_DURATION + formats = [f for f in formats if f.get('is_from_start')] + + def refetch_manifest(format_id): + nonlocal formats, expiration_time, is_live + if time.time() <= expiration_time: + return + + _, _, prs, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url) + video_details = traverse_obj( + prs, (..., 'videoDetails'), expected_type=dict, default=[]) + microformats = traverse_obj( + prs, (..., 'microformat', 'playerMicroformatRenderer'), + expected_type=dict, default=[]) + _, is_live, _, formats = self._list_formats(video_id, microformats, video_details, prs, player_url) + expiration_time = time.time() + EXPIRATION_DURATION + + def mpd_feed(format_id): + """ + @returns (manifest_url, manifest_stream_number, is_live) or None + """ + with lock: + refetch_manifest(format_id) + + f = next((f for f in formats if f['format_id'] == format_id), None) + if not f: + self.report_warning( + f'Cannot find refreshed manifest for format {format_id}{bug_reports_message()}') + return None + return f['manifest_url'], f['manifest_stream_number'], is_live + + for f in formats: + f['protocol'] = 'http_dash_segments_generator' + f['fragments'] = functools.partial( + self._live_dash_fragments, f['format_id'], live_start_time, mpd_feed) + + def _live_dash_fragments(self, format_id, live_start_time, mpd_feed, ctx): + FETCH_SPAN, MAX_DURATION = 5, 432000 + + mpd_url, stream_number, is_live = None, None, True + + begin_index = 0 + download_start_time = ctx.get('start') or time.time() + + lack_early_segments = download_start_time - (live_start_time or download_start_time) > MAX_DURATION + if lack_early_segments: + self.report_warning(bug_reports_message( + 'Starting download from the last 120 hours of the live stream since ' + 'YouTube does not have data before that. If you think this is wrong,'), only_once=True) + lack_early_segments = True + + known_idx, no_fragment_score, last_segment_url = begin_index, 0, None + fragments, fragment_base_url = None, None + + def _extract_sequence_from_mpd(refresh_sequence): + nonlocal mpd_url, stream_number, is_live, no_fragment_score, fragments, fragment_base_url + # Obtain from MPD's maximum seq value + old_mpd_url = mpd_url + mpd_url, stream_number, is_live = mpd_feed(format_id) or (mpd_url, stream_number, False) + if old_mpd_url == mpd_url and not refresh_sequence: + return True, last_seq + try: + fmts, _ = self._extract_mpd_formats_and_subtitles( + mpd_url, None, note=False, errnote=False, fatal=False) + except ExtractorError: + fmts = None + if not fmts: + no_fragment_score += 1 + return False, last_seq + fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number) + fragments = fmt_info['fragments'] + fragment_base_url = fmt_info['fragment_base_url'] + assert fragment_base_url + + _last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1)) + return True, _last_seq + + while is_live: + fetch_time = time.time() + if no_fragment_score > 30: + return + if last_segment_url: + # Obtain from "X-Head-Seqnum" header value from each segment + try: + urlh = self._request_webpage( + last_segment_url, None, note=False, errnote=False, fatal=False) + except ExtractorError: + urlh = None + last_seq = try_get(urlh, lambda x: int_or_none(x.headers['X-Head-Seqnum'])) + if last_seq is None: + no_fragment_score += 1 + last_segment_url = None + continue + else: + should_retry, last_seq = _extract_sequence_from_mpd(True) + if not should_retry: + continue + + if known_idx > last_seq: + last_segment_url = None + continue + + last_seq += 1 + + if begin_index < 0 and known_idx < 0: + # skip from the start when it's negative value + known_idx = last_seq + begin_index + if lack_early_segments: + known_idx = max(known_idx, last_seq - int(MAX_DURATION // fragments[-1]['duration'])) + try: + for idx in range(known_idx, last_seq): + # do not update sequence here or you'll get skipped some part of it + should_retry, _ = _extract_sequence_from_mpd(False) + if not should_retry: + # retry when it gets weird state + known_idx = idx - 1 + raise ExtractorError('breaking out of outer loop') + last_segment_url = urljoin(fragment_base_url, 'sq/%d' % idx) + yield { + 'url': last_segment_url, + } + if known_idx == last_seq: + no_fragment_score += 5 + else: + no_fragment_score = 0 + known_idx = last_seq + except ExtractorError: + continue + + time.sleep(max(0, FETCH_SPAN + fetch_time - time.time())) + def _extract_player_url(self, *ytcfgs, webpage=None): player_url = traverse_obj( ytcfgs, (..., 'PLAYER_JS_URL'), (..., 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'jsUrl'), @@ -2064,19 +2240,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), regex), webpage, name, default='{}'), video_id, fatal=False) - @staticmethod - def parse_time_text(time_text): - """ - Parse the comment time text - time_text is in the format 'X units ago (edited)' - """ - time_text_split = time_text.split(' ') - if len(time_text_split) >= 3: - try: - return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto') - except ValueError: - return None - def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') if not comment_id: @@ -2085,10 +2248,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): text = self._get_text(comment_renderer, 'contentText') # note: timestamp is an estimate calculated from the current time and time_text - time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' - time_text_dt = self.parse_time_text(time_text) - if isinstance(time_text_dt, datetime.datetime): - timestamp = calendar.timegm(time_text_dt.timetuple()) + timestamp, time_text = self._extract_time_text(comment_renderer, 'publishedTimeText') author = self._get_text(comment_renderer, 'authorText') author_id = try_get(comment_renderer, lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str) @@ -2261,11 +2421,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): yield from self._comment_entries(renderer, ytcfg, video_id) max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) - # Force English regardless of account setting to prevent parsing issues - # See: https://github.com/yt-dlp/yt-dlp/issues/532 - ytcfg = copy.deepcopy(ytcfg) - traverse_obj( - ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en' return itertools.islice(_real_comment_extract(contents), 0, max_comments) @staticmethod @@ -2531,11 +2686,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): dct['container'] = dct['ext'] + '_dash' yield dct + live_from_start = is_live and self.get_param('live_from_start') skip_manifests = self._configuration_arg('skip') - get_dash = ( - (not is_live or self._configuration_arg('include_live_dash')) - and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True)) - get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True) + if not self.get_param('youtube_include_hls_manifest', True): + skip_manifests.append('hls') + get_dash = 'dash' not in skip_manifests and ( + not is_live or live_from_start or self._configuration_arg('include_live_dash')) + get_hls = not live_from_start and 'hls' not in skip_manifests def process_manifest_format(f, proto, itag): if itag in itags: @@ -2566,6 +2723,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if process_manifest_format(f, 'dash', f['format_id']): f['filesize'] = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) + if live_from_start: + f['is_from_start'] = True + yield f def _extract_storyboard(self, player_responses, duration): @@ -2603,12 +2763,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } for j in range(math.ceil(fragment_count))], } - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - video_id = self._match_id(url) - - base_url = self.http_scheme() + '//www.youtube.com/' - webpage_url = base_url + 'watch?v=' + video_id + def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): webpage = None if 'webpage' not in self._configuration_arg('player_skip'): webpage = self._download_webpage( @@ -2620,6 +2775,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._get_requested_clients(url, smuggled_data), video_id, webpage, master_ytcfg) + return webpage, master_ytcfg, player_responses, player_url + + def _list_formats(self, video_id, microformats, video_details, player_responses, player_url): + live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails')) + is_live = get_first(video_details, 'isLive') + if is_live is None: + is_live = get_first(live_broadcast_details, 'isLiveNow') + + streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) + formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live)) + + return live_broadcast_details, is_live, streaming_data, formats + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + video_id = self._match_id(url) + + base_url = self.http_scheme() + '//www.youtube.com/' + webpage_url = base_url + 'watch?v=' + video_id + + webpage, master_ytcfg, player_responses, player_url = self._download_player_responses(url, smuggled_data, video_id, webpage_url) + playability_statuses = traverse_obj( player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[]) @@ -2688,13 +2865,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return self.playlist_result( entries, video_id, video_title, video_description) - live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails')) - is_live = get_first(video_details, 'isLive') - if is_live is None: - is_live = get_first(live_broadcast_details, 'isLiveNow') - - streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) - formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live)) + live_broadcast_details, is_live, streaming_data, formats = self._list_formats(video_id, microformats, video_details, player_responses, player_url) if not formats: if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')): @@ -2797,10 +2968,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): is_live = False if is_upcoming is None and (live_content or is_live): is_upcoming = False - live_starttime = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp')) - live_endtime = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp')) - if not duration and live_endtime and live_starttime: - duration = live_endtime - live_starttime + live_start_time = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp')) + live_end_time = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp')) + if not duration and live_end_time and live_start_time: + duration = live_end_time - live_start_time + + if is_live and self.get_param('live_from_start'): + self._prepare_live_from_start_formats(formats, video_id, live_start_time, url, webpage_url, smuggled_data) formats.extend(self._extract_storyboard(player_responses, duration)) @@ -2843,7 +3017,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else None if is_live is None or is_upcoming is None else live_content), 'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL - 'release_timestamp': live_starttime, + 'release_timestamp': live_start_time, } pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict) @@ -4223,7 +4397,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): info_dict['entries'] = self._smuggle_data(info_dict['entries'], smuggled_data) return info_dict - _url_re = re.compile(r'(?P<pre>%s)(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$' % _VALID_URL) + _URL_RE = re.compile(rf'(?P<pre>{_VALID_URL})(?(channel_type)(?P<tab>/\w+))?(?P<post>.*)$') def __real_extract(self, url, smuggled_data): item_id = self._match_id(url) @@ -4232,36 +4406,33 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): compat_opts = self.get_param('compat_opts', []) def get_mobj(url): - mobj = self._url_re.match(url).groupdict() + mobj = self._URL_RE.match(url).groupdict() mobj.update((k, '') for k, v in mobj.items() if v is None) return mobj - mobj = get_mobj(url) + mobj, redirect_warning = get_mobj(url), None # Youtube returns incomplete data if tabname is not lower case pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel'] if is_channel: if smuggled_data.get('is_music_url'): - if item_id[:2] == 'VL': - # Youtube music VL channels have an equivalent playlist + if item_id[:2] == 'VL': # Youtube music VL channels have an equivalent playlist item_id = item_id[2:] - pre, tab, post, is_channel = 'https://www.youtube.com/playlist?list=%s' % item_id, '', '', False - elif item_id[:2] == 'MP': - # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist + pre, tab, post, is_channel = f'https://www.youtube.com/playlist?list={item_id}', '', '', False + elif item_id[:2] == 'MP': # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist mdata = self._extract_tab_endpoint( - 'https://music.youtube.com/channel/%s' % item_id, item_id, default_client='web_music') - murl = traverse_obj( - mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'), get_all=False, expected_type=compat_str) + f'https://music.youtube.com/channel/{item_id}', item_id, default_client='web_music') + murl = traverse_obj(mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'), + get_all=False, expected_type=compat_str) if not murl: - raise ExtractorError('Failed to resolve album to playlist.') + raise ExtractorError('Failed to resolve album to playlist') return self.url_result(murl, ie=YoutubeTabIE.ie_key()) - elif mobj['channel_type'] == 'browse': - # Youtube music /browse/ should be changed to /channel/ - pre = 'https://www.youtube.com/channel/%s' % item_id + elif mobj['channel_type'] == 'browse': # Youtube music /browse/ should be changed to /channel/ + pre = f'https://www.youtube.com/channel/{item_id}' + if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts: # Home URLs should redirect to /videos/ - self.report_warning( - 'A channel/user page was given. All the channel\'s videos will be downloaded. ' - 'To download only the videos in the home page, add a "/featured" to the URL') + redirect_warning = ('A channel/user page was given. All the channel\'s videos will be downloaded. ' + 'To download only the videos in the home page, add a "/featured" to the URL') tab = '/videos' url = ''.join((pre, tab, post)) @@ -4269,28 +4440,27 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): # Handle both video/playlist URLs qs = parse_qs(url) - video_id = qs.get('v', [None])[0] - playlist_id = qs.get('list', [None])[0] + video_id, playlist_id = [qs.get(key, [None])[0] for key in ('v', 'list')] if not video_id and mobj['not_channel'].startswith('watch'): if not playlist_id: # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable raise ExtractorError('Unable to recognize tab page') # Common mistake: https://www.youtube.com/watch?list=playlist_id - self.report_warning('A video URL was given without video ID. Trying to download playlist %s' % playlist_id) - url = 'https://www.youtube.com/playlist?list=%s' % playlist_id + self.report_warning(f'A video URL was given without video ID. Trying to download playlist {playlist_id}') + url = f'https://www.youtube.com/playlist?list={playlist_id}' mobj = get_mobj(url) if video_id and playlist_id: if self.get_param('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id) - self.to_screen('Downloading playlist %s; add --no-playlist to just download video %s' % (playlist_id, video_id)) + self.to_screen(f'Downloading just video {video_id} because of --no-playlist') + return self.url_result(f'https://www.youtube.com/watch?v={video_id}', + ie=YoutubeIE.ie_key(), video_id=video_id) + self.to_screen(f'Downloading playlist {playlist_id}; add --no-playlist to just download video {video_id}') data, ytcfg = self._extract_data(url, item_id) - tabs = try_get( - data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) + tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list) if tabs: selected_tab = self._extract_selected_tab(tabs) tab_name = selected_tab.get('title', '') @@ -4299,41 +4469,45 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): # Live tab should have redirected to the video raise ExtractorError('The channel is not currently live', expected=True) if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]: + redirect_warning = f'The URL does not have a {mobj["tab"][1:]} tab' if not mobj['not_channel'] and item_id[:2] == 'UC': # Topic channels don't have /videos. Use the equivalent playlist instead - self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:])) - pl_id = 'UU%s' % item_id[2:] - pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post']) + pl_id = f'UU{item_id[2:]}' + pl_url = f'https://www.youtube.com/playlist?list={pl_id}' try: - data, ytcfg, item_id, url = *self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True), pl_id, pl_url + data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True) except ExtractorError: - self.report_warning('The playlist gave error. Falling back to channel URL') - else: - self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name)) + redirect_warning += ' and the playlist redirect gave error' + else: + item_id, url, tab_name = pl_id, pl_url, mobj['tab'][1:] + redirect_warning += f'. Redirecting to playlist {pl_id} instead' + if tab_name.lower() != mobj['tab'][1:]: + redirect_warning += f'. {tab_name} tab is being downloaded instead' - self.write_debug('Final URL: %s' % url) + if redirect_warning: + self.report_warning(redirect_warning) + self.write_debug(f'Final URL: {url}') # YouTube sometimes provides a button to reload playlist with unavailable videos. if 'no-youtube-unavailable-videos' not in compat_opts: data = self._reload_with_unavailable_videos(item_id, data, ytcfg) or data self._extract_and_report_alerts(data, only_once=True) - tabs = try_get( - data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) + tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list) if tabs: return self._extract_from_tabs(item_id, ytcfg, data, tabs) - playlist = try_get( - data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) + playlist = traverse_obj( + data, ('contents', 'twoColumnWatchNextResults', 'playlist', 'playlist'), expected_type=dict) if playlist: return self._extract_from_playlist(item_id, url, data, playlist, ytcfg) - video_id = try_get( - data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'], - compat_str) or video_id + video_id = traverse_obj( + data, ('currentVideoEndpoint', 'watchEndpoint', 'videoId'), expected_type=str) or video_id if video_id: if mobj['tab'] != '/live': # live tab is expected to redirect to video - self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id) - return self.url_result(f'https://www.youtube.com/watch?v={video_id}', ie=YoutubeIE.ie_key(), video_id=video_id) + self.report_warning(f'Unable to recognize playlist. Downloading just video {video_id}') + return self.url_result(f'https://www.youtube.com/watch?v={video_id}', + ie=YoutubeIE.ie_key(), video_id=video_id) raise ExtractorError('Unable to recognize tab page') diff --git a/yt_dlp/extractor/zee5.py b/yt_dlp/extractor/zee5.py index 462bc4efe..5a5eebd30 100644 --- a/yt_dlp/extractor/zee5.py +++ b/yt_dlp/extractor/zee5.py @@ -177,7 +177,7 @@ class Zee5SeriesIE(InfoExtractor): https?://(?:www\.)?zee5\.com/(?:[^#?]+/)? (?:tvshows|kids|zee5originals)(?:/[^#/?]+){2}/ ) - (?P<id>[^#/?]+)/?(?:$|[?#]) + (?P<id>[^#/?]+)(?:/episodes)?/?(?:$|[?#]) ''' _TESTS = [{ 'url': 'https://www.zee5.com/kids/kids-shows/krishna-balram/0-6-1871', @@ -209,8 +209,10 @@ class Zee5SeriesIE(InfoExtractor): 'info_dict': { 'id': '0-6-270', }, - } - ] + }, { + 'url': 'https://www.zee5.com/tvshows/details/chala-hawa-yeu-dya-ladies-zindabaad/0-6-2943/episodes', + 'only_matching': True, + }] def _entries(self, show_id): access_token_request = self._download_json( |