From af4944d84b857f285bcf70b33edcbef5ad400a31 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 30 Mar 2022 12:22:36 +0530 Subject: Fix bug in 8a7f68d0b12d0f4910a15b59a3ec090bbf83b6f2 Closes #3241 --- yt_dlp/postprocessor/ffmpeg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 0b18e8774..5216acbfb 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -95,7 +95,7 @@ class FFmpegPostProcessor(PostProcessor): def get_ffmpeg_version(path, prog): if path in self._version_cache: - self._versions[path], self._features = self._version_cache[path], self._features_cache.get(path, {}) + self._versions[prog], self._features = self._version_cache[path], self._features_cache.get(path, {}) return out = _get_exe_version_output(path, ['-bsfs'], to_screen=self.write_debug) ver = detect_exe_version(out) if out else False -- cgit v1.2.3 From 48e15bb6b11ce437d18687e068852a8bf2cf0b6c Mon Sep 17 00:00:00 2001 From: Ha Tien Loi Date: Wed, 30 Mar 2022 17:04:00 +0700 Subject: [dailymotion] Support `geo.dailymotion.com` (#3230) Closes #3229 Authored by: hatienl0i261299 --- yt_dlp/extractor/dailymotion.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py index 95589d53a..9cb56185b 100644 --- a/yt_dlp/extractor/dailymotion.py +++ b/yt_dlp/extractor/dailymotion.py @@ -94,10 +94,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor): _VALID_URL = r'''(?ix) https?:// (?: - (?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)| + (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player\.html\?)?video|swf)| (?:www\.)?lequipe\.fr/video ) - /(?P[^/?_]+)(?:.+?\bplaylist=(?Px[0-9a-z]+))? + [/=](?P[^/?_&]+)(?:.+?\bplaylist=(?Px[0-9a-z]+))? ''' IE_NAME = 'dailymotion' _TESTS = [{ @@ -115,6 +115,25 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader_id': 'x1xm8ri', 'age_limit': 0, }, + }, { + 'url': 'https://geo.dailymotion.com/player.html?video=x89eyek&mute=true', + 'md5': 'e2f9717c6604773f963f069ca53a07f8', + 'info_dict': { + 'id': 'x89eyek', + 'ext': 'mp4', + 'title': "En quête d'esprit du 27/03/2022", + 'description': 'md5:66542b9f4df2eb23f314fc097488e553', + 'duration': 2756, + 'timestamp': 1648383669, + 'upload_date': '20220327', + 'uploader': 'CNEWS', + 'uploader_id': 'x24vth', + 'age_limit': 0, + 'view_count': int, + 'like_count': int, + 'tags': ['en_quete_d_esprit'], + 'thumbnail': 'https://s2.dmcdn.net/v/Tncwi1YGKdvFbDuDY/x1080', + } }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', 'md5': '2137c41a8e78554bb09225b8eb322406', -- cgit v1.2.3 From ab2579bb45ccdb82d40dbb75f48721d97df88270 Mon Sep 17 00:00:00 2001 From: Daniel <61970262+rozari0@users.noreply.github.com> Date: Wed, 30 Mar 2022 16:54:35 +0600 Subject: [xnxx] Add `xnxx3.com` (#3188) Authored by: rozari0 --- yt_dlp/extractor/xnxx.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/xnxx.py b/yt_dlp/extractor/xnxx.py index dd4fb54d4..27f991627 100644 --- a/yt_dlp/extractor/xnxx.py +++ b/yt_dlp/extractor/xnxx.py @@ -13,7 +13,7 @@ from ..utils import ( class XNXXIE(InfoExtractor): - _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P[0-9a-z]+)/' + _VALID_URL = r'https?://(?:video|www)\.xnxx3?\.com/video-?(?P[0-9a-z]+)/' _TESTS = [{ 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video', 'md5': '7583e96c15c0f21e9da3453d9920fbba', @@ -32,6 +32,9 @@ class XNXXIE(InfoExtractor): }, { 'url': 'http://www.xnxx.com/video-55awb78/', 'only_matching': True, + }, { + 'url': 'http://www.xnxx3.com/video-55awb78/', + 'only_matching': True, }] def _real_extract(self, url): -- cgit v1.2.3 From 180c81509f6bec740df2957aa3d8aebc4e27b601 Mon Sep 17 00:00:00 2001 From: Felix S Date: Wed, 30 Mar 2022 11:31:25 +0000 Subject: [docs] Add an `.editorconfig` file (#3220) Authored by: fstirlitz --- .editorconfig | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..40c19fa66 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,8 @@ +root = true + +[**.py] +charset = utf-8 +indent_size = 4 +indent_style = space +trim_trailing_whitespace = true +insert_final_newline = true -- cgit v1.2.3 From 5d0aeac0e9137e0cc038d44f00d19f1f9181c883 Mon Sep 17 00:00:00 2001 From: MrRawes Date: Wed, 30 Mar 2022 12:35:06 +0100 Subject: [docs] Clarify the exact `BSD` license of dependencies (#3197) Authored by: MrRawes --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ab729fa4c..a75441e35 100644 --- a/README.md +++ b/README.md @@ -265,15 +265,15 @@ While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly * [**ffmpeg** and **ffprobe**](https://www.ffmpeg.org) - Required for [merging separate video and audio files](#format-selection) as well as for various [post-processing](#post-processing-options) tasks. License [depends on the build](https://www.ffmpeg.org/legal.html) * [**mutagen**](https://github.com/quodlibet/mutagen)\* - For embedding thumbnail in certain formats. Licensed under [GPLv2+](https://github.com/quodlibet/mutagen/blob/master/COPYING) -* [**pycryptodomex**](https://github.com/Legrandin/pycryptodome)\* - For decrypting AES-128 HLS streams and various other data. Licensed under [BSD2](https://github.com/Legrandin/pycryptodome/blob/master/LICENSE.rst) -* [**websockets**](https://github.com/aaugustin/websockets)\* - For downloading over websocket. Licensed under [BSD3](https://github.com/aaugustin/websockets/blob/main/LICENSE) -* [**secretstorage**](https://github.com/mitya57/secretstorage)\* - For accessing the Gnome keyring while decrypting cookies of Chromium-based browsers on Linux. Licensed under [BSD](https://github.com/mitya57/secretstorage/blob/master/LICENSE) +* [**pycryptodomex**](https://github.com/Legrandin/pycryptodome)\* - For decrypting AES-128 HLS streams and various other data. Licensed under [BSD-2-Clause](https://github.com/Legrandin/pycryptodome/blob/master/LICENSE.rst) +* [**websockets**](https://github.com/aaugustin/websockets)\* - For downloading over websocket. Licensed under [BSD-3-Clause](https://github.com/aaugustin/websockets/blob/main/LICENSE) +* [**secretstorage**](https://github.com/mitya57/secretstorage)\* - For accessing the Gnome keyring while decrypting cookies of Chromium-based browsers on Linux. Licensed under [BSD-3-Clause](https://github.com/mitya57/secretstorage/blob/master/LICENSE) * [**brotli**](https://github.com/google/brotli)\* or [**brotlicffi**](https://github.com/python-hyper/brotlicffi) - [Brotli](https://en.wikipedia.org/wiki/Brotli) content encoding support. Both licensed under MIT [1](https://github.com/google/brotli/blob/master/LICENSE) [2](https://github.com/python-hyper/brotlicffi/blob/master/LICENSE) * [**certifi**](https://github.com/certifi/python-certifi)\* - Provides Mozilla's root certificate bundle. Licensed under [MPLv2](https://github.com/certifi/python-certifi/blob/master/LICENSE) * [**AtomicParsley**](https://github.com/wez/atomicparsley) - For embedding thumbnail in mp4/m4a if mutagen/ffmpeg cannot. Licensed under [GPLv2+](https://github.com/wez/atomicparsley/blob/master/COPYING) * [**rtmpdump**](http://rtmpdump.mplayerhq.hu) - For downloading `rtmp` streams. ffmpeg will be used as a fallback. Licensed under [GPLv2+](http://rtmpdump.mplayerhq.hu) * [**mplayer**](http://mplayerhq.hu/design7/info.html) or [**mpv**](https://mpv.io) - For downloading `rstp` streams. ffmpeg will be used as a fallback. Licensed under [GPLv2+](https://github.com/mpv-player/mpv/blob/master/Copyright) -* [**phantomjs**](https://github.com/ariya/phantomjs) - Used in extractors where javascript needs to be run. Licensed under [BSD3](https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD) +* [**phantomjs**](https://github.com/ariya/phantomjs) - Used in extractors where javascript needs to be run. Licensed under [BSD-3-Clause](https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD) * [**sponskrub**](https://github.com/faissaloo/SponSkrub) - For using the now **deprecated** [sponskrub options](#sponskrub-options). Licensed under [GPLv3+](https://github.com/faissaloo/SponSkrub/blob/master/LICENCE.md) * Any external downloader that you want to use with `--downloader` -- cgit v1.2.3 From 11078c6d571673a0f09e21933f4ad1e6fcc35456 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Wed, 30 Mar 2022 18:19:22 +0530 Subject: [crunhyroll] Fix inheritance https://github.com/yt-dlp/yt-dlp/pull/2955#issuecomment-1083060465 --- yt_dlp/extractor/crunchyroll.py | 4 ++-- yt_dlp/extractor/vrv.py | 52 +++++++++++++++++++++++------------------ 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index bf1bf8c1c..bb4ae12f5 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -9,7 +9,7 @@ import zlib from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor -from .vrv import VRVIE +from .vrv import VRVBaseIE from ..compat import ( compat_b64decode, compat_etree_Element, @@ -100,7 +100,7 @@ class CrunchyrollBaseIE(InfoExtractor): parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) -class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): +class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE): IE_NAME = 'crunchyroll' _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P[0-9]+))(?:[/?&]|$)' _TESTS = [{ diff --git a/yt_dlp/extractor/vrv.py b/yt_dlp/extractor/vrv.py index 10e6be7ed..00e1006c4 100644 --- a/yt_dlp/extractor/vrv.py +++ b/yt_dlp/extractor/vrv.py @@ -85,7 +85,30 @@ class VRVBaseIE(InfoExtractor): 'resource_key': resource_key, })['__links__']['cms_resource']['href'] - def _initialize_pre_login(self): + def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): + if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'): + return [] + format_id = join_nonempty( + stream_format, + audio_lang and 'audio-%s' % audio_lang, + hardsub_lang and 'hardsub-%s' % hardsub_lang) + if 'hls' in stream_format: + adaptive_formats = self._extract_m3u8_formats( + url, video_id, 'mp4', m3u8_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + elif stream_format == 'dash': + adaptive_formats = self._extract_mpd_formats( + url, video_id, mpd_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + if audio_lang: + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_lang + return adaptive_formats + + def _set_api_params(self): webpage = self._download_webpage( 'https://vrv.co/', None, headers=self.geo_verification_headers()) self._API_PARAMS = self._parse_json(self._search_regex( @@ -133,28 +156,8 @@ class VRVIE(VRVBaseIE): self._TOKEN = token_credentials['oauth_token'] self._TOKEN_SECRET = token_credentials['oauth_token_secret'] - def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): - if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'): - return [] - format_id = join_nonempty( - stream_format, - audio_lang and 'audio-%s' % audio_lang, - hardsub_lang and 'hardsub-%s' % hardsub_lang) - if 'hls' in stream_format: - adaptive_formats = self._extract_m3u8_formats( - url, video_id, 'mp4', m3u8_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - elif stream_format == 'dash': - adaptive_formats = self._extract_mpd_formats( - url, video_id, mpd_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - if audio_lang: - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = audio_lang - return adaptive_formats + def _initialize_pre_login(self): + return self._set_api_params() def _real_extract(self, url): video_id = self._match_id(url) @@ -249,6 +252,9 @@ class VRVSeriesIE(VRVBaseIE): 'playlist_mincount': 11, } + def _initialize_pre_login(self): + return self._set_api_params() + def _real_extract(self, url): series_id = self._match_id(url) -- cgit v1.2.3 From c418e6b5a6aa483b801c29cf5ada4263e33a9a3e Mon Sep 17 00:00:00 2001 From: zackmark29 <62680932+zackmark29@users.noreply.github.com> Date: Thu, 31 Mar 2022 10:47:58 +0800 Subject: [viu] Fix bypass for preview (#3247) Authored by: zackmark29 --- yt_dlp/extractor/viu.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/viu.py b/yt_dlp/extractor/viu.py index b0a1fca68..ba627ca5b 100644 --- a/yt_dlp/extractor/viu.py +++ b/yt_dlp/extractor/viu.py @@ -329,7 +329,8 @@ class ViuOTTIE(InfoExtractor): if token is not None: query['identity'] = token else: - # preview is limited to 3min for non-members. But we can try to bypass it + # The content is Preview or for VIP only. + # We can try to bypass the duration which is limited to 3mins only duration_limit, query['duration'] = True, '180' try: stream_data = download_playback() @@ -346,13 +347,13 @@ class ViuOTTIE(InfoExtractor): # bypass preview duration limit if duration_limit: - stream_url = urllib.parse.urlparse(stream_url) + old_stream_url = urllib.parse.urlparse(stream_url) + query = dict(urllib.parse.parse_qsl(old_stream_url.query, keep_blank_values=True)) query.update({ 'duration': video_data.get('time_duration') or '9999999', 'duration_start': '0', }) - stream_url = stream_url._replace(query=urllib.parse.urlencode(dict( - urllib.parse.parse_qsl(stream_url.query, keep_blank_values=True)))).geturl() + stream_url = old_stream_url._replace(query=urllib.parse.urlencode(query)).geturl() formats.append({ 'format_id': vid_format, -- cgit v1.2.3 From bb5a7cb8ad9274c7388a54ef6a6ceae24dd892cc Mon Sep 17 00:00:00 2001 From: Bricio <216170+Bricio@users.noreply.github.com> Date: Thu, 31 Mar 2022 00:04:55 -0300 Subject: [Craftsy] Add extractor (#3208) Authored by: Bricio --- yt_dlp/extractor/craftsy.py | 71 ++++++++++++++++++++++++++++++++++++++++++ yt_dlp/extractor/extractors.py | 1 + 2 files changed, 72 insertions(+) create mode 100644 yt_dlp/extractor/craftsy.py diff --git a/yt_dlp/extractor/craftsy.py b/yt_dlp/extractor/craftsy.py new file mode 100644 index 000000000..ed2f4420e --- /dev/null +++ b/yt_dlp/extractor/craftsy.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor + +from ..utils import ( + dict_get, + get_element_by_id, + js_to_json, + traverse_obj, +) + + +class CraftsyIE(InfoExtractor): + _VALID_URL = r'https?://www.craftsy.com/class/(?P[a-z0-9_-]+)/' + _TESTS = [{ + 'url': 'https://www.craftsy.com/class/the-midnight-quilt-show-season-5/', + 'info_dict': { + 'id': 'the-midnight-quilt-show-season-5', + 'title': 'The Midnight Quilt Show Season 5', + 'description': 'md5:113eda818e985d1a566625fb2f833b7a', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.craftsy.com/class/sew-your-own-designer-handbag/', + 'info_dict': { + 'id': 'sew-your-own-designer-handbag', + 'title': 'Sew Your Own Designer Handbag', + 'description': 'md5:8270d0ef5427d3c895a27351aeaac276', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://www.craftsy.com/class/all-access-estes-park-wool-market/', + 'info_dict': { + 'id': 'all-access-estes-park-wool-market', + 'title': 'All Access: Estes Park Wool Market', + 'description': 'md5:aded1bd8d38ae2fae4dae936c0ae01e7', + }, + 'playlist_count': 6, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json(self._search_regex( + r'class_video_player_vars\s*=\s*({.*})\s*;', + get_element_by_id('vidstore-classes_class-video-player-js-extra', webpage), + 'video data'), video_id, transform_source=js_to_json) + + account_id = traverse_obj(video_data, ('video_player', 'bc_account_id')) + + entries = [] + class_preview = traverse_obj(video_data, ('video_player', 'class_preview')) + if class_preview: + v_id = class_preview.get('video_id') + entries.append(self.url_result( + f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={v_id}', + BrightcoveNewIE, v_id, class_preview.get('title'))) + + if dict_get(video_data, ('is_free', 'user_has_access')): + entries += [ + self.url_result( + f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={lesson["video_id"]}', + BrightcoveNewIE, lesson['video_id'], lesson.get('title')) + for lesson in video_data['lessons']] + + return self.playlist_result( + entries, video_id, video_data.get('class_title'), + self._html_search_meta(('og:description', 'description'), webpage, default=None)) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index e5ae12a7d..52279b985 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -322,6 +322,7 @@ from .cpac import ( from .cozytv import CozyTVIE from .cracked import CrackedIE from .crackle import CrackleIE +from .craftsy import CraftsyIE from .crooksandliars import CrooksAndLiarsIE from .crowdbunker import ( CrowdBunkerIE, -- cgit v1.2.3 From 504f789ad55f0581681171abc428c3094057cae1 Mon Sep 17 00:00:00 2001 From: Alexander Seiler Date: Thu, 31 Mar 2022 05:23:32 +0200 Subject: [AZMedien] Support `tv.telezueri.ch` (#3251) Authored by: goggle --- yt_dlp/extractor/azmedien.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/azmedien.py b/yt_dlp/extractor/azmedien.py index b3cabbf94..0168340b9 100644 --- a/yt_dlp/extractor/azmedien.py +++ b/yt_dlp/extractor/azmedien.py @@ -11,7 +11,7 @@ class AZMedienIE(InfoExtractor): IE_DESC = 'AZ Medien videos' _VALID_URL = r'''(?x) https?:// - (?:www\.)? + (?:www\.|tv\.)? (?P telezueri\.ch| telebaern\.tv| @@ -31,7 +31,7 @@ class AZMedienIE(InfoExtractor): ''' _TESTS = [{ - 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', + 'url': 'https://tv.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', 'info_dict': { 'id': '1_anruz3wy', 'ext': 'mp4', @@ -39,6 +39,9 @@ class AZMedienIE(InfoExtractor): 'uploader_id': 'TVOnline', 'upload_date': '20180930', 'timestamp': 1538328802, + 'view_count': int, + 'thumbnail': 'http://cfvod.kaltura.com/p/1719221/sp/171922100/thumbnail/entry_id/1_anruz3wy/version/100031', + 'duration': 1930 }, 'params': { 'skip_download': True, -- cgit v1.2.3 From f189faf1ce0e8c0d81bc7ec841718fe03b74ca34 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 31 Mar 2022 13:30:07 +0530 Subject: [BRMediathek] Fix VALID_URL Closes #2466 --- yt_dlp/extractor/br.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/br.py b/yt_dlp/extractor/br.py index 7169eceb6..0155827d8 100644 --- a/yt_dlp/extractor/br.py +++ b/yt_dlp/extractor/br.py @@ -175,7 +175,7 @@ class BRIE(InfoExtractor): class BRMediathekIE(InfoExtractor): IE_DESC = 'Bayerischer Rundfunk Mediathek' - _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?Pav:[0-9a-f]{24})' + _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek//?video/(?:[^/?&#]+?-)?(?Pav:[0-9a-f]{24})' _TESTS = [{ 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', @@ -188,6 +188,9 @@ class BRMediathekIE(InfoExtractor): 'timestamp': 1511942766, 'upload_date': '20171129', } + }, { + 'url': 'https://www.br.de/mediathek//video/av:61b0db581aed360007558c12', + 'only_matching': True, }] def _real_extract(self, url): -- cgit v1.2.3 From c4f60dd7cdbf5282a8d1a8fa8dd4f6fd60acc034 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 31 Mar 2022 13:19:16 +0530 Subject: [utils] Add `try_call` --- CONTRIBUTING.md | 2 +- yt_dlp/downloader/http.py | 6 +++--- yt_dlp/extractor/mediasite.py | 11 ++++++----- yt_dlp/extractor/whowatch.py | 9 +++++---- yt_dlp/utils.py | 16 ++++++++++------ 5 files changed, 25 insertions(+), 19 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index dbd6a84b2..1897f73e0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -643,7 +643,7 @@ Wrap all extracted numeric data into safe functions from [`yt_dlp/utils.py`](yt_ Use `url_or_none` for safe URL processing. -Use `try_get`, `dict_get` and `traverse_obj` for safe metadata extraction from parsed JSON. +Use `traverse_obj` and `try_call` (superseeds `dict_get` and `try_get`) for safe metadata extraction from parsed JSON. Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index 8e096b76b..cabf401a7 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -18,7 +18,7 @@ from ..utils import ( parse_http_range, sanitized_Request, ThrottledDownload, - try_get, + try_call, write_xattr, XAttrMetadataError, XAttrUnavailableError, @@ -120,12 +120,12 @@ class HttpFD(FileDownloader): else: range_end = None - if try_get(None, lambda _: range_start > range_end): + if try_call(lambda: range_start > range_end): ctx.resume_len = 0 ctx.open_mode = 'wb' raise RetryDownload(Exception(f'Conflicting range. (start={range_start} > end={range_end})')) - if try_get(None, lambda _: range_end >= ctx.content_len): + if try_call(lambda: range_end >= ctx.content_len): range_end = ctx.content_len - 1 request = sanitized_Request(url, request_data, headers) diff --git a/yt_dlp/extractor/mediasite.py b/yt_dlp/extractor/mediasite.py index ace86c2fd..fbf9223b2 100644 --- a/yt_dlp/extractor/mediasite.py +++ b/yt_dlp/extractor/mediasite.py @@ -14,6 +14,7 @@ from ..utils import ( float_or_none, mimetype2ext, str_or_none, + try_call, try_get, unescapeHTML, unsmuggle_url, @@ -145,11 +146,11 @@ class MediasiteIE(InfoExtractor): 'duration': slide['Time'] / 1000, }) - next_time = try_get(None, [ - lambda _: Stream['Slides'][i + 1]['Time'], - lambda _: duration, - lambda _: slide['Time'], - ], expected_type=(int, float)) + next_time = try_call( + lambda: Stream['Slides'][i + 1]['Time'], + lambda: duration, + lambda: slide['Time'], + expected_type=(int, float)) fragments.append({ 'path': fname_template.format(slide.get('Number', i + 1)), diff --git a/yt_dlp/extractor/whowatch.py b/yt_dlp/extractor/whowatch.py index f8bc2e73a..e4b610d00 100644 --- a/yt_dlp/extractor/whowatch.py +++ b/yt_dlp/extractor/whowatch.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, qualities, + try_call, try_get, ExtractorError, ) @@ -26,10 +27,10 @@ class WhoWatchIE(InfoExtractor): metadata = self._download_json('https://api.whowatch.tv/lives/%s' % video_id, video_id) live_data = self._download_json('https://api.whowatch.tv/lives/%s/play' % video_id, video_id) - title = try_get(None, ( - lambda x: live_data['share_info']['live_title'][1:-1], - lambda x: metadata['live']['title'], - ), compat_str) + title = try_call( + lambda: live_data['share_info']['live_title'][1:-1], + lambda: metadata['live']['title'], + expected_type=str) hls_url = live_data.get('hls_url') if not hls_url: diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 62a1800d4..22062f85f 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3096,15 +3096,19 @@ def dict_get(d, key_or_keys, default=None, skip_false_values=True): return d.get(key_or_keys, default) -def try_get(src, getter, expected_type=None): - for get in variadic(getter): +def try_call(*funcs, expected_type=None, args=[], kwargs={}): + for f in funcs: try: - v = get(src) - except (AttributeError, KeyError, TypeError, IndexError): + val = f(*args, **kwargs) + except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError): pass else: - if expected_type is None or isinstance(v, expected_type): - return v + if expected_type is None or isinstance(val, expected_type): + return val + + +def try_get(src, getter, expected_type=None): + return try_call(*variadic(getter), args=(src,), expected_type=expected_type) def filter_dict(dct, cndn=lambda _, v: v is not None): -- cgit v1.2.3 From e6f868a63c15f576152733a1508f474b5e5bd1ef Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 31 Mar 2022 13:25:50 +0530 Subject: [utils] `traverse_obj`: Allow filtering by value --- yt_dlp/extractor/funimation.py | 2 +- yt_dlp/extractor/iqiyi.py | 4 ++-- yt_dlp/utils.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/funimation.py b/yt_dlp/extractor/funimation.py index 36a9c4772..6aa9bc9ce 100644 --- a/yt_dlp/extractor/funimation.py +++ b/yt_dlp/extractor/funimation.py @@ -333,7 +333,7 @@ class FunimationShowIE(FunimationBaseIE): 'https://prod-api-funimationnow.dadcdigital.com/api/funimation/episodes/?limit=99999&title_id=%s' % show_info.get('id'), display_id) - vod_items = traverse_obj(items_info, ('items', ..., re.compile('(?i)mostRecent[AS]vod').match, 'item')) + vod_items = traverse_obj(items_info, ('items', ..., lambda k, _: re.match(r'(?i)mostRecent[AS]vod', k), 'item')) return { '_type': 'playlist', diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index dc4667744..14877d405 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -626,8 +626,8 @@ class IqIE(InfoExtractor): note=f'Downloading format data for {self._BID_TAGS[bid]}', errnote='Unable to download format data', fatal=False), 'data', expected_type=dict) - video_format = next((video_format for video_format in traverse_obj( - format_data, ('program', 'video', ...), expected_type=dict, default=[]) if str(video_format['bid']) == bid), {}) + video_format = traverse_obj(format_data, ('program', 'video', lambda _, v: str(v['bid']) == bid), + expected_type=dict, default=[], get_all=False) or {} extracted_formats = [] if video_format.get('m3u8Url'): extracted_formats.extend(self._extract_m3u8_formats( diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 22062f85f..a2fa29afe 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5152,8 +5152,8 @@ def traverse_obj( @param path_list A list of paths which are checked one by one. Each path is a list of keys where each key is a string, a function, a tuple of strings/None or "...". - When a fuction is given, it takes the key as argument and - returns whether the key matches or not. When a tuple is given, + When a fuction is given, it takes the key and value as arguments + and returns whether the key matches or not. When a tuple is given, all the keys given in the tuple are traversed, and "..." traverses all the keys in the object "None" returns the object without traversal @@ -5198,7 +5198,7 @@ def traverse_obj( obj = str(obj) _current_depth += 1 depth = max(depth, _current_depth) - return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)] + return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))] elif isinstance(obj, dict) and not (is_user_input and key == ':'): obj = (obj.get(key) if casesense or (key in obj) else next((v for k, v in obj.items() if _lower(k) == key), None)) -- cgit v1.2.3 From 5d45484cc762861f8fe59fa42d499db5a284c2c7 Mon Sep 17 00:00:00 2001 From: "Lesmiscore (Naoya Ozaki)" Date: Fri, 1 Apr 2022 19:31:58 +0900 Subject: [niconico] Fix extraction of thumbnails and uploader (#3266) --- yt_dlp/extractor/niconico.py | 18 ++++++++++++++---- yt_dlp/utils.py | 7 +++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 74828f833..a5a1a01e0 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -25,7 +25,10 @@ from ..utils import ( parse_duration, parse_filesize, parse_iso8601, + parse_resolution, + qualities, remove_start, + str_or_none, traverse_obj, try_get, unescapeHTML, @@ -430,18 +433,25 @@ class NiconicoIE(InfoExtractor): # find in json (logged in) tags = traverse_obj(api_data, ('tag', 'items', ..., 'name')) + thumb_prefs = qualities(['url', 'middleUrl', 'largeUrl', 'player', 'ogp']) + return { 'id': video_id, '_api_data': api_data, 'title': get_video_info(('originalTitle', 'title')) or self._og_search_title(webpage, default=None), 'formats': formats, - 'thumbnail': get_video_info('thumbnail', 'url') or self._html_search_meta( - ('image', 'og:image'), webpage, 'thumbnail', default=None), + 'thumbnails': [{ + 'id': key, + 'url': url, + 'ext': 'jpg', + 'preference': thumb_prefs(key), + **parse_resolution(url, lenient=True), + } for key, url in (get_video_info('thumbnail') or {}).items() if url], 'description': clean_html(get_video_info('description')), - 'uploader': traverse_obj(api_data, ('owner', 'nickname')), + 'uploader': traverse_obj(api_data, ('owner', 'nickname'), ('channel', 'name'), ('community', 'name')), + 'uploader_id': str_or_none(traverse_obj(api_data, ('owner', 'id'), ('channel', 'id'), ('community', 'id'))), 'timestamp': parse_iso8601(get_video_info('registeredAt')) or parse_iso8601( self._html_search_meta('video:release_date', webpage, 'date published', default=None)), - 'uploader_id': traverse_obj(api_data, ('owner', 'id')), 'channel': traverse_obj(api_data, ('channel', 'name'), ('community', 'name')), 'channel_id': traverse_obj(api_data, ('channel', 'id'), ('community', 'id')), 'view_count': int_or_none(get_video_info('count', 'view')), diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index a2fa29afe..ce918750d 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2418,11 +2418,14 @@ def parse_count(s): return str_to_int(mobj.group(1)) -def parse_resolution(s): +def parse_resolution(s, *, lenient=False): if s is None: return {} - mobj = re.search(r'(?\d+)\s*[xX×,]\s*(?P\d+)(?![a-zA-Z0-9])', s) + if lenient: + mobj = re.search(r'(?P\d+)\s*[xX×,]\s*(?P\d+)', s) + else: + mobj = re.search(r'(?\d+)\s*[xX×,]\s*(?P\d+)(?![a-zA-Z0-9])', s) if mobj: return { 'width': int(mobj.group('w')), -- cgit v1.2.3 From 4c268f9cb75edd0ca7b2e3737cfa5abd21ee653d Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sat, 2 Apr 2022 11:20:31 +0530 Subject: [Nebula] Fix bug in 52efa4b31200119adaa8acf33e50b84fcb6948f0 --- yt_dlp/extractor/nebula.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index b77ef5f28..77f253519 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -86,7 +86,7 @@ class NebulaBaseIE(InfoExtractor): # if 401 or 403, attempt credential re-auth and retry if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403): self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}') - self._login() + self._perform_login() return inner_call() else: raise -- cgit v1.2.3 From c085e4ec475eb17343d228d2749c8e2a1d998edf Mon Sep 17 00:00:00 2001 From: nixxo Date: Sat, 2 Apr 2022 07:57:56 +0200 Subject: [rai] Fix extraction of http formats (#3272) Closes #3270 Authored by: nixxo --- yt_dlp/extractor/rai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index 9d243b2be..6864129c6 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -118,7 +118,7 @@ class RaiBaseIE(InfoExtractor): }) def _create_http_urls(self, relinker_url, fmts): - _RELINKER_REG = r'https?://(?P[^/]+?)/(?:i/)?(?P[^/]+?)/(?P.+?)/(?P\d+)(?:_(?P[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' + _RELINKER_REG = r'https?://(?P[^/]+?)/(?:i/)?(?P[^/]+?)/(?P.+?)/(?P\w+)(?:_(?P[\d\,]+))?(?:\.mp4|/playlist\.m3u8).+?' _MP4_TMPL = '%s&overrideUserAgentRule=mp4-%s' _QUALITY = { # tbr: w, h -- cgit v1.2.3 From c8e856a551730c289d9ef8f0674620753de6c5be Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Sat, 2 Apr 2022 19:07:13 +1300 Subject: [web.archive:youtube] Make CDX API requests non-fatal Partial fix for https://github.com/yt-dlp/yt-dlp/issues/3278 Authored-by: coletdjnz --- yt_dlp/extractor/archiveorg.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index 2a25c0713..b06ac74ae 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -457,7 +457,7 @@ class YoutubeWebArchiveIE(InfoExtractor): _OLDEST_CAPTURE_DATE = 20050214000000 _NEWEST_CAPTURE_DATE = 20500101000000 - def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note='Downloading CDX API JSON'): + def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note=None, fatal=False): # CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md query = { 'url': url, @@ -468,7 +468,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'collapse': collapse or [], **(query or {}) } - res = self._download_json('https://web.archive.org/cdx/search/cdx', item_id, note, query=query) + res = self._download_json( + 'https://web.archive.org/cdx/search/cdx', item_id, + note or 'Downloading CDX API JSON', query=query, fatal=fatal) if isinstance(res, list) and len(res) >= 2: # format response to make it easier to use return list(dict(zip(res[0], v)) for v in res[1:]) -- cgit v1.2.3 From ad210f4fd460574436dc65d3c3cee041c905c46f Mon Sep 17 00:00:00 2001 From: coletdev Date: Sat, 2 Apr 2022 19:11:14 +1300 Subject: [youtube:search] Support hashtag entries (#3265) Authored-by: coletdjnz --- yt_dlp/extractor/youtube.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 19b4985f6..4e6a80911 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3903,6 +3903,13 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): if video_id: return self._extract_video(video_renderer) + def _hashtag_tile_entry(self, hashtag_tile_renderer): + url = urljoin('https://youtube.com', traverse_obj( + hashtag_tile_renderer, ('onTapCommand', 'commandMetadata', 'webCommandMetadata', 'url'))) + if url: + return self.url_result( + url, ie=YoutubeTabIE.ie_key(), title=self._get_text(hashtag_tile_renderer, 'hashtag')) + def _post_thread_entries(self, post_thread_renderer): post_renderer = try_get( post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict) @@ -3991,6 +3998,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): 'videoRenderer': lambda x: [self._video_entry(x)], 'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}), 'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}), + 'hashtagTileRenderer': lambda x: [self._hashtag_tile_entry(x)] } for key, renderer in isr_content.items(): if key not in known_renderers: @@ -5520,7 +5528,17 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): 'id': 'python', 'title': 'python', } - + }, { + 'url': 'https://www.youtube.com/results?search_query=%23cats', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '#cats', + 'title': '#cats', + 'entries': [{ + 'url': r're:https://(www\.)?youtube\.com/hashtag/cats', + 'title': '#cats', + }], + }, }, { 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', 'only_matching': True, -- cgit v1.2.3 From a17526e427fffcd38064a4657de4fa59cf5a9953 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 3 Apr 2022 19:01:03 +0530 Subject: [youtube:tab] Minor improvements (See desc) * Support shorts on channel homepage * Extract thumbnail of OLAK playlists --- yt_dlp/extractor/youtube.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4e6a80911..485849ba9 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3777,7 +3777,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _extract_basic_item_renderer(item): # Modified from _extract_grid_item_renderer known_basic_renderers = ( - 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer' + 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'showRenderer', 'reelItemRenderer' ) for key, renderer in item.items(): if not isinstance(renderer, dict): @@ -3992,7 +3992,8 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): known_renderers = { 'playlistVideoListRenderer': self._playlist_entries, 'gridRenderer': self._grid_entries, - 'shelfRenderer': lambda x: self._shelf_entries(x), + 'reelShelfRenderer': self._grid_entries, + 'shelfRenderer': self._shelf_entries, 'musicResponsiveListItemRenderer': lambda x: [self._music_reponsive_list_entry(x)], 'backstagePostThreadRenderer': self._post_thread_entries, 'videoRenderer': lambda x: [self._video_entry(x)], @@ -4170,7 +4171,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): }) primary_thumbnails = self._extract_thumbnails( - primary_sidebar_renderer, ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail')) + primary_sidebar_renderer, ('thumbnailRenderer', ('playlistVideoThumbnailRenderer', 'playlistCustomThumbnailRenderer'), 'thumbnail')) if playlist_id is None: playlist_id = item_id -- cgit v1.2.3 From dc57e74a7fb9418ec403ff461eab3a47a350d7a3 Mon Sep 17 00:00:00 2001 From: aarubui Date: Sun, 3 Apr 2022 23:53:22 +1000 Subject: [tenplay] Improve extractor (#3280) Authored by: aarubui --- yt_dlp/extractor/tenplay.py | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index 5b3222ecf..5c7b54531 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -7,6 +7,7 @@ import base64 from .common import InfoExtractor from ..utils import ( HEADRequest, + int_or_none, urlencode_postdata, ) @@ -15,6 +16,28 @@ class TenPlayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?Ptpv\d{6}[a-z]{5})' _NETRC_MACHINE = '10play' _TESTS = [{ + 'url': 'https://10play.com.au/neighbours/web-extras/season-39/nathan-borg-is-the-first-aussie-actor-with-a-cochlear-implant-to-join-neighbours/tpv210128qupwd', + 'info_dict': { + 'id': '6226844312001', + 'ext': 'mp4', + 'title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', + 'alt_title': 'Nathan Borg Is The First Aussie Actor With A Cochlear Implant To Join Neighbours', + 'description': 'md5:a02d0199c901c2dd4c796f1e7dd0de43', + 'duration': 186, + 'season': 39, + 'series': 'Neighbours', + 'thumbnail': r're:https://.*\.jpg', + 'uploader': 'Channel 10', + 'age_limit': 15, + 'timestamp': 1611810000, + 'upload_date': '20210128', + 'uploader_id': '2199827728001', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Only available in Australia', + }, { 'url': 'https://10play.com.au/todd-sampsons-body-hack/episodes/season-4/episode-7/tpv200921kvngh', 'info_dict': { 'id': '6192880312001', @@ -62,12 +85,17 @@ class TenPlayIE(InfoExtractor): def _real_extract(self, url): content_id = self._match_id(url) - _token = self._get_bearer_token(content_id) data = self._download_json( 'https://10play.com.au/api/v1/videos/' + content_id, content_id) + headers = {} + + if data.get('memberGated') is True: + _token = self._get_bearer_token(content_id) + headers = {'Authorization': _token} + _video_url = self._download_json( data.get('playbackApiEndpoint'), content_id, 'Downloading video JSON', - headers={'Authorization': _token}).get('source') + headers=headers).get('source') m3u8_url = self._request_webpage(HEADRequest( _video_url), content_id).geturl() if '10play-not-in-oz' in m3u8_url: @@ -77,12 +105,16 @@ class TenPlayIE(InfoExtractor): return { 'formats': formats, + 'subtitles': {'en': [{'url': data.get('captionUrl')}]} if data.get('captionUrl') else None, 'id': data.get('altId') or content_id, - 'title': data.get('title'), + 'duration': data.get('duration'), + 'title': data.get('subtitle'), + 'alt_title': data.get('title'), 'description': data.get('description'), 'age_limit': self._AUS_AGES.get(data.get('classification')), - 'series': data.get('showName'), - 'season': data.get('showContentSeason'), + 'series': data.get('tvShow'), + 'season': int_or_none(data.get('season')), + 'episode_number': int_or_none(data.get('episode')), 'timestamp': data.get('published'), 'thumbnail': data.get('imageUrl'), 'uploader': 'Channel 10', -- cgit v1.2.3 From fbfde1c3e6b59c5ff94e2604f1502acdeb14f8f0 Mon Sep 17 00:00:00 2001 From: Fam0r Date: Sun, 3 Apr 2022 18:11:50 +0300 Subject: [elonet] Rewrite extractor (#3277) Closes #2911 Authored by: Fam0r, pukkandan --- yt_dlp/extractor/common.py | 4 +-- yt_dlp/extractor/elonet.py | 85 +++++++++++++++++----------------------------- 2 files changed, 34 insertions(+), 55 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d0e57da23..af964c527 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1297,8 +1297,8 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' - % {'prop': re.escape(prop)}) + property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' + % {'prop': re.escape(prop), 'sep': '(?::|[:-])'}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), diff --git a/yt_dlp/extractor/elonet.py b/yt_dlp/extractor/elonet.py index eefba4e24..9c6aea28e 100644 --- a/yt_dlp/extractor/elonet.py +++ b/yt_dlp/extractor/elonet.py @@ -1,30 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - base_url, - ExtractorError, - try_get, -) -from ..compat import compat_str +from ..utils import determine_ext class ElonetIE(InfoExtractor): _VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P[0-9]+)' _TESTS = [{ - # m3u8 with subtitles 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867', - 'md5': '8efc954b96c543711707f87de757caea', 'info_dict': { 'id': '107867', 'ext': 'mp4', 'title': 'Valkoinen peura', - 'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...', - 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large', + 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_107867.+', + 'description': 'md5:bded4201c9677fab10854884fe8f7312', }, + 'params': {'skip_download': 'dash'}, }, { # DASH with subtitles 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539', @@ -32,58 +24,45 @@ class ElonetIE(InfoExtractor): 'id': '116539', 'ext': 'mp4', 'title': 'Minulla on tiikeri', - 'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...', - 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr', - } + 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_116539.+', + 'description': 'md5:5ab72b3fe76d3414e46cc8f277104419', + }, + 'params': {'skip_download': 'dash'}, + }, { + # Page with multiple videos, download the main one + 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_117396', + 'info_dict': { + 'id': '117396', + 'ext': 'mp4', + 'title': 'Sampo', + 'thumbnail': r're:^https?://elonet\.finna\.fi/Cover/Show\?id=kavi\.elonet_elokuva_117396.+', + 'description': 'md5:ec69572a5b054d0ecafe8086b1fa96f7', + }, + 'params': {'skip_download': 'dash'}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r']+data-video-sources="([^"]+)"', webpage, 'json'), video_id)[0]['src'] + ext = determine_ext(src) - json_s = self._html_search_regex( - r'data-video-sources="(.+?)"', webpage, 'json') - src = try_get( - self._parse_json(json_s, video_id), - lambda x: x[0]["src"], compat_str) - formats = [] - subtitles = {} - if re.search(r'\.m3u8\??', src): - res = self._download_webpage_handle( - # elonet servers have certificate problems - src.replace('https:', 'http:'), video_id, - note='Downloading m3u8 information', - errnote='Failed to download m3u8 information') - if res: - doc, urlh = res - url = urlh.geturl() - formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url) - for f in formats: - f['ext'] = 'mp4' - elif re.search(r'\.mpd\??', src): - res = self._download_xml_handle( - src, video_id, - note='Downloading MPD manifest', - errnote='Failed to download MPD manifest') - if res: - doc, urlh = res - url = base_url(urlh.geturl()) - formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url) + if ext == 'm3u8': + formats, subtitles = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False) + elif ext == 'mpd': + formats, subtitles = self._extract_mpd_formats_and_subtitles(src, video_id, fatal=False) else: - raise ExtractorError("Unknown streaming format") + formats, subtitles = [], {} + self.raise_no_formats(f'Unknown streaming format {ext}') + self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'subtitles': subtitles, } -- cgit v1.2.3 From 265e586d96bae2eb86a4f702ee2caef3b0cd78c3 Mon Sep 17 00:00:00 2001 From: "Lesmiscore (Naoya Ozaki)" Date: Mon, 4 Apr 2022 00:41:14 +0900 Subject: [openrec] Download archived livestreams (#3267) Authored by: Lesmiscore --- yt_dlp/extractor/openrec.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/openrec.py b/yt_dlp/extractor/openrec.py index b476c0986..5eb1cdbad 100644 --- a/yt_dlp/extractor/openrec.py +++ b/yt_dlp/extractor/openrec.py @@ -7,6 +7,7 @@ from ..utils import ( get_first, int_or_none, traverse_obj, + try_get, unified_strdate, unified_timestamp, ) @@ -18,6 +19,13 @@ class OpenRecBaseIE(InfoExtractor): return self._parse_json( self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id) + def _expand_media(self, video_id, media): + for name, m3u8_url in (media or {}).items(): + if not m3u8_url: + continue + yield from self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', m3u8_id=name) + def _extract_movie(self, webpage, video_id, name, is_live): window_stores = self._extract_pagestore(webpage, video_id) movie_stores = [ @@ -29,13 +37,21 @@ class OpenRecBaseIE(InfoExtractor): if not any(movie_stores): raise ExtractorError(f'Failed to extract {name} info') - m3u8_playlists = get_first(movie_stores, 'media') or {} - formats = [] - for name, m3u8_url in m3u8_playlists.items(): - if not m3u8_url: - continue - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', live=is_live, m3u8_id=name)) + formats = list(self._expand_media(video_id, get_first(movie_stores, 'media'))) + if not formats and is_live: + # archived livestreams + cookies = self._get_cookies('https://www.openrec.tv/') + detail = self._download_json( + f'https://apiv5.openrec.tv/api/v5/movies/{video_id}/detail', video_id, + headers={ + 'Origin': 'https://www.openrec.tv', + 'Referer': 'https://www.openrec.tv/', + 'access-token': try_get(cookies, lambda x: x.get('access_token').value), + 'uuid': try_get(cookies, lambda x: x.get('uuid').value), + }) + new_media = traverse_obj(detail, ('data', 'items', ..., 'media'), get_all=False) + formats = list(self._expand_media(video_id, new_media)) + is_live = False self._sort_formats(formats) -- cgit v1.2.3 From 12e022d074c2e5b240788a61452e5536fa51c151 Mon Sep 17 00:00:00 2001 From: Tim Schindler Date: Mon, 4 Apr 2022 09:20:14 +0200 Subject: [Cybrary] Add extractor (#3264) Authored by: aaearon --- yt_dlp/extractor/cybrary.py | 146 +++++++++++++++++++++++++++++++++++++++++ yt_dlp/extractor/extractors.py | 4 ++ 2 files changed, 150 insertions(+) create mode 100644 yt_dlp/extractor/cybrary.py diff --git a/yt_dlp/extractor/cybrary.py b/yt_dlp/extractor/cybrary.py new file mode 100644 index 000000000..c278f0fe0 --- /dev/null +++ b/yt_dlp/extractor/cybrary.py @@ -0,0 +1,146 @@ +# coding: utf-8 +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + smuggle_url, + str_or_none, + traverse_obj, + urlencode_postdata +) + + +class CybraryBaseIE(InfoExtractor): + _API_KEY = 'AIzaSyCX9ru6j70PX2My1Eq6Q1zoMAhuTdXlzSw' + _ENDPOINTS = { + 'course': 'https://app.cybrary.it/courses/api/catalog/browse/course/{}', + 'course_enrollment': 'https://app.cybrary.it/courses/api/catalog/{}/enrollment', + 'enrollment': 'https://app.cybrary.it/courses/api/enrollment/{}', + 'launch': 'https://app.cybrary.it/courses/api/catalog/{}/launch', + 'vimeo_oembed': 'https://vimeo.com/api/oembed.json?url=https://vimeo.com/{}', + } + _NETRC_MACHINE = 'cybrary' + _TOKEN = None + + def _perform_login(self, username, password): + CybraryBaseIE._TOKEN = self._download_json( + f'https://identitytoolkit.googleapis.com/v1/accounts:signInWithPassword?key={self._API_KEY}', + None, data=urlencode_postdata({'email': username, 'password': password, 'returnSecureToken': True}), + note='Logging in')['idToken'] + + def _real_initialize(self): + if not self._TOKEN: + self.raise_login_required(method='password') + + def _call_api(self, endpoint, item_id): + return self._download_json( + self._ENDPOINTS[endpoint].format(item_id), item_id, + note=f'Downloading {endpoint} JSON metadata', + headers={'Authorization': f'Bearer {self._TOKEN}'}) + + def _get_vimeo_id(self, activity_id): + launch_api = self._call_api('launch', activity_id) + + if launch_api.get('url'): + return self._search_regex(r'https?://player\.vimeo\.com/video/(?P[0-9]+)', launch_api['url'], 'vimeo_id') + return traverse_obj(launch_api, ('vendor_data', 'content', ..., 'videoId'), get_all=False) + + +class CybraryIE(CybraryBaseIE): + _VALID_URL = r'https?://app.cybrary.it/immersive/(?P[0-9]+)/activity/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://app.cybrary.it/immersive/12487950/activity/63102', + 'md5': '9ae12d37e555cb2ed554223a71a701d0', + 'info_dict': { + 'id': '646609770', + 'ext': 'mp4', + 'title': 'Getting Started', + 'thumbnail': 'https://i.vimeocdn.com/video/1301817996-76a268f0c56cff18a5cecbbdc44131eb9dda0c80eb0b3a036_1280', + 'series_id': '63111', + 'uploader_url': 'https://vimeo.com/user30867300', + 'duration': 88, + 'uploader_id': 'user30867300', + 'series': 'Cybrary Orientation', + 'uploader': 'Cybrary', + 'chapter': 'Cybrary Orientation Series', + 'chapter_id': '63110' + }, + 'expected_warnings': ['No authenticators for vimeo'] + }, { + 'url': 'https://app.cybrary.it/immersive/12747143/activity/52686', + 'md5': '62f26547dccc59c44363e2a13d4ad08d', + 'info_dict': { + 'id': '445638073', + 'ext': 'mp4', + 'title': 'Azure Virtual Network IP Addressing', + 'thumbnail': 'https://i.vimeocdn.com/video/936667051-1647ace66c627d4a2382185e0dae8deb830309bfddd53f8b2367b2f91e92ed0e-d_1280', + 'series_id': '52733', + 'uploader_url': 'https://vimeo.com/user30867300', + 'duration': 426, + 'uploader_id': 'user30867300', + 'series': 'AZ-500: Microsoft Azure Security Technologies', + 'uploader': 'Cybrary', + 'chapter': 'Implement Network Security', + 'chapter_id': '52693' + }, + 'expected_warnings': ['No authenticators for vimeo'] + }] + + def _real_extract(self, url): + activity_id, enrollment_id = self._match_valid_url(url).group('id', 'enrollment') + course = self._call_api('enrollment', enrollment_id)['content'] + activity = traverse_obj(course, ('learning_modules', ..., 'activities', lambda _, v: int(activity_id) == v['id']), get_all=False) + + if activity.get('type') not in ['Video Activity', 'Lesson Activity']: + raise ExtractorError('The activity is not a video', expected=True) + + module = next((m for m in course.get('learning_modules') or [] + if int(activity_id) in traverse_obj(m, ('activities', ..., 'id') or [])), None) + + vimeo_id = self._get_vimeo_id(activity_id) + + return { + '_type': 'url_transparent', + 'series': traverse_obj(course, ('content_description', 'title')), + 'series_id': str_or_none(traverse_obj(course, ('content_description', 'id'))), + 'id': vimeo_id, + 'chapter': module.get('title'), + 'chapter_id': str_or_none(module.get('id')), + 'title': activity.get('title'), + 'url': smuggle_url(f'https://player.vimeo.com/video/{vimeo_id}', {'http_headers': {'Referer': 'https://api.cybrary.it'}}) + } + + +class CybraryCourseIE(CybraryBaseIE): + _VALID_URL = r'https://app.cybrary.it/browse/course/(?P[\w-]+)/?(?:$|[#?])' + _TESTS = [{ + 'url': 'https://app.cybrary.it/browse/course/az-500-microsoft-azure-security-technologies', + 'info_dict': { + 'id': 898, + 'title': 'AZ-500: Microsoft Azure Security Technologies', + 'description': 'md5:69549d379c0fc1dec92926d4e8b6fbd4' + }, + 'playlist_count': 59 + }, { + 'url': 'https://app.cybrary.it/browse/course/cybrary-orientation', + 'info_dict': { + 'id': 1245, + 'title': 'Cybrary Orientation', + 'description': 'md5:9e69ff66b32fe78744e0ad4babe2e88e' + }, + 'playlist_count': 4 + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + course = self._call_api('course', course_id) + enrollment_info = self._call_api('course_enrollment', course['id']) + + entries = [self.url_result( + f'https://app.cybrary.it/immersive/{enrollment_info["id"]}/activity/{activity["id"]}') + for activity in traverse_obj(course, ('content_item', 'learning_modules', ..., 'activities', ...))] + + return self.playlist_result( + entries, + traverse_obj(course, ('content_item', 'id'), expected_type=str_or_none), + course.get('title'), course.get('short_description')) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 52279b985..457f4c2aa 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -345,6 +345,10 @@ from .curiositystream import ( CuriosityStreamSeriesIE, ) from .cwtv import CWTVIE +from .cybrary import ( + CybraryIE, + CybraryCourseIE +) from .daftsex import DaftsexIE from .dailymail import DailyMailIE from .dailymotion import ( -- cgit v1.2.3 From 18eac302a22a31b324c848dce997c34213a5199a Mon Sep 17 00:00:00 2001 From: Ha Tien Loi Date: Mon, 4 Apr 2022 14:29:35 +0700 Subject: [Imdb] Improve extractor (#3291) Closes #3283 Authored by: hatienl0i261299 --- yt_dlp/extractor/imdb.py | 64 ++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/yt_dlp/extractor/imdb.py b/yt_dlp/extractor/imdb.py index 24f1fde64..7eb66d821 100644 --- a/yt_dlp/extractor/imdb.py +++ b/yt_dlp/extractor/imdb.py @@ -7,9 +7,10 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + int_or_none, mimetype2ext, - parse_duration, qualities, + traverse_obj, try_get, url_or_none, ) @@ -28,6 +29,17 @@ class ImdbIE(InfoExtractor): 'title': 'No. 2', 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7', 'duration': 152, + 'thumbnail': r're:^https?://.+\.jpg', + } + }, { + 'url': 'https://www.imdb.com/video/vi3516832537', + 'info_dict': { + 'id': '3516832537', + 'ext': 'mp4', + 'title': 'Paul: U.S. Trailer #1', + 'description': 'md5:17fcc4fe11ec29b4399be9d4c5ef126c', + 'duration': 153, + 'thumbnail': r're:^https?://.+\.jpg', } }, { 'url': 'http://www.imdb.com/video/_/vi2524815897', @@ -51,8 +63,13 @@ class ImdbIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - data = self._download_json( + webpage = self._download_webpage(f'https://www.imdb.com/video/vi{video_id}', video_id) + info = self._search_nextjs_data(webpage, video_id) + video_info = traverse_obj(info, ('props', 'pageProps', 'videoPlaybackData', 'video'), default={}) + title = (traverse_obj(video_info, ('name', 'value'), ('primaryTitle', 'titleText', 'text')) + or self._html_search_meta(('og:title', 'twitter:title'), webpage, default=None) + or self._html_search_regex(r'(.+?)', webpage, 'title')) + data = video_info.get('playbackURLs') or try_get(self._download_json( 'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id, query={ 'key': base64.b64encode(json.dumps({ @@ -60,11 +77,10 @@ class ImdbIE(InfoExtractor): 'subType': 'FORCE_LEGACY', 'id': 'vi%s' % video_id, }).encode()).decode(), - })[0] - + }), lambda x: x[0]['videoLegacyEncodings']) quality = qualities(('SD', '480p', '720p', '1080p')) - formats = [] - for encoding in data['videoLegacyEncodings']: + formats, subtitles = [], {} + for encoding in data: if not encoding or not isinstance(encoding, dict): continue video_url = url_or_none(encoding.get('url')) @@ -73,11 +89,13 @@ class ImdbIE(InfoExtractor): ext = mimetype2ext(encoding.get( 'mimeType')) or determine_ext(video_url) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( video_url, video_id, 'mp4', entry_protocol='m3u8_native', - preference=1, m3u8_id='hls', fatal=False)) + preference=1, m3u8_id='hls', fatal=False) + subtitles = self._merge_subtitles(subtitles, subs) + formats.extend(fmts) continue - format_id = encoding.get('definition') + format_id = traverse_obj(encoding, ('displayName', 'value'), 'definition') formats.append({ 'format_id': format_id, 'url': video_url, @@ -86,33 +104,15 @@ class ImdbIE(InfoExtractor): }) self._sort_formats(formats) - webpage = self._download_webpage( - 'https://www.imdb.com/video/vi' + video_id, video_id) - video_metadata = self._parse_json(self._search_regex( - r'args\.push\(\s*({.+?})\s*\)\s*;', webpage, - 'video metadata'), video_id) - - video_info = video_metadata.get('VIDEO_INFO') - if video_info and isinstance(video_info, dict): - info = try_get( - video_info, lambda x: x[list(video_info.keys())[0]][0], dict) - else: - info = {} - - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage) or self._html_search_regex( - r'(.+?)', webpage, 'title', - default=None) or info['videoTitle'] - return { 'id': video_id, 'title': title, 'alt_title': info.get('videoSubTitle'), 'formats': formats, - 'description': info.get('videoDescription'), - 'thumbnail': url_or_none(try_get( - info, lambda x: x['videoSlate']['source'])), - 'duration': parse_duration(info.get('videoRuntime')), + 'description': try_get(video_info, lambda x: x['description']['value']), + 'thumbnail': url_or_none(try_get(video_info, lambda x: x['thumbnail']['url'])), + 'duration': int_or_none(try_get(video_info, lambda x: x['runtime']['value'])), + 'subtitles': subtitles, } -- cgit v1.2.3 From 5127e92a943b620a2f5c348e339facef0134fd9f Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 3 Apr 2022 20:17:59 +0530 Subject: Fix filepath sanitization in `--print-to-file` --- yt_dlp/YoutubeDL.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 6a8e45b1a..4c43ac871 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1240,18 +1240,21 @@ class YoutubeDL(object): outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs) return self.escape_outtmpl(outtmpl) % info_dict - def _prepare_filename(self, info_dict, tmpl_type='default'): + def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None): + assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive' + if outtmpl is None: + outtmpl = self.outtmpl_dict.get(tmpl_type or 'default', self.outtmpl_dict['default']) try: - outtmpl = self._outtmpl_expandpath(self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])) + outtmpl = self._outtmpl_expandpath(outtmpl) filename = self.evaluate_outtmpl(outtmpl, info_dict, True) if not filename: return None - if tmpl_type in ('default', 'temp'): + if tmpl_type in ('', 'temp'): final_ext, ext = self.params.get('final_ext'), info_dict.get('ext') if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'): filename = replace_extension(filename, ext, final_ext) - else: + elif tmpl_type: force_ext = OUTTMPL_TYPES[tmpl_type] if force_ext: filename = replace_extension(filename, force_ext, info_dict.get('ext')) @@ -1267,10 +1270,12 @@ class YoutubeDL(object): self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') return None - def prepare_filename(self, info_dict, dir_type='', warn=False): - """Generate the output filename.""" - - filename = self._prepare_filename(info_dict, dir_type or 'default') + def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False): + """Generate the output filename""" + if outtmpl: + assert not dir_type, 'outtmpl and dir_type are mutually exclusive' + dir_type = None + filename = self._prepare_filename(info_dict, tmpl_type=dir_type, outtmpl=outtmpl) if not filename and dir_type not in ('', 'temp'): return '' @@ -2767,7 +2772,7 @@ class YoutubeDL(object): self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy)) for tmpl, file_tmpl in self.params['print_to_file'].get(key, []): - filename = self.evaluate_outtmpl(file_tmpl, info_dict) + filename = self.prepare_filename(info_dict, outtmpl=file_tmpl) tmpl = format_tmpl(tmpl) self.to_screen(f'[info] Writing {tmpl!r} to: {filename}') if self._ensure_dir_exists(filename): -- cgit v1.2.3 From 85e801a9dbc671f97af92aebea18170e6a384374 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 4 Apr 2022 14:56:02 +0530 Subject: Fallback to video-only format when selecting by extension Closes #3296 --- yt_dlp/YoutubeDL.py | 51 +++++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 4c43ac871..51d83bde0 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2188,7 +2188,7 @@ class YoutubeDL(object): yield merged_format else: - format_fallback, format_reverse, format_idx = False, True, 1 + format_fallback, seperate_fallback, format_reverse, format_idx = False, None, True, 1 mobj = re.match( r'(?Pbest|worst|b|w)(?Pvideo|audio|v|a)?(?P\*)?(?:\.(?P[1-9]\d*))?$', format_spec) @@ -2215,6 +2215,7 @@ class YoutubeDL(object): filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' elif format_spec in self._format_selection_exts['video']: filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none' + seperate_fallback = lambda f: f.get('ext') == format_spec and f.get('vcodec') != 'none' elif format_spec in self._format_selection_exts['storyboards']: filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none' else: @@ -2223,11 +2224,15 @@ class YoutubeDL(object): def selector_function(ctx): formats = list(ctx['formats']) matches = list(filter(filter_f, formats)) if filter_f is not None else formats - if format_fallback and ctx['incomplete_formats'] and not matches: - # for extractors with incomplete formats (audio only (soundcloud) - # or video only (imgur)) best/worst will fallback to - # best/worst {video,audio}-only format - matches = formats + if not matches: + if format_fallback and ctx['incomplete_formats']: + # for extractors with incomplete formats (audio only (soundcloud) + # or video only (imgur)) best/worst will fallback to + # best/worst {video,audio}-only format + matches = formats + elif seperate_fallback and not ctx['has_merged_format']: + # for compatibility with youtube-dl when there is no pre-merged format + matches = list(filter(seperate_fallback, formats)) matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1])) try: yield matches[format_idx - 1] @@ -2604,33 +2609,15 @@ class YoutubeDL(object): self.report_error(err, tb=False, is_error=False) continue - # While in format selection we may need to have an access to the original - # format set in order to calculate some metrics or do some processing. - # For now we need to be able to guess whether original formats provided - # by extractor are incomplete or not (i.e. whether extractor provides only - # video-only or audio-only formats) for proper formats selection for - # extractors with such incomplete formats (see - # https://github.com/ytdl-org/youtube-dl/pull/5556). - # Since formats may be filtered during format selection and may not match - # the original formats the results may be incorrect. Thus original formats - # or pre-calculated metrics should be passed to format selection routines - # as well. - # We will pass a context object containing all necessary additional data - # instead of just formats. - # This fixes incorrect format selection issue (see - # https://github.com/ytdl-org/youtube-dl/issues/10083). - incomplete_formats = ( - # All formats are video-only or - all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) - # all formats are audio-only - or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) - - ctx = { + formats_to_download = list(format_selector({ 'formats': formats, - 'incomplete_formats': incomplete_formats, - } - - formats_to_download = list(format_selector(ctx)) + 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats), + 'incomplete_formats': ( + # All formats are video-only or + all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) + # all formats are audio-only + or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)), + })) if interactive_format_selection and not formats_to_download: self.report_error('Requested format is not available', tb=False, is_error=False) continue -- cgit v1.2.3 From 04f3fd2c8948621612d852f8f68ef549a484bfb6 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 4 Apr 2022 13:57:35 +0530 Subject: [cleanup] Use `_html_extract_title` --- CONTRIBUTING.md | 4 ++-- yt_dlp/extractor/adobeconnect.py | 2 +- yt_dlp/extractor/allocine.py | 6 ++---- yt_dlp/extractor/archiveorg.py | 3 +-- yt_dlp/extractor/asiancrush.py | 3 +-- yt_dlp/extractor/bbc.py | 5 ++--- yt_dlp/extractor/breitbart.py | 5 ++--- yt_dlp/extractor/callin.py | 2 +- yt_dlp/extractor/cbc.py | 6 +++--- yt_dlp/extractor/closertotruth.py | 3 +-- yt_dlp/extractor/common.py | 10 ++++------ yt_dlp/extractor/cspan.py | 2 +- yt_dlp/extractor/fivetv.py | 3 +-- yt_dlp/extractor/foxgay.py | 3 +-- yt_dlp/extractor/generic.py | 6 ++---- yt_dlp/extractor/glide.py | 4 +--- yt_dlp/extractor/hellporno.py | 3 +-- yt_dlp/extractor/huya.py | 3 +-- yt_dlp/extractor/imdb.py | 2 +- yt_dlp/extractor/infoq.py | 2 +- yt_dlp/extractor/iwara.py | 3 +-- yt_dlp/extractor/linkedin.py | 2 +- yt_dlp/extractor/miaopai.py | 3 +-- yt_dlp/extractor/mojvideo.py | 3 +-- yt_dlp/extractor/newgrounds.py | 6 ++---- yt_dlp/extractor/nhk.py | 4 +++- yt_dlp/extractor/playvid.py | 3 +-- yt_dlp/extractor/rule34video.py | 2 +- yt_dlp/extractor/senategov.py | 2 +- yt_dlp/extractor/sunporno.py | 3 +-- yt_dlp/extractor/thisav.py | 4 +--- yt_dlp/extractor/traileraddict.py | 3 +-- yt_dlp/extractor/varzesh3.py | 3 +-- yt_dlp/extractor/vshare.py | 3 +-- yt_dlp/extractor/vupload.py | 2 +- yt_dlp/extractor/weibo.py | 3 +-- yt_dlp/extractor/yahoo.py | 2 +- yt_dlp/extractor/youjizz.py | 3 +-- 38 files changed, 51 insertions(+), 80 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1897f73e0..ea1893d15 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -534,13 +534,13 @@ Extracting variables is acceptable for reducing code duplication and improving r Correct: ```python -title = self._html_search_regex(r'([^<]+)', webpage, 'title') +title = self._html_search_regex(r'

([^<]+)

', webpage, 'title') ``` Incorrect: ```python -TITLE_RE = r'([^<]+)' +TITLE_RE = r'

([^<]+)

' # ...some lines of code... title = self._html_search_regex(TITLE_RE, webpage, 'title') ``` diff --git a/yt_dlp/extractor/adobeconnect.py b/yt_dlp/extractor/adobeconnect.py index e688dddcb..e2e6f93f3 100644 --- a/yt_dlp/extractor/adobeconnect.py +++ b/yt_dlp/extractor/adobeconnect.py @@ -14,7 +14,7 @@ class AdobeConnectIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.+?)', webpage, 'title') + title = self._html_extract_title(webpage) qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1]) is_live = qs.get('isLive', ['false'])[0] == 'true' formats = [] diff --git a/yt_dlp/extractor/allocine.py b/yt_dlp/extractor/allocine.py index cd533acfc..403a277e9 100644 --- a/yt_dlp/extractor/allocine.py +++ b/yt_dlp/extractor/allocine.py @@ -7,6 +7,7 @@ from ..utils import ( int_or_none, qualities, remove_end, + strip_or_none, try_get, unified_timestamp, url_basename, @@ -102,10 +103,7 @@ class AllocineIE(InfoExtractor): video_id = display_id media_data = self._download_json( 'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id) - title = remove_end( - self._html_search_regex( - r'(?s)(.+?)', webpage, 'title').strip(), - ' - AlloCiné') + title = remove_end(strip_or_none(self._html_extract_title(webpage), ' - AlloCiné')) for key, value in media_data['video'].items(): if not key.endswith('Path'): continue diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index b06ac74ae..2ab3c1beb 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -483,8 +483,7 @@ class YoutubeWebArchiveIE(InfoExtractor): regex), webpage, name, default='{}'), video_id, fatal=False) def _extract_webpage_title(self, webpage): - page_title = self._html_search_regex( - r'([^<]*)', webpage, 'title', default='') + page_title = self._html_extract_title(webpage, default='') # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix. return self._html_search_regex( r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', diff --git a/yt_dlp/extractor/asiancrush.py b/yt_dlp/extractor/asiancrush.py index 75a632958..7f1940fca 100644 --- a/yt_dlp/extractor/asiancrush.py +++ b/yt_dlp/extractor/asiancrush.py @@ -181,8 +181,7 @@ class AsianCrushPlaylistIE(AsianCrushBaseIE): 'title', default=None) or self._og_search_title( webpage, default=None) or self._html_search_meta( 'twitter:title', webpage, 'title', - default=None) or self._search_regex( - r'([^<]+)', webpage, 'title', fatal=False) + default=None) or self._html_extract_title(webpage) if title: title = re.sub(r'\s*\|\s*.+?$', '', title) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 823155730..29ad7ded7 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -906,9 +906,8 @@ class BBCIE(BBCCoUkIE): playlist_title = json_ld_info.get('title') if not playlist_title: - playlist_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(.+?)', webpage, 'playlist title', default=None) + playlist_title = (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'playlist title', default=None)) if playlist_title: playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() diff --git a/yt_dlp/extractor/breitbart.py b/yt_dlp/extractor/breitbart.py index f50f719dc..e029aa627 100644 --- a/yt_dlp/extractor/breitbart.py +++ b/yt_dlp/extractor/breitbart.py @@ -29,9 +29,8 @@ class BreitBartIE(InfoExtractor): self._sort_formats(formats) return { 'id': video_id, - 'title': self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(?s)(.*?)', webpage, 'video title'), + 'title': (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'video title')), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'age_limit': self._rta_search(webpage), diff --git a/yt_dlp/extractor/callin.py b/yt_dlp/extractor/callin.py index acf327ace..1f3b7cfff 100644 --- a/yt_dlp/extractor/callin.py +++ b/yt_dlp/extractor/callin.py @@ -54,7 +54,7 @@ class CallinIE(InfoExtractor): id = episode['id'] title = (episode.get('title') or self._og_search_title(webpage, fatal=False) - or self._html_search_regex('(.*?)', webpage, 'title')) + or self._html_extract_title(webpage)) url = episode['m3u8'] formats = self._extract_m3u8_formats(url, display_id, ext='ts') self._sort_formats(formats) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index ac1272f7b..fba8bf965 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -127,9 +127,9 @@ class CBCIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - title = self._og_search_title(webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( - r'([^<]+)', webpage, 'title', fatal=False) + title = (self._og_search_title(webpage, default=None) + or self._html_search_meta('twitter:title', webpage, 'title', default=None) + or self._html_extract_title(webpage)) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] diff --git a/yt_dlp/extractor/closertotruth.py b/yt_dlp/extractor/closertotruth.py index 26243d52d..517e121e0 100644 --- a/yt_dlp/extractor/closertotruth.py +++ b/yt_dlp/extractor/closertotruth.py @@ -54,8 +54,7 @@ class CloserToTruthIE(InfoExtractor): r']+src=["\'].*?\b(?:partner_id|p)/(\d+)', webpage, 'kaltura partner_id') - title = self._search_regex( - r'(.+?)\s*\|\s*.+?', webpage, 'video title') + title = self._html_extract_title(webpage, 'video title') select = self._search_regex( r'(?s)]+id="select-version"[^>]*>(.+?)', diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index af964c527..81688eb54 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1329,9 +1329,8 @@ class InfoExtractor(object): def _og_search_description(self, html, **kargs): return self._og_search_property('description', html, fatal=False, **kargs) - def _og_search_title(self, html, **kargs): - kargs.setdefault('fatal', False) - return self._og_search_property('title', html, **kargs) + def _og_search_title(self, html, *, fatal=False, **kargs): + return self._og_search_property('title', html, fatal=fatal, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): regexes = self._og_regexes('video') + self._og_regexes('video:url') @@ -1342,9 +1341,8 @@ class InfoExtractor(object): def _og_search_url(self, html, **kargs): return self._og_search_property('url', html, **kargs) - def _html_extract_title(self, html, name, **kwargs): - return self._html_search_regex( - r'(?s)(.*?)', html, name, **kwargs) + def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs): + return self._html_search_regex(r'(?s)([^<]+)', html, name, fatal=fatal, **kwargs) def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): name = variadic(name) diff --git a/yt_dlp/extractor/cspan.py b/yt_dlp/extractor/cspan.py index d29b58ba6..f51159bbe 100644 --- a/yt_dlp/extractor/cspan.py +++ b/yt_dlp/extractor/cspan.py @@ -278,7 +278,7 @@ class CSpanCongressIE(InfoExtractor): video_id, transform_source=js_to_json) title = (self._og_search_title(webpage, default=None) - or self._html_search_regex(r'(?s)(.*?)', webpage, 'video title')) + or self._html_extract_title(webpage, 'video title')) description = (self._og_search_description(webpage, default=None) or self._html_search_meta('description', webpage, 'description', default=None)) diff --git a/yt_dlp/extractor/fivetv.py b/yt_dlp/extractor/fivetv.py index be81fccb8..d6bebd19b 100644 --- a/yt_dlp/extractor/fivetv.py +++ b/yt_dlp/extractor/fivetv.py @@ -75,8 +75,7 @@ class FiveTVIE(InfoExtractor): r']+?href="([^"]+)"[^>]+?class="videoplayer"'], webpage, 'video url') - title = self._og_search_title(webpage, default=None) or self._search_regex( - r'([^<]+)', webpage, 'title') + title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) duration = int_or_none(self._og_search_property( 'video:duration', webpage, 'duration', default=None)) diff --git a/yt_dlp/extractor/foxgay.py b/yt_dlp/extractor/foxgay.py index 512a10645..1c53e0642 100644 --- a/yt_dlp/extractor/foxgay.py +++ b/yt_dlp/extractor/foxgay.py @@ -29,8 +29,7 @@ class FoxgayIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = remove_end(self._html_search_regex( - r'([^<]+)', webpage, 'title'), ' - Foxgay.com') + title = remove_end(self._html_extract_title(webpage), ' - Foxgay.com') description = get_element_by_id('inf_tit', webpage) # The default user-agent with foxgay cookies leads to pages without videos diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 4a2e30158..65e803dd7 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2873,10 +2873,8 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - video_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(?s)(.*?)', webpage, 'video title', - default='video') + video_title = (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'video title', default='video')) # Try to detect age limit automatically age_limit = self._rta_search(webpage) diff --git a/yt_dlp/extractor/glide.py b/yt_dlp/extractor/glide.py index d94dfbf09..12af859be 100644 --- a/yt_dlp/extractor/glide.py +++ b/yt_dlp/extractor/glide.py @@ -23,9 +23,7 @@ class GlideIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'(.+?)', webpage, - 'title', default=None) or self._og_search_title(webpage) + title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage) video_url = self._proto_relative_url(self._search_regex( r']+src=(["\'])(?P.+?)\1', webpage, 'video URL', default=None, diff --git a/yt_dlp/extractor/hellporno.py b/yt_dlp/extractor/hellporno.py index fae425103..92d32cdcc 100644 --- a/yt_dlp/extractor/hellporno.py +++ b/yt_dlp/extractor/hellporno.py @@ -38,8 +38,7 @@ class HellPornoIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - title = remove_end(self._html_search_regex( - r'([^<]+)', webpage, 'title'), ' - Hell Porno') + title = remove_end(self._html_extract_title(webpage), ' - Hell Porno') info = self._parse_html5_media_entries(url, webpage, display_id)[0] self._sort_formats(info['formats']) diff --git a/yt_dlp/extractor/huya.py b/yt_dlp/extractor/huya.py index b81439682..4e96f22fa 100644 --- a/yt_dlp/extractor/huya.py +++ b/yt_dlp/extractor/huya.py @@ -66,8 +66,7 @@ class HuyaLiveIE(InfoExtractor): room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo']) if not room_info: raise ExtractorError('Can not extract the room info', expected=True) - title = room_info.get('roomName') or room_info.get('introduction') or self._html_search_regex( - r'([^<]+)', webpage, 'title') + title = room_info.get('roomName') or room_info.get('introduction') or self._html_extract_title(webpage) screen_type = room_info.get('screenType') live_source_type = room_info.get('liveSourceType') stream_info_list = stream_data['data'][0]['gameStreamInfoList'] diff --git a/yt_dlp/extractor/imdb.py b/yt_dlp/extractor/imdb.py index 7eb66d821..96cee2e2f 100644 --- a/yt_dlp/extractor/imdb.py +++ b/yt_dlp/extractor/imdb.py @@ -68,7 +68,7 @@ class ImdbIE(InfoExtractor): video_info = traverse_obj(info, ('props', 'pageProps', 'videoPlaybackData', 'video'), default={}) title = (traverse_obj(video_info, ('name', 'value'), ('primaryTitle', 'titleText', 'text')) or self._html_search_meta(('og:title', 'twitter:title'), webpage, default=None) - or self._html_search_regex(r'(.+?)', webpage, 'title')) + or self._html_extract_title(webpage)) data = video_info.get('playbackURLs') or try_get(self._download_json( 'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id, query={ diff --git a/yt_dlp/extractor/infoq.py b/yt_dlp/extractor/infoq.py index 0a70a1fb4..347cc5154 100644 --- a/yt_dlp/extractor/infoq.py +++ b/yt_dlp/extractor/infoq.py @@ -115,7 +115,7 @@ class InfoQIE(BokeCCBaseIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'(.*?)', webpage, 'title') + video_title = self._html_extract_title(webpage) video_description = self._html_search_meta('description', webpage, 'description') if '/cn/' in url: diff --git a/yt_dlp/extractor/iwara.py b/yt_dlp/extractor/iwara.py index 254d98692..c0e01e352 100644 --- a/yt_dlp/extractor/iwara.py +++ b/yt_dlp/extractor/iwara.py @@ -76,8 +76,7 @@ class IwaraIE(InfoExtractor): 'age_limit': age_limit, } - title = remove_end(self._html_search_regex( - r'([^<]+)', webpage, 'title'), ' | Iwara') + title = remove_end(self._html_extract_title(webpage), ' | Iwara') thumbnail = self._html_search_regex( r'poster=[\'"]([^\'"]+)', webpage, 'thumbnail', default=None) diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index bf549e164..0f57bfa06 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -102,7 +102,7 @@ class LinkedInIE(LinkedInBaseIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'([^<]+)', webpage, 'title') + title = self._html_extract_title(webpage) description = clean_html(get_element_by_class('share-update-card__update-text', webpage)) like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage)) creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage))) diff --git a/yt_dlp/extractor/miaopai.py b/yt_dlp/extractor/miaopai.py index f9e35ac7f..cf0610bdf 100644 --- a/yt_dlp/extractor/miaopai.py +++ b/yt_dlp/extractor/miaopai.py @@ -24,8 +24,7 @@ class MiaoPaiIE(InfoExtractor): webpage = self._download_webpage( url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD}) - title = self._html_search_regex( - r'([^<]+)', webpage, 'title') + title = self._html_extract_title(webpage) thumbnail = self._html_search_regex( r']+class=(?P[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P[\'"])(?P[^\'"]+)(?P=q2)', webpage, 'thumbnail', fatal=False, group='url') diff --git a/yt_dlp/extractor/mojvideo.py b/yt_dlp/extractor/mojvideo.py index 0421f3f44..16d94052b 100644 --- a/yt_dlp/extractor/mojvideo.py +++ b/yt_dlp/extractor/mojvideo.py @@ -38,8 +38,7 @@ class MojvideoIE(InfoExtractor): r'([^<]*)', playerapi, 'error description', fatal=False) raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True) - title = self._html_search_regex( - r'([^<]+)', playerapi, 'title') + title = self._html_extract_title(playerapi) video_url = self._html_search_regex( r'([^<]+)', playerapi, 'video URL') thumbnail = self._html_search_regex( diff --git a/yt_dlp/extractor/newgrounds.py b/yt_dlp/extractor/newgrounds.py index 1e1274ef0..6525a6d8a 100644 --- a/yt_dlp/extractor/newgrounds.py +++ b/yt_dlp/extractor/newgrounds.py @@ -106,8 +106,7 @@ class NewgroundsIE(InfoExtractor): uploader = None webpage = self._download_webpage(url, media_id) - title = self._html_search_regex( - r'(.+?)', webpage, 'title') + title = self._html_extract_title(webpage) media_url_string = self._search_regex( r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None) @@ -219,8 +218,7 @@ class NewgroundsPlaylistIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - title = self._search_regex( - r'([^>]+)', webpage, 'title', default=None) + title = self._html_extract_title(webpage, default=None) # cut left menu webpage = self._search_regex( diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 626c6379b..3b8efc3e6 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -309,7 +309,9 @@ class NhkForSchoolProgramListIE(InfoExtractor): webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id) - title = self._og_search_title(webpage, fatal=False) or self._html_extract_title(webpage, fatal=False) or self._html_search_regex(r'

([^<]+?)とは?\s*

', webpage, 'title', fatal=False) + title = (self._og_search_title(webpage) + or self._html_extract_title(webpage) + or self._html_search_regex(r'

([^<]+?)とは?\s*

', webpage, 'title', fatal=False)) title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None description = self._html_search_regex( r'(?s)\s*

[^<]+

', diff --git a/yt_dlp/extractor/playvid.py b/yt_dlp/extractor/playvid.py index 4aef186ea..e1c406b6c 100644 --- a/yt_dlp/extractor/playvid.py +++ b/yt_dlp/extractor/playvid.py @@ -85,8 +85,7 @@ class PlayvidIE(InfoExtractor): # Extract title - should be in the flashvars; if not, look elsewhere if video_title is None: - video_title = self._html_search_regex( - r'(.*?)</title', webpage, 'title') + video_title = self._html_extract_title(webpage) return { 'id': video_id, diff --git a/yt_dlp/extractor/rule34video.py b/yt_dlp/extractor/rule34video.py index 522d4ccd5..a602a9f33 100644 --- a/yt_dlp/extractor/rule34video.py +++ b/yt_dlp/extractor/rule34video.py @@ -49,7 +49,7 @@ class Rule34VideoIE(InfoExtractor): 'quality': quality, }) - title = self._html_search_regex(r'<title>([^<]+)', webpage, 'title') + title = self._html_extract_title(webpage) thumbnail = self._html_search_regex(r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None) duration = self._html_search_regex(r'"icon-clock">\s+((?:\d+:?)+)', webpage, 'duration', default=None) diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py index 6f4240422..b295184a1 100644 --- a/yt_dlp/extractor/senategov.py +++ b/yt_dlp/extractor/senategov.py @@ -112,7 +112,7 @@ class SenateISVPIE(InfoExtractor): if smuggled_data.get('force_title'): title = smuggled_data['force_title'] else: - title = self._html_search_regex(r'([^<]+)', webpage, video_id) + title = self._html_extract_title(webpage) poster = qs.get('poster') thumbnail = poster[0] if poster else None diff --git a/yt_dlp/extractor/sunporno.py b/yt_dlp/extractor/sunporno.py index 68051169b..59b77bf92 100644 --- a/yt_dlp/extractor/sunporno.py +++ b/yt_dlp/extractor/sunporno.py @@ -36,8 +36,7 @@ class SunPornoIE(InfoExtractor): webpage = self._download_webpage( 'http://www.sunporno.com/videos/%s' % video_id, video_id) - title = self._html_search_regex( - r'([^<]+)', webpage, 'title') + title = self._html_extract_title(webpage) description = self._html_search_meta( 'description', webpage, 'description') thumbnail = self._html_search_regex( diff --git a/yt_dlp/extractor/thisav.py b/yt_dlp/extractor/thisav.py index 4af286e6d..6bb00b3ab 100644 --- a/yt_dlp/extractor/thisav.py +++ b/yt_dlp/extractor/thisav.py @@ -37,9 +37,7 @@ class ThisAVIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = remove_end(self._html_search_regex( - r'([^<]+)', webpage, 'title'), - ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') + title = remove_end(self._html_extract_title(webpage), ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站') video_url = self._html_search_regex( r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None) if video_url: diff --git a/yt_dlp/extractor/traileraddict.py b/yt_dlp/extractor/traileraddict.py index 10100fbcf..514f4793e 100644 --- a/yt_dlp/extractor/traileraddict.py +++ b/yt_dlp/extractor/traileraddict.py @@ -24,8 +24,7 @@ class TrailerAddictIE(InfoExtractor): name = mobj.group('movie') + '/' + mobj.group('trailer_name') webpage = self._download_webpage(url, name) - title = self._search_regex(r'(.+?)', - webpage, 'video title').replace(' - Trailer Addict', '') + title = self._html_extract_title(webpage, 'video title').replace(' - Trailer Addict', '') view_count_str = self._search_regex( r'([0-9,.]+)', webpage, 'view count', fatal=False) diff --git a/yt_dlp/extractor/varzesh3.py b/yt_dlp/extractor/varzesh3.py index 81313dc9d..32655b96d 100644 --- a/yt_dlp/extractor/varzesh3.py +++ b/yt_dlp/extractor/varzesh3.py @@ -42,8 +42,7 @@ class Varzesh3IE(InfoExtractor): video_url = self._search_regex( r']+src="([^"]+)"', webpage, 'video url') - title = remove_start(self._html_search_regex( - r'([^<]+)', webpage, 'title'), 'ویدیو ورزش 3 | ') + title = remove_start(self._html_extract_title(webpage), 'ویدیو ورزش 3 | ') description = self._html_search_regex( r'(?s)
(.+?)
', diff --git a/yt_dlp/extractor/vshare.py b/yt_dlp/extractor/vshare.py index c631ac1fa..b4874ac39 100644 --- a/yt_dlp/extractor/vshare.py +++ b/yt_dlp/extractor/vshare.py @@ -50,8 +50,7 @@ class VShareIE(InfoExtractor): 'https://vshare.io/v/%s/width-650/height-430/1' % video_id, video_id, headers={'Referer': url}) - title = self._html_search_regex( - r'([^<]+)', webpage, 'title') + title = self._html_extract_title(webpage) title = title.split(' - ')[0] error = self._html_search_regex( diff --git a/yt_dlp/extractor/vupload.py b/yt_dlp/extractor/vupload.py index 2229a6591..b561f63f7 100644 --- a/yt_dlp/extractor/vupload.py +++ b/yt_dlp/extractor/vupload.py @@ -28,7 +28,7 @@ class VuploadIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.+?)', webpage, 'title') + title = self._html_extract_title(webpage) video_json = self._parse_json(self._html_search_regex(r'sources:\s*(.+?]),', webpage, 'video'), video_id, transform_source=js_to_json) formats = [] for source in video_json: diff --git a/yt_dlp/extractor/weibo.py b/yt_dlp/extractor/weibo.py index 621df5b54..dafa2af3b 100644 --- a/yt_dlp/extractor/weibo.py +++ b/yt_dlp/extractor/weibo.py @@ -73,8 +73,7 @@ class WeiboIE(InfoExtractor): webpage = self._download_webpage( url, video_id, note='Revisiting webpage') - title = self._html_search_regex( - r'(.+?)', webpage, 'title') + title = self._html_extract_title(webpage) video_formats = compat_parse_qs(self._search_regex( r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) diff --git a/yt_dlp/extractor/yahoo.py b/yt_dlp/extractor/yahoo.py index 6cf3b1de2..20504de2c 100644 --- a/yt_dlp/extractor/yahoo.py +++ b/yt_dlp/extractor/yahoo.py @@ -533,7 +533,7 @@ class YahooJapanNewsIE(InfoExtractor): title = self._html_search_meta( ['og:title', 'twitter:title'], webpage, 'title', default=None - ) or self._html_search_regex('([^<]+)', webpage, 'title') + ) or self._html_extract_title(webpage) if display_id == host: # Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...) diff --git a/yt_dlp/extractor/youjizz.py b/yt_dlp/extractor/youjizz.py index 5f5fbf21c..111623ffe 100644 --- a/yt_dlp/extractor/youjizz.py +++ b/yt_dlp/extractor/youjizz.py @@ -36,8 +36,7 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'(.+?)', webpage, 'title') + title = self._html_extract_title(webpage) formats = [] -- cgit v1.2.3 From 5fa3c9a88f597625296981a4a26be723e65d4842 Mon Sep 17 00:00:00 2001 From: Ha Tien Loi Date: Mon, 4 Apr 2022 17:07:07 +0700 Subject: [TikTok] Fix URLs with user id (#3295) Closes #3243 Authored by: hatienl0i261299 --- yt_dlp/extractor/tiktok.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 56cc2dcc6..6f8c32882 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -263,7 +263,7 @@ class TikTokBaseIE(InfoExtractor): return { 'id': aweme_id, - 'title': aweme_detail['desc'], + 'title': aweme_detail.get('desc'), 'description': aweme_detail['desc'], 'view_count': int_or_none(stats_info.get('play_count')), 'like_count': int_or_none(stats_info.get('digg_count')), @@ -457,6 +457,30 @@ class TikTokIE(TikTokBaseIE): 'comment_count': int, }, 'expected_warnings': ['Video not available'] + }, { + # Video without title and description + 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694', + 'info_dict': { + 'id': '7059698374567611694', + 'ext': 'mp4', + 'title': 'N/A', + 'description': '', + 'uploader': 'pokemonlife22', + 'creator': 'Pokemon', + 'uploader_id': '6820838815978423302', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W', + 'track': 'original sound', + 'timestamp': 1643714123, + 'duration': 6, + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'upload_date': '20220201', + 'artist': 'Pokemon', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + 'expected_warnings': ['Video not available'] }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', @@ -521,6 +545,15 @@ class TikTokUserIE(TikTokBaseIE): 'thumbnail': r're:https://.+_1080x1080\.webp' }, 'expected_warnings': ['Retrying'] + }, { + 'url': 'https://www.tiktok.com/@6820838815978423302', + 'playlist_mincount': 5, + 'info_dict': { + 'id': '6820838815978423302', + 'title': '6820838815978423302', + 'thumbnail': r're:https://.+_1080x1080\.webp' + }, + 'expected_warnings': ['Retrying'] }, { 'url': 'https://www.tiktok.com/@meme', 'playlist_mincount': 593, @@ -593,7 +626,7 @@ class TikTokUserIE(TikTokBaseIE): webpage = self._download_webpage(url, user_name, headers={ 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)' }) - user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID') + user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID', default=None) or user_name videos = LazyList(self._video_entries_api(webpage, user_id, user_name)) thumbnail = traverse_obj(videos, (0, 'author', 'avatar_larger', 'url_list', 0)) -- cgit v1.2.3 From f4d706a931bdf2534c23353b5843d3220efe6f89 Mon Sep 17 00:00:00 2001 From: Jeff Huffman Date: Tue, 5 Apr 2022 03:51:12 -0700 Subject: [crunchyroll:playlist] Implement beta API (#2955) Closes #3121, #2930 Authored by: tejing1 --- yt_dlp/extractor/crunchyroll.py | 200 +++++++++++++++++++++++++++++++--------- 1 file changed, 155 insertions(+), 45 deletions(-) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index bb4ae12f5..7edb645f8 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -86,6 +86,22 @@ class CrunchyrollBaseIE(InfoExtractor): if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): raise ExtractorError('Login succeeded but did not set etp_rt cookie') + # Beta-specific, but needed for redirects + def _get_beta_embedded_json(self, webpage, display_id): + initial_state = self._parse_json(self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) + app_config = self._parse_json(self._search_regex( + r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) + return initial_state, app_config + + def _redirect_to_beta(self, webpage, iekey, video_id): + if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): + raise ExtractorError('Received a beta page from non-beta url when not logged in.') + initial_state, app_config = self._get_beta_embedded_json(webpage, video_id) + url = app_config['baseSiteUrl'] + initial_state['router']['locations']['current']['pathname'] + self.to_screen(f'{video_id}: Redirected to beta site - {url}') + return self.url_result(f'{url}', iekey, video_id) + @staticmethod def _add_skip_wall(url): parsed_url = compat_urlparse.urlparse(url) @@ -406,6 +422,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage = self._download_webpage( self._add_skip_wall(webpage_url), video_id, headers=self.geo_verification_headers()) + if re.search(r'
', webpage): + return self._redirect_to_beta(webpage, CrunchyrollBetaIE.ie_key(), video_id) note_m = self._html_search_regex( r'
(.+?)
', webpage, 'trailer-notice', default='') @@ -670,6 +688,8 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): # https:// gives a 403, but http:// does not self._add_skip_wall(url).replace('https://', 'http://'), show_id, headers=self.geo_verification_headers()) + if re.search(r'
', webpage): + return self._redirect_to_beta(webpage, CrunchyrollBetaShowIE.ie_key(), show_id) title = self._html_search_meta('name', webpage, default=None) episode_re = r'
  • ]+>.*?(?:\w{1,2}/)?)watch/(?P\w+)/(?P[\w\-]+)/?(?:\?|$)' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P(?:\w{1,2}/)?)watch/(?P\w+)/(?P[\w\-]*)/?(?:\?|$)' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', 'info_dict': { @@ -705,51 +772,49 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'uploader': 'Toei Animation', 'title': 'World Trigger Episode 73 – To the Future', 'upload_date': '20160402', + 'episode_number': 73, + 'series': 'World Trigger', + 'average_rating': 4.9, + 'episode': 'To the Future', + 'season': 'World Trigger', + 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/c870dedca1a83137c2d3d144984155ed1459527119_main.jpg', + 'season_number': 1, }, 'params': {'skip_download': 'm3u8'}, 'expected_warnings': ['Unable to download XML'] + }, { + 'url': 'https://beta.crunchyroll.com/watch/GYK53DMPR/wicked-lord-shingan-reborn', + 'info_dict': { + 'id': '648781', + 'ext': 'mp4', + 'episode_number': 1, + 'timestamp': 1389173400, + 'series': 'Love, Chunibyo & Other Delusions - Heart Throb -', + 'description': 'md5:5579d1a0355cc618558ba23d27067a62', + 'uploader': 'TBS', + 'episode': 'Wicked Lord Shingan... Reborn', + 'average_rating': 4.9, + 'season': 'Love, Chunibyo & Other Delusions - Heart Throb -', + 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/2ba0384e225a5370d5f0ee9496d91ea51389046521_main.jpg', + 'title': 'Love, Chunibyo & Other Delusions - Heart Throb - Episode 1 – Wicked Lord Shingan... Reborn', + 'season_number': 2, + 'upload_date': '20140108', + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Unable to download XML'] + }, { + 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/', + 'only_matching': True, }] def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id') - webpage = self._download_webpage(url, display_id) - initial_state = self._parse_json( - self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), - display_id) - episode_data = initial_state['content']['byId'][internal_id] + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') + if not self._get_cookies(url).get('etp_rt'): - video_id = episode_data['external_id'].split('.')[1] - series_id = episode_data['episode_metadata']['series_slug_title'] - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}', - CrunchyrollIE.ie_key(), video_id) - - app_config = self._parse_json( - self._search_regex(r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), - display_id) - client_id = app_config['cxApiParams']['accountAuthClientId'] - api_domain = app_config['cxApiParams']['apiDomain'] - basic_token = str(base64.b64encode(('%s:' % client_id).encode('ascii')), 'ascii') - auth_response = self._download_json( - f'{api_domain}/auth/v1/token', display_id, - note='Authenticating with cookie', - headers={ - 'Authorization': 'Basic ' + basic_token - }, data='grant_type=etp_rt_cookie'.encode('ascii')) - policy_response = self._download_json( - f'{api_domain}/index/v2', display_id, - note='Retrieving signed policy', - headers={ - 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] - }) - bucket = policy_response['cms']['bucket'] - params = { - 'Policy': policy_response['cms']['policy'], - 'Signature': policy_response['cms']['signature'], - 'Key-Pair-Id': policy_response['cms']['key_pair_id'] - } - locale = traverse_obj(initial_state, ('localization', 'locale')) - if locale: - params['locale'] = locale + return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key()) + + api_domain, bucket, params = self._get_params(lang) + episode_response = self._download_json( f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, note='Retrieving episode metadata', @@ -827,9 +892,9 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): } -class CrunchyrollBetaShowIE(CrunchyrollBaseIE): +class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): IE_NAME = 'crunchyroll:playlist:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P(?:\w{1,2}/)?)series/\w+/(?P[\w\-]+)/?(?:\?|$)' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P(?:\w{1,2}/)?)series/(?P\w+)/(?P[\w\-]*)/?(?:\?|$)' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { @@ -837,12 +902,57 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE): 'title': 'Girl Friend BETA', }, 'playlist_mincount': 10, + }, { + 'url': 'https://beta.crunchyroll.com/series/GYJQV73V6/love-chunibyo--other-delusions---heart-throb--', + 'info_dict': { + 'id': 'love-chunibyo-other-delusions-heart-throb-', + 'title': 'Love, Chunibyo & Other Delusions - Heart Throb -', + }, + 'playlist_mincount': 10, }, { 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA', 'only_matching': True, }] def _real_extract(self, url): - lang, series_id = self._match_valid_url(url).group('lang', 'id') - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id.lower()}', - CrunchyrollShowPlaylistIE.ie_key(), series_id) + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') + + if not self._get_cookies(url).get('etp_rt'): + return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key()) + + api_domain, bucket, params = self._get_params(lang) + + series_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id, + note='Retrieving series metadata', query=params) + + seasons_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id, + note='Retrieving season list', query=params) + + def entries(): + for season in seasons_response['items']: + episodes_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id, + note=f'Retrieving episode list for {season.get("slug_title")}', query=params) + for episode in episodes_response['items']: + episode_id = episode['id'] + episode_display_id = episode['slug_title'] + yield { + '_type': 'url', + 'url': f'https://beta.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', + 'ie_key': CrunchyrollBetaIE.ie_key(), + 'id': episode_id, + 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), + 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')), + 'duration': float_or_none(episode.get('duration_ms'), 1000), + 'series': episode.get('series_title'), + 'series_id': episode.get('series_id'), + 'season': episode.get('season_title'), + 'season_id': episode.get('season_id'), + 'season_number': episode.get('season_number'), + 'episode': episode.get('title'), + 'episode_number': episode.get('sequence_number') + } + + return self.playlist_result(entries(), internal_id, series_response.get('title')) -- cgit v1.2.3 From 0a8a7e68fabf6fc9387f270301e51225ac349b00 Mon Sep 17 00:00:00 2001 From: Teemu Ikonen Date: Tue, 5 Apr 2022 15:15:47 +0300 Subject: [ruutu] Detect embeds (#3294) Authored by: tpikonen --- yt_dlp/extractor/generic.py | 26 +++++++++++++++++++++++++- yt_dlp/extractor/ruutu.py | 15 +++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 65e803dd7..2c503e581 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -149,6 +149,7 @@ from .blogger import BloggerIE from .mainstreaming import MainStreamingIE from .gfycat import GfycatIE from .panopto import PanoptoBaseIE +from .ruutu import RuutuIE class GenericIE(InfoExtractor): @@ -2511,7 +2512,24 @@ class GenericIE(InfoExtractor): 'id': 'insert-a-quiz-into-a-panopto-video' }, 'playlist_count': 1 - } + }, + { + # Ruutu embed + 'url': 'https://www.nelonen.fi/ohjelmat/madventures-suomi/2160731-riku-ja-tunna-lahtevat-peurajahtiin-tv-sta-tutun-biologin-kanssa---metsastysreissu-huipentuu-kasvissyojan-painajaiseen', + 'md5': 'a2513a98d3496099e6eced40f7e6a14b', + 'info_dict': { + 'id': '4044426', + 'ext': 'mp4', + 'title': 'Riku ja Tunna lähtevät peurajahtiin tv:stä tutun biologin kanssa – metsästysreissu huipentuu kasvissyöjän painajaiseen!', + 'thumbnail': r're:^https?://.+\.jpg$', + 'duration': 108, + 'series' : 'Madventures Suomi', + 'description': 'md5:aa55b44bd06a1e337a6f1d0b46507381', + 'categories': ['Matkailu', 'Elämäntyyli'], + 'age_limit': 0, + 'upload_date': '20220308', + }, + }, ] def report_following_redirect(self, new_url): @@ -3737,6 +3755,12 @@ class GenericIE(InfoExtractor): panopto_urls = PanoptoBaseIE._extract_urls(webpage) if panopto_urls: return self.playlist_from_matches(panopto_urls, video_id, video_title) + + # Look for Ruutu embeds + ruutu_url = RuutuIE._extract_url(webpage) + if ruutu_url: + return self.url_result(ruutu_url, RuutuIE) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/yt_dlp/extractor/ruutu.py b/yt_dlp/extractor/ruutu.py index d9cf39d71..5a30e3360 100644 --- a/yt_dlp/extractor/ruutu.py +++ b/yt_dlp/extractor/ruutu.py @@ -1,6 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse from ..utils import ( @@ -8,6 +11,8 @@ from ..utils import ( ExtractorError, find_xpath_attr, int_or_none, + traverse_obj, + try_call, unified_strdate, url_or_none, xpath_attr, @@ -123,6 +128,16 @@ class RuutuIE(InfoExtractor): ] _API_BASE = 'https://gatling.nelonenmedia.fi' + @classmethod + def _extract_url(cls, webpage): + settings = try_call( + lambda: json.loads(re.search( + r'jQuery\.extend\(Drupal\.settings, ({.+?})\);', webpage).group(1), strict=False)) + video_id = traverse_obj(settings, ( + 'mediaCrossbowSettings', 'file', 'field_crossbow_video_id', 'und', 0, 'value')) + if video_id: + return f'http://www.ruutu.fi/video/{video_id}' + def _real_extract(self, url): video_id = self._match_id(url) -- cgit v1.2.3 From a44ca5a470e09b5170fc9c3a46733f050fadbfae Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 27 Mar 2022 07:50:43 +0530 Subject: [cleanup] Misc fixes Closes https://github.com/yt-dlp/yt-dlp/pull/3213, Closes https://github.com/yt-dlp/yt-dlp/pull/3117 Related: https://github.com/yt-dlp/yt-dlp/issues/3146#issuecomment-1077323114, https://github.com/yt-dlp/yt-dlp/pull/3277#discussion_r841019671, https://github.com/yt-dlp/yt-dlp/commit/a825ffbffa0bea322e3ccb44c6f8e01d8d9572fb#commitcomment-68538986, https://github.com/yt-dlp/yt-dlp/issues/2360, https://github.com/yt-dlp/yt-dlp/commit/5fa3c9a88f597625296981a4a26be723e65d4842#r70393519, https://github.com/yt-dlp/yt-dlp/commit/5fa3c9a88f597625296981a4a26be723e65d4842#r70393254 --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 2 +- .github/ISSUE_TEMPLATE/2_site_support_request.yml | 2 +- .github/ISSUE_TEMPLATE/3_site_feature_request.yml | 2 +- .github/ISSUE_TEMPLATE/4_bug_report.yml | 2 +- .github/ISSUE_TEMPLATE/5_feature_request.yml | 23 +++++++- .github/ISSUE_TEMPLATE/6_question.yml | 2 +- .github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml | 2 +- .../ISSUE_TEMPLATE_tmpl/2_site_support_request.yml | 2 +- .../ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml | 2 +- .github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml | 2 +- .github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml | 23 +++++++- .github/ISSUE_TEMPLATE_tmpl/6_question.yml | 2 +- .gitignore | 3 ++ README.md | 61 ++++++++++++---------- yt_dlp/YoutubeDL.py | 9 ++-- yt_dlp/__init__.py | 2 +- yt_dlp/cookies.py | 11 ++-- yt_dlp/downloader/fragment.py | 24 ++++----- yt_dlp/downloader/http.py | 20 ++----- yt_dlp/extractor/bilibili.py | 6 +-- yt_dlp/extractor/canvas.py | 8 --- yt_dlp/extractor/common.py | 8 +-- yt_dlp/extractor/dropout.py | 4 +- yt_dlp/extractor/facebook.py | 6 ++- yt_dlp/extractor/generic.py | 6 +-- yt_dlp/extractor/limelight.py | 2 +- yt_dlp/extractor/niconico.py | 2 +- yt_dlp/extractor/tiktok.py | 11 ++-- yt_dlp/extractor/yandexvideo.py | 1 - yt_dlp/extractor/youtube.py | 15 ++++-- yt_dlp/options.py | 14 ++--- yt_dlp/postprocessor/ffmpeg.py | 13 +++-- yt_dlp/utils.py | 19 ++++--- 33 files changed, 184 insertions(+), 127 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index 4d9187143..c671a1910 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -11,7 +11,7 @@ body: options: - label: I'm reporting a broken site required: true - - label: I've verified that I'm running yt-dlp version **2022.03.08.1**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + - label: I've verified that I'm running yt-dlp version **2022.03.08.1** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are alive and playable in a browser required: true diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index cff73b555..5ff022a04 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -11,7 +11,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2022.03.08.1**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + - label: I've verified that I'm running yt-dlp version **2022.03.08.1** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are alive and playable in a browser required: true diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 44012044a..acdfeb038 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -11,7 +11,7 @@ body: options: - label: I'm reporting a site feature request required: true - - label: I've verified that I'm running yt-dlp version **2022.03.08.1**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + - label: I've verified that I'm running yt-dlp version **2022.03.08.1** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are alive and playable in a browser required: true diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index d93380725..a4a038fc8 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -11,7 +11,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2022.03.08.1**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + - label: I've verified that I'm running yt-dlp version **2022.03.08.1** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are alive and playable in a browser required: true diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index 51987d533..1bdafc441 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -13,7 +13,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2022.03.08.1**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + - label: I've verified that I'm running yt-dlp version **2022.03.08.1** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues including closed ones. DO NOT post duplicates required: true @@ -30,3 +30,24 @@ body: placeholder: WRITE DESCRIPTION HERE validations: required: true + - type: textarea + id: log + attributes: + label: Verbose log + description: | + If your feature request involves an existing yt-dlp command, provide the complete verbose output of that command. + Add the `-vU` flag to **your** command line you run yt-dlp with (`yt-dlp -vU `), copy the WHOLE output and insert it below. + It should look similar to this: + placeholder: | + [debug] Command-line config: ['-vU', 'http://www.youtube.com/watch?v=BaW_jenozKc'] + [debug] Portable config file: yt-dlp.conf + [debug] Portable config: ['-i'] + [debug] Encodings: locale cp1252, fs utf-8, stdout utf-8, stderr utf-8, pref cp1252 + [debug] yt-dlp version 2021.12.01 (exe) + [debug] Python version 3.8.8 (CPython 64bit) - Windows-10-10.0.19041-SP0 + [debug] exe versions: ffmpeg 3.0.1, ffprobe 3.0.1 + [debug] Optional libraries: Cryptodome, keyring, mutagen, sqlite, websockets + [debug] Proxy map: {} + yt-dlp is up to date (2021.12.01) + + render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 061158ed3..030d2cfe7 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -35,7 +35,7 @@ body: attributes: label: Verbose log description: | - If your question involes a yt-dlp command, provide the complete verbose output of that command. + If your question involves a yt-dlp command, provide the complete verbose output of that command. Add the `-vU` flag to **your** command line you run yt-dlp with (`yt-dlp -vU `), copy the WHOLE output and insert it below. It should look similar to this: placeholder: | diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml index fd6435ba6..422af9c72 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.yml @@ -11,7 +11,7 @@ body: options: - label: I'm reporting a broken site required: true - - label: I've verified that I'm running yt-dlp version **%(version)s**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are alive and playable in a browser required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml index f380c1331..fec50559a 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.yml @@ -11,7 +11,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **%(version)s**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are alive and playable in a browser required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml index 88b1f1217..266408c19 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.yml @@ -11,7 +11,7 @@ body: options: - label: I'm reporting a site feature request required: true - - label: I've verified that I'm running yt-dlp version **%(version)s**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are alive and playable in a browser required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml index 03a6ba551..8b49b6385 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.yml @@ -11,7 +11,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **%(version)s**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are alive and playable in a browser required: true diff --git a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml index eb5d3d634..1f33f09dc 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.yml @@ -13,7 +13,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **%(version)s**. ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) + - label: I've verified that I'm running yt-dlp version **%(version)s** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues including closed ones. DO NOT post duplicates required: true @@ -30,3 +30,24 @@ body: placeholder: WRITE DESCRIPTION HERE validations: required: true + - type: textarea + id: log + attributes: + label: Verbose log + description: | + If your feature request involves an existing yt-dlp command, provide the complete verbose output of that command. + Add the `-vU` flag to **your** command line you run yt-dlp with (`yt-dlp -vU `), copy the WHOLE output and insert it below. + It should look similar to this: + placeholder: | + [debug] Command-line config: ['-vU', 'http://www.youtube.com/watch?v=BaW_jenozKc'] + [debug] Portable config file: yt-dlp.conf + [debug] Portable config: ['-i'] + [debug] Encodings: locale cp1252, fs utf-8, stdout utf-8, stderr utf-8, pref cp1252 + [debug] yt-dlp version 2021.12.01 (exe) + [debug] Python version 3.8.8 (CPython 64bit) - Windows-10-10.0.19041-SP0 + [debug] exe versions: ffmpeg 3.0.1, ffprobe 3.0.1 + [debug] Optional libraries: Cryptodome, keyring, mutagen, sqlite, websockets + [debug] Proxy map: {} + yt-dlp is up to date (2021.12.01) + + render: shell diff --git a/.github/ISSUE_TEMPLATE_tmpl/6_question.yml b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml index 061158ed3..030d2cfe7 100644 --- a/.github/ISSUE_TEMPLATE_tmpl/6_question.yml +++ b/.github/ISSUE_TEMPLATE_tmpl/6_question.yml @@ -35,7 +35,7 @@ body: attributes: label: Verbose log description: | - If your question involes a yt-dlp command, provide the complete verbose output of that command. + If your question involves a yt-dlp command, provide the complete verbose output of that command. Add the `-vU` flag to **your** command line you run yt-dlp with (`yt-dlp -vU `), copy the WHOLE output and insert it below. It should look similar to this: placeholder: | diff --git a/.gitignore b/.gitignore index fd51ad66e..c815538e8 100644 --- a/.gitignore +++ b/.gitignore @@ -116,3 +116,6 @@ yt-dlp.zip ytdlp_plugins/extractor/* !ytdlp_plugins/extractor/__init__.py !ytdlp_plugins/extractor/sample.py +ytdlp_plugins/postprocessor/* +!ytdlp_plugins/postprocessor/__init__.py +!ytdlp_plugins/postprocessor/sample.py diff --git a/README.md b/README.md index a75441e35..6b4f39b9e 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,7 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu * The options `--auto-number` (`-A`), `--title` (`-t`) and `--literal` (`-l`), no longer work. See [removed options](#Removed) for details * `avconv` is not supported as an alternative to `ffmpeg` +* yt-dlp stores config files in slightly different locations to youtube-dl. See [configuration](#configuration) for a list of correct locations * The default [output template](#output-template) is `%(title)s [%(id)s].%(ext)s`. There is no real reason for this change. This was changed before yt-dlp was ever made public and now there are no plans to change it back to `%(title)s-%(id)s.%(ext)s`. Instead, you may use `--compat-options filename` * The default [format sorting](#sorting-formats) is different from youtube-dl and prefers higher resolution and better codecs rather than higher bitrates. You can use the `--format-sort` option to change this to any order you prefer, or use `--compat-options format-sort` to use youtube-dl's sorting order * The default format selector is `bv*+ba/b`. This means that if a combined video + audio format that is better than the best video-only format is found, the former will be preferred. Use `-f bv+ba/b` or `--compat-options format-spec` to revert this @@ -431,24 +432,24 @@ You can also fork the project on github and run your fork's [build workflow](.gi --dateafter DATE Download only videos uploaded on or after this date. The date formats accepted is the same as --date - --match-filter FILTER Generic video filter. Any field (see + --match-filters FILTER Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a number or a string using the operators defined in "Filtering formats". You can also simply specify a field to match if the - field is present and "!field" to check if - the field is not present. In addition, - Python style regular expression matching - can be done using "~=", and multiple - filters can be checked with "&". Use a "\" - to escape "&" or quotes if needed. Eg: - --match-filter "!is_live & like_count>?100 - & description~='(?i)\bcats \& dogs\b'" - matches only videos that are not live, has - a like count more than 100 (or the like - field is not available), and also has a - description that contains the phrase "cats - & dogs" (ignoring case) + field is present, use "!field" to check if + the field is not present, and "&" to check + multiple conditions. Use a "\" to escape + "&" or quotes if needed. If used multiple + times, the filter matches if atleast one of + the conditions are met. Eg: --match-filter + !is_live --match-filter "like_count>?100 & + description~='(?i)\bcats \& dogs\b'" + matches only videos that are not live OR + those that have a like count more than 100 + (or the like field is not available) and + also has a description that contains the + phrase "cats & dogs" (ignoring case) --no-match-filter Do not use generic video filter (default) --no-playlist Download only the video, if the URL refers to a video and a playlist @@ -840,15 +841,17 @@ You can also fork the project on github and run your fork's [build workflow](.gi (requires ffmpeg and ffprobe) --audio-format FORMAT Specify audio format to convert the audio to when -x is used. Currently supported - formats are: best (default) or one of - best|aac|flac|mp3|m4a|opus|vorbis|wav|alac - --audio-quality QUALITY Specify ffmpeg audio quality, insert a + formats are: best (default) or one of aac, + flac, mp3, m4a, opus, vorbis, wav, alac + --audio-quality QUALITY Specify ffmpeg audio quality to use when + converting the audio with -x. Insert a value between 0 (best) and 10 (worst) for VBR or a specific bitrate like 128K (default 5) --remux-video FORMAT Remux the video into another container if - necessary (currently supported: mp4|mkv|flv - |webm|mov|avi|mp3|mka|m4a|ogg|opus). If + necessary (currently supported: mp4, mkv, + flv, webm, mov, avi, mka, ogg, aac, flac, + mp3, m4a, opus, vorbis, wav, alac). If target container does not support the video/audio codec, remuxing will fail. You can specify multiple rules; Eg. @@ -948,10 +951,10 @@ You can also fork the project on github and run your fork's [build workflow](.gi option can be used multiple times --no-exec Remove any previously defined --exec --convert-subs FORMAT Convert the subtitles to another format - (currently supported: srt|vtt|ass|lrc) + (currently supported: srt, vtt, ass, lrc) (Alias: --convert-subtitles) --convert-thumbnails FORMAT Convert the thumbnails to another format - (currently supported: jpg|png|webp) + (currently supported: jpg, png, webp) --split-chapters Split video into multiple files based on internal chapters. The "chapter:" prefix can be used with "--paths" and "--output" @@ -1638,7 +1641,11 @@ $ yt-dlp --parse-metadata "description:Artist - (?P.+)" # Set title as "Series name S01E05" $ yt-dlp --parse-metadata "%(series)s S%(season_number)02dE%(episode_number)02d:%(title)s" -# Set "comment" field in video metadata using description instead of webpage_url +# Prioritize uploader as the "artist" field in video metadata +$ yt-dlp --parse-metadata "%(uploader|)s:%(meta_artist)s" --add-metadata + +# Set "comment" field in video metadata using description instead of webpage_url, +# handling multiple lines correctly $ yt-dlp --parse-metadata "description:(?s)(?P.+)" --add-metadata # Remove "formats" field from the infojson by setting it to an empty string @@ -1651,7 +1658,7 @@ $ yt-dlp --replace-in-metadata "title,uploader" "[ _]" "-" # EXTRACTOR ARGUMENTS -Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) separated string of `ARG=VAL1,VAL2`. Eg: `--extractor-args "youtube:player-client=android_agegate,web;include_live_dash" --extractor-args "funimation:version=uncut"` +Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) separated string of `ARG=VAL1,VAL2`. Eg: `--extractor-args "youtube:player-client=android_embedded,web;include_live_dash" --extractor-args "funimation:version=uncut"` The following extractors use this feature: @@ -1661,10 +1668,8 @@ The following extractors use this feature: * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `include_live_dash`: Include live dash formats even without `--live-from-start` (These formats don't download properly) * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) -* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all`. - * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total. -* `max_comment_depth` Maximum depth for nested comments. YouTube supports depths 1 or 2 (default) - * **Deprecated**: Set `max-replies` to `0` or `all` in `max_comments` instead (e.g. `max_comments=all,all,0` to get no replies) +* `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` + * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) @@ -1743,7 +1748,7 @@ with YoutubeDL(ydl_opts) as ydl: ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc']) ``` -Most likely, you'll want to use various options. For a list of options available, have a look at [`yt_dlp/YoutubeDL.py`](yt_dlp/YoutubeDL.py#L191). +Most likely, you'll want to use various options. For a list of options available, have a look at [`yt_dlp/YoutubeDL.py`](yt_dlp/YoutubeDL.py#L195). Here's a more complete example demonstrating various functionality: diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 51d83bde0..d03229d86 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -517,7 +517,7 @@ class YoutubeDL(object): _format_fields = { # NB: Keep in sync with the docstring of extractor/common.py - 'url', 'manifest_url', 'ext', 'format', 'format_id', 'format_note', + 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note', 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', @@ -938,7 +938,7 @@ class YoutubeDL(object): def deprecation_warning(self, message): if self.params.get('logger') is not None: - self.params['logger'].warning('DeprecationWarning: {message}') + self.params['logger'].warning(f'DeprecationWarning: {message}') else: self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True) @@ -2478,8 +2478,9 @@ class YoutubeDL(object): if info_dict.get('is_live') and formats: formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start] if get_from_start and not formats: - self.raise_no_formats(info_dict, msg='--live-from-start is passed, but there are no formats that can be downloaded from the start. ' - 'If you want to download from the current time, pass --no-live-from-start') + self.raise_no_formats(info_dict, msg=( + '--live-from-start is passed, but there are no formats that can be downloaded from the start. ' + 'If you want to download from the current time, use --no-live-from-start')) if not formats: self.raise_no_formats(info_dict) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index a445d8621..ebf2d227a 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -379,7 +379,7 @@ def validate_options(opts): 'To let yt-dlp download and merge the best available formats, simply do not pass any format selection', 'If you know what you are doing and want only the best pre-merged format, use "-f b" instead to suppress this warning'))) - # --(post-processor/downloader)-args without name + # --(postprocessor/downloader)-args without name def report_args_compat(name, value, key1, key2=None): if key1 in value and key2 not in value: warnings.append(f'{name} arguments given without specifying name. The arguments will be given to all {name}s') diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 7265cad81..1f08a3664 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -21,6 +21,7 @@ from .compat import ( compat_cookiejar_Cookie, ) from .utils import ( + error_to_str, expand_path, Popen, YoutubeDLCookieJar, @@ -721,7 +722,7 @@ def _get_kwallet_network_wallet(logger): network_wallet = stdout.decode('utf-8').strip() logger.debug('NetworkWallet = "{}"'.format(network_wallet)) return network_wallet - except BaseException as e: + except Exception as e: logger.warning('exception while obtaining NetworkWallet: {}'.format(e)) return default_wallet @@ -766,8 +767,8 @@ def _get_kwallet_password(browser_keyring_name, logger): if stdout[-1:] == b'\n': stdout = stdout[:-1] return stdout - except BaseException as e: - logger.warning(f'exception running kwallet-query: {type(e).__name__}({e})') + except Exception as e: + logger.warning(f'exception running kwallet-query: {error_to_str(e)}') return b'' @@ -823,8 +824,8 @@ def _get_mac_keyring_password(browser_keyring_name, logger): if stdout[-1:] == b'\n': stdout = stdout[:-1] return stdout - except BaseException as e: - logger.warning(f'exception running find-generic-password: {type(e).__name__}({e})') + except Exception as e: + logger.warning(f'exception running find-generic-password: {error_to_str(e)}') return None diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 6b75dfc62..c45a8a476 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -403,7 +403,7 @@ class FragmentFD(FileDownloader): pass if compat_os_name == 'nt': - def bindoj_result(future): + def future_result(future): while True: try: return future.result(0.1) @@ -412,7 +412,7 @@ class FragmentFD(FileDownloader): except concurrent.futures.TimeoutError: continue else: - def bindoj_result(future): + def future_result(future): return future.result() def interrupt_trigger_iter(fg): @@ -430,7 +430,7 @@ class FragmentFD(FileDownloader): result = True for tpe, job in spins: try: - result = result and bindoj_result(job) + result = result and future_result(job) except KeyboardInterrupt: interrupt_trigger[0] = False finally: @@ -494,16 +494,14 @@ class FragmentFD(FileDownloader): self.report_error('Giving up after %s fragment retries' % fragment_retries) def append_fragment(frag_content, frag_index, ctx): - if not frag_content: - if not is_fatal(frag_index - 1): - self.report_skip_fragment(frag_index, 'fragment not found') - return True - else: - ctx['dest_stream'].close() - self.report_error( - 'fragment %s not found, unable to continue' % frag_index) - return False - self._append_fragment(ctx, pack_func(frag_content, frag_index)) + if frag_content: + self._append_fragment(ctx, pack_func(frag_content, frag_index)) + elif not is_fatal(frag_index - 1): + self.report_skip_fragment(frag_index, 'fragment not found') + else: + ctx['dest_stream'].close() + self.report_error(f'fragment {frag_index} not found, unable to continue') + return False return True decrypt_fragment = self.decrypter(info_dict) diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index cabf401a7..591a9b08d 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -7,7 +7,6 @@ import random from .common import FileDownloader from ..compat import ( - compat_str, compat_urllib_error, compat_http_client ) @@ -58,8 +57,6 @@ class HttpFD(FileDownloader): ctx.resume_len = 0 ctx.block_size = self.params.get('buffersize', 1024) ctx.start_time = time.time() - ctx.chunk_size = None - throttle_start = None # parse given Range req_start, req_end, _ = parse_http_range(headers.get('Range')) @@ -85,12 +82,6 @@ class HttpFD(FileDownloader): class NextFragment(Exception): pass - def set_range(req, start, end): - range_header = 'bytes=%d-' % start - if end: - range_header += compat_str(end) - req.add_header('Range', range_header) - def establish_connection(): ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size) if not is_test and chunk_size else chunk_size) @@ -131,7 +122,7 @@ class HttpFD(FileDownloader): request = sanitized_Request(url, request_data, headers) has_range = range_start is not None if has_range: - set_range(request, range_start, range_end) + request.add_header('Range', f'bytes={int(range_start)}-{int_or_none(range_end) or ""}') # Establish connection try: ctx.data = self.ydl.urlopen(request) @@ -214,7 +205,6 @@ class HttpFD(FileDownloader): raise RetryDownload(err) def download(): - nonlocal throttle_start data_len = ctx.data.info().get('Content-length', None) # Range HTTP header may be ignored/unsupported by a webserver @@ -329,14 +319,14 @@ class HttpFD(FileDownloader): if speed and speed < (self.params.get('throttledratelimit') or 0): # The speed must stay below the limit for 3 seconds # This prevents raising error when the speed temporarily goes down - if throttle_start is None: - throttle_start = now - elif now - throttle_start > 3: + if ctx.throttle_start is None: + ctx.throttle_start = now + elif now - ctx.throttle_start > 3: if ctx.stream is not None and ctx.tmpfilename != '-': ctx.stream.close() raise ThrottledDownload() elif speed: - throttle_start = None + ctx.throttle_start = None if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len: ctx.resume_len = byte_counter diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index dd1ff512e..3212f3328 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -926,9 +926,9 @@ class BiliIntlIE(BiliIntlBaseIE): if season_id and not video_data: # Non-Bstation layout, read through episode list season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) - video_data = next( - episode for episode in traverse_obj(season_json, ('sections', ..., 'episodes', ...), expected_type=dict) - if str(episode.get('episode_id')) == ep_id) + video_data = traverse_obj(season_json, + ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id), + expected_type=dict, get_all=False) return self._extract_video_info(video_data, ep_id=ep_id, aid=aid) diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py index 31e7d7de6..8b9903774 100644 --- a/yt_dlp/extractor/canvas.py +++ b/yt_dlp/extractor/canvas.py @@ -245,10 +245,6 @@ class VrtNUIE(GigyaBaseIE): 'upload_date': '20200727', }, 'skip': 'This video is only available for registered users', - 'params': { - 'username': '', - 'password': '', - }, 'expected_warnings': ['is not a supported codec'], }, { # Only available via new API endpoint @@ -264,10 +260,6 @@ class VrtNUIE(GigyaBaseIE): 'episode_number': 5, }, 'skip': 'This video is only available for registered users', - 'params': { - 'username': '', - 'password': '', - }, 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], }] _NETRC_MACHINE = 'vrtnu' diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 81688eb54..e2605c1f4 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -139,6 +139,8 @@ class InfoExtractor(object): for HDS - URL of the F4M manifest, for DASH - URL of the MPD manifest, for MSS - URL of the ISM manifest. + * manifest_stream_number (For internal use only) + The index of the stream in the manifest file * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). @@ -215,7 +217,7 @@ class InfoExtractor(object): (HTTP or RTMP) download. Boolean. * has_drm The format has DRM and cannot be downloaded. Boolean * downloader_options A dictionary of downloader options as - described in FileDownloader + described in FileDownloader (For internal use only) RTMP formats can also have the additional fields: page_url, app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn, rtmp_protocol, rtmp_real_time @@ -3684,9 +3686,9 @@ class InfoExtractor(object): def _merge_subtitle_items(subtitle_list1, subtitle_list2): """ Merge subtitle items for one language. Items with duplicated URLs/data will be dropped. """ - list1_data = set([item.get('url') or item['data'] for item in subtitle_list1]) + list1_data = set((item.get('url'), item.get('data')) for item in subtitle_list1) ret = list(subtitle_list1) - ret.extend([item for item in subtitle_list2 if (item.get('url') or item['data']) not in list1_data]) + ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data) return ret @classmethod diff --git a/yt_dlp/extractor/dropout.py b/yt_dlp/extractor/dropout.py index a7442d8f0..2fa61950c 100644 --- a/yt_dlp/extractor/dropout.py +++ b/yt_dlp/extractor/dropout.py @@ -123,7 +123,7 @@ class DropoutIE(InfoExtractor): self._login(display_id) webpage = self._download_webpage(url, display_id, note='Downloading video webpage') finally: - self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out') + self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out', fatal=False) embed_url = self._search_regex(r'embed_url:\s*["\'](.+?)["\']', webpage, 'embed url') thumbnail = self._og_search_thumbnail(webpage) @@ -139,7 +139,7 @@ class DropoutIE(InfoExtractor): '_type': 'url_transparent', 'ie_key': VHXEmbedIE.ie_key(), 'url': embed_url, - 'id': self._search_regex(r'embed.vhx.tv/videos/(.+?)\?', embed_url, 'id'), + 'id': self._search_regex(r'embed\.vhx\.tv/videos/(.+?)\?', embed_url, 'id'), 'display_id': display_id, 'title': title, 'description': self._html_search_meta('description', webpage, fatal=False), diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 2deed585f..5e0e2facf 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -397,8 +397,10 @@ class FacebookIE(InfoExtractor): r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)] post = traverse_obj(post_data, ( ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] - media = [m for m in traverse_obj(post, (..., 'attachments', ..., 'media'), expected_type=dict) or [] - if str(m.get('id')) == video_id and m.get('__typename') == 'Video'] + media = traverse_obj( + post, + (..., 'attachments', ..., 'media', lambda _, m: str(m['id']) == video_id and m['__typename'] == 'Video'), + expected_type=dict) title = get_first(media, ('title', 'text')) description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {} diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 2c503e581..bd56ad289 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2523,7 +2523,7 @@ class GenericIE(InfoExtractor): 'title': 'Riku ja Tunna lähtevät peurajahtiin tv:stä tutun biologin kanssa – metsästysreissu huipentuu kasvissyöjän painajaiseen!', 'thumbnail': r're:^https?://.+\.jpg$', 'duration': 108, - 'series' : 'Madventures Suomi', + 'series': 'Madventures Suomi', 'description': 'md5:aa55b44bd06a1e337a6f1d0b46507381', 'categories': ['Matkailu', 'Elämäntyyli'], 'age_limit': 0, @@ -3886,8 +3886,8 @@ class GenericIE(InfoExtractor): if RtmpIE.suitable(vurl): return True vpath = compat_urlparse.urlparse(vurl).path - vext = determine_ext(vpath) - return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') + vext = determine_ext(vpath, None) + return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') def filter_video(urls): return list(filter(check_video, urls)) diff --git a/yt_dlp/extractor/limelight.py b/yt_dlp/extractor/limelight.py index 369141d67..b20681ad1 100644 --- a/yt_dlp/extractor/limelight.py +++ b/yt_dlp/extractor/limelight.py @@ -194,7 +194,7 @@ class LimelightBaseIE(InfoExtractor): cc_url = cc.get('webvttFileUrl') if not cc_url: continue - lang = cc.get('languageCode') or self._search_regex(r'/[a-z]{2}\.vtt', cc_url, 'lang', default='en') + lang = cc.get('languageCode') or self._search_regex(r'/([a-z]{2})\.vtt', cc_url, 'lang', default='en') subtitles.setdefault(lang, []).append({ 'url': cc_url, }) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index a5a1a01e0..4eb6ed070 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -469,7 +469,7 @@ class NiconicoIE(InfoExtractor): comment_user_key = traverse_obj(api_data, ('comment', 'keys', 'userKey')) user_id_str = session_api_data.get('serviceUserId') - thread_ids = [x for x in traverse_obj(api_data, ('comment', 'threads')) or [] if x['isActive']] + thread_ids = traverse_obj(api_data, ('comment', 'threads', lambda _, v: v['isActive'])) raw_danmaku = self._extract_all_comments(video_id, thread_ids, user_id_str, comment_user_key) if not raw_danmaku: self.report_warning(f'Failed to get comments. {bug_reports_message()}') diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 6f8c32882..c1d6c5477 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -264,7 +264,7 @@ class TikTokBaseIE(InfoExtractor): return { 'id': aweme_id, 'title': aweme_detail.get('desc'), - 'description': aweme_detail['desc'], + 'description': aweme_detail.get('desc'), 'view_count': int_or_none(stats_info.get('play_count')), 'like_count': int_or_none(stats_info.get('digg_count')), 'repost_count': int_or_none(stats_info.get('share_count')), @@ -387,6 +387,9 @@ class TikTokIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'artist': 'Ysrbeats', + 'album': 'Lehanga', + 'track': 'Lehanga', } }, { 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en', @@ -410,6 +413,8 @@ class TikTokIE(TikTokBaseIE): 'like_count': int, 'repost_count': int, 'comment_count': int, + 'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson', + 'track': 'Big Fun', } }, { # Banned audio, only available on the app @@ -463,7 +468,7 @@ class TikTokIE(TikTokBaseIE): 'info_dict': { 'id': '7059698374567611694', 'ext': 'mp4', - 'title': 'N/A', + 'title': 'tiktok video #7059698374567611694', 'description': '', 'uploader': 'pokemonlife22', 'creator': 'Pokemon', @@ -480,7 +485,7 @@ class TikTokIE(TikTokBaseIE): 'repost_count': int, 'comment_count': int, }, - 'expected_warnings': ['Video not available'] + 'expected_warnings': ['Video not available', 'Creating a generic title'] }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py index a101af67e..7d3966bf1 100644 --- a/yt_dlp/extractor/yandexvideo.py +++ b/yt_dlp/extractor/yandexvideo.py @@ -163,7 +163,6 @@ class YandexVideoPreviewIE(InfoExtractor): 'thumbnail': 'https://i.mycdn.me/videoPreview?id=544866765315&type=37&idx=13&tkn=TY5qjLYZHxpmcnK8U2LgzYkgmaU&fn=external_8', 'uploader_id': '481054701571', 'title': 'LOFT - summer, summer, summer HD', - 'manifest_stream_number': 0, 'uploader': 'АРТЁМ КУДРОВ', }, }, { # youtube diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 485849ba9..017554c88 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -837,17 +837,20 @@ class YoutubeBaseInfoExtractor(InfoExtractor): uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') channel_id = traverse_obj( - renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), expected_type=str, get_all=False) + renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), + expected_type=str, get_all=False) timestamp, time_text = self._extract_time_text(renderer, 'publishedTimeText') scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) overlay_style = traverse_obj( - renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) + renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), + get_all=False, expected_type=str) badges = self._extract_badges(renderer) thumbnails = self._extract_thumbnails(renderer, 'thumbnail') navigation_url = urljoin('https://www.youtube.com/', traverse_obj( - renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str)) + renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), + expected_type=str)) or '' url = f'https://www.youtube.com/watch?v={video_id}' - if overlay_style == 'SHORTS' or (navigation_url and '/shorts/' in navigation_url): + if overlay_style == 'SHORTS' or '/shorts/' in navigation_url: url = f'https://www.youtube.com/shorts/{video_id}' return { @@ -862,7 +865,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'uploader': uploader, 'channel_id': channel_id, 'thumbnails': thumbnails, - 'upload_date': strftime_or_none(timestamp, '%Y%m%d') if self._configuration_arg('approximate_date', ie_key='youtubetab') else None, + 'upload_date': (strftime_or_none(timestamp, '%Y%m%d') + if self._configuration_arg('approximate_date', ie_key='youtubetab') + else None), 'live_status': ('is_upcoming' if scheduled_timestamp is not None else 'was_live' if 'streamed' in time_text.lower() else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges diff --git a/yt_dlp/options.py b/yt_dlp/options.py index eb306898a..06c613262 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -163,6 +163,8 @@ def create_parser(): values = [process(value)] if delim is None else list(map(process, value.split(delim)[::-1])) while values: actual_val = val = values.pop() + if not val: + raise optparse.OptionValueError(f'Invalid {option.metavar} for {opt_str}: {value}') if val == 'all': current.update(allowed_values) elif val == '-all': @@ -1311,7 +1313,7 @@ def create_parser(): '--audio-format', metavar='FORMAT', dest='audioformat', default='best', help=( 'Specify audio format to convert the audio to when -x is used. Currently supported formats are: ' - 'best (default) or one of %s' % '|'.join(FFmpegExtractAudioPP.SUPPORTED_EXTS))) + 'best (default) or one of %s' % ', '.join(FFmpegExtractAudioPP.SUPPORTED_EXTS))) postproc.add_option( '--audio-quality', metavar='QUALITY', dest='audioquality', default='5', @@ -1323,7 +1325,7 @@ def create_parser(): 'Remux the video into another container if necessary (currently supported: %s). ' 'If target container does not support the video/audio codec, remuxing will fail. ' 'You can specify multiple rules; Eg. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 ' - 'and anything else to mkv.' % '|'.join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS))) + 'and anything else to mkv.' % ', '.join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS))) postproc.add_option( '--recode-video', metavar='FORMAT', dest='recodevideo', default=None, @@ -1438,7 +1440,7 @@ def create_parser(): '"multi_video" (default; only when the videos form a single show). ' 'All the video files must have same codecs and number of streams to be concatable. ' 'The "pl_video:" prefix can be used with "--paths" and "--output" to ' - 'set the output filename for the split files. See "OUTPUT TEMPLATE" for details')) + 'set the output filename for the concatenated files. See "OUTPUT TEMPLATE" for details')) postproc.add_option( '--fixup', metavar='POLICY', dest='fixup', default=None, @@ -1486,20 +1488,20 @@ def create_parser(): help=optparse.SUPPRESS_HELP) postproc.add_option( '--no-exec-before-download', - action='store_const', dest='exec_before_dl_cmd', const=[], + action='store_const', dest='exec_before_dl_cmd', const=None, help=optparse.SUPPRESS_HELP) postproc.add_option( '--convert-subs', '--convert-sub', '--convert-subtitles', metavar='FORMAT', dest='convertsubtitles', default=None, help=( 'Convert the subtitles to another format (currently supported: %s) ' - '(Alias: --convert-subtitles)' % '|'.join(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))) + '(Alias: --convert-subtitles)' % ', '.join(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))) postproc.add_option( '--convert-thumbnails', metavar='FORMAT', dest='convertthumbnails', default=None, help=( 'Convert the thumbnails to another format ' - '(currently supported: %s) ' % '|'.join(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS))) + '(currently supported: %s) ' % ', '.join(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS))) postproc.add_option( '--split-chapters', '--split-tracks', dest='split_chapters', action='store_true', default=False, diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 5216acbfb..643290286 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -500,6 +500,9 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): temp_path = new_path = prefix + sep + extension if new_path == path: + if acodec == 'copy': + self.to_screen(f'File is already in target format {self._preferredcodec}, skipping') + return [], information orig_path = prepend_extension(path, 'orig') temp_path = prepend_extension(path, 'temp') if (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)) @@ -1122,6 +1125,11 @@ class FFmpegConcatPP(FFmpegPostProcessor): self._only_multi_video = only_multi_video super().__init__(downloader) + def _get_codecs(self, file): + codecs = traverse_obj(self.get_metadata_object(file), ('streams', ..., 'codec_name')) + self.write_debug(f'Codecs = {", ".join(codecs)}') + return tuple(codecs) + def concat_files(self, in_files, out_file): if not self._downloader._ensure_dir_exists(out_file): return @@ -1131,8 +1139,7 @@ class FFmpegConcatPP(FFmpegPostProcessor): os.replace(in_files[0], out_file) return [] - codecs = [traverse_obj(self.get_metadata_object(file), ('streams', ..., 'codec_name')) for file in in_files] - if len(set(map(tuple, codecs))) > 1: + if len(set(map(self._get_codecs, in_files))) > 1: raise PostProcessingError( 'The files have different streams/codecs and cannot be concatenated. ' 'Either select different formats or --recode-video them to a common format') @@ -1146,7 +1153,7 @@ class FFmpegConcatPP(FFmpegPostProcessor): entries = info.get('entries') or [] if not any(entries) or (self._only_multi_video and info['_type'] != 'multi_video'): return [], info - elif any(len(entry) > 1 for entry in traverse_obj(entries, (..., 'requested_downloads')) or []): + elif traverse_obj(entries, (..., 'requested_downloads', lambda _, v: len(v) > 1)): raise PostProcessingError('Concatenation is not supported when downloading multiple separate formats') in_files = traverse_obj(entries, (..., 'requested_downloads', 0, 'filepath')) or [] diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index ce918750d..6663583fc 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -1040,7 +1040,7 @@ def make_HTTPS_handler(params, **kwargs): def bug_reports_message(before=';'): - msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp , ' + msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , ' 'filling out the appropriate issue template. ' 'Confirm you are on the latest version using yt-dlp -U') @@ -2883,6 +2883,7 @@ class PagedList: class OnDemandPagedList(PagedList): + """Download pages until a page with less than maximum results""" def _getslice(self, start, end): for pagenum in itertools.count(start // self._pagesize): firstid = pagenum * self._pagesize @@ -2922,6 +2923,7 @@ class OnDemandPagedList(PagedList): class InAdvancePagedList(PagedList): + """PagedList with total number of pages known in advance""" def __init__(self, pagefunc, pagecount, pagesize): PagedList.__init__(self, pagefunc, pagesize, True) self._pagecount = pagecount @@ -3090,13 +3092,10 @@ def multipart_encode(data, boundary=None): def dict_get(d, key_or_keys, default=None, skip_false_values=True): - if isinstance(key_or_keys, (list, tuple)): - for key in key_or_keys: - if key not in d or d[key] is None or skip_false_values and not d[key]: - continue - return d[key] - return default - return d.get(key_or_keys, default) + for val in map(d.get, variadic(key_or_keys)): + if val is not None and (val or not skip_false_values): + return val + return default def try_call(*funcs, expected_type=None, args=[], kwargs={}): @@ -3324,6 +3323,10 @@ def error_to_compat_str(err): return err_str +def error_to_str(err): + return f'{type(err).__name__}: {err}' + + def mimetype2ext(mt): if mt is None: return None -- cgit v1.2.3