diff options
-rw-r--r-- | test/test_YoutubeDL.py | 20 | ||||
-rw-r--r-- | yt_dlp/YoutubeDL.py | 43 | ||||
-rw-r--r-- | yt_dlp/__init__.py | 4 | ||||
-rw-r--r-- | yt_dlp/extractor/common.py | 4 | ||||
-rw-r--r-- | yt_dlp/extractor/extractors.py | 3 | ||||
-rw-r--r-- | yt_dlp/extractor/gettr.py | 6 | ||||
-rw-r--r-- | yt_dlp/extractor/nhk.py | 149 | ||||
-rw-r--r-- | yt_dlp/extractor/spiegel.py | 2 | ||||
-rw-r--r-- | yt_dlp/extractor/youtube.py | 12 | ||||
-rw-r--r-- | yt_dlp/options.py | 8 | ||||
-rw-r--r-- | yt_dlp/postprocessor/ffmpeg.py | 2 | ||||
-rw-r--r-- | yt_dlp/postprocessor/sponsorblock.py | 4 | ||||
-rw-r--r-- | yt_dlp/utils.py | 3 |
13 files changed, 205 insertions, 55 deletions
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 34ed814b4..7637297be 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -30,9 +30,7 @@ class YDL(FakeYDL): self.msgs = [] def process_info(self, info_dict): - info_dict = info_dict.copy() - info_dict.pop('__original_infodict', None) - self.downloaded_info_dicts.append(info_dict) + self.downloaded_info_dicts.append(info_dict.copy()) def to_screen(self, msg): self.msgs.append(msg) @@ -898,20 +896,6 @@ class TestYoutubeDL(unittest.TestCase): os.unlink(filename) def test_match_filter(self): - class FilterYDL(YDL): - def __init__(self, *args, **kwargs): - super(FilterYDL, self).__init__(*args, **kwargs) - self.params['simulate'] = True - - def process_info(self, info_dict): - super(YDL, self).process_info(info_dict) - - def _match_entry(self, info_dict, incomplete=False): - res = super(FilterYDL, self)._match_entry(info_dict, incomplete) - if res is None: - self.downloaded_info_dicts.append(info_dict.copy()) - return res - first = { 'id': '1', 'url': TEST_URL, @@ -939,7 +923,7 @@ class TestYoutubeDL(unittest.TestCase): videos = [first, second] def get_videos(filter_=None): - ydl = FilterYDL({'match_filter': filter_}) + ydl = YDL({'match_filter': filter_, 'simulate': True}) for v in videos: ydl.process_ie_result(v, download=True) return [v['id'] for v in ydl.downloaded_info_dicts] diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index a96fc0bdd..2910fd3bc 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1036,8 +1036,7 @@ class YoutubeDL(object): @staticmethod def _copy_infodict(info_dict): info_dict = dict(info_dict) - for key in ('__original_infodict', '__postprocessors'): - info_dict.pop(key, None) + info_dict.pop('__postprocessors', None) return info_dict def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): @@ -2511,8 +2510,6 @@ class YoutubeDL(object): if '__x_forwarded_for_ip' in info_dict: del info_dict['__x_forwarded_for_ip'] - # TODO Central sorting goes here - if self.params.get('check_formats') is True: formats = LazyList(self._check_formats(formats[::-1]), reverse=True) @@ -2525,6 +2522,12 @@ class YoutubeDL(object): info_dict, _ = self.pre_process(info_dict) + if self._match_entry(info_dict) is not None: + return info_dict + + self.post_extract(info_dict) + info_dict, _ = self.pre_process(info_dict, 'after_filter') + # The pre-processors may have modified the formats formats = info_dict.get('formats', [info_dict]) @@ -2609,15 +2612,12 @@ class YoutubeDL(object): + ', '.join([f['format_id'] for f in formats_to_download])) max_downloads_reached = False for i, fmt in enumerate(formats_to_download): - formats_to_download[i] = new_info = dict(info_dict) - # Save a reference to the original info_dict so that it can be modified in process_info if needed + formats_to_download[i] = new_info = self._copy_infodict(info_dict) new_info.update(fmt) - new_info['__original_infodict'] = info_dict try: self.process_info(new_info) except MaxDownloadsReached: max_downloads_reached = True - new_info.pop('__original_infodict') # Remove copied info for key, val in tuple(new_info.items()): if info_dict.get(key) == val: @@ -2825,7 +2825,7 @@ class YoutubeDL(object): return None def process_info(self, info_dict): - """Process a single resolved IE result. (Modified it in-place)""" + """Process a single resolved IE result. (Modifies it in-place)""" assert info_dict.get('_type', 'video') == 'video' original_infodict = info_dict @@ -2833,18 +2833,22 @@ class YoutubeDL(object): if 'format' not in info_dict and 'ext' in info_dict: info_dict['format'] = info_dict['ext'] + # This is mostly just for backward compatibility of process_info + # As a side-effect, this allows for format-specific filters if self._match_entry(info_dict) is not None: info_dict['__write_download_archive'] = 'ignore' return + # Does nothing under normal operation - for backward compatibility of process_info self.post_extract(info_dict) - self._num_downloads += 1 # info_dict['_filename'] needs to be set for backward compatibility info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True) temp_filename = self.prepare_filename(info_dict, 'temp') files_to_move = {} + self._num_downloads += 1 + # Forced printings self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict)) @@ -3258,17 +3262,14 @@ class YoutubeDL(object): return info_dict info_dict.setdefault('epoch', int(time.time())) info_dict.setdefault('_type', 'video') - remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict - keep_keys = ['_type'] # Always keep this to facilitate load-info-json + if remove_private_keys: - remove_keys |= { + reject = lambda k, v: v is None or (k.startswith('_') and k != '_type') or k in { 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries', 'entries', 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber', } - reject = lambda k, v: k not in keep_keys and ( - k.startswith('_') or k in remove_keys or v is None) else: - reject = lambda k, v: k in remove_keys + reject = lambda k, v: False def filter_fn(obj): if isinstance(obj, dict): @@ -3295,14 +3296,8 @@ class YoutubeDL(object): actual_post_extract(video_dict or {}) return - post_extractor = info_dict.get('__post_extractor') or (lambda: {}) - extra = post_extractor().items() - info_dict.update(extra) - info_dict.pop('__post_extractor', None) - - original_infodict = info_dict.get('__original_infodict') or {} - original_infodict.update(extra) - original_infodict.pop('__post_extractor', None) + post_extractor = info_dict.pop('__post_extractor', None) or (lambda: {}) + info_dict.update(post_extractor()) actual_post_extract(info_dict or {}) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 7469b0f61..f308f6a89 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -471,8 +471,8 @@ def _real_main(argv=None): 'key': 'SponsorBlock', 'categories': sponsorblock_query, 'api': opts.sponsorblock_api, - # Run this immediately after extraction is complete - 'when': 'pre_process' + # Run this after filtering videos + 'when': 'after_filter' }) if opts.parse_metadata: postprocessors.append({ diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 04d4c0733..d8bb21137 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1307,6 +1307,10 @@ class InfoExtractor(object): def _og_search_url(self, html, **kargs): return self._og_search_property('url', html, **kargs) + def _html_extract_title(self, html, name, **kwargs): + return self._html_search_regex( + r'(?s)<title>(.*?)</title>', html, name, **kwargs) + def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): name = variadic(name) if display_name is None: diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 15bc74915..7d4262acf 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -994,6 +994,9 @@ from .nfl import ( from .nhk import ( NhkVodIE, NhkVodProgramIE, + NhkForSchoolBangumiIE, + NhkForSchoolSubjectIE, + NhkForSchoolProgramListIE, ) from .nhl import NHLIE from .nick import ( diff --git a/yt_dlp/extractor/gettr.py b/yt_dlp/extractor/gettr.py index 179bd7c47..9842edd81 100644 --- a/yt_dlp/extractor/gettr.py +++ b/yt_dlp/extractor/gettr.py @@ -39,6 +39,7 @@ class GettrIE(GettrBaseIE): 'thumbnail': r're:^https?://.+/out\.jpg', 'timestamp': 1632782451058, 'duration': 58.5585, + 'tags': ['hornofafrica', 'explorations'], } }, { 'url': 'https://gettr.com/post/p4iahp', @@ -52,6 +53,7 @@ class GettrIE(GettrBaseIE): 'thumbnail': r're:^https?://.+/out\.jpg', 'timestamp': 1626594455017, 'duration': 23, + 'tags': 'count:12', } }] @@ -84,7 +86,7 @@ class GettrIE(GettrBaseIE): formats = self._extract_m3u8_formats( urljoin(self._MEDIA_BASE_URL, vid), post_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') if vid else [] + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if vid else [] if ovid: formats.append({ @@ -93,8 +95,6 @@ class GettrIE(GettrBaseIE): 'ext': 'mp4', 'width': int_or_none(post_data.get('vid_wid')), 'height': int_or_none(post_data.get('vid_hgt')), - 'source_preference': 1, - 'quality': 1, }) self._sort_formats(formats) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 4998fed83..626c6379b 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -1,8 +1,15 @@ from __future__ import unicode_literals +import re from .common import InfoExtractor -from ..utils import urljoin +from ..utils import ( + parse_duration, + traverse_obj, + unescapeHTML, + unified_timestamp, + urljoin +) class NhkBaseIE(InfoExtractor): @@ -176,3 +183,143 @@ class NhkVodProgramIE(NhkBaseIE): program_title = entries[0].get('series') return self.playlist_result(entries, program_id, program_title) + + +class NhkForSchoolBangumiIE(InfoExtractor): + _VALID_URL = r'https?://www2\.nhk\.or\.jp/school/movie/(?P<type>bangumi|clip)\.cgi\?das_id=(?P<id>[a-zA-Z0-9_-]+)' + _TESTS = [{ + 'url': 'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id=D0005150191_00000', + 'info_dict': { + 'id': 'D0005150191_00003', + 'title': 'にている かな', + 'duration': 599.999, + 'timestamp': 1396414800, + + 'upload_date': '20140402', + 'ext': 'mp4', + + 'chapters': 'count:12' + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + program_type, video_id = self._match_valid_url(url).groups() + + webpage = self._download_webpage( + f'https://www2.nhk.or.jp/school/movie/{program_type}.cgi?das_id={video_id}', video_id) + + # searches all variables + base_values = {g.group(1): g.group(2) for g in re.finditer(r'var\s+([a-zA-Z_]+)\s*=\s*"([^"]+?)";', webpage)} + # and programObj values too + program_values = {g.group(1): g.group(3) for g in re.finditer(r'(?:program|clip)Obj\.([a-zA-Z_]+)\s*=\s*(["\'])([^"]+?)\2;', webpage)} + # extract all chapters + chapter_durations = [parse_duration(g.group(1)) for g in re.finditer(r'chapterTime\.push\(\'([0-9:]+?)\'\);', webpage)] + chapter_titles = [' '.join([g.group(1) or '', unescapeHTML(g.group(2))]).strip() for g in re.finditer(r'<div class="cpTitle"><span>(scene\s*\d+)?</span>([^<]+?)</div>', webpage)] + + # this is how player_core.js is actually doing (!) + version = base_values.get('r_version') or program_values.get('version') + if version: + video_id = f'{video_id.split("_")[0]}_{version}' + + formats = self._extract_m3u8_formats( + f'https://nhks-vh.akamaihd.net/i/das/{video_id[0:8]}/{video_id}_V_000.f4v/master.m3u8', + video_id, ext='mp4', m3u8_id='hls') + self._sort_formats(formats) + + duration = parse_duration(base_values.get('r_duration')) + + chapters = None + if chapter_durations and chapter_titles and len(chapter_durations) == len(chapter_titles): + start_time = chapter_durations + end_time = chapter_durations[1:] + [duration] + chapters = [{ + 'start_time': s, + 'end_time': e, + 'title': t, + } for s, e, t in zip(start_time, end_time, chapter_titles)] + + return { + 'id': video_id, + 'title': program_values.get('name'), + 'duration': parse_duration(base_values.get('r_duration')), + 'timestamp': unified_timestamp(base_values['r_upload']), + 'formats': formats, + 'chapters': chapters, + } + + +class NhkForSchoolSubjectIE(InfoExtractor): + IE_DESC = 'Portal page for each school subjects, like Japanese (kokugo, 国語) or math (sansuu/suugaku or 算数・数学)' + KNOWN_SUBJECTS = ( + 'rika', 'syakai', 'kokugo', + 'sansuu', 'seikatsu', 'doutoku', + 'ongaku', 'taiiku', 'zukou', + 'gijutsu', 'katei', 'sougou', + 'eigo', 'tokkatsu', + 'tokushi', 'sonota', + ) + _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>%s)/?(?:[\?#].*)?$' % '|'.join(re.escape(s) for s in KNOWN_SUBJECTS) + + _TESTS = [{ + 'url': 'https://www.nhk.or.jp/school/sougou/', + 'info_dict': { + 'id': 'sougou', + 'title': '総合的な学習の時間', + }, + 'playlist_mincount': 16, + }, { + 'url': 'https://www.nhk.or.jp/school/rika/', + 'info_dict': { + 'id': 'rika', + 'title': '理科', + }, + 'playlist_mincount': 15, + }] + + def _real_extract(self, url): + subject_id = self._match_id(url) + webpage = self._download_webpage(url, subject_id) + + return self.playlist_from_matches( + re.finditer(rf'href="((?:https?://www\.nhk\.or\.jp)?/school/{re.escape(subject_id)}/[^/]+/)"', webpage), + subject_id, + self._html_search_regex(r'(?s)<span\s+class="subjectName">\s*<img\s*[^<]+>\s*([^<]+?)</span>', webpage, 'title', fatal=False), + lambda g: urljoin(url, g.group(1))) + + +class NhkForSchoolProgramListIE(InfoExtractor): + _VALID_URL = r'https?://www\.nhk\.or\.jp/school/(?P<id>(?:%s)/[a-zA-Z0-9_-]+)' % ( + '|'.join(re.escape(s) for s in NhkForSchoolSubjectIE.KNOWN_SUBJECTS) + ) + _TESTS = [{ + 'url': 'https://www.nhk.or.jp/school/sougou/q/', + 'info_dict': { + 'id': 'sougou/q', + 'title': 'Q~こどものための哲学', + }, + 'playlist_mincount': 20, + }] + + def _real_extract(self, url): + program_id = self._match_id(url) + + webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id) + + title = self._og_search_title(webpage, fatal=False) or self._html_extract_title(webpage, fatal=False) or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False) + title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None + description = self._html_search_regex( + r'(?s)<div\s+class="programDetail\s*">\s*<p>[^<]+</p>', + webpage, 'description', fatal=False, group=0) + + bangumi_list = self._download_json( + f'https://www.nhk.or.jp/school/{program_id}/meta/program.json', program_id) + # they're always bangumi + bangumis = [ + self.url_result(f'https://www2.nhk.or.jp/school/movie/bangumi.cgi?das_id={x}') + for x in traverse_obj(bangumi_list, ('part', ..., 'part-video-dasid')) or []] + + return self.playlist_result(bangumis, program_id, title, description) diff --git a/yt_dlp/extractor/spiegel.py b/yt_dlp/extractor/spiegel.py index 2da32b9b2..58f2ed353 100644 --- a/yt_dlp/extractor/spiegel.py +++ b/yt_dlp/extractor/spiegel.py @@ -7,7 +7,7 @@ from .jwplatform import JWPlatformIE class SpiegelIE(InfoExtractor): _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' - _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE + _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:$|[#?])' % _UUID_RE _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', 'md5': '50c7948883ec85a3e431a0a44b7ad1d6', diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4d9815eb3..636bf42b6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -844,7 +844,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'uploader': uploader, 'channel_id': channel_id, 'thumbnails': thumbnails, - # 'upload_date': strftime_or_none(timestamp, '%Y%m%d'), + 'upload_date': strftime_or_none(timestamp, '%Y%m%d') if self._configuration_arg('approximate_date', ie_key='youtubetab') else None, 'live_status': ('is_upcoming' if scheduled_timestamp is not None else 'was_live' if 'streamed' in time_text.lower() else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges @@ -4244,6 +4244,16 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): if 'webpage' not in self._configuration_arg('skip'): webpage, data = self._extract_webpage(url, item_id, fatal=webpage_fatal) ytcfg = ytcfg or self.extract_ytcfg(item_id, webpage) + # Reject webpage data if redirected to home page without explicitly requesting + selected_tab = self._extract_selected_tab(traverse_obj( + data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list, default=[])) or {} + if (url != 'https://www.youtube.com/feed/recommended' + and selected_tab.get('tabIdentifier') == 'FEwhat_to_watch' # Home page + and 'no-youtube-channel-redirect' not in self.get_param('compat_opts', [])): + msg = 'The channel/playlist does not exist and the URL redirected to youtube.com home page' + if fatal: + raise ExtractorError(msg, expected=True) + self.report_warning(msg, only_once=True) if not data: if not ytcfg and self.is_authenticated: msg = 'Playlists that require authentication may not extract correctly without a successful webpage download.' diff --git a/yt_dlp/options.py b/yt_dlp/options.py index eb21a25ac..f0bc3c09c 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1546,11 +1546,11 @@ def create_parser(): 'and (optionally) arguments to be passed to it, separated by a colon ":". ' 'ARGS are a semicolon ";" delimited list of NAME=VALUE. ' 'The "when" argument determines when the postprocessor is invoked. ' - 'It can be one of "pre_process" (after extraction), ' - '"before_dl" (before video download), "post_process" (after video download; default), ' - '"after_move" (after moving file to their final locations), ' + 'It can be one of "pre_process" (after video extraction), "after_filter" (after video passes filter), ' + '"before_dl" (before each video download), "post_process" (after each video download; default), ' + '"after_move" (after moving video file to it\'s final locations), ' '"after_video" (after downloading and processing all formats of a video), ' - 'or "playlist" (end of playlist). ' + 'or "playlist" (at end of playlist). ' 'This option can be used multiple times to add different postprocessors')) sponsorblock = optparse.OptionGroup(parser, 'SponsorBlock Options', description=( diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index d4495b4a2..907627381 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -1055,7 +1055,7 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor): class FFmpegThumbnailsConvertorPP(FFmpegPostProcessor): - SUPPORTED_EXTS = ('jpg', 'png') + SUPPORTED_EXTS = ('jpg', 'png', 'webp') def __init__(self, downloader=None, format=None): super(FFmpegThumbnailsConvertorPP, self).__init__(downloader) diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index cd48b15ae..e7e04e86e 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -49,6 +49,9 @@ class SponsorBlockPP(FFmpegPostProcessor): def duration_filter(s): start_end = s['segment'] + # Ignore entire video segments (https://wiki.sponsor.ajay.app/w/Types). + if start_end == (0, 0): + return False # Ignore milliseconds difference at the start. if start_end[0] <= 1: start_end[0] = 0 @@ -89,6 +92,7 @@ class SponsorBlockPP(FFmpegPostProcessor): url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + compat_urllib_parse_urlencode({ 'service': service, 'categories': json.dumps(self._categories), + 'actionTypes': json.dumps(['skip', 'poi']) }) self.write_debug(f'SponsorBlock query: {url}') for d in self._get_json(url): diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index b3f2a0625..90502dbc0 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3040,6 +3040,9 @@ def qualities(quality_ids): return q +POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'} + + DEFAULT_OUTTMPL = { 'default': '%(title)s [%(id)s].%(ext)s', 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s', |