diff options
Diffstat (limited to 'yt_dlp')
-rw-r--r-- | yt_dlp/__init__.py | 84 | ||||
-rw-r--r-- | yt_dlp/options.py | 113 | ||||
-rw-r--r-- | yt_dlp/postprocessor/__init__.py | 4 | ||||
-rw-r--r-- | yt_dlp/postprocessor/ffmpeg.py | 174 | ||||
-rw-r--r-- | yt_dlp/postprocessor/modify_chapters.py | 333 | ||||
-rw-r--r-- | yt_dlp/postprocessor/sponskrub.py | 1 | ||||
-rw-r--r-- | yt_dlp/postprocessor/sponsorblock.py | 96 |
7 files changed, 714 insertions, 91 deletions
diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 58e8ea5d9..91b2bcb85 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -13,7 +13,6 @@ import random import re import sys - from .options import ( parseOpts, ) @@ -307,6 +306,7 @@ def _real_main(argv=None): opts.forceprint = opts.forceprint or [] for tmpl in opts.forceprint or []: validate_outtmpl(tmpl, 'print template') + validate_outtmpl(opts.sponsorblock_chapter_title, 'SponsorBlock chapter title') if opts.extractaudio and not opts.keepvideo and opts.format is None: opts.format = 'bestaudio/best' @@ -353,16 +353,35 @@ def _real_main(argv=None): if opts.getcomments and not printing_json: opts.writeinfojson = True + if opts.no_sponsorblock: + opts.sponsorblock_mark = set() + opts.sponsorblock_remove = set() + sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove + + if (opts.addmetadata or opts.sponsorblock_mark) and opts.addchapters is None: + opts.addchapters = True + opts.remove_chapters = opts.remove_chapters or [] + def report_conflict(arg1, arg2): warnings.append('%s is ignored since %s was given' % (arg2, arg1)) - if opts.remuxvideo and opts.recodevideo: - report_conflict('--recode-video', '--remux-video') - opts.remuxvideo = False + if (opts.remove_chapters or sponsorblock_query) and opts.sponskrub is not False: + if opts.sponskrub: + if opts.remove_chapters: + report_conflict('--remove-chapters', '--sponskrub') + if opts.sponsorblock_mark: + report_conflict('--sponsorblock-mark', '--sponskrub') + if opts.sponsorblock_remove: + report_conflict('--sponsorblock-remove', '--sponskrub') + opts.sponskrub = False if opts.sponskrub_cut and opts.split_chapters and opts.sponskrub is not False: report_conflict('--split-chapter', '--sponskrub-cut') opts.sponskrub_cut = False + if opts.remuxvideo and opts.recodevideo: + report_conflict('--recode-video', '--remux-video') + opts.remuxvideo = False + if opts.allow_unplayable_formats: if opts.extractaudio: report_conflict('--allow-unplayable-formats', '--extract-audio') @@ -388,12 +407,26 @@ def _real_main(argv=None): if opts.fixup and opts.fixup.lower() not in ('never', 'ignore'): report_conflict('--allow-unplayable-formats', '--fixup') opts.fixup = 'never' + if opts.remove_chapters: + report_conflict('--allow-unplayable-formats', '--remove-chapters') + opts.remove_chapters = [] + if opts.sponsorblock_remove: + report_conflict('--allow-unplayable-formats', '--sponsorblock-remove') + opts.sponsorblock_remove = set() if opts.sponskrub: report_conflict('--allow-unplayable-formats', '--sponskrub') opts.sponskrub = False # PostProcessors postprocessors = [] + if sponsorblock_query: + postprocessors.append({ + 'key': 'SponsorBlock', + 'categories': sponsorblock_query, + 'api': opts.sponsorblock_api, + # Run this immediately after extraction is complete + 'when': 'pre_process' + }) if opts.parse_metadata: postprocessors.append({ 'key': 'MetadataParser', @@ -439,16 +472,7 @@ def _real_main(argv=None): 'key': 'FFmpegVideoConvertor', 'preferedformat': opts.recodevideo, }) - # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and - # FFmpegExtractAudioPP as containers before conversion may not support - # metadata (3gp, webm, etc.) - # And this post-processor should be placed before other metadata - # manipulating post-processors (FFmpegEmbedSubtitle) to prevent loss of - # extra metadata. By default ffmpeg preserves metadata applicable for both - # source and target containers. From this point the container won't change, - # so metadata can be added here. - if opts.addmetadata: - postprocessors.append({'key': 'FFmpegMetadata'}) + # If ModifyChapters is going to remove chapters, subtitles must already be in the container. if opts.embedsubtitles: already_have_subtitle = opts.writesubtitles and 'no-keep-subs' not in compat_opts postprocessors.append({ @@ -462,6 +486,33 @@ def _real_main(argv=None): # this was the old behaviour if only --all-sub was given. if opts.allsubtitles and not opts.writeautomaticsub: opts.writesubtitles = True + # ModifyChapters must run before FFmpegMetadataPP + remove_chapters_patterns = [] + for regex in opts.remove_chapters: + try: + remove_chapters_patterns.append(re.compile(regex)) + except re.error as err: + parser.error(f'invalid --remove-chapters regex {regex!r} - {err}') + if opts.remove_chapters or sponsorblock_query: + postprocessors.append({ + 'key': 'ModifyChapters', + 'remove_chapters_patterns': remove_chapters_patterns, + 'remove_sponsor_segments': opts.sponsorblock_remove, + 'sponsorblock_chapter_title': opts.sponsorblock_chapter_title, + 'force_keyframes': opts.force_keyframes_at_cuts + }) + # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and + # FFmpegExtractAudioPP as containers before conversion may not support + # metadata (3gp, webm, etc.) + # By default ffmpeg preserves metadata applicable for both + # source and target containers. From this point the container won't change, + # so metadata can be added here. + if opts.addmetadata or opts.addchapters: + postprocessors.append({ + 'key': 'FFmpegMetadata', + 'add_chapters': opts.addchapters, + 'add_metadata': opts.addmetadata, + }) # This should be above EmbedThumbnail since sponskrub removes the thumbnail attachment # but must be below EmbedSubtitle and FFmpegMetadata # See https://github.com/yt-dlp/yt-dlp/issues/204 , https://github.com/faissaloo/SponSkrub/issues/29 @@ -485,7 +536,10 @@ def _real_main(argv=None): if not already_have_thumbnail: opts.writethumbnail = True if opts.split_chapters: - postprocessors.append({'key': 'FFmpegSplitChapters'}) + postprocessors.append({ + 'key': 'FFmpegSplitChapters', + 'force_keyframes': opts.force_keyframes_at_cuts, + }) # XAttrMetadataPP should be run after post-processors that may change file contents if opts.xattrs: postprocessors.append({'key': 'XAttrMetadata'}) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 0f8ce8ce8..483cce8d8 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -28,7 +28,9 @@ from .postprocessor import ( FFmpegSubtitlesConvertorPP, FFmpegThumbnailsConvertorPP, FFmpegVideoRemuxerPP, + SponsorBlockPP, ) +from .postprocessor.modify_chapters import DEFAULT_SPONSORBLOCK_CHAPTER_TITLE def _hide_login_info(opts): @@ -1218,10 +1220,10 @@ def parseOpts(overrideArguments=None): 'Give these arguments to the postprocessors. ' 'Specify the postprocessor/executable name and the arguments separated by a colon ":" ' 'to give the argument to the specified postprocessor/executable. Supported PP are: ' - 'Merger, ExtractAudio, SplitChapters, Metadata, EmbedSubtitle, EmbedThumbnail, ' - 'SubtitlesConvertor, ThumbnailsConvertor, VideoRemuxer, VideoConvertor, ' - 'SponSkrub, FixupStretched, FixupM4a, FixupM3u8, FixupTimestamp and FixupDuration. ' - 'The supported executables are: AtomicParsley, FFmpeg, FFprobe, and SponSkrub. ' + 'Merger, ModifyChapters, SplitChapters, ExtractAudio, VideoRemuxer, VideoConvertor, ' + 'Metadata, EmbedSubtitle, EmbedThumbnail, SubtitlesConvertor, ThumbnailsConvertor, ' + 'FixupStretched, FixupM4a, FixupM3u8, FixupTimestamp and FixupDuration. ' + 'The supported executables are: AtomicParsley, FFmpeg and FFprobe. ' 'You can also specify "PP+EXE:ARGS" to give the arguments to the specified executable ' 'only when being used by the specified postprocessor. Additionally, for ffmpeg/ffprobe, ' '"_i"/"_o" can be appended to the prefix optionally followed by a number to pass the argument ' @@ -1263,11 +1265,19 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--embed-metadata', '--add-metadata', action='store_true', dest='addmetadata', default=False, - help='Embed metadata including chapter markers (if supported by the format) to the video file (Alias: --add-metadata)') + help='Embed metadata to the video file. Also adds chapters to file unless --no-add-chapters is used (Alias: --add-metadata)') postproc.add_option( '--no-embed-metadata', '--no-add-metadata', action='store_false', dest='addmetadata', - help='Do not write metadata (default) (Alias: --no-add-metadata)') + help='Do not add metadata to file (default) (Alias: --no-add-metadata)') + postproc.add_option( + '--embed-chapters', '--add-chapters', + action='store_true', dest='addchapters', default=None, + help='Add chapter markers to the video file (Alias: --add-chapters)') + postproc.add_option( + '--no-embed-chapters', '--no-add-chapters', + action='store_false', dest='addchapters', + help='Do not add chapter markers (default) (Alias: --no-add-chapters)') postproc.add_option( '--metadata-from-title', metavar='FORMAT', dest='metafromtitle', @@ -1354,41 +1364,90 @@ def parseOpts(overrideArguments=None): '--no-split-chapters', '--no-split-tracks', dest='split_chapters', action='store_false', help='Do not split video based on chapters (default)') + postproc.add_option( + '--remove-chapters', + metavar='REGEX', dest='remove_chapters', action='append', + help='Remove chapters whose title matches the given regular expression. This option can be used multiple times') + postproc.add_option( + '--no-remove-chapters', dest='remove_chapters', action='store_const', const=None, + help='Do not remove any chapters from the file (default)') + postproc.add_option( + '--force-keyframes-at-cuts', + action='store_true', dest='force_keyframes_at_cuts', default=False, + help=( + 'Force keyframes around the chapters before removing/splitting them. ' + 'Requires a reencode and thus is very slow, but the resulting video ' + 'may have fewer artifacts around the cuts')) + postproc.add_option( + '--no-force-keyframes-at-cuts', + action='store_false', dest='force_keyframes_at_cuts', + help='Do not force keyframes around the chapters when cutting/splitting (default)') - sponskrub = optparse.OptionGroup(parser, 'SponSkrub (SponsorBlock) Options', description=( - 'SponSkrub (https://github.com/yt-dlp/SponSkrub) is a utility to mark/remove sponsor segments ' - 'from downloaded YouTube videos using SponsorBlock API (https://sponsor.ajay.app)')) - sponskrub.add_option( + sponsorblock = optparse.OptionGroup(parser, 'SponsorBlock Options', description=( + 'Make chapter entries for, or remove various segments (sponsor, introductions, etc.) ' + 'from downloaded YouTube videos using the SponsorBlock API (https://sponsor.ajay.app)')) + sponsorblock.add_option( + '--sponsorblock-mark', metavar='CATS', + dest='sponsorblock_mark', default=set(), action='callback', type='str', + callback=_set_from_options_callback, callback_kwargs={'allowed_values': SponsorBlockPP.CATEGORIES.keys()}, + help=( + 'SponsorBlock categories to create chapters for, separated by commas. ' + 'Available categories are all, %s. You can prefix the category with a "-" to exempt it. ' + 'See https://wiki.sponsor.ajay.app/index.php/Segment_Categories for description of the categories. ' + 'Eg: --sponsorblock-query all,-preview' % ', '.join(SponsorBlockPP.CATEGORIES.keys()))) + sponsorblock.add_option( + '--sponsorblock-remove', metavar='CATS', + dest='sponsorblock_remove', default=set(), action='callback', type='str', + callback=_set_from_options_callback, callback_kwargs={'allowed_values': SponsorBlockPP.CATEGORIES.keys()}, + help=( + 'SponsorBlock categories to be removed from the video file, separated by commas. ' + 'If a category is present in both mark and remove, remove takes precedence. ' + 'The syntax and available categories are the same as for --sponsorblock-mark')) + sponsorblock.add_option( + '--sponsorblock-chapter-title', metavar='TEMPLATE', + default=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, dest='sponsorblock_chapter_title', + help=( + 'The title template for SponsorBlock chapters created by --sponsorblock-mark. ' + 'The same syntax as the output template is used, but the only available fields are ' + 'start_time, end_time, category, categories, name, category_names. Defaults to "%default"')) + sponsorblock.add_option( + '--no-sponsorblock', default=False, + action='store_true', dest='no_sponsorblock', + help='Disable both --sponsorblock-mark and --sponsorblock-remove') + sponsorblock.add_option( + '--sponsorblock-api', metavar='URL', + default='https://sponsor.ajay.app', dest='sponsorblock_api', + help='SponsorBlock API location, defaults to %default') + + sponsorblock.add_option( '--sponskrub', action='store_true', dest='sponskrub', default=None, - help=( - 'Use sponskrub to mark sponsored sections. ' - 'This is enabled by default if the sponskrub binary exists (Youtube only)')) - sponskrub.add_option( + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( '--no-sponskrub', action='store_false', dest='sponskrub', - help='Do not use sponskrub') - sponskrub.add_option( + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( '--sponskrub-cut', default=False, action='store_true', dest='sponskrub_cut', - help='Cut out the sponsor sections instead of simply marking them') - sponskrub.add_option( + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( '--no-sponskrub-cut', action='store_false', dest='sponskrub_cut', - help='Simply mark the sponsor sections, not cut them out (default)') - sponskrub.add_option( + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( '--sponskrub-force', default=False, action='store_true', dest='sponskrub_force', - help='Run sponskrub even if the video was already downloaded') - sponskrub.add_option( + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( '--no-sponskrub-force', action='store_true', dest='sponskrub_force', - help='Do not cut out the sponsor sections if the video was already downloaded (default)') - sponskrub.add_option( + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( '--sponskrub-location', metavar='PATH', dest='sponskrub_path', default='', - help='Location of the sponskrub binary; either the path to the binary or its containing directory') - sponskrub.add_option( + help=optparse.SUPPRESS_HELP) + sponsorblock.add_option( '--sponskrub-args', dest='sponskrub_args', metavar='ARGS', help=optparse.SUPPRESS_HELP) @@ -1457,7 +1516,7 @@ def parseOpts(overrideArguments=None): parser.add_option_group(subtitles) parser.add_option_group(authentication) parser.add_option_group(postproc) - parser.add_option_group(sponskrub) + parser.add_option_group(sponsorblock) parser.add_option_group(extractor) if overrideArguments is not None: diff --git a/yt_dlp/postprocessor/__init__.py b/yt_dlp/postprocessor/__init__.py index 31c2d7c68..adbcd3755 100644 --- a/yt_dlp/postprocessor/__init__.py +++ b/yt_dlp/postprocessor/__init__.py @@ -26,7 +26,9 @@ from .metadataparser import ( MetadataParserPP, ) from .movefilesafterdownload import MoveFilesAfterDownloadPP +from .sponsorblock import SponsorBlockPP from .sponskrub import SponSkrubPP +from .modify_chapters import ModifyChaptersPP def get_postprocessor(key): @@ -56,6 +58,8 @@ __all__ = [ 'MetadataFromFieldPP', 'MetadataFromTitlePP', 'MoveFilesAfterDownloadPP', + 'SponsorBlockPP', 'SponSkrubPP', + 'ModifyChaptersPP', 'XAttrMetadataPP', ] diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 7537d5db4..806334645 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -8,22 +8,22 @@ import time import re import json - from .common import AudioConversionError, PostProcessor from ..compat import compat_str, compat_numeric_types from ..utils import ( + dfxp2srt, encodeArgument, encodeFilename, get_exe_version, is_outdated_version, + ISO639Utils, + orderedSet, PostProcessingError, prepend_extension, - shell_quote, - dfxp2srt, - ISO639Utils, process_communicate_or_kill, replace_extension, + shell_quote, traverse_obj, variadic, ) @@ -281,7 +281,8 @@ class FFmpegPostProcessor(PostProcessor): def run_ffmpeg(self, path, out_path, opts, **kwargs): return self.run_ffmpeg_multiple_files([path], out_path, opts, **kwargs) - def _ffmpeg_filename_argument(self, fn): + @staticmethod + def _ffmpeg_filename_argument(fn): # Always use 'file:' because the filename may contain ':' (ffmpeg # interprets that as a protocol) or can start with '-' (-- is broken in # ffmpeg, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details) @@ -290,6 +291,62 @@ class FFmpegPostProcessor(PostProcessor): return fn return 'file:' + fn if fn != '-' else fn + @staticmethod + def _quote_for_ffmpeg(string): + # See https://ffmpeg.org/ffmpeg-utils.html#toc-Quoting-and-escaping + # A sequence of '' produces '\'''\''; + # final replace removes the empty '' between \' \'. + string = string.replace("'", r"'\''").replace("'''", "'") + # Handle potential ' at string boundaries. + string = string[1:] if string[0] == "'" else "'" + string + return string[:-1] if string[-1] == "'" else string + "'" + + def force_keyframes(self, filename, timestamps): + timestamps = orderedSet(timestamps) + if timestamps[0] == 0: + timestamps = timestamps[1:] + keyframe_file = prepend_extension(filename, 'keyframes.temp') + self.to_screen(f'Re-encoding "{filename}" with appropriate keyframes') + self.run_ffmpeg(filename, keyframe_file, ['-force_key_frames', ','.join( + f'{t:.6f}' for t in timestamps)]) + return keyframe_file + + def concat_files(self, in_files, out_file, concat_opts=None): + """ + Use concat demuxer to concatenate multiple files having identical streams. + + Only inpoint, outpoint, and duration concat options are supported. + See https://ffmpeg.org/ffmpeg-formats.html#concat-1 for details + """ + concat_file = f'{out_file}.concat' + self.write_debug(f'Writing concat spec to {concat_file}') + with open(concat_file, 'wt', encoding='utf-8') as f: + f.writelines(self._concat_spec(in_files, concat_opts)) + + out_flags = ['-c', 'copy'] + if out_file.rpartition('.')[-1] in ('mp4', 'mov'): + # For some reason, '-c copy' is not enough to copy subtitles + out_flags.extend(['-c:s', 'mov_text', '-movflags', '+faststart']) + + try: + self.real_run_ffmpeg( + [(concat_file, ['-hide_banner', '-nostdin', '-f', 'concat', '-safe', '0'])], + [(out_file, out_flags)]) + finally: + os.remove(concat_file) + + @classmethod + def _concat_spec(cls, in_files, concat_opts=None): + if concat_opts is None: + concat_opts = [{}] * len(in_files) + yield 'ffconcat version 1.0\n' + for file, opts in zip(in_files, concat_opts): + yield f'file {cls._quote_for_ffmpeg(cls._ffmpeg_filename_argument(file))}\n' + # Iterate explicitly to yield the following directives in order, ignoring the rest. + for directive in 'inpoint', 'outpoint', 'duration': + if directive in opts: + yield f'{directive} {opts[directive]}\n' + class FFmpegExtractAudioPP(FFmpegPostProcessor): COMMON_AUDIO_EXTS = ('wav', 'flac', 'm4a', 'aiff', 'mp3', 'ogg', 'mka', 'opus', 'wma') @@ -531,6 +588,11 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): class FFmpegMetadataPP(FFmpegPostProcessor): + def __init__(self, downloader, add_metadata=True, add_chapters=True): + FFmpegPostProcessor.__init__(self, downloader) + self._add_metadata = add_metadata + self._add_chapters = add_chapters + @staticmethod def _options(target_ext): yield from ('-map', '0', '-dn') @@ -541,6 +603,46 @@ class FFmpegMetadataPP(FFmpegPostProcessor): @PostProcessor._restrict_to(images=False) def run(self, info): + filename, metadata_filename = info['filepath'], None + options = [] + if self._add_chapters and info.get('chapters'): + metadata_filename = replace_extension(filename, 'meta') + options.extend(self._get_chapter_opts(info['chapters'], metadata_filename)) + if self._add_metadata: + options.extend(self._get_metadata_opts(info)) + + if not options: + self.to_screen('There isn\'t any metadata to add') + return [], info + + temp_filename = prepend_extension(filename, 'temp') + self.to_screen('Adding metadata to "%s"' % filename) + self.run_ffmpeg_multiple_files( + (filename, metadata_filename), temp_filename, + itertools.chain(self._options(info['ext']), *options)) + if metadata_filename: + os.remove(metadata_filename) + os.replace(temp_filename, filename) + return [], info + + @staticmethod + def _get_chapter_opts(chapters, metadata_filename): + with io.open(metadata_filename, 'wt', encoding='utf-8') as f: + def ffmpeg_escape(text): + return re.sub(r'([\\=;#\n])', r'\\\1', text) + + metadata_file_content = ';FFMETADATA1\n' + for chapter in chapters: + metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n' + metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000) + metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000) + chapter_title = chapter.get('title') + if chapter_title: + metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title) + f.write(metadata_file_content) + yield ('-map_metadata', '1') + + def _get_metadata_opts(self, info): metadata = {} def add(meta_list, info_list=None): @@ -577,61 +679,27 @@ class FFmpegMetadataPP(FFmpegPostProcessor): for key in filter(lambda k: k.startswith(prefix), info.keys()): add(key[len(prefix):], key) - filename, metadata_filename = info['filepath'], None - options = [('-metadata', f'{name}={value}') for name, value in metadata.items()] + for name, value in metadata.items(): + yield ('-metadata', f'{name}={value}') stream_idx = 0 for fmt in info.get('requested_formats') or []: stream_count = 2 if 'none' not in (fmt.get('vcodec'), fmt.get('acodec')) else 1 if fmt.get('language'): lang = ISO639Utils.short2long(fmt['language']) or fmt['language'] - options.extend(('-metadata:s:%d' % (stream_idx + i), 'language=%s' % lang) - for i in range(stream_count)) + for i in range(stream_count): + yield ('-metadata:s:%d' % (stream_idx + i), 'language=%s' % lang) stream_idx += stream_count - chapters = info.get('chapters', []) - if chapters: - metadata_filename = replace_extension(filename, 'meta') - with io.open(metadata_filename, 'wt', encoding='utf-8') as f: - def ffmpeg_escape(text): - return re.sub(r'([\\=;#\n])', r'\\\1', text) - - metadata_file_content = ';FFMETADATA1\n' - for chapter in chapters: - metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n' - metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000) - metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000) - chapter_title = chapter.get('title') - if chapter_title: - metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title) - f.write(metadata_file_content) - options.append(('-map_metadata', '1')) - if ('no-attach-info-json' not in self.get_param('compat_opts', []) and '__infojson_filename' in info and info['ext'] in ('mkv', 'mka')): - old_stream, new_stream = self.get_stream_number(filename, ('tags', 'mimetype'), 'application/json') + old_stream, new_stream = self.get_stream_number(info['filepath'], ('tags', 'mimetype'), 'application/json') if old_stream is not None: - options.append(('-map', '-0:%d' % old_stream)) + yield ('-map', '-0:%d' % old_stream) new_stream -= 1 - options.append(( - '-attach', info['__infojson_filename'], - '-metadata:s:%d' % new_stream, 'mimetype=application/json' - )) - - if not options: - self.to_screen('There isn\'t any metadata to add') - return [], info - - temp_filename = prepend_extension(filename, 'temp') - self.to_screen('Adding metadata to "%s"' % filename) - self.run_ffmpeg_multiple_files( - (filename, metadata_filename), temp_filename, - itertools.chain(self._options(info['ext']), *options)) - if chapters: - os.remove(metadata_filename) - os.replace(temp_filename, filename) - return [], info + yield ('-attach', info['__infojson_filename'], + '-metadata:s:%d' % new_stream, 'mimetype=application/json') class FFmpegMergerPP(FFmpegPostProcessor): @@ -808,6 +876,9 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): class FFmpegSplitChaptersPP(FFmpegPostProcessor): + def __init__(self, downloader, force_keyframes=False): + FFmpegPostProcessor.__init__(self, downloader) + self._force_keyframes = force_keyframes def _prepare_filename(self, number, chapter, info): info = info.copy() @@ -835,13 +906,18 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor): def run(self, info): chapters = info.get('chapters') or [] if not chapters: - self.report_warning('Chapter information is unavailable') + self.to_screen('Chapter information is unavailable') return [], info + in_file = info['filepath'] + if self._force_keyframes and len(chapters) > 1: + in_file = self.force_keyframes(in_file, (c['start_time'] for c in chapters)) self.to_screen('Splitting video by chapters; %d chapters found' % len(chapters)) for idx, chapter in enumerate(chapters): destination, opts = self._ffmpeg_args_for_chapter(idx + 1, chapter, info) - self.real_run_ffmpeg([(info['filepath'], opts)], [(destination, ['-c', 'copy'])]) + self.real_run_ffmpeg([(in_file, opts)], [(destination, ['-c', 'copy'])]) + if in_file != info['filepath']: + os.remove(in_file) return [], info diff --git a/yt_dlp/postprocessor/modify_chapters.py b/yt_dlp/postprocessor/modify_chapters.py new file mode 100644 index 000000000..3d6493b68 --- /dev/null +++ b/yt_dlp/postprocessor/modify_chapters.py @@ -0,0 +1,333 @@ +import copy +import heapq +import os + +from .common import PostProcessor +from .ffmpeg import ( + FFmpegPostProcessor, + FFmpegSubtitlesConvertorPP +) +from .sponsorblock import SponsorBlockPP +from ..utils import ( + float_or_none, + orderedSet, + PostProcessingError, + prepend_extension, + traverse_obj, +) + + +_TINY_SPONSOR_OVERLAP_DURATION = 1 +DEFAULT_SPONSORBLOCK_CHAPTER_TITLE = '[SponsorBlock]: %(category_names)l' + + +class ModifyChaptersPP(FFmpegPostProcessor): + def __init__(self, downloader, remove_chapters_patterns=None, remove_sponsor_segments=None, + sponsorblock_chapter_title=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, force_keyframes=False): + FFmpegPostProcessor.__init__(self, downloader) + self._remove_chapters_patterns = set(remove_chapters_patterns or []) + self._remove_sponsor_segments = set(remove_sponsor_segments or []) + self._sponsorblock_chapter_title = sponsorblock_chapter_title + self._force_keyframes = force_keyframes + + @PostProcessor._restrict_to(images=False) + def run(self, info): + chapters, sponsor_chapters = self._mark_chapters_to_remove( + info.get('chapters') or [], info.get('sponsorblock_chapters') or []) + if not chapters and not sponsor_chapters: + return [], info + + real_duration = self._get_real_video_duration(info['filepath']) + if not chapters: + chapters = [{'start_time': 0, 'end_time': real_duration, 'title': info['title']}] + + info['chapters'], cuts = self._remove_marked_arrange_sponsors(chapters + sponsor_chapters) + if not cuts: + return [], info + + if abs(real_duration - info['duration']) > 1: + if abs(real_duration - info['chapters'][-1]['end_time']) < 1: + self.to_screen(f'Skipping {self.pp_key()} since the video appears to be already cut') + return [], info + if not info.get('__real_download'): + raise PostProcessingError('Cannot cut video since the real and expected durations mismatch. ' + 'Different chapters may have already been removed') + return [], info + else: + self.write_debug('Expected and actual durations mismatch') + + concat_opts = self._make_concat_opts(cuts, real_duration) + + def remove_chapters(file, is_sub): + return file, self.remove_chapters(file, cuts, concat_opts, self._force_keyframes and not is_sub) + + in_out_files = [remove_chapters(info['filepath'], False)] + in_out_files.extend(remove_chapters(in_file, True) for in_file in self._get_supported_subs(info)) + + # Renaming should only happen after all files are processed + files_to_remove = [] + for in_file, out_file in in_out_files: + uncut_file = prepend_extension(in_file, 'uncut') + os.replace(in_file, uncut_file) + os.replace(out_file, in_file) + files_to_remove.append(uncut_file) + + return files_to_remove, info + + def _mark_chapters_to_remove(self, chapters, sponsor_chapters): + if self._remove_chapters_patterns: + warn_no_chapter_to_remove = True + if not chapters: + self.to_screen('Chapter information is unavailable') + warn_no_chapter_to_remove = False + for c in chapters: + if any(regex.search(c['title']) for regex in self._remove_chapters_patterns): + c['remove'] = True + warn_no_chapter_to_remove = False + if warn_no_chapter_to_remove: + self.to_screen('There are no chapters matching the regex') + + if self._remove_sponsor_segments: + warn_no_chapter_to_remove = True + if not sponsor_chapters: + self.to_screen('SponsorBlock information is unavailable') + warn_no_chapter_to_remove = False + for c in sponsor_chapters: + if c['category'] in self._remove_sponsor_segments: + c['remove'] = True + warn_no_chapter_to_remove = False + if warn_no_chapter_to_remove: + self.to_screen('There are no matching SponsorBlock chapters') + + return chapters, sponsor_chapters + + def _get_real_video_duration(self, filename): + duration = float_or_none( + traverse_obj(self.get_metadata_object(filename), ('format', 'duration'))) + if duration is None: + raise PostProcessingError('ffprobe returned empty duration') + return duration + + def _get_supported_subs(self, info): + for sub in (info.get('requested_subtitles') or {}).values(): + sub_file = sub.get('filepath') + # The file might have been removed by --embed-subs + if not sub_file or not os.path.exists(sub_file): + continue + ext = sub['ext'] + if ext not in FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS: + self.report_warning(f'Cannot remove chapters from external {ext} subtitles; "{sub_file}" is now out of sync') + continue + # TODO: create __real_download for subs? + yield sub_file + + def _remove_marked_arrange_sponsors(self, chapters): + # Store cuts separately, since adjacent and overlapping cuts must be merged. + cuts = [] + + def append_cut(c): + assert 'remove' in c + last_to_cut = cuts[-1] if cuts else None + if last_to_cut and last_to_cut['end_time'] >= c['start_time']: + last_to_cut['end_time'] = max(last_to_cut['end_time'], c['end_time']) + else: + cuts.append(c) + return len(cuts) - 1 + + def excess_duration(c): + # Cuts that are completely within the chapter reduce chapters' duration. + # Since cuts can overlap, excess duration may be less that the sum of cuts' durations. + # To avoid that, chapter stores the index to the fist cut within the chapter, + # instead of storing excess duration. append_cut ensures that subsequent cuts (if any) + # will be merged with previous ones (if necessary). + cut_idx, excess = c.pop('cut_idx', len(cuts)), 0 + while cut_idx < len(cuts): + cut = cuts[cut_idx] + if cut['start_time'] >= c['end_time']: + break + if cut['end_time'] > c['start_time']: + excess += min(cut['end_time'], c['end_time']) + excess -= max(cut['start_time'], c['start_time']) + cut_idx += 1 + return excess + + new_chapters = [] + + def chapter_length(c): + return c['end_time'] - c['start_time'] + + def original_uncut_chapter(c): + return '_was_cut' not in c and '_categories' not in c + + def append_chapter(c): + assert 'remove' not in c + length = chapter_length(c) - excess_duration(c) + # Chapter is completely covered by cuts or sponsors. + if length <= 0: + return + start = new_chapters[-1]['end_time'] if new_chapters else 0 + c.update(start_time=start, end_time=start + length) + # Append without checking for tininess to prevent having + # a completely empty chapter list. + if not new_chapters: + new_chapters.append(c) + return + old_c = new_chapters[-1] + # Merge with the previous if the chapter is tiny. + # Only tiny chapters resulting from a cut can be skipped. + # Chapters that were already tiny in the original list will be preserved. + if not original_uncut_chapter(c) and length < _TINY_SPONSOR_OVERLAP_DURATION: + old_c['end_time'] = c['end_time'] + # Previous tiny chapter was appended for the sake of preventing an empty chapter list. + # Replace it with the current one. + elif not original_uncut_chapter(old_c) and chapter_length(old_c) < _TINY_SPONSOR_OVERLAP_DURATION: + c['start_time'] = old_c['start_time'] + new_chapters[-1] = c + else: + new_chapters.append(c) + + # Turn into a priority queue, index is a tie breaker. + # Plain stack sorted by start_time is not enough: after splitting the chapter, + # the part returned to the stack is not guaranteed to have start_time + # less than or equal to the that of the stack's head. + chapters = [(c['start_time'], i, c) for i, c in enumerate(chapters)] + heapq.heapify(chapters) + + _, cur_i, cur_chapter = heapq.heappop(chapters) + while chapters: + _, i, c = heapq.heappop(chapters) + # Non-overlapping chapters or cuts can be appended directly. However, + # adjacent non-overlapping cuts must be merged, which is handled by append_cut. + if cur_chapter['end_time'] <= c['start_time']: + (append_chapter if 'remove' not in cur_chapter else append_cut)(cur_chapter) + cur_i, cur_chapter = i, c + continue + + # Eight possibilities for overlapping chapters: (cut, cut), (cut, sponsor), + # (cut, normal), (sponsor, cut), (normal, cut), (sponsor, sponsor), + # (sponsor, normal), and (normal, sponsor). There is no (normal, normal): + # normal chapters are assumed not to overlap. + if 'remove' in cur_chapter: + # (cut, cut): adjust end_time. + if 'remove' in c: + cur_chapter['end_time'] = max(cur_chapter['end_time'], c['end_time']) + # (cut, sponsor/normal): chop the beginning of the later chapter + # (if it's not completely hidden by the cut). Push to the priority queue + # to restore sorting by start_time: with beginning chopped, c may actually + # start later than the remaining chapters from the queue. + elif cur_chapter['end_time'] < c['end_time']: + c['start_time'] = cur_chapter['end_time'] + c['_was_cut'] = True + heapq.heappush(chapters, (c['start_time'], i, c)) + # (sponsor/normal, cut). + elif 'remove' in c: + cur_chapter['_was_cut'] = True + # Chop the end of the current chapter if the cut is not contained within it. + # Chopping the end doesn't break start_time sorting, no PQ push is necessary. + if cur_chapter['end_time'] <= c['end_time']: + cur_chapter['end_time'] = c['start_time'] + append_chapter(cur_chapter) + cur_i, cur_chapter = i, c + continue + # Current chapter contains the cut within it. If the current chapter is + # a sponsor chapter, check whether the categories before and after the cut differ. + if '_categories' in cur_chapter: + after_c = dict(cur_chapter, start_time=c['end_time'], _categories=[]) + cur_cats = [] + for cat_start_end in cur_chapter['_categories']: + if cat_start_end[1] < c['start_time']: + cur_cats.append(cat_start_end) + if cat_start_end[2] > c['end_time']: + after_c['_categories'].append(cat_start_end) + cur_chapter['_categories'] = cur_cats + if cur_chapter['_categories'] != after_c['_categories']: + # Categories before and after the cut differ: push the after part to PQ. + heapq.heappush(chapters, (after_c['start_time'], cur_i, after_c)) + cur_chapter['end_time'] = c['start_time'] + append_chapter(cur_chapter) + cur_i, cur_chapter = i, c + continue + # Either sponsor categories before and after the cut are the same or + # we're dealing with a normal chapter. Just register an outstanding cut: + # subsequent append_chapter will reduce the duration. + cur_chapter.setdefault('cut_idx', append_cut(c)) + # (sponsor, normal): if a normal chapter is not completely overlapped, + # chop the beginning of it and push it to PQ. + elif '_categories' in cur_chapter and '_categories' not in c: + if cur_chapter['end_time'] < c['end_time']: + c['start_time'] = cur_chapter['end_time'] + c['_was_cut'] = True + heapq.heappush(chapters, (c['start_time'], i, c)) + # (normal, sponsor) and (sponsor, sponsor) + else: + assert '_categories' in c + cur_chapter['_was_cut'] = True + c['_was_cut'] = True + # Push the part after the sponsor to PQ. + if cur_chapter['end_time'] > c['end_time']: + # deepcopy to make categories in after_c and cur_chapter/c refer to different lists. + after_c = dict(copy.deepcopy(cur_chapter), start_time=c['end_time']) + heapq.heappush(chapters, (after_c['start_time'], cur_i, after_c)) + # Push the part after the overlap to PQ. + elif c['end_time'] > cur_chapter['end_time']: + after_cur = dict(copy.deepcopy(c), start_time=cur_chapter['end_time']) + heapq.heappush(chapters, (after_cur['start_time'], cur_i, after_cur)) + c['end_time'] = cur_chapter['end_time'] + # (sponsor, sponsor): merge categories in the overlap. + if '_categories' in cur_chapter: + c['_categories'] = cur_chapter['_categories'] + c['_categories'] + # Inherit the cuts that the current chapter has accumulated within it. + if 'cut_idx' in cur_chapter: + c['cut_idx'] = cur_chapter['cut_idx'] + cur_chapter['end_time'] = c['start_time'] + append_chapter(cur_chapter) + cur_i, cur_chapter = i, c + (append_chapter if 'remove' not in cur_chapter else append_cut)(cur_chapter) + + i = -1 + for c in new_chapters.copy(): + i += 1 + c.pop('_was_cut', None) + cats = c.pop('_categories', None) + if cats: + category = min(cats, key=lambda c: c[2] - c[1])[0] + cats = orderedSet(x[0] for x in cats) + c.update({ + 'category': category, + 'categories': cats, + 'name': SponsorBlockPP.CATEGORIES[category], + 'category_names': [SponsorBlockPP.CATEGORIES[c] for c in cats] + }) + outtmpl, tmpl_dict = self._downloader.prepare_outtmpl(self._sponsorblock_chapter_title, c) + c['title'] = self._downloader.escape_outtmpl(outtmpl) % tmpl_dict + if i > 0 and c['title'] == new_chapters[i - 1]['title']: + new_chapters[i - 1]['end_time'] = c['end_time'] + new_chapters.pop(i) + i -= 1 + + return new_chapters, cuts + + def remove_chapters(self, filename, ranges_to_cut, concat_opts, force_keyframes=False): + in_file = filename + out_file = prepend_extension(in_file, 'temp') + if force_keyframes: + in_file = self.force_keyframes(in_file, (t for r in ranges_to_cut for t in r)) + self.to_screen(f'Removing chapters from {filename}') + self.concat_files([in_file] * len(concat_opts), out_file, concat_opts) + if in_file != filename: + os.remove(in_file) + return out_file + + @staticmethod + def _make_concat_opts(chapters_to_remove, duration): + opts = [{}] + for s in chapters_to_remove: + # Do not create 0 duration chunk at the beginning. + if s['start_time'] == 0: + opts[-1]['inpoint'] = f'{s["end_time"]:.6f}' + continue + opts[-1]['outpoint'] = f'{s["start_time"]:.6f}' + # Do not create 0 duration chunk at the end. + if s['end_time'] != duration: + opts.append({'inpoint': f'{s["end_time"]:.6f}'}) + return opts diff --git a/yt_dlp/postprocessor/sponskrub.py b/yt_dlp/postprocessor/sponskrub.py index 588f0ae12..932555a0e 100644 --- a/yt_dlp/postprocessor/sponskrub.py +++ b/yt_dlp/postprocessor/sponskrub.py @@ -17,6 +17,7 @@ from ..utils import ( ) +# Deprecated in favor of the native implementation class SponSkrubPP(PostProcessor): _temp_ext = 'spons' _exe_name = 'sponskrub' diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py new file mode 100644 index 000000000..6264d45c5 --- /dev/null +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -0,0 +1,96 @@ +import json +import re +from hashlib import sha256 + +from .ffmpeg import FFmpegPostProcessor +from ..compat import compat_urllib_parse_urlencode, compat_HTTPError +from ..utils import PostProcessingError, sanitized_Request + + +class SponsorBlockPP(FFmpegPostProcessor): + + EXTRACTORS = { + 'Youtube': 'YouTube', + } + CATEGORIES = { + 'sponsor': 'Sponsor', + 'intro': 'Intermission/Intro Animation', + 'outro': 'Endcards/Credits', + 'selfpromo': 'Unpaid/Self Promotion', + 'interaction': 'Interaction Reminder', + 'preview': 'Preview/Recap', + 'music_offtopic': 'Non-Music Section' + } + + def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'): + FFmpegPostProcessor.__init__(self, downloader) + self._categories = tuple(categories or self.CATEGORIES.keys()) + self._API_URL = api if re.match('^https?://', api) else 'https://' + api + + def run(self, info): + extractor = info['extractor_key'] + if extractor not in self.EXTRACTORS: + self.to_screen(f'SponsorBlock is not supported for {extractor}') + return [], info + + info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info['duration']) + return [], info + + def _get_sponsor_chapters(self, info, duration): + segments = self._get_sponsor_segments(info['id'], self.EXTRACTORS[info['extractor_key']]) + + def duration_filter(s): + start_end = s['segment'] + # Ignore milliseconds difference at the start. + if start_end[0] <= 1: + start_end[0] = 0 + # Ignore milliseconds difference at the end. + # Never allow the segment to exceed the video. + if duration and duration - start_end[1] <= 1: + start_end[1] = duration + # SponsorBlock duration may be absent or it may deviate from the real one. + return s['videoDuration'] == 0 or not duration or abs(duration - s['videoDuration']) <= 1 + + duration_match = [s for s in segments if duration_filter(s)] + if len(duration_match) != len(segments): + self.report_warning('Some SponsorBlock segments are from a video of different duration, maybe from an old version of this video') + + def to_chapter(s): + (start, end), cat = s['segment'], s['category'] + return { + 'start_time': start, + 'end_time': end, + 'category': cat, + 'title': self.CATEGORIES[cat], + '_categories': [(cat, start, end)] + } + + sponsor_chapters = [to_chapter(s) for s in duration_match] + if not sponsor_chapters: + self.to_screen('No segments were found in the SponsorBlock database') + else: + self.to_screen(f'Found {len(sponsor_chapters)} segments in the SponsorBlock database') + return sponsor_chapters + + def _get_sponsor_segments(self, video_id, service): + hash = sha256(video_id.encode('ascii')).hexdigest() + # SponsorBlock API recommends using first 4 hash characters. + url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + compat_urllib_parse_urlencode({ + 'service': service, + 'categories': json.dumps(self._categories), + }) + for d in self._get_json(url): + if d['videoID'] == video_id: + return d['segments'] + return [] + + def _get_json(self, url): + self.write_debug(f'SponsorBlock query: {url}') + try: + rsp = self._downloader.urlopen(sanitized_Request(url)) + except compat_HTTPError as e: + if e.code == 404: + return [] + raise PostProcessingError(f'Error communicating with SponsorBlock API - {e}') + + return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8')) |