diff options
Diffstat (limited to 'yt_dlp/postprocessor')
-rw-r--r-- | yt_dlp/postprocessor/__init__.py | 4 | ||||
-rw-r--r-- | yt_dlp/postprocessor/ffmpeg.py | 174 | ||||
-rw-r--r-- | yt_dlp/postprocessor/modify_chapters.py | 333 | ||||
-rw-r--r-- | yt_dlp/postprocessor/sponskrub.py | 1 | ||||
-rw-r--r-- | yt_dlp/postprocessor/sponsorblock.py | 96 |
5 files changed, 559 insertions, 49 deletions
diff --git a/yt_dlp/postprocessor/__init__.py b/yt_dlp/postprocessor/__init__.py index 31c2d7c68..adbcd3755 100644 --- a/yt_dlp/postprocessor/__init__.py +++ b/yt_dlp/postprocessor/__init__.py @@ -26,7 +26,9 @@ from .metadataparser import ( MetadataParserPP, ) from .movefilesafterdownload import MoveFilesAfterDownloadPP +from .sponsorblock import SponsorBlockPP from .sponskrub import SponSkrubPP +from .modify_chapters import ModifyChaptersPP def get_postprocessor(key): @@ -56,6 +58,8 @@ __all__ = [ 'MetadataFromFieldPP', 'MetadataFromTitlePP', 'MoveFilesAfterDownloadPP', + 'SponsorBlockPP', 'SponSkrubPP', + 'ModifyChaptersPP', 'XAttrMetadataPP', ] diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 7537d5db4..806334645 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -8,22 +8,22 @@ import time import re import json - from .common import AudioConversionError, PostProcessor from ..compat import compat_str, compat_numeric_types from ..utils import ( + dfxp2srt, encodeArgument, encodeFilename, get_exe_version, is_outdated_version, + ISO639Utils, + orderedSet, PostProcessingError, prepend_extension, - shell_quote, - dfxp2srt, - ISO639Utils, process_communicate_or_kill, replace_extension, + shell_quote, traverse_obj, variadic, ) @@ -281,7 +281,8 @@ class FFmpegPostProcessor(PostProcessor): def run_ffmpeg(self, path, out_path, opts, **kwargs): return self.run_ffmpeg_multiple_files([path], out_path, opts, **kwargs) - def _ffmpeg_filename_argument(self, fn): + @staticmethod + def _ffmpeg_filename_argument(fn): # Always use 'file:' because the filename may contain ':' (ffmpeg # interprets that as a protocol) or can start with '-' (-- is broken in # ffmpeg, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details) @@ -290,6 +291,62 @@ class FFmpegPostProcessor(PostProcessor): return fn return 'file:' + fn if fn != '-' else fn + @staticmethod + def _quote_for_ffmpeg(string): + # See https://ffmpeg.org/ffmpeg-utils.html#toc-Quoting-and-escaping + # A sequence of '' produces '\'''\''; + # final replace removes the empty '' between \' \'. + string = string.replace("'", r"'\''").replace("'''", "'") + # Handle potential ' at string boundaries. + string = string[1:] if string[0] == "'" else "'" + string + return string[:-1] if string[-1] == "'" else string + "'" + + def force_keyframes(self, filename, timestamps): + timestamps = orderedSet(timestamps) + if timestamps[0] == 0: + timestamps = timestamps[1:] + keyframe_file = prepend_extension(filename, 'keyframes.temp') + self.to_screen(f'Re-encoding "{filename}" with appropriate keyframes') + self.run_ffmpeg(filename, keyframe_file, ['-force_key_frames', ','.join( + f'{t:.6f}' for t in timestamps)]) + return keyframe_file + + def concat_files(self, in_files, out_file, concat_opts=None): + """ + Use concat demuxer to concatenate multiple files having identical streams. + + Only inpoint, outpoint, and duration concat options are supported. + See https://ffmpeg.org/ffmpeg-formats.html#concat-1 for details + """ + concat_file = f'{out_file}.concat' + self.write_debug(f'Writing concat spec to {concat_file}') + with open(concat_file, 'wt', encoding='utf-8') as f: + f.writelines(self._concat_spec(in_files, concat_opts)) + + out_flags = ['-c', 'copy'] + if out_file.rpartition('.')[-1] in ('mp4', 'mov'): + # For some reason, '-c copy' is not enough to copy subtitles + out_flags.extend(['-c:s', 'mov_text', '-movflags', '+faststart']) + + try: + self.real_run_ffmpeg( + [(concat_file, ['-hide_banner', '-nostdin', '-f', 'concat', '-safe', '0'])], + [(out_file, out_flags)]) + finally: + os.remove(concat_file) + + @classmethod + def _concat_spec(cls, in_files, concat_opts=None): + if concat_opts is None: + concat_opts = [{}] * len(in_files) + yield 'ffconcat version 1.0\n' + for file, opts in zip(in_files, concat_opts): + yield f'file {cls._quote_for_ffmpeg(cls._ffmpeg_filename_argument(file))}\n' + # Iterate explicitly to yield the following directives in order, ignoring the rest. + for directive in 'inpoint', 'outpoint', 'duration': + if directive in opts: + yield f'{directive} {opts[directive]}\n' + class FFmpegExtractAudioPP(FFmpegPostProcessor): COMMON_AUDIO_EXTS = ('wav', 'flac', 'm4a', 'aiff', 'mp3', 'ogg', 'mka', 'opus', 'wma') @@ -531,6 +588,11 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): class FFmpegMetadataPP(FFmpegPostProcessor): + def __init__(self, downloader, add_metadata=True, add_chapters=True): + FFmpegPostProcessor.__init__(self, downloader) + self._add_metadata = add_metadata + self._add_chapters = add_chapters + @staticmethod def _options(target_ext): yield from ('-map', '0', '-dn') @@ -541,6 +603,46 @@ class FFmpegMetadataPP(FFmpegPostProcessor): @PostProcessor._restrict_to(images=False) def run(self, info): + filename, metadata_filename = info['filepath'], None + options = [] + if self._add_chapters and info.get('chapters'): + metadata_filename = replace_extension(filename, 'meta') + options.extend(self._get_chapter_opts(info['chapters'], metadata_filename)) + if self._add_metadata: + options.extend(self._get_metadata_opts(info)) + + if not options: + self.to_screen('There isn\'t any metadata to add') + return [], info + + temp_filename = prepend_extension(filename, 'temp') + self.to_screen('Adding metadata to "%s"' % filename) + self.run_ffmpeg_multiple_files( + (filename, metadata_filename), temp_filename, + itertools.chain(self._options(info['ext']), *options)) + if metadata_filename: + os.remove(metadata_filename) + os.replace(temp_filename, filename) + return [], info + + @staticmethod + def _get_chapter_opts(chapters, metadata_filename): + with io.open(metadata_filename, 'wt', encoding='utf-8') as f: + def ffmpeg_escape(text): + return re.sub(r'([\\=;#\n])', r'\\\1', text) + + metadata_file_content = ';FFMETADATA1\n' + for chapter in chapters: + metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n' + metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000) + metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000) + chapter_title = chapter.get('title') + if chapter_title: + metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title) + f.write(metadata_file_content) + yield ('-map_metadata', '1') + + def _get_metadata_opts(self, info): metadata = {} def add(meta_list, info_list=None): @@ -577,61 +679,27 @@ class FFmpegMetadataPP(FFmpegPostProcessor): for key in filter(lambda k: k.startswith(prefix), info.keys()): add(key[len(prefix):], key) - filename, metadata_filename = info['filepath'], None - options = [('-metadata', f'{name}={value}') for name, value in metadata.items()] + for name, value in metadata.items(): + yield ('-metadata', f'{name}={value}') stream_idx = 0 for fmt in info.get('requested_formats') or []: stream_count = 2 if 'none' not in (fmt.get('vcodec'), fmt.get('acodec')) else 1 if fmt.get('language'): lang = ISO639Utils.short2long(fmt['language']) or fmt['language'] - options.extend(('-metadata:s:%d' % (stream_idx + i), 'language=%s' % lang) - for i in range(stream_count)) + for i in range(stream_count): + yield ('-metadata:s:%d' % (stream_idx + i), 'language=%s' % lang) stream_idx += stream_count - chapters = info.get('chapters', []) - if chapters: - metadata_filename = replace_extension(filename, 'meta') - with io.open(metadata_filename, 'wt', encoding='utf-8') as f: - def ffmpeg_escape(text): - return re.sub(r'([\\=;#\n])', r'\\\1', text) - - metadata_file_content = ';FFMETADATA1\n' - for chapter in chapters: - metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n' - metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000) - metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000) - chapter_title = chapter.get('title') - if chapter_title: - metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title) - f.write(metadata_file_content) - options.append(('-map_metadata', '1')) - if ('no-attach-info-json' not in self.get_param('compat_opts', []) and '__infojson_filename' in info and info['ext'] in ('mkv', 'mka')): - old_stream, new_stream = self.get_stream_number(filename, ('tags', 'mimetype'), 'application/json') + old_stream, new_stream = self.get_stream_number(info['filepath'], ('tags', 'mimetype'), 'application/json') if old_stream is not None: - options.append(('-map', '-0:%d' % old_stream)) + yield ('-map', '-0:%d' % old_stream) new_stream -= 1 - options.append(( - '-attach', info['__infojson_filename'], - '-metadata:s:%d' % new_stream, 'mimetype=application/json' - )) - - if not options: - self.to_screen('There isn\'t any metadata to add') - return [], info - - temp_filename = prepend_extension(filename, 'temp') - self.to_screen('Adding metadata to "%s"' % filename) - self.run_ffmpeg_multiple_files( - (filename, metadata_filename), temp_filename, - itertools.chain(self._options(info['ext']), *options)) - if chapters: - os.remove(metadata_filename) - os.replace(temp_filename, filename) - return [], info + yield ('-attach', info['__infojson_filename'], + '-metadata:s:%d' % new_stream, 'mimetype=application/json') class FFmpegMergerPP(FFmpegPostProcessor): @@ -808,6 +876,9 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): class FFmpegSplitChaptersPP(FFmpegPostProcessor): + def __init__(self, downloader, force_keyframes=False): + FFmpegPostProcessor.__init__(self, downloader) + self._force_keyframes = force_keyframes def _prepare_filename(self, number, chapter, info): info = info.copy() @@ -835,13 +906,18 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor): def run(self, info): chapters = info.get('chapters') or [] if not chapters: - self.report_warning('Chapter information is unavailable') + self.to_screen('Chapter information is unavailable') return [], info + in_file = info['filepath'] + if self._force_keyframes and len(chapters) > 1: + in_file = self.force_keyframes(in_file, (c['start_time'] for c in chapters)) self.to_screen('Splitting video by chapters; %d chapters found' % len(chapters)) for idx, chapter in enumerate(chapters): destination, opts = self._ffmpeg_args_for_chapter(idx + 1, chapter, info) - self.real_run_ffmpeg([(info['filepath'], opts)], [(destination, ['-c', 'copy'])]) + self.real_run_ffmpeg([(in_file, opts)], [(destination, ['-c', 'copy'])]) + if in_file != info['filepath']: + os.remove(in_file) return [], info diff --git a/yt_dlp/postprocessor/modify_chapters.py b/yt_dlp/postprocessor/modify_chapters.py new file mode 100644 index 000000000..3d6493b68 --- /dev/null +++ b/yt_dlp/postprocessor/modify_chapters.py @@ -0,0 +1,333 @@ +import copy +import heapq +import os + +from .common import PostProcessor +from .ffmpeg import ( + FFmpegPostProcessor, + FFmpegSubtitlesConvertorPP +) +from .sponsorblock import SponsorBlockPP +from ..utils import ( + float_or_none, + orderedSet, + PostProcessingError, + prepend_extension, + traverse_obj, +) + + +_TINY_SPONSOR_OVERLAP_DURATION = 1 +DEFAULT_SPONSORBLOCK_CHAPTER_TITLE = '[SponsorBlock]: %(category_names)l' + + +class ModifyChaptersPP(FFmpegPostProcessor): + def __init__(self, downloader, remove_chapters_patterns=None, remove_sponsor_segments=None, + sponsorblock_chapter_title=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, force_keyframes=False): + FFmpegPostProcessor.__init__(self, downloader) + self._remove_chapters_patterns = set(remove_chapters_patterns or []) + self._remove_sponsor_segments = set(remove_sponsor_segments or []) + self._sponsorblock_chapter_title = sponsorblock_chapter_title + self._force_keyframes = force_keyframes + + @PostProcessor._restrict_to(images=False) + def run(self, info): + chapters, sponsor_chapters = self._mark_chapters_to_remove( + info.get('chapters') or [], info.get('sponsorblock_chapters') or []) + if not chapters and not sponsor_chapters: + return [], info + + real_duration = self._get_real_video_duration(info['filepath']) + if not chapters: + chapters = [{'start_time': 0, 'end_time': real_duration, 'title': info['title']}] + + info['chapters'], cuts = self._remove_marked_arrange_sponsors(chapters + sponsor_chapters) + if not cuts: + return [], info + + if abs(real_duration - info['duration']) > 1: + if abs(real_duration - info['chapters'][-1]['end_time']) < 1: + self.to_screen(f'Skipping {self.pp_key()} since the video appears to be already cut') + return [], info + if not info.get('__real_download'): + raise PostProcessingError('Cannot cut video since the real and expected durations mismatch. ' + 'Different chapters may have already been removed') + return [], info + else: + self.write_debug('Expected and actual durations mismatch') + + concat_opts = self._make_concat_opts(cuts, real_duration) + + def remove_chapters(file, is_sub): + return file, self.remove_chapters(file, cuts, concat_opts, self._force_keyframes and not is_sub) + + in_out_files = [remove_chapters(info['filepath'], False)] + in_out_files.extend(remove_chapters(in_file, True) for in_file in self._get_supported_subs(info)) + + # Renaming should only happen after all files are processed + files_to_remove = [] + for in_file, out_file in in_out_files: + uncut_file = prepend_extension(in_file, 'uncut') + os.replace(in_file, uncut_file) + os.replace(out_file, in_file) + files_to_remove.append(uncut_file) + + return files_to_remove, info + + def _mark_chapters_to_remove(self, chapters, sponsor_chapters): + if self._remove_chapters_patterns: + warn_no_chapter_to_remove = True + if not chapters: + self.to_screen('Chapter information is unavailable') + warn_no_chapter_to_remove = False + for c in chapters: + if any(regex.search(c['title']) for regex in self._remove_chapters_patterns): + c['remove'] = True + warn_no_chapter_to_remove = False + if warn_no_chapter_to_remove: + self.to_screen('There are no chapters matching the regex') + + if self._remove_sponsor_segments: + warn_no_chapter_to_remove = True + if not sponsor_chapters: + self.to_screen('SponsorBlock information is unavailable') + warn_no_chapter_to_remove = False + for c in sponsor_chapters: + if c['category'] in self._remove_sponsor_segments: + c['remove'] = True + warn_no_chapter_to_remove = False + if warn_no_chapter_to_remove: + self.to_screen('There are no matching SponsorBlock chapters') + + return chapters, sponsor_chapters + + def _get_real_video_duration(self, filename): + duration = float_or_none( + traverse_obj(self.get_metadata_object(filename), ('format', 'duration'))) + if duration is None: + raise PostProcessingError('ffprobe returned empty duration') + return duration + + def _get_supported_subs(self, info): + for sub in (info.get('requested_subtitles') or {}).values(): + sub_file = sub.get('filepath') + # The file might have been removed by --embed-subs + if not sub_file or not os.path.exists(sub_file): + continue + ext = sub['ext'] + if ext not in FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS: + self.report_warning(f'Cannot remove chapters from external {ext} subtitles; "{sub_file}" is now out of sync') + continue + # TODO: create __real_download for subs? + yield sub_file + + def _remove_marked_arrange_sponsors(self, chapters): + # Store cuts separately, since adjacent and overlapping cuts must be merged. + cuts = [] + + def append_cut(c): + assert 'remove' in c + last_to_cut = cuts[-1] if cuts else None + if last_to_cut and last_to_cut['end_time'] >= c['start_time']: + last_to_cut['end_time'] = max(last_to_cut['end_time'], c['end_time']) + else: + cuts.append(c) + return len(cuts) - 1 + + def excess_duration(c): + # Cuts that are completely within the chapter reduce chapters' duration. + # Since cuts can overlap, excess duration may be less that the sum of cuts' durations. + # To avoid that, chapter stores the index to the fist cut within the chapter, + # instead of storing excess duration. append_cut ensures that subsequent cuts (if any) + # will be merged with previous ones (if necessary). + cut_idx, excess = c.pop('cut_idx', len(cuts)), 0 + while cut_idx < len(cuts): + cut = cuts[cut_idx] + if cut['start_time'] >= c['end_time']: + break + if cut['end_time'] > c['start_time']: + excess += min(cut['end_time'], c['end_time']) + excess -= max(cut['start_time'], c['start_time']) + cut_idx += 1 + return excess + + new_chapters = [] + + def chapter_length(c): + return c['end_time'] - c['start_time'] + + def original_uncut_chapter(c): + return '_was_cut' not in c and '_categories' not in c + + def append_chapter(c): + assert 'remove' not in c + length = chapter_length(c) - excess_duration(c) + # Chapter is completely covered by cuts or sponsors. + if length <= 0: + return + start = new_chapters[-1]['end_time'] if new_chapters else 0 + c.update(start_time=start, end_time=start + length) + # Append without checking for tininess to prevent having + # a completely empty chapter list. + if not new_chapters: + new_chapters.append(c) + return + old_c = new_chapters[-1] + # Merge with the previous if the chapter is tiny. + # Only tiny chapters resulting from a cut can be skipped. + # Chapters that were already tiny in the original list will be preserved. + if not original_uncut_chapter(c) and length < _TINY_SPONSOR_OVERLAP_DURATION: + old_c['end_time'] = c['end_time'] + # Previous tiny chapter was appended for the sake of preventing an empty chapter list. + # Replace it with the current one. + elif not original_uncut_chapter(old_c) and chapter_length(old_c) < _TINY_SPONSOR_OVERLAP_DURATION: + c['start_time'] = old_c['start_time'] + new_chapters[-1] = c + else: + new_chapters.append(c) + + # Turn into a priority queue, index is a tie breaker. + # Plain stack sorted by start_time is not enough: after splitting the chapter, + # the part returned to the stack is not guaranteed to have start_time + # less than or equal to the that of the stack's head. + chapters = [(c['start_time'], i, c) for i, c in enumerate(chapters)] + heapq.heapify(chapters) + + _, cur_i, cur_chapter = heapq.heappop(chapters) + while chapters: + _, i, c = heapq.heappop(chapters) + # Non-overlapping chapters or cuts can be appended directly. However, + # adjacent non-overlapping cuts must be merged, which is handled by append_cut. + if cur_chapter['end_time'] <= c['start_time']: + (append_chapter if 'remove' not in cur_chapter else append_cut)(cur_chapter) + cur_i, cur_chapter = i, c + continue + + # Eight possibilities for overlapping chapters: (cut, cut), (cut, sponsor), + # (cut, normal), (sponsor, cut), (normal, cut), (sponsor, sponsor), + # (sponsor, normal), and (normal, sponsor). There is no (normal, normal): + # normal chapters are assumed not to overlap. + if 'remove' in cur_chapter: + # (cut, cut): adjust end_time. + if 'remove' in c: + cur_chapter['end_time'] = max(cur_chapter['end_time'], c['end_time']) + # (cut, sponsor/normal): chop the beginning of the later chapter + # (if it's not completely hidden by the cut). Push to the priority queue + # to restore sorting by start_time: with beginning chopped, c may actually + # start later than the remaining chapters from the queue. + elif cur_chapter['end_time'] < c['end_time']: + c['start_time'] = cur_chapter['end_time'] + c['_was_cut'] = True + heapq.heappush(chapters, (c['start_time'], i, c)) + # (sponsor/normal, cut). + elif 'remove' in c: + cur_chapter['_was_cut'] = True + # Chop the end of the current chapter if the cut is not contained within it. + # Chopping the end doesn't break start_time sorting, no PQ push is necessary. + if cur_chapter['end_time'] <= c['end_time']: + cur_chapter['end_time'] = c['start_time'] + append_chapter(cur_chapter) + cur_i, cur_chapter = i, c + continue + # Current chapter contains the cut within it. If the current chapter is + # a sponsor chapter, check whether the categories before and after the cut differ. + if '_categories' in cur_chapter: + after_c = dict(cur_chapter, start_time=c['end_time'], _categories=[]) + cur_cats = [] + for cat_start_end in cur_chapter['_categories']: + if cat_start_end[1] < c['start_time']: + cur_cats.append(cat_start_end) + if cat_start_end[2] > c['end_time']: + after_c['_categories'].append(cat_start_end) + cur_chapter['_categories'] = cur_cats + if cur_chapter['_categories'] != after_c['_categories']: + # Categories before and after the cut differ: push the after part to PQ. + heapq.heappush(chapters, (after_c['start_time'], cur_i, after_c)) + cur_chapter['end_time'] = c['start_time'] + append_chapter(cur_chapter) + cur_i, cur_chapter = i, c + continue + # Either sponsor categories before and after the cut are the same or + # we're dealing with a normal chapter. Just register an outstanding cut: + # subsequent append_chapter will reduce the duration. + cur_chapter.setdefault('cut_idx', append_cut(c)) + # (sponsor, normal): if a normal chapter is not completely overlapped, + # chop the beginning of it and push it to PQ. + elif '_categories' in cur_chapter and '_categories' not in c: + if cur_chapter['end_time'] < c['end_time']: + c['start_time'] = cur_chapter['end_time'] + c['_was_cut'] = True + heapq.heappush(chapters, (c['start_time'], i, c)) + # (normal, sponsor) and (sponsor, sponsor) + else: + assert '_categories' in c + cur_chapter['_was_cut'] = True + c['_was_cut'] = True + # Push the part after the sponsor to PQ. + if cur_chapter['end_time'] > c['end_time']: + # deepcopy to make categories in after_c and cur_chapter/c refer to different lists. + after_c = dict(copy.deepcopy(cur_chapter), start_time=c['end_time']) + heapq.heappush(chapters, (after_c['start_time'], cur_i, after_c)) + # Push the part after the overlap to PQ. + elif c['end_time'] > cur_chapter['end_time']: + after_cur = dict(copy.deepcopy(c), start_time=cur_chapter['end_time']) + heapq.heappush(chapters, (after_cur['start_time'], cur_i, after_cur)) + c['end_time'] = cur_chapter['end_time'] + # (sponsor, sponsor): merge categories in the overlap. + if '_categories' in cur_chapter: + c['_categories'] = cur_chapter['_categories'] + c['_categories'] + # Inherit the cuts that the current chapter has accumulated within it. + if 'cut_idx' in cur_chapter: + c['cut_idx'] = cur_chapter['cut_idx'] + cur_chapter['end_time'] = c['start_time'] + append_chapter(cur_chapter) + cur_i, cur_chapter = i, c + (append_chapter if 'remove' not in cur_chapter else append_cut)(cur_chapter) + + i = -1 + for c in new_chapters.copy(): + i += 1 + c.pop('_was_cut', None) + cats = c.pop('_categories', None) + if cats: + category = min(cats, key=lambda c: c[2] - c[1])[0] + cats = orderedSet(x[0] for x in cats) + c.update({ + 'category': category, + 'categories': cats, + 'name': SponsorBlockPP.CATEGORIES[category], + 'category_names': [SponsorBlockPP.CATEGORIES[c] for c in cats] + }) + outtmpl, tmpl_dict = self._downloader.prepare_outtmpl(self._sponsorblock_chapter_title, c) + c['title'] = self._downloader.escape_outtmpl(outtmpl) % tmpl_dict + if i > 0 and c['title'] == new_chapters[i - 1]['title']: + new_chapters[i - 1]['end_time'] = c['end_time'] + new_chapters.pop(i) + i -= 1 + + return new_chapters, cuts + + def remove_chapters(self, filename, ranges_to_cut, concat_opts, force_keyframes=False): + in_file = filename + out_file = prepend_extension(in_file, 'temp') + if force_keyframes: + in_file = self.force_keyframes(in_file, (t for r in ranges_to_cut for t in r)) + self.to_screen(f'Removing chapters from {filename}') + self.concat_files([in_file] * len(concat_opts), out_file, concat_opts) + if in_file != filename: + os.remove(in_file) + return out_file + + @staticmethod + def _make_concat_opts(chapters_to_remove, duration): + opts = [{}] + for s in chapters_to_remove: + # Do not create 0 duration chunk at the beginning. + if s['start_time'] == 0: + opts[-1]['inpoint'] = f'{s["end_time"]:.6f}' + continue + opts[-1]['outpoint'] = f'{s["start_time"]:.6f}' + # Do not create 0 duration chunk at the end. + if s['end_time'] != duration: + opts.append({'inpoint': f'{s["end_time"]:.6f}'}) + return opts diff --git a/yt_dlp/postprocessor/sponskrub.py b/yt_dlp/postprocessor/sponskrub.py index 588f0ae12..932555a0e 100644 --- a/yt_dlp/postprocessor/sponskrub.py +++ b/yt_dlp/postprocessor/sponskrub.py @@ -17,6 +17,7 @@ from ..utils import ( ) +# Deprecated in favor of the native implementation class SponSkrubPP(PostProcessor): _temp_ext = 'spons' _exe_name = 'sponskrub' diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py new file mode 100644 index 000000000..6264d45c5 --- /dev/null +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -0,0 +1,96 @@ +import json +import re +from hashlib import sha256 + +from .ffmpeg import FFmpegPostProcessor +from ..compat import compat_urllib_parse_urlencode, compat_HTTPError +from ..utils import PostProcessingError, sanitized_Request + + +class SponsorBlockPP(FFmpegPostProcessor): + + EXTRACTORS = { + 'Youtube': 'YouTube', + } + CATEGORIES = { + 'sponsor': 'Sponsor', + 'intro': 'Intermission/Intro Animation', + 'outro': 'Endcards/Credits', + 'selfpromo': 'Unpaid/Self Promotion', + 'interaction': 'Interaction Reminder', + 'preview': 'Preview/Recap', + 'music_offtopic': 'Non-Music Section' + } + + def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'): + FFmpegPostProcessor.__init__(self, downloader) + self._categories = tuple(categories or self.CATEGORIES.keys()) + self._API_URL = api if re.match('^https?://', api) else 'https://' + api + + def run(self, info): + extractor = info['extractor_key'] + if extractor not in self.EXTRACTORS: + self.to_screen(f'SponsorBlock is not supported for {extractor}') + return [], info + + info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info['duration']) + return [], info + + def _get_sponsor_chapters(self, info, duration): + segments = self._get_sponsor_segments(info['id'], self.EXTRACTORS[info['extractor_key']]) + + def duration_filter(s): + start_end = s['segment'] + # Ignore milliseconds difference at the start. + if start_end[0] <= 1: + start_end[0] = 0 + # Ignore milliseconds difference at the end. + # Never allow the segment to exceed the video. + if duration and duration - start_end[1] <= 1: + start_end[1] = duration + # SponsorBlock duration may be absent or it may deviate from the real one. + return s['videoDuration'] == 0 or not duration or abs(duration - s['videoDuration']) <= 1 + + duration_match = [s for s in segments if duration_filter(s)] + if len(duration_match) != len(segments): + self.report_warning('Some SponsorBlock segments are from a video of different duration, maybe from an old version of this video') + + def to_chapter(s): + (start, end), cat = s['segment'], s['category'] + return { + 'start_time': start, + 'end_time': end, + 'category': cat, + 'title': self.CATEGORIES[cat], + '_categories': [(cat, start, end)] + } + + sponsor_chapters = [to_chapter(s) for s in duration_match] + if not sponsor_chapters: + self.to_screen('No segments were found in the SponsorBlock database') + else: + self.to_screen(f'Found {len(sponsor_chapters)} segments in the SponsorBlock database') + return sponsor_chapters + + def _get_sponsor_segments(self, video_id, service): + hash = sha256(video_id.encode('ascii')).hexdigest() + # SponsorBlock API recommends using first 4 hash characters. + url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + compat_urllib_parse_urlencode({ + 'service': service, + 'categories': json.dumps(self._categories), + }) + for d in self._get_json(url): + if d['videoID'] == video_id: + return d['segments'] + return [] + + def _get_json(self, url): + self.write_debug(f'SponsorBlock query: {url}') + try: + rsp = self._downloader.urlopen(sanitized_Request(url)) + except compat_HTTPError as e: + if e.code == 404: + return [] + raise PostProcessingError(f'Error communicating with SponsorBlock API - {e}') + + return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8')) |