[webvtt] Merge daisy-chained duplicate cues (#638)

Fixes: https://github.com/yt-dlp/yt-dlp/issues/631#issuecomment-893338552 Previous deduplication algorithm only removed duplicate cues with identical text, styles and timestamps. This change also merges cues that come in ‘daisy chains’, where sequences of cues with identical text and styles appear in which the ending timestamp of one equals the starting timestamp of the next. This deduplication algorithm has the somewhat unfortunate side effect that NOTE blocks between cues, if found, will be emitted in a different order relative to their original cues. This may be unwanted if perfect fidelity is desired, but then so is daisy-chain deduplication itself. NOTE blocks ought to be ignored by WebVTT players in any case. Authored by: fstirlitz
author: Felix S <felix.von.s@posteo.de> 2021-08-09 20:22:30 +0000
committer: GitHub <noreply@github.com> 2021-08-10 01:52:30 +0530
commit: 25a3f4f5d6de2bd5fb60cd11a46f0ac232882a56 (patch)
tree: 3e3bbe3078fe903abbd0e6593a3a0e230564cdda
parent: ad3dc496bbf2e2a574a16244ddde0740778e5daf (diff)
download: hypervideo-pre-25a3f4f5d6de2bd5fb60cd11a46f0ac232882a56.tar.lz
hypervideo-pre-25a3f4f5d6de2bd5fb60cd11a46f0ac232882a56.tar.xz
hypervideo-pre-25a3f4f5d6de2bd5fb60cd11a46f0ac232882a56.zip
3 files changed, 61 insertions, 19 deletions
diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py
index 1cc99a4e9..e3af140fd 100644
--- a/yt_dlp/downloader/fragment.py
+++ b/yt_dlp/downloader/fragment.py
@@ -329,7 +329,7 @@ class FragmentFD(FileDownloader):
             'fragment_index': 0,
         })
 
-    def download_and_append_fragments(self, ctx, fragments, info_dict, pack_func=None):
+    def download_and_append_fragments(self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None):
         fragment_retries = self.params.get('fragment_retries', 0)
         is_fatal = (lambda idx: idx == 0) if self.params.get('skip_unavailable_fragments', True) else (lambda _: True)
         if not pack_func:
@@ -424,5 +424,8 @@ class FragmentFD(FileDownloader):
                 if not result:
                     return False
 
+        if finish_func is not None:
+            ctx['dest_stream'].write(finish_func())
+            ctx['dest_stream'].flush()
         self._finish_frag_download(ctx, info_dict)
         return True
diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py
index 9cbd5a584..9cfc191cb 100644
--- a/yt_dlp/downloader/hls.py
+++ b/yt_dlp/downloader/hls.py
@@ -260,29 +260,35 @@ class HlsFD(FragmentFD):
                         block.end += adjust
 
                         dedup_window = extra_state.setdefault('webvtt_dedup_window', [])
-                        cue = block.as_json
 
-                        # skip the cue if an identical one appears
-                        # in the window of potential duplicates
-                        # and prune the window of unviable candidates
+                        ready = []
+
                         i = 0
-                        skip = True
+                        is_new = True
                         while i < len(dedup_window):
-                            window_cue = dedup_window[i]
-                            if window_cue == cue:
-                                break
-                            if window_cue['end'] >= cue['start']:
-                                i += 1
+                            wcue = dedup_window[i]
+                            wblock = webvtt.CueBlock.from_json(wcue)
+                            i += 1
+                            if wblock.hinges(block):
+                                wcue['end'] = block.end
+                                is_new = False
+                                continue
+                            if wblock == block:
+                                is_new = False
+                                continue
+                            if wblock.end > block.start:
                                 continue
+                            ready.append(wblock)
+                            i -= 1
                             del dedup_window[i]
-                        else:
-                            skip = False
 
-                        if skip:
-                            continue
+                        if is_new:
+                            dedup_window.append(block.as_json)
+                        for block in ready:
+                            block.write_into(output)
 
-                        # add the cue to the window
-                        dedup_window.append(cue)
+                        # we only emit cues once they fall out of the duplicate window
+                        continue
                     elif isinstance(block, webvtt.Magic):
                         # take care of MPEG PES timestamp overflow
                         if block.mpegts is None:
@@ -317,6 +323,19 @@ class HlsFD(FragmentFD):
                     block.write_into(output)
 
                 return output.getvalue().encode('utf-8')
+
+            def fin_fragments():
+                dedup_window = extra_state.get('webvtt_dedup_window')
+                if not dedup_window:
+                    return b''
+
+                output = io.StringIO()
+                for cue in dedup_window:
+                    webvtt.CueBlock.from_json(cue).write_into(output)
+
+                return output.getvalue().encode('utf-8')
+
+            self.download_and_append_fragments(
+                ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments)
         else:
-            pack_fragment = None
-        return self.download_and_append_fragments(ctx, fragments, info_dict, pack_fragment)
+            return self.download_and_append_fragments(ctx, fragments, info_dict)
diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py
index ef55e6459..eee2a4a2d 100644
--- a/yt_dlp/webvtt.py
+++ b/yt_dlp/webvtt.py
@@ -331,6 +331,26 @@ class CueBlock(Block):
             'settings': self.settings,
         }
 
+    def __eq__(self, other):
+        return self.as_json == other.as_json
+
+    @classmethod
+    def from_json(cls, json):
+        return cls(
+            id=json['id'],
+            start=json['start'],
+            end=json['end'],
+            text=json['text'],
+            settings=json['settings']
+        )
+
+    def hinges(self, other):
+        if self.text != other.text:
+            return False
+        if self.settings != other.settings:
+            return False
+        return self.start <= self.end == other.start <= other.end
+
 
 def parse_fragment(frag_content):
     """
author	Felix S <felix.von.s@posteo.de>	2021-08-09 20:22:30 +0000
committer	GitHub <noreply@github.com>	2021-08-10 01:52:30 +0530
commit	25a3f4f5d6de2bd5fb60cd11a46f0ac232882a56 (patch)
tree	3e3bbe3078fe903abbd0e6593a3a0e230564cdda
parent	ad3dc496bbf2e2a574a16244ddde0740778e5daf (diff)
download	hypervideo-pre-25a3f4f5d6de2bd5fb60cd11a46f0ac232882a56.tar.lz hypervideo-pre-25a3f4f5d6de2bd5fb60cd11a46f0ac232882a56.tar.xz hypervideo-pre-25a3f4f5d6de2bd5fb60cd11a46f0ac232882a56.zip