[extractor] Extract chapters from JSON-LD (#2031)

Authored by: iw0nderhow, pukkandan
author: chris <6024426+iw0nderhow@users.noreply.github.com> 2022-01-01 22:07:00 +0100
committer: GitHub <noreply@github.com> 2022-01-02 02:37:00 +0530
commit: f5225737877a78f63b9a6f1de675c95c650f65d6 (patch)
tree: 6481494f9e2fc6063646be74df7cec4199f327d4 /yt_dlp
parent: 7592749cbe377675688dfcad5b7c1d46bbb684e1 (diff)
download: hypervideo-pre-f5225737877a78f63b9a6f1de675c95c650f65d6.tar.lz
hypervideo-pre-f5225737877a78f63b9a6f1de675c95c650f65d6.tar.xz
hypervideo-pre-f5225737877a78f63b9a6f1de675c95c650f65d6.zip
1 files changed, 18 insertions, 0 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index 1d694293e..79f53c9c2 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1429,6 +1429,23 @@ class InfoExtractor(object):
                     continue
                 info[count_key] = interaction_count
 
+        def extract_chapter_information(e):
+            chapters = [{
+                'title': part.get('name'),
+                'start_time': part.get('startOffset'),
+                'end_time': part.get('endOffset'),
+            } for part in e.get('hasPart', []) if part.get('@type') == 'Clip']
+            for idx, (last_c, current_c, next_c) in enumerate(zip(
+                    [{'end_time': 0}] + chapters, chapters, chapters[1:])):
+                current_c['end_time'] = current_c['end_time'] or next_c['start_time']
+                current_c['start_time'] = current_c['start_time'] or last_c['end_time']
+                if None in current_c.values():
+                    self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
+                    return
+            if chapters:
+                chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
+                info['chapters'] = chapters
+
         def extract_video_object(e):
             assert e['@type'] == 'VideoObject'
             author = e.get('author')
@@ -1452,6 +1469,7 @@ class InfoExtractor(object):
                 'view_count': int_or_none(e.get('interactionCount')),
             })
             extract_interaction_statistic(e)
+            extract_chapter_information(e)
 
         def traverse_json_ld(json_ld, at_top_level=True):
             for e in json_ld:
author	chris <6024426+iw0nderhow@users.noreply.github.com>	2022-01-01 22:07:00 +0100
committer	GitHub <noreply@github.com>	2022-01-02 02:37:00 +0530
commit	f5225737877a78f63b9a6f1de675c95c650f65d6 (patch)
tree	6481494f9e2fc6063646be74df7cec4199f327d4 /yt_dlp
parent	7592749cbe377675688dfcad5b7c1d46bbb684e1 (diff)
download	hypervideo-pre-f5225737877a78f63b9a6f1de675c95c650f65d6.tar.lz hypervideo-pre-f5225737877a78f63b9a6f1de675c95c650f65d6.tar.xz hypervideo-pre-f5225737877a78f63b9a6f1de675c95c650f65d6.zip