[extractor] Support default implicit graph in JSON-LD (#1983)

Original PR: https://github.com/ytdl-org/youtube-dl/pull/30229 Per W3C JSON-LD v1.1 §4.9 (non-normative ref): When a JSON-LD document's top-level structure is a map that contains no other keys than @graph and optionally @context (properties that are not mapped to an IRI or a keyword are ignored), @graph is considered to express the otherwise implicit default graph. Authored by: zmousm
author: Zenon Mousmoulas <zmousm@users.noreply.github.com> 2021-12-16 22:46:30 +0200
committer: GitHub <noreply@github.com> 2021-12-17 02:16:30 +0530
commit: d5c3254889208a75d57c74868a7e7ce62be6b636 (patch)
tree: 9c9ee121c83f50417124aebda732a39af06054c8 /yt_dlp/extractor/common.py
parent: fed13096518de5ba6f2125a2a93df2113214d5db (diff)
download: hypervideo-pre-d5c3254889208a75d57c74868a7e7ce62be6b636.tar.lz
hypervideo-pre-d5c3254889208a75d57c74868a7e7ce62be6b636.tar.xz
hypervideo-pre-d5c3254889208a75d57c74868a7e7ce62be6b636.zip
1 files changed, 10 insertions, 3 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index ebf2e3cea..52099b4b4 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1451,8 +1451,13 @@ class InfoExtractor(object):
             })
             extract_interaction_statistic(e)
 
-        for e in json_ld:
-            if '@context' in e:
+        def traverse_json_ld(json_ld, at_top_level=True):
+            for e in json_ld:
+                if at_top_level and '@context' not in e:
+                    continue
+                if at_top_level and set(e.keys()) == {'@context', '@graph'}:
+                    traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
+                    break
                 item_type = e.get('@type')
                 if expected_type is not None and expected_type != item_type:
                     continue
@@ -1488,7 +1493,7 @@ class InfoExtractor(object):
                     info.update({
                         'timestamp': parse_iso8601(e.get('datePublished')),
                         'title': unescapeHTML(e.get('headline')),
-                        'description': unescapeHTML(e.get('articleBody')),
+                        'description': unescapeHTML(e.get('articleBody') or e.get('description')),
                     })
                 elif item_type == 'VideoObject':
                     extract_video_object(e)
@@ -1503,6 +1508,8 @@ class InfoExtractor(object):
                     continue
                 else:
                     break
+        traverse_json_ld(json_ld)
+
         return dict((k, v) for k, v in info.items() if v is not None)
 
     def _search_nextjs_data(self, webpage, video_id, **kw):
author	Zenon Mousmoulas <zmousm@users.noreply.github.com>	2021-12-16 22:46:30 +0200
committer	GitHub <noreply@github.com>	2021-12-17 02:16:30 +0530
commit	d5c3254889208a75d57c74868a7e7ce62be6b636 (patch)
tree	9c9ee121c83f50417124aebda732a39af06054c8 /yt_dlp/extractor/common.py
parent	fed13096518de5ba6f2125a2a93df2113214d5db (diff)
download	hypervideo-pre-d5c3254889208a75d57c74868a7e7ce62be6b636.tar.lz hypervideo-pre-d5c3254889208a75d57c74868a7e7ce62be6b636.tar.xz hypervideo-pre-d5c3254889208a75d57c74868a7e7ce62be6b636.zip