diff options
author | Zenon Mousmoulas <zmousm@users.noreply.github.com> | 2021-12-16 22:46:30 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-12-17 02:16:30 +0530 |
commit | d5c3254889208a75d57c74868a7e7ce62be6b636 (patch) | |
tree | 9c9ee121c83f50417124aebda732a39af06054c8 /yt_dlp/extractor/common.py | |
parent | fed13096518de5ba6f2125a2a93df2113214d5db (diff) | |
download | hypervideo-pre-d5c3254889208a75d57c74868a7e7ce62be6b636.tar.lz hypervideo-pre-d5c3254889208a75d57c74868a7e7ce62be6b636.tar.xz hypervideo-pre-d5c3254889208a75d57c74868a7e7ce62be6b636.zip |
[extractor] Support default implicit graph in JSON-LD (#1983)
Original PR: https://github.com/ytdl-org/youtube-dl/pull/30229
Per W3C JSON-LD v1.1 ยง4.9 (non-normative ref):
When a JSON-LD document's top-level structure is a map that contains
no other keys than @graph and optionally @context (properties that
are not mapped to an IRI or a keyword are ignored), @graph is
considered to express the otherwise implicit default graph.
Authored by: zmousm
Diffstat (limited to 'yt_dlp/extractor/common.py')
-rw-r--r-- | yt_dlp/extractor/common.py | 13 |
1 files changed, 10 insertions, 3 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index ebf2e3cea..52099b4b4 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1451,8 +1451,13 @@ class InfoExtractor(object): }) extract_interaction_statistic(e) - for e in json_ld: - if '@context' in e: + def traverse_json_ld(json_ld, at_top_level=True): + for e in json_ld: + if at_top_level and '@context' not in e: + continue + if at_top_level and set(e.keys()) == {'@context', '@graph'}: + traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False) + break item_type = e.get('@type') if expected_type is not None and expected_type != item_type: continue @@ -1488,7 +1493,7 @@ class InfoExtractor(object): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), 'title': unescapeHTML(e.get('headline')), - 'description': unescapeHTML(e.get('articleBody')), + 'description': unescapeHTML(e.get('articleBody') or e.get('description')), }) elif item_type == 'VideoObject': extract_video_object(e) @@ -1503,6 +1508,8 @@ class InfoExtractor(object): continue else: break + traverse_json_ld(json_ld) + return dict((k, v) for k, v in info.items() if v is not None) def _search_nextjs_data(self, webpage, video_id, **kw): |