aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/yt_data_extract
diff options
context:
space:
mode:
Diffstat (limited to 'youtube/yt_data_extract')
-rw-r--r--youtube/yt_data_extract/common.py34
-rw-r--r--youtube/yt_data_extract/watch_extraction.py31
2 files changed, 56 insertions, 9 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py
index 74ac1d6..e0a3f7f 100644
--- a/youtube/yt_data_extract/common.py
+++ b/youtube/yt_data_extract/common.py
@@ -1,6 +1,7 @@
import re
import urllib.parse
import collections
+import collections.abc
def get(object, key, default=None, types=()):
'''Like dict.get(), but returns default if the result doesn't match one of the types.
@@ -62,17 +63,40 @@ def multi_deep_get(object, *key_sequences, default=None, types=()):
continue
return default
+
+def _is_empty(value):
+ '''Determines if value is None or an empty iterable, such as '' and []'''
+ if value is None:
+ return True
+ elif isinstance(value, collections.abc.Iterable) and not value:
+ return True
+ return False
+
+
def liberal_update(obj, key, value):
- '''Updates obj[key] with value as long as value is not None.
- Ensures obj[key] will at least get a value of None, however'''
- if (value is not None) or (key not in obj):
+ '''Updates obj[key] with value as long as value is not None or empty.
+ Ensures obj[key] will at least get an empty value, however'''
+ if (not _is_empty(value)) or (key not in obj):
obj[key] = value
def conservative_update(obj, key, value):
- '''Only updates obj if it doesn't have key or obj[key] is None'''
- if obj.get(key) is None:
+ '''Only updates obj if it doesn't have key or obj[key] is None/empty'''
+ if _is_empty(obj.get(key)):
obj[key] = value
+
+def liberal_dict_update(dict1, dict2):
+ '''Update dict1 with keys from dict2 using liberal_update'''
+ for key, value in dict2.items():
+ liberal_update(dict1, key, value)
+
+
+def conservative_dict_update(dict1, dict2):
+ '''Update dict1 with keys from dict2 using conservative_update'''
+ for key, value in dict2.items():
+ conservative_update(dict1, key, value)
+
+
def concat_or_none(*strings):
'''Concatenates strings. Returns None if any of the arguments are None'''
result = ''
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index 04380fa..b3d3cb4 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -2,7 +2,8 @@ from .common import (get, multi_get, deep_get, multi_deep_get,
liberal_update, conservative_update, remove_redirect, normalize_url,
extract_str, extract_formatted_text, extract_int, extract_approx_int,
extract_date, check_missing_keys, extract_item_info, extract_items,
- extract_response, concat_or_none)
+ extract_response, concat_or_none, liberal_dict_update,
+ conservative_dict_update)
import json
import urllib.parse
@@ -211,13 +212,33 @@ def _extract_metadata_row_info(renderer_content):
return info
+def _extract_from_video_metadata(renderer_content):
+ info = _extract_from_video_information_renderer(renderer_content)
+ liberal_dict_update(info, _extract_likes_dislikes(renderer_content))
+ liberal_dict_update(info, _extract_from_owner_renderer(renderer_content))
+ liberal_dict_update(info, _extract_metadata_row_info(deep_get(
+ renderer_content, 'metadataRowContainer',
+ 'metadataRowContainerRenderer', default={}
+ )))
+ liberal_update(info, 'title', extract_str(renderer_content.get('title')))
+ liberal_update(
+ info, 'description',
+ extract_str(renderer_content.get('description'), recover_urls=True)
+ )
+ liberal_update(info, 'time_published',
+ extract_date(renderer_content.get('dateText')))
+ return info
+
visible_extraction_dispatch = {
+ # Either these ones spread around in various places
'slimVideoInformationRenderer': _extract_from_video_information_renderer,
'slimVideoActionBarRenderer': _extract_likes_dislikes,
'slimOwnerRenderer': _extract_from_owner_renderer,
'videoDescriptionHeaderRenderer': _extract_from_video_header_renderer,
'expandableVideoDescriptionRenderer': _extract_from_description_renderer,
'metadataRowContainerRenderer': _extract_metadata_row_info,
+ # OR just this one, which contains SOME of the above inside it
+ 'slimVideoMetadataRenderer': _extract_from_video_metadata,
}
def _extract_watch_info_mobile(top_level):
@@ -265,12 +286,14 @@ def _extract_watch_info_mobile(top_level):
for renderer in items:
name, renderer_content = list(renderer.items())[0]
found.add(name)
- info.update(visible_extraction_dispatch[name](renderer_content))
+ liberal_dict_update(
+ info,
+ visible_extraction_dispatch[name](renderer_content)
+ )
# Call the function on blank dict for any that weren't found
# so that the empty keys get added
for name in visible_extraction_dispatch.keys() - found:
- info.update(visible_extraction_dispatch[name]({}))
-
+ liberal_dict_update(info, visible_extraction_dispatch[name]({}))
# comment section info
items, _ = extract_items(response, item_types={