diff options
-rw-r--r-- | youtube/yt_data_extract/common.py | 34 | ||||
-rw-r--r-- | youtube/yt_data_extract/watch_extraction.py | 31 |
2 files changed, 56 insertions, 9 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index 74ac1d6..e0a3f7f 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -1,6 +1,7 @@ import re import urllib.parse import collections +import collections.abc def get(object, key, default=None, types=()): '''Like dict.get(), but returns default if the result doesn't match one of the types. @@ -62,17 +63,40 @@ def multi_deep_get(object, *key_sequences, default=None, types=()): continue return default + +def _is_empty(value): + '''Determines if value is None or an empty iterable, such as '' and []''' + if value is None: + return True + elif isinstance(value, collections.abc.Iterable) and not value: + return True + return False + + def liberal_update(obj, key, value): - '''Updates obj[key] with value as long as value is not None. - Ensures obj[key] will at least get a value of None, however''' - if (value is not None) or (key not in obj): + '''Updates obj[key] with value as long as value is not None or empty. + Ensures obj[key] will at least get an empty value, however''' + if (not _is_empty(value)) or (key not in obj): obj[key] = value def conservative_update(obj, key, value): - '''Only updates obj if it doesn't have key or obj[key] is None''' - if obj.get(key) is None: + '''Only updates obj if it doesn't have key or obj[key] is None/empty''' + if _is_empty(obj.get(key)): obj[key] = value + +def liberal_dict_update(dict1, dict2): + '''Update dict1 with keys from dict2 using liberal_update''' + for key, value in dict2.items(): + liberal_update(dict1, key, value) + + +def conservative_dict_update(dict1, dict2): + '''Update dict1 with keys from dict2 using conservative_update''' + for key, value in dict2.items(): + conservative_update(dict1, key, value) + + def concat_or_none(*strings): '''Concatenates strings. Returns None if any of the arguments are None''' result = '' diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index 04380fa..b3d3cb4 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -2,7 +2,8 @@ from .common import (get, multi_get, deep_get, multi_deep_get, liberal_update, conservative_update, remove_redirect, normalize_url, extract_str, extract_formatted_text, extract_int, extract_approx_int, extract_date, check_missing_keys, extract_item_info, extract_items, - extract_response, concat_or_none) + extract_response, concat_or_none, liberal_dict_update, + conservative_dict_update) import json import urllib.parse @@ -211,13 +212,33 @@ def _extract_metadata_row_info(renderer_content): return info +def _extract_from_video_metadata(renderer_content): + info = _extract_from_video_information_renderer(renderer_content) + liberal_dict_update(info, _extract_likes_dislikes(renderer_content)) + liberal_dict_update(info, _extract_from_owner_renderer(renderer_content)) + liberal_dict_update(info, _extract_metadata_row_info(deep_get( + renderer_content, 'metadataRowContainer', + 'metadataRowContainerRenderer', default={} + ))) + liberal_update(info, 'title', extract_str(renderer_content.get('title'))) + liberal_update( + info, 'description', + extract_str(renderer_content.get('description'), recover_urls=True) + ) + liberal_update(info, 'time_published', + extract_date(renderer_content.get('dateText'))) + return info + visible_extraction_dispatch = { + # Either these ones spread around in various places 'slimVideoInformationRenderer': _extract_from_video_information_renderer, 'slimVideoActionBarRenderer': _extract_likes_dislikes, 'slimOwnerRenderer': _extract_from_owner_renderer, 'videoDescriptionHeaderRenderer': _extract_from_video_header_renderer, 'expandableVideoDescriptionRenderer': _extract_from_description_renderer, 'metadataRowContainerRenderer': _extract_metadata_row_info, + # OR just this one, which contains SOME of the above inside it + 'slimVideoMetadataRenderer': _extract_from_video_metadata, } def _extract_watch_info_mobile(top_level): @@ -265,12 +286,14 @@ def _extract_watch_info_mobile(top_level): for renderer in items: name, renderer_content = list(renderer.items())[0] found.add(name) - info.update(visible_extraction_dispatch[name](renderer_content)) + liberal_dict_update( + info, + visible_extraction_dispatch[name](renderer_content) + ) # Call the function on blank dict for any that weren't found # so that the empty keys get added for name in visible_extraction_dispatch.keys() - found: - info.update(visible_extraction_dispatch[name]({})) - + liberal_dict_update(info, visible_extraction_dispatch[name]({})) # comment section info items, _ = extract_items(response, item_types={ |