diff options
Diffstat (limited to 'youtube/yt_data_extract')
-rw-r--r-- | youtube/yt_data_extract/common.py | 17 | ||||
-rw-r--r-- | youtube/yt_data_extract/everything_else.py | 4 | ||||
-rw-r--r-- | youtube/yt_data_extract/watch_extraction.py | 54 |
3 files changed, 37 insertions, 38 deletions
diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index 459b5e9..4681a86 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -322,7 +322,7 @@ item_types = { 'channelAboutFullMetadataRenderer', } -def traverse_browse_renderer(renderer): +def _traverse_browse_renderer(renderer): for tab in get(renderer, 'tabs', (), types=(list, tuple)): tab_renderer = multi_deep_get(tab, ['tabRenderer'], ['expandableTabRenderer'], default=None, types=dict) if tab_renderer is None: @@ -332,24 +332,24 @@ def traverse_browse_renderer(renderer): print('Could not find tab with content') return {} -def traverse_standard_list(renderer): +def _traverse_standard_list(renderer): renderer_list = multi_deep_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple)) continuation = deep_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation') return renderer_list, continuation # these renderers contain one inside them nested_renderer_dispatch = { - 'singleColumnBrowseResultsRenderer': traverse_browse_renderer, - 'twoColumnBrowseResultsRenderer': traverse_browse_renderer, + 'singleColumnBrowseResultsRenderer': _traverse_browse_renderer, + 'twoColumnBrowseResultsRenderer': _traverse_browse_renderer, 'twoColumnSearchResultsRenderer': lambda renderer: get(renderer, 'primaryContents', {}, types=dict), } # these renderers contain a list of renderers inside them nested_renderer_list_dispatch = { - 'sectionListRenderer': traverse_standard_list, - 'itemSectionRenderer': traverse_standard_list, - 'gridRenderer': traverse_standard_list, - 'playlistVideoListRenderer': traverse_standard_list, + 'sectionListRenderer': _traverse_standard_list, + 'itemSectionRenderer': _traverse_standard_list, + 'gridRenderer': _traverse_standard_list, + 'playlistVideoListRenderer': _traverse_standard_list, 'singleColumnWatchNextResults': lambda r: (deep_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None), } @@ -411,6 +411,5 @@ def extract_items(response, item_types=item_types): current_iter = iter_stack.pop() # go back up the stack except IndexError: return items, ctoken - else: return [], None diff --git a/youtube/yt_data_extract/everything_else.py b/youtube/yt_data_extract/everything_else.py index 6277c8d..6dc5248 100644 --- a/youtube/yt_data_extract/everything_else.py +++ b/youtube/yt_data_extract/everything_else.py @@ -192,7 +192,7 @@ def extract_playlist_info(polymer_json): return info -def ctoken_metadata(ctoken): +def _ctoken_metadata(ctoken): result = dict() params = proto.parse(proto.b64_to_bytes(ctoken)) result['video_id'] = proto.parse(params[2])[2].decode('ascii') @@ -220,7 +220,7 @@ def extract_comments_info(polymer_json): url = multi_deep_get(polymer_json, [1, 'url'], ['url']) if url: ctoken = urllib.parse.parse_qs(url[url.find('?')+1:])['ctoken'][0] - metadata = ctoken_metadata(ctoken) + metadata = _ctoken_metadata(ctoken) else: metadata = {} info['video_id'] = metadata.get('video_id') diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py index 1166344..ff39f62 100644 --- a/youtube/yt_data_extract/watch_extraction.py +++ b/youtube/yt_data_extract/watch_extraction.py @@ -115,7 +115,7 @@ _formats = { '397': {'vcodec': 'av01.0.05M.08'}, } -def extract_metadata_row_info(video_renderer_info): +def _extract_metadata_row_info(video_renderer_info): # extract category and music list info = { 'category': None, @@ -145,7 +145,7 @@ def extract_metadata_row_info(video_renderer_info): return info -def extract_watch_info_mobile(top_level): +def _extract_watch_info_mobile(top_level): info = {} microformat = deep_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={}) @@ -167,7 +167,7 @@ def extract_watch_info_mobile(top_level): print('Failed to extract video metadata') video_info = {} - info.update(extract_metadata_row_info(video_info)) + info.update(_extract_metadata_row_info(video_info)) info['description'] = extract_str(video_info.get('description'), recover_urls=True) info['view_count'] = extract_int(extract_str(video_info.get('expandedSubtitle'))) info['author'] = extract_str(deep_get(video_info, 'owner', 'slimOwnerRenderer', 'title')) @@ -228,7 +228,7 @@ def extract_watch_info_mobile(top_level): return info month_abbreviations = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'} -def extract_watch_info_desktop(top_level): +def _extract_watch_info_desktop(top_level): info = { 'comment_count': None, 'comments_disabled': None, @@ -241,7 +241,7 @@ def extract_watch_info_desktop(top_level): if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'): video_info.update(list(renderer.values())[0]) - info.update(extract_metadata_row_info(video_info)) + info.update(_extract_metadata_row_info(video_info)) info['description'] = extract_str(video_info.get('description', None), recover_urls=True) info['time_published'] = extract_date(extract_str(video_info.get('dateText', None))) @@ -263,21 +263,7 @@ def extract_watch_info_desktop(top_level): return info -def get_caption_url(info, language, format, automatic=False, translation_language=None): - '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.''' - url = info['_captions_base_url'] - url += '&lang=' + language - url += '&fmt=' + format - if automatic: - url += '&kind=asr' - elif language in info['_manual_caption_language_names']: - url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='') - - if translation_language: - url += '&tlang=' + translation_language - return url - -def extract_formats(info, player_response): +def _extract_formats(info, player_response): streaming_data = player_response.get('streamingData', {}) yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', []) @@ -305,7 +291,7 @@ def extract_formats(info, player_response): info['formats'].append(fmt) -def extract_playability_error(info, player_response, error_prefix=''): +def _extract_playability_error(info, player_response, error_prefix=''): if info['formats']: info['playability_status'] = None info['playability_error'] = None @@ -379,10 +365,10 @@ def extract_watch_info(polymer_json): print('WARNING: Found non-translatable caption language') # formats - extract_formats(info, player_response) + _extract_formats(info, player_response) # playability errors - extract_playability_error(info, player_response) + _extract_playability_error(info, player_response) # check age-restriction info['age_restricted'] = (info['playability_status'] == 'LOGIN_REQUIRED' and info['playability_error'] and ' age' in info['playability_error']) @@ -394,9 +380,9 @@ def extract_watch_info(polymer_json): mobile = 'singleColumnWatchNextResults' in deep_get(top_level, 'response', 'contents', default={}) if mobile: - info.update(extract_watch_info_mobile(top_level)) + info.update(_extract_watch_info_mobile(top_level)) else: - info.update(extract_watch_info_desktop(top_level)) + info.update(_extract_watch_info_desktop(top_level)) # stuff from videoDetails. Use liberal_update to prioritize info from videoDetails over existing info vd = deep_get(top_level, 'playerResponse', 'videoDetails', default={}) @@ -430,6 +416,20 @@ def extract_watch_info(polymer_json): info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None return info +def get_caption_url(info, language, format, automatic=False, translation_language=None): + '''Gets the url for captions with the given language and format. If automatic is True, get the automatic captions for that language. If translation_language is given, translate the captions from `language` to `translation_language`. If automatic is true and translation_language is given, the automatic captions will be translated.''' + url = info['_captions_base_url'] + url += '&lang=' + language + url += '&fmt=' + format + if automatic: + url += '&kind=asr' + elif language in info['_manual_caption_language_names']: + url += '&name=' + urllib.parse.quote(info['_manual_caption_language_names'][language], safe='') + + if translation_language: + url += '&tlang=' + translation_language + return url + def update_with_age_restricted_info(info, video_info_page): ERROR_PREFIX = 'Error bypassing age-restriction: ' @@ -445,5 +445,5 @@ def update_with_age_restricted_info(info, video_info_page): info['playability_error'] = ERROR_PREFIX + 'Failed to parse json response' return - extract_formats(info, player_response) - extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX) + _extract_formats(info, player_response) + _extract_playability_error(info, player_response, error_prefix=ERROR_PREFIX) |