diff options
author | James Taylor <user234683@users.noreply.github.com> | 2019-12-19 21:33:54 -0800 |
---|---|---|
committer | James Taylor <user234683@users.noreply.github.com> | 2019-12-19 21:33:54 -0800 |
commit | b4406df9cf33c53b6e942e6a5c72d955f57c4b5f (patch) | |
tree | 4de0082ac9eb26a05188dd424835ea50b1483113 /youtube/yt_data_extract.py | |
parent | b614fcdb8579ba29fccfa47eab1e2965cfb0beaa (diff) | |
parent | 6b7a1212e30b713453aa7d2b3a7122e97689dad0 (diff) | |
download | yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.tar.lz yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.tar.xz yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.zip |
Merge branch 'modular-data-extract'
Commits in this branch are prefixed with "Extraction:"
This branch refactors data extraction. All such functionality has been moved to the yt_data_extract module.
Responses from requests are given to the module and it parses them into a consistent, more useful format.
The dependency on youtube-dl has also been dropped and this functionality has been built from scratch for these reasons:
(1) I've noticed youtube-dl breaks more often than invidious (which uses watch page extraction built from scratch) in response to changes from Youtube, so I'm hoping what I wrote will also be less brittle.
(2) Such breakage is inconvenient because I have to manually merge the fixes since I had to make changes to youtube-dl to make it do things such as extracting related videos.
(3) I have no control over error handling and request pooling with youtube-dl, since it does all the requests (these would require intrusive changes I don't want to maintain).
(4) I will now be able to finally display the number of comments and whether comments are disabled without making additional requests.
Diffstat (limited to 'youtube/yt_data_extract.py')
-rw-r--r-- | youtube/yt_data_extract.py | 273 |
1 files changed, 0 insertions, 273 deletions
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py deleted file mode 100644 index 5419084..0000000 --- a/youtube/yt_data_extract.py +++ /dev/null @@ -1,273 +0,0 @@ -from youtube import util - -import html -import json - -# videos (all of type str): - -# id -# title -# url -# author -# author_url -# thumbnail -# description -# published -# duration -# likes -# dislikes -# views -# playlist_index - -# playlists: - -# id -# title -# url -# author -# author_url -# thumbnail -# description -# updated -# size -# first_video_id - - - - - -def get_plain_text(node): - try: - return node['simpleText'] - except KeyError: - return ''.join(text_run['text'] for text_run in node['runs']) - -def format_text_runs(runs): - if isinstance(runs, str): - return runs - result = '' - for text_run in runs: - if text_run.get("bold", False): - result += "<b>" + html.escape(text_run["text"]) + "</b>" - elif text_run.get('italics', False): - result += "<i>" + html.escape(text_run["text"]) + "</i>" - else: - result += html.escape(text_run["text"]) - return result - - - - - - - - -def get_url(node): - try: - return node['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - except KeyError: - return node['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - - -def get_text(node): - if node == {}: - return '' - try: - return node['simpleText'] - except KeyError: - pass - try: - return node['runs'][0]['text'] - except IndexError: # empty text runs - return '' - except KeyError: - print(node) - raise - -def get_formatted_text(node): - try: - return node['runs'] - except KeyError: - return node['simpleText'] - -def get_badges(node): - badges = [] - for badge_node in node: - badge = badge_node['metadataBadgeRenderer']['label'] - badges.append(badge) - return badges - -def get_thumbnail(node): - try: - return node['thumbnails'][0]['url'] # polymer format - except KeyError: - return node['url'] # ajax format - -dispatch = { - -# polymer format - 'title': ('title', get_text), - 'publishedTimeText': ('published', get_text), - 'videoId': ('id', lambda node: node), - 'descriptionSnippet': ('description', get_formatted_text), - 'lengthText': ('duration', get_text), - 'thumbnail': ('thumbnail', get_thumbnail), - 'thumbnails': ('thumbnail', lambda node: node[0]['thumbnails'][0]['url']), - - 'viewCountText': ('views', get_text), - 'numVideosText': ('size', lambda node: get_text(node).split(' ')[0]), # the format is "324 videos" - 'videoCountText': ('size', get_text), - 'playlistId': ('id', lambda node: node), - 'descriptionText': ('description', get_formatted_text), - - 'subscriberCountText': ('subscriber_count', get_text), - 'channelId': ('id', lambda node: node), - 'badges': ('badges', get_badges), - -# ajax format - 'view_count_text': ('views', get_text), - 'num_videos_text': ('size', lambda node: get_text(node).split(' ')[0]), - 'owner_text': ('author', get_text), - 'owner_endpoint': ('author_url', lambda node: node['url']), - 'description': ('description', get_formatted_text), - 'index': ('playlist_index', get_text), - 'short_byline': ('author', get_text), - 'length': ('duration', get_text), - 'video_id': ('id', lambda node: node), - -} - -def ajax_info(item_json): - try: - info = {} - for key, node in item_json.items(): - try: - simple_key, function = dispatch[key] - except KeyError: - continue - info[simple_key] = function(node) - return info - except KeyError: - print(item_json) - raise - - - -def prefix_urls(item): - try: - item['thumbnail'] = '/' + item['thumbnail'].lstrip('/') - except KeyError: - pass - - try: - item['author_url'] = util.URL_ORIGIN + item['author_url'] - except KeyError: - pass - -def add_extra_html_info(item): - if item['type'] == 'video': - item['url'] = util.URL_ORIGIN + '/watch?v=' + item['id'] - - video_info = {} - for key in ('id', 'title', 'author', 'duration'): - try: - video_info[key] = item[key] - except KeyError: - video_info[key] = '' - - item['video_info'] = json.dumps(video_info) - - elif item['type'] == 'playlist': - item['url'] = util.URL_ORIGIN + '/playlist?list=' + item['id'] - elif item['type'] == 'channel': - item['url'] = util.URL_ORIGIN + "/channel/" + item['id'] - - -def renderer_info(renderer, additional_info={}): - type = list(renderer.keys())[0] - renderer = renderer[type] - info = {} - if type == 'itemSectionRenderer': - return renderer_info(renderer['contents'][0], additional_info) - - if type in ('movieRenderer', 'clarificationRenderer'): - info['type'] = 'unsupported' - return info - - info.update(additional_info) - - - if type in ('compactVideoRenderer', 'videoRenderer', 'playlistVideoRenderer', 'gridVideoRenderer'): - info['type'] = 'video' - elif type in ('playlistRenderer', 'compactPlaylistRenderer', 'gridPlaylistRenderer', - 'radioRenderer', 'compactRadioRenderer', 'gridRadioRenderer', - 'showRenderer', 'compactShowRenderer', 'gridShowRenderer'): - info['type'] = 'playlist' - elif type == 'channelRenderer': - info['type'] = 'channel' - elif type == 'playlistHeaderRenderer': - info['type'] = 'playlist_metadata' - else: - info['type'] = 'unsupported' - return info - - try: - if 'viewCountText' in renderer: # prefer this one as it contains all the digits - info['views'] = get_text(renderer['viewCountText']) - elif 'shortViewCountText' in renderer: - info['views'] = get_text(renderer['shortViewCountText']) - - if 'ownerText' in renderer: - info['author'] = renderer['ownerText']['runs'][0]['text'] - info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] - try: - overlays = renderer['thumbnailOverlays'] - except KeyError: - pass - else: - for overlay in overlays: - if 'thumbnailOverlayTimeStatusRenderer' in overlay: - info['duration'] = get_text(overlay['thumbnailOverlayTimeStatusRenderer']['text']) - # show renderers don't have videoCountText - elif 'thumbnailOverlayBottomPanelRenderer' in overlay: - info['size'] = get_text(overlay['thumbnailOverlayBottomPanelRenderer']['text']) - - # show renderers don't have playlistId, have to dig into the url to get it - try: - info['id'] = renderer['navigationEndpoint']['watchEndpoint']['playlistId'] - except KeyError: - pass - for key, node in renderer.items(): - if key in ('longBylineText', 'shortBylineText'): - info['author'] = get_text(node) - try: - info['author_url'] = get_url(node) - except KeyError: - pass - - # show renderers don't have thumbnail key at top level, dig into thumbnailRenderer - elif key == 'thumbnailRenderer' and 'showCustomThumbnailRenderer' in node: - info['thumbnail'] = node['showCustomThumbnailRenderer']['thumbnail']['thumbnails'][0]['url'] - else: - try: - simple_key, function = dispatch[key] - except KeyError: - continue - info[simple_key] = function(node) - if info['type'] == 'video' and 'duration' not in info: - info['duration'] = 'Live' - - return info - except KeyError: - print(renderer) - raise - - -def parse_info_prepare_for_html(renderer, additional_info={}): - item = renderer_info(renderer, additional_info) - prefix_urls(item) - add_extra_html_info(item) - - return item - - |