aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/yt_data_extract.py
diff options
context:
space:
mode:
authorJames Taylor <user234683@users.noreply.github.com>2019-12-19 21:33:54 -0800
committerJames Taylor <user234683@users.noreply.github.com>2019-12-19 21:33:54 -0800
commitb4406df9cf33c53b6e942e6a5c72d955f57c4b5f (patch)
tree4de0082ac9eb26a05188dd424835ea50b1483113 /youtube/yt_data_extract.py
parentb614fcdb8579ba29fccfa47eab1e2965cfb0beaa (diff)
parent6b7a1212e30b713453aa7d2b3a7122e97689dad0 (diff)
downloadyt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.tar.lz
yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.tar.xz
yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.zip
Merge branch 'modular-data-extract'
Commits in this branch are prefixed with "Extraction:" This branch refactors data extraction. All such functionality has been moved to the yt_data_extract module. Responses from requests are given to the module and it parses them into a consistent, more useful format. The dependency on youtube-dl has also been dropped and this functionality has been built from scratch for these reasons: (1) I've noticed youtube-dl breaks more often than invidious (which uses watch page extraction built from scratch) in response to changes from Youtube, so I'm hoping what I wrote will also be less brittle. (2) Such breakage is inconvenient because I have to manually merge the fixes since I had to make changes to youtube-dl to make it do things such as extracting related videos. (3) I have no control over error handling and request pooling with youtube-dl, since it does all the requests (these would require intrusive changes I don't want to maintain). (4) I will now be able to finally display the number of comments and whether comments are disabled without making additional requests.
Diffstat (limited to 'youtube/yt_data_extract.py')
-rw-r--r--youtube/yt_data_extract.py273
1 files changed, 0 insertions, 273 deletions
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py
deleted file mode 100644
index 5419084..0000000
--- a/youtube/yt_data_extract.py
+++ /dev/null
@@ -1,273 +0,0 @@
-from youtube import util
-
-import html
-import json
-
-# videos (all of type str):
-
-# id
-# title
-# url
-# author
-# author_url
-# thumbnail
-# description
-# published
-# duration
-# likes
-# dislikes
-# views
-# playlist_index
-
-# playlists:
-
-# id
-# title
-# url
-# author
-# author_url
-# thumbnail
-# description
-# updated
-# size
-# first_video_id
-
-
-
-
-
-def get_plain_text(node):
- try:
- return node['simpleText']
- except KeyError:
- return ''.join(text_run['text'] for text_run in node['runs'])
-
-def format_text_runs(runs):
- if isinstance(runs, str):
- return runs
- result = ''
- for text_run in runs:
- if text_run.get("bold", False):
- result += "<b>" + html.escape(text_run["text"]) + "</b>"
- elif text_run.get('italics', False):
- result += "<i>" + html.escape(text_run["text"]) + "</i>"
- else:
- result += html.escape(text_run["text"])
- return result
-
-
-
-
-
-
-
-
-def get_url(node):
- try:
- return node['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
- except KeyError:
- return node['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
-
-
-def get_text(node):
- if node == {}:
- return ''
- try:
- return node['simpleText']
- except KeyError:
- pass
- try:
- return node['runs'][0]['text']
- except IndexError: # empty text runs
- return ''
- except KeyError:
- print(node)
- raise
-
-def get_formatted_text(node):
- try:
- return node['runs']
- except KeyError:
- return node['simpleText']
-
-def get_badges(node):
- badges = []
- for badge_node in node:
- badge = badge_node['metadataBadgeRenderer']['label']
- badges.append(badge)
- return badges
-
-def get_thumbnail(node):
- try:
- return node['thumbnails'][0]['url'] # polymer format
- except KeyError:
- return node['url'] # ajax format
-
-dispatch = {
-
-# polymer format
- 'title': ('title', get_text),
- 'publishedTimeText': ('published', get_text),
- 'videoId': ('id', lambda node: node),
- 'descriptionSnippet': ('description', get_formatted_text),
- 'lengthText': ('duration', get_text),
- 'thumbnail': ('thumbnail', get_thumbnail),
- 'thumbnails': ('thumbnail', lambda node: node[0]['thumbnails'][0]['url']),
-
- 'viewCountText': ('views', get_text),
- 'numVideosText': ('size', lambda node: get_text(node).split(' ')[0]), # the format is "324 videos"
- 'videoCountText': ('size', get_text),
- 'playlistId': ('id', lambda node: node),
- 'descriptionText': ('description', get_formatted_text),
-
- 'subscriberCountText': ('subscriber_count', get_text),
- 'channelId': ('id', lambda node: node),
- 'badges': ('badges', get_badges),
-
-# ajax format
- 'view_count_text': ('views', get_text),
- 'num_videos_text': ('size', lambda node: get_text(node).split(' ')[0]),
- 'owner_text': ('author', get_text),
- 'owner_endpoint': ('author_url', lambda node: node['url']),
- 'description': ('description', get_formatted_text),
- 'index': ('playlist_index', get_text),
- 'short_byline': ('author', get_text),
- 'length': ('duration', get_text),
- 'video_id': ('id', lambda node: node),
-
-}
-
-def ajax_info(item_json):
- try:
- info = {}
- for key, node in item_json.items():
- try:
- simple_key, function = dispatch[key]
- except KeyError:
- continue
- info[simple_key] = function(node)
- return info
- except KeyError:
- print(item_json)
- raise
-
-
-
-def prefix_urls(item):
- try:
- item['thumbnail'] = '/' + item['thumbnail'].lstrip('/')
- except KeyError:
- pass
-
- try:
- item['author_url'] = util.URL_ORIGIN + item['author_url']
- except KeyError:
- pass
-
-def add_extra_html_info(item):
- if item['type'] == 'video':
- item['url'] = util.URL_ORIGIN + '/watch?v=' + item['id']
-
- video_info = {}
- for key in ('id', 'title', 'author', 'duration'):
- try:
- video_info[key] = item[key]
- except KeyError:
- video_info[key] = ''
-
- item['video_info'] = json.dumps(video_info)
-
- elif item['type'] == 'playlist':
- item['url'] = util.URL_ORIGIN + '/playlist?list=' + item['id']
- elif item['type'] == 'channel':
- item['url'] = util.URL_ORIGIN + "/channel/" + item['id']
-
-
-def renderer_info(renderer, additional_info={}):
- type = list(renderer.keys())[0]
- renderer = renderer[type]
- info = {}
- if type == 'itemSectionRenderer':
- return renderer_info(renderer['contents'][0], additional_info)
-
- if type in ('movieRenderer', 'clarificationRenderer'):
- info['type'] = 'unsupported'
- return info
-
- info.update(additional_info)
-
-
- if type in ('compactVideoRenderer', 'videoRenderer', 'playlistVideoRenderer', 'gridVideoRenderer'):
- info['type'] = 'video'
- elif type in ('playlistRenderer', 'compactPlaylistRenderer', 'gridPlaylistRenderer',
- 'radioRenderer', 'compactRadioRenderer', 'gridRadioRenderer',
- 'showRenderer', 'compactShowRenderer', 'gridShowRenderer'):
- info['type'] = 'playlist'
- elif type == 'channelRenderer':
- info['type'] = 'channel'
- elif type == 'playlistHeaderRenderer':
- info['type'] = 'playlist_metadata'
- else:
- info['type'] = 'unsupported'
- return info
-
- try:
- if 'viewCountText' in renderer: # prefer this one as it contains all the digits
- info['views'] = get_text(renderer['viewCountText'])
- elif 'shortViewCountText' in renderer:
- info['views'] = get_text(renderer['shortViewCountText'])
-
- if 'ownerText' in renderer:
- info['author'] = renderer['ownerText']['runs'][0]['text']
- info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
- try:
- overlays = renderer['thumbnailOverlays']
- except KeyError:
- pass
- else:
- for overlay in overlays:
- if 'thumbnailOverlayTimeStatusRenderer' in overlay:
- info['duration'] = get_text(overlay['thumbnailOverlayTimeStatusRenderer']['text'])
- # show renderers don't have videoCountText
- elif 'thumbnailOverlayBottomPanelRenderer' in overlay:
- info['size'] = get_text(overlay['thumbnailOverlayBottomPanelRenderer']['text'])
-
- # show renderers don't have playlistId, have to dig into the url to get it
- try:
- info['id'] = renderer['navigationEndpoint']['watchEndpoint']['playlistId']
- except KeyError:
- pass
- for key, node in renderer.items():
- if key in ('longBylineText', 'shortBylineText'):
- info['author'] = get_text(node)
- try:
- info['author_url'] = get_url(node)
- except KeyError:
- pass
-
- # show renderers don't have thumbnail key at top level, dig into thumbnailRenderer
- elif key == 'thumbnailRenderer' and 'showCustomThumbnailRenderer' in node:
- info['thumbnail'] = node['showCustomThumbnailRenderer']['thumbnail']['thumbnails'][0]['url']
- else:
- try:
- simple_key, function = dispatch[key]
- except KeyError:
- continue
- info[simple_key] = function(node)
- if info['type'] == 'video' and 'duration' not in info:
- info['duration'] = 'Live'
-
- return info
- except KeyError:
- print(renderer)
- raise
-
-
-def parse_info_prepare_for_html(renderer, additional_info={}):
- item = renderer_info(renderer, additional_info)
- prefix_urls(item)
- add_extra_html_info(item)
-
- return item
-
-