Merge branch 'modular-data-extract'

Commits in this branch are prefixed with "Extraction:" This branch refactors data extraction. All such functionality has been moved to the yt_data_extract module. Responses from requests are given to the module and it parses them into a consistent, more useful format. The dependency on youtube-dl has also been dropped and this functionality has been built from scratch for these reasons: (1) I've noticed youtube-dl breaks more often than invidious (which uses watch page extraction built from scratch) in response to changes from Youtube, so I'm hoping what I wrote will also be less brittle. (2) Such breakage is inconvenient because I have to manually merge the fixes since I had to make changes to youtube-dl to make it do things such as extracting related videos. (3) I have no control over error handling and request pooling with youtube-dl, since it does all the requests (these would require intrusive changes I don't want to maintain). (4) I will now be able to finally display the number of comments and whether comments are disabled without making additional requests.
author: James Taylor <user234683@users.noreply.github.com> 2019-12-19 21:33:54 -0800
committer: James Taylor <user234683@users.noreply.github.com> 2019-12-19 21:33:54 -0800
commit: b4406df9cf33c53b6e942e6a5c72d955f57c4b5f (patch)
tree: 4de0082ac9eb26a05188dd424835ea50b1483113 /youtube/yt_data_extract.py
parent: b614fcdb8579ba29fccfa47eab1e2965cfb0beaa (diff)
parent: 6b7a1212e30b713453aa7d2b3a7122e97689dad0 (diff)
download: yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.tar.lz
yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.tar.xz
yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.zip
1 files changed, 0 insertions, 273 deletions
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py
deleted file mode 100644
index 5419084..0000000
--- a/youtube/yt_data_extract.py
+++ /dev/null
@@ -1,273 +0,0 @@
-from youtube import util
-
-import html
-import json
-
-# videos (all of type str):
-
-# id
-# title
-# url
-# author
-# author_url
-# thumbnail
-# description
-# published
-# duration
-# likes
-# dislikes
-# views
-# playlist_index
-
-# playlists:
-
-# id
-# title
-# url
-# author
-# author_url
-# thumbnail
-# description
-# updated
-# size
-# first_video_id
-
-
-
-
-
-def get_plain_text(node):
-    try:
-        return node['simpleText']
-    except KeyError:
-        return ''.join(text_run['text'] for text_run in node['runs'])
-
-def format_text_runs(runs):
-    if isinstance(runs, str):
-        return runs
-    result = ''
-    for text_run in runs:
-        if text_run.get("bold", False):
-            result += "<b>" + html.escape(text_run["text"]) + "</b>"
-        elif text_run.get('italics', False):
-            result += "<i>" + html.escape(text_run["text"]) + "</i>"
-        else:
-            result += html.escape(text_run["text"])
-    return result
-
-
-
-
-
-
-
-
-def get_url(node):
-    try:
-        return node['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
-    except KeyError:
-        return node['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
-
-
-def get_text(node):
-    if node == {}:
-        return ''
-    try:
-        return node['simpleText']
-    except KeyError:
-        pass
-    try:
-        return node['runs'][0]['text']
-    except IndexError: # empty text runs
-        return ''
-    except KeyError:
-        print(node)
-        raise
-
-def get_formatted_text(node):
-    try:
-        return node['runs']
-    except KeyError:
-        return node['simpleText']
-
-def get_badges(node):
-    badges = []
-    for badge_node in node:
-        badge = badge_node['metadataBadgeRenderer']['label']
-        badges.append(badge)
-    return badges
-
-def get_thumbnail(node):
-    try:
-        return node['thumbnails'][0]['url']     # polymer format
-    except KeyError:
-        return node['url']     # ajax format
-
-dispatch = {
-
-# polymer format    
-    'title':                ('title',       get_text),
-    'publishedTimeText':    ('published',   get_text),
-    'videoId':              ('id',          lambda node: node),
-    'descriptionSnippet':   ('description', get_formatted_text),
-    'lengthText':           ('duration',    get_text),
-    'thumbnail':            ('thumbnail',   get_thumbnail),
-    'thumbnails':           ('thumbnail',   lambda node: node[0]['thumbnails'][0]['url']),
-
-    'viewCountText':        ('views',       get_text),
-    'numVideosText':        ('size',        lambda node: get_text(node).split(' ')[0]),     # the format is "324 videos"
-    'videoCountText':       ('size',        get_text),
-    'playlistId':           ('id',          lambda node: node),
-    'descriptionText':      ('description', get_formatted_text),
-
-    'subscriberCountText':  ('subscriber_count',    get_text),
-    'channelId':            ('id',          lambda node: node),
-    'badges':               ('badges',      get_badges),
-
-# ajax format
-    'view_count_text':  ('views',       get_text),
-    'num_videos_text':  ('size',        lambda node: get_text(node).split(' ')[0]),
-    'owner_text':       ('author',      get_text),
-    'owner_endpoint':   ('author_url',  lambda node: node['url']),
-    'description':      ('description', get_formatted_text),
-    'index':            ('playlist_index', get_text),
-    'short_byline':     ('author',      get_text),
-    'length':           ('duration',    get_text),
-    'video_id':         ('id',          lambda node: node),
-
-}
-
-def ajax_info(item_json):
-    try:
-        info = {}          
-        for key, node in item_json.items():
-            try:
-                simple_key, function = dispatch[key]
-            except KeyError:
-                continue
-            info[simple_key] = function(node)
-        return info
-    except KeyError:
-        print(item_json)
-        raise
-
-
-
-def prefix_urls(item):
-    try:
-        item['thumbnail'] = '/' + item['thumbnail'].lstrip('/')
-    except KeyError:
-        pass
-
-    try:
-        item['author_url'] = util.URL_ORIGIN + item['author_url']
-    except KeyError:
-        pass
-
-def add_extra_html_info(item):
-    if item['type'] == 'video':
-        item['url'] = util.URL_ORIGIN + '/watch?v=' + item['id']
-
-        video_info = {}
-        for key in ('id', 'title', 'author', 'duration'):
-            try:
-                video_info[key] = item[key]
-            except KeyError:
-                video_info[key] = ''
-
-        item['video_info'] = json.dumps(video_info)
-
-    elif item['type'] == 'playlist':
-        item['url'] = util.URL_ORIGIN + '/playlist?list=' + item['id']
-    elif item['type'] == 'channel':
-        item['url'] = util.URL_ORIGIN + "/channel/" + item['id']
-
-
-def renderer_info(renderer, additional_info={}):
-    type = list(renderer.keys())[0]
-    renderer = renderer[type]
-    info = {}
-    if type == 'itemSectionRenderer':
-        return renderer_info(renderer['contents'][0], additional_info)
-    
-    if type in ('movieRenderer', 'clarificationRenderer'):
-        info['type'] = 'unsupported'
-        return info
-
-    info.update(additional_info)
-
-
-    if type in ('compactVideoRenderer', 'videoRenderer', 'playlistVideoRenderer', 'gridVideoRenderer'):
-        info['type'] = 'video'
-    elif type in ('playlistRenderer', 'compactPlaylistRenderer', 'gridPlaylistRenderer',
-                  'radioRenderer', 'compactRadioRenderer', 'gridRadioRenderer',
-                  'showRenderer', 'compactShowRenderer', 'gridShowRenderer'):
-        info['type'] = 'playlist'
-    elif type == 'channelRenderer':
-        info['type'] = 'channel'
-    elif type == 'playlistHeaderRenderer':
-        info['type'] = 'playlist_metadata'
-    else:
-        info['type'] = 'unsupported'
-        return info
-
-    try:
-        if 'viewCountText' in renderer:     # prefer this one as it contains all the digits
-            info['views'] = get_text(renderer['viewCountText'])
-        elif 'shortViewCountText' in renderer:
-            info['views'] = get_text(renderer['shortViewCountText'])
-
-        if 'ownerText' in renderer:
-            info['author'] = renderer['ownerText']['runs'][0]['text']
-            info['author_url'] = renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
-        try:
-            overlays = renderer['thumbnailOverlays']
-        except KeyError:
-            pass
-        else:
-            for overlay in overlays:
-                if 'thumbnailOverlayTimeStatusRenderer' in overlay:
-                    info['duration'] = get_text(overlay['thumbnailOverlayTimeStatusRenderer']['text'])
-                # show renderers don't have videoCountText
-                elif 'thumbnailOverlayBottomPanelRenderer' in overlay:
-                    info['size'] = get_text(overlay['thumbnailOverlayBottomPanelRenderer']['text'])
-
-        # show renderers don't have playlistId, have to dig into the url to get it
-        try:
-            info['id'] = renderer['navigationEndpoint']['watchEndpoint']['playlistId']
-        except KeyError:
-            pass
-        for key, node in renderer.items():
-            if key in ('longBylineText', 'shortBylineText'):
-                info['author'] = get_text(node)
-                try:
-                    info['author_url'] = get_url(node)
-                except KeyError:
-                    pass
-
-            # show renderers don't have thumbnail key at top level, dig into thumbnailRenderer
-            elif key == 'thumbnailRenderer' and 'showCustomThumbnailRenderer' in node:
-                info['thumbnail'] = node['showCustomThumbnailRenderer']['thumbnail']['thumbnails'][0]['url']
-            else:
-                try:
-                    simple_key, function = dispatch[key]
-                except KeyError:
-                    continue
-                info[simple_key] = function(node)
-        if info['type'] == 'video' and 'duration' not in info:
-            info['duration'] = 'Live'
-
-        return info
-    except KeyError:
-        print(renderer)
-        raise
-
-
-def parse_info_prepare_for_html(renderer, additional_info={}):
-    item = renderer_info(renderer, additional_info)
-    prefix_urls(item)
-    add_extra_html_info(item)
-
-    return item
-
-
author	James Taylor <user234683@users.noreply.github.com>	2019-12-19 21:33:54 -0800
committer	James Taylor <user234683@users.noreply.github.com>	2019-12-19 21:33:54 -0800
commit	b4406df9cf33c53b6e942e6a5c72d955f57c4b5f (patch)
tree	4de0082ac9eb26a05188dd424835ea50b1483113 /youtube/yt_data_extract.py
parent	b614fcdb8579ba29fccfa47eab1e2965cfb0beaa (diff)
parent	6b7a1212e30b713453aa7d2b3a7122e97689dad0 (diff)
download	yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.tar.lz yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.tar.xz yt-local-b4406df9cf33c53b6e942e6a5c72d955f57c4b5f.zip