Extraction: Rewrite item_extraction for better error handling and readability, rename extracted names for more consistency

author: James Taylor <user234683@users.noreply.github.com> 2019-12-18 19:39:16 -0800
committer: James Taylor <user234683@users.noreply.github.com> 2019-12-18 19:39:16 -0800
commit: 98777ee82561ae205f156a7f8497728aecfa080c (patch)
tree: aaaf3e82dcdac00abda588b6cfb15e5382a49cd0
parent: ee0a118a6c7ed0e371fed18dcdace1f18a3cabf6 (diff)
download: yt-local-98777ee82561ae205f156a7f8497728aecfa080c.tar.lz
yt-local-98777ee82561ae205f156a7f8497728aecfa080c.tar.xz
yt-local-98777ee82561ae205f156a7f8497728aecfa080c.zip
12 files changed, 305 insertions, 340 deletions
diff --git a/youtube/__init__.py b/youtube/__init__.py
index 0137e86..534b9f8 100644
--- a/youtube/__init__.py
+++ b/youtube/__init__.py
@@ -23,3 +23,10 @@ def inject_theme_preference():
         'theme_path': '/youtube.com/static/' + theme_names[settings.theme] + '.css',
     }
 
+@yt_app.template_filter('commatize')
+def commatize(num):
+    if num is None:
+        return ''
+    if isinstance(num, str):
+        num = int(num)
+    return '{:,}'.format(num)
diff --git a/youtube/comments.py b/youtube/comments.py
index 250a95f..e237f0f 100644
--- a/youtube/comments.py
+++ b/youtube/comments.py
@@ -91,33 +91,33 @@ def post_process_comments_info(comments_info):
         comment['author_url'] = util.URL_ORIGIN + comment['author_url']
         comment['author_avatar'] = '/' + comment['author_avatar']
 
-        comment['permalink'] = util.URL_ORIGIN + '/watch?v=' + comments_info['video_id'] + '&lc=' + comment['comment_id']
+        comment['permalink'] = util.URL_ORIGIN + '/watch?v=' + comments_info['video_id'] + '&lc=' + comment['id']
 
         if comment['author_channel_id'] in accounts.accounts:
             comment['delete_url'] = (util.URL_ORIGIN + '/delete_comment?video_id='
                 + comments_info['video_id']
                 + '&channel_id='+ comment['author_channel_id']
                 + '&author_id=' + comment['author_id']
-                + '&comment_id=' + comment['comment_id'])
+                + '&comment_id=' + comment['id'])
 
-        num_replies = comment['number_of_replies']
-        if num_replies == 0:
-            comment['replies_url'] = util.URL_ORIGIN + '/post_comment?parent_id=' + comment['comment_id'] + "&video_id=" + comments_info['video_id']
+        reply_count = comment['reply_count']
+        if reply_count == 0:
+            comment['replies_url'] = util.URL_ORIGIN + '/post_comment?parent_id=' + comment['id'] + "&video_id=" + comments_info['video_id']
         else:
-            comment['replies_url'] = util.URL_ORIGIN + '/comments?parent_id=' + comment['comment_id'] + "&video_id=" + comments_info['video_id']
+            comment['replies_url'] = util.URL_ORIGIN + '/comments?parent_id=' + comment['id'] + "&video_id=" + comments_info['video_id']
 
-        if num_replies == 0:
+        if reply_count == 0:
             comment['view_replies_text'] = 'Reply'
-        elif num_replies == 1:
+        elif reply_count == 1:
             comment['view_replies_text'] = '1 reply'
         else:
-            comment['view_replies_text'] = str(num_replies) + ' replies'
+            comment['view_replies_text'] = str(reply_count) + ' replies'
 
 
-        if comment['likes'] == 1:
+        if comment['like_count'] == 1:
             comment['likes_text'] = '1 like'
         else:
-            comment['likes_text'] = str(comment['likes']) + ' likes'
+            comment['likes_text'] = str(comment['like_count']) + ' likes'
 
     comments_info['include_avatars'] = settings.enable_comment_avatars
     if comments_info['ctoken']:
diff --git a/youtube/playlist.py b/youtube/playlist.py
index bc2c417..ced0644 100644
--- a/youtube/playlist.py
+++ b/youtube/playlist.py
@@ -98,13 +98,19 @@ def get_playlist_page():
         info['metadata'] = yt_data_extract.extract_playlist_metadata(first_page_json)
 
     yt_data_extract.prefix_urls(info['metadata'])
-    for item in info['items']:
+    for item in info.get('items', ()):
         yt_data_extract.prefix_urls(item)
         yt_data_extract.add_extra_html_info(item)
+        if 'id' in item:
+            item['thumbnail'] = '/https://i.ytimg.com/vi/' + item['id'] + '/default.jpg'
+
+    video_count = yt_data_extract.default_multi_get(info, 'metadata', 'video_count')
+    if video_count is None:
+        video_count = 40
 
     return flask.render_template('playlist.html',
-        video_list = info['items'],
-        num_pages = math.ceil(info['metadata']['size']/20),
+        video_list = info.get('items', []),
+        num_pages = math.ceil(video_count/20),
         parameters_dictionary = request.args,
 
         **info['metadata']
diff --git a/youtube/search.py b/youtube/search.py
index cb66744..a881557 100644
--- a/youtube/search.py
+++ b/youtube/search.py
@@ -79,9 +79,9 @@ def get_search_page():
     if search_info['error']:
         return flask.render_template('error.html', error_message = search_info['error'])
 
-    for item_info in search_info['items']:
-        yt_data_extract.prefix_urls(item_info)
-        yt_data_extract.add_extra_html_info(item_info)
+    for extract_item_info in search_info['items']:
+        yt_data_extract.prefix_urls(extract_item_info)
+        yt_data_extract.add_extra_html_info(extract_item_info)
 
     corrections = search_info['corrections']
     if corrections['type'] == 'did_you_mean':
diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py
index e0c71f5..9709467 100644
--- a/youtube/subscriptions.py
+++ b/youtube/subscriptions.py
@@ -172,7 +172,7 @@ def _get_videos(cursor, number_per_page, offset, tag = None):
             'id':   db_video[0],
             'title':    db_video[1],
             'duration': db_video[2],
-            'published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]),
+            'time_published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]),
             'author':   db_video[5],
         })
 
@@ -462,8 +462,10 @@ def _get_upstream_videos(channel_id):
 
     videos = channel_info['items']
     for i, video_item in enumerate(videos):
-        if 'description' not in video_item:
+        if not video_item.get('description'):
             video_item['description'] = ''
+        else:
+            video_item['description'] = ''.join(run.get('text', '') for run in video_item['description'])
 
         if video_item['id'] in times_published:
             video_item['time_published'] = times_published[video_item['id']]
@@ -471,7 +473,7 @@ def _get_upstream_videos(channel_id):
         else:
             video_item['is_time_published_exact'] = False
             try:
-                video_item['time_published'] = youtube_timestamp_to_posix(video_item['published']) - i  # subtract a few seconds off the videos so they will be in the right order
+                video_item['time_published'] = youtube_timestamp_to_posix(video_item['time_published']) - i  # subtract a few seconds off the videos so they will be in the right order
             except KeyError:
                 print(video_item)
 
diff --git a/youtube/templates/comments.html b/youtube/templates/comments.html
index 20cde4e..396852a 100644
--- a/youtube/templates/comments.html
+++ b/youtube/templates/comments.html
@@ -12,11 +12,11 @@
                 <a class="author" href="{{ comment['author_url'] }}" title="{{ comment['author'] }}">{{ comment['author'] }}</a>
             </address>
             <a class="permalink" href="{{ comment['permalink'] }}" title="permalink">
-                <time datetime="">{{ comment['published'] }}</time>
+                <time datetime="">{{ comment['time_published'] }}</time>
             </a>
             <span class="text">{{ common_elements.text_runs(comment['text']) }}</span>
 
-            <span class="likes">{{ comment['likes_text'] if comment['likes'] else ''}}</span>
+            <span class="likes">{{ comment['likes_text'] if comment['like_count'] else ''}}</span>
             <div class="bottom-row">
                 <a href="{{ comment['replies_url'] }}" class="replies">{{ comment['view_replies_text'] }}</a>
                 {% if 'delete_url' is in comment %}
diff --git a/youtube/templates/common_elements.html b/youtube/templates/common_elements.html
index 1a417ae..4c776b6 100644
--- a/youtube/templates/common_elements.html
+++ b/youtube/templates/common_elements.html
@@ -9,55 +9,59 @@
                 {{ text_run["text"] }}
             {%- endif -%}
         {%- endfor -%}
-    {%- else -%}
+    {%- elif runs -%}
         {{ runs }}
     {%- endif -%}
 {% endmacro %}
 
 {% macro item(info, description=false, horizontal=true, include_author=true, include_badges=true) %}
     <div class="item-box {{ info['type'] + '-item-box' }} {{'horizontal-item-box' if horizontal else 'vertical-item-box'}} {{'has-description' if description else 'no-description'}}">
-        <div class="item {{ info['type'] + '-item' }}">
-            <a class="thumbnail-box" href="{{ info['url'] }}" title="{{ info['title'] }}">
-                <img class="thumbnail-img" src="{{ info['thumbnail'] }}">
-                {% if info['type'] != 'channel' %}
-                    <div class="thumbnail-info">
-                        <span>{{ info['size'] if info['type'] == 'playlist' else info['duration'] }}</span>
-                    </div>
-                {% endif %}
-            </a>
+        {% if info['error'] %}
+            {{ info['error'] }}
+        {% else %}
+            <div class="item {{ info['type'] + '-item' }}">
+                <a class="thumbnail-box" href="{{ info['url'] }}" title="{{ info['title'] }}">
+                    <img class="thumbnail-img" src="{{ info['thumbnail'] }}">
+                    {% if info['type'] != 'channel' %}
+                        <div class="thumbnail-info">
+                            <span>{{ (info['video_count']|string + ' videos') if info['type'] == 'playlist' else info['duration'] }}</span>
+                        </div>
+                    {% endif %}
+                </a>
 
-            <div class="title"><a class="title" href="{{ info['url'] }}" title="{{ info['title'] }}">{{ info['title'] }}</a></div>
+                <div class="title"><a class="title" href="{{ info['url'] }}" title="{{ info['title'] }}">{{ info['title'] }}</a></div>
 
-            <ul class="stats {{'vertical-stats' if horizontal and not description and include_author else 'horizontal-stats'}}">
-                {% if info['type'] == 'channel' %}
-                    <li><span>{{ info['subscriber_count'] }} subscribers</span></li>
-                    <li><span>{{ info['size'] }} videos</span></li>
-                {% else %}
-                    {% if include_author %}
-                        {% if 'author_url' is in(info) %}
-                            <li><address title="{{ info['author'] }}">By <a href="{{ info['author_url'] }}">{{ info['author'] }}</a></address></li>
-                        {% else %}
-                            <li><address title="{{ info['author'] }}"><b>{{ info['author'] }}</b></address></li>
+                <ul class="stats {{'vertical-stats' if horizontal and not description and include_author else 'horizontal-stats'}}">
+                    {% if info['type'] == 'channel' %}
+                        <li><span>{{ info['approx_subscriber_count'] }} subscribers</span></li>
+                        <li><span>{{ info['video_count'] }} videos</span></li>
+                    {% else %}
+                        {% if include_author %}
+                            {% if info.get('author_url') %}
+                                <li><address title="{{ info['author'] }}">By <a href="{{ info['author_url'] }}">{{ info['author'] }}</a></address></li>
+                            {% else %}
+                                <li><address title="{{ info['author'] }}"><b>{{ info['author'] }}</b></address></li>
+                            {% endif %}
+                        {% endif %}
+                        {% if info.get('approx_view_count') %}
+                            <li><span class="views">{{ info['approx_view_count'] }} views</span></li>
+                        {% endif %}
+                        {% if info.get('time_published') %}
+                            <li><time>{{ info['time_published'] }}</time></li>
                         {% endif %}
                     {% endif %}
-                    {% if 'views' is in(info) %}
-                        <li><span class="views">{{ info['views'] }}</span></li>
-                    {% endif %}
-                    {% if 'published' is in(info) %}
-                        <li><time>{{ info['published'] }}</time></li>
-                    {% endif %}
-                {% endif %}
-            </ul>
+                </ul>
 
-            {% if description %}
-                <span class="description">{{ text_runs(info.get('description', '')) }}</span>
-            {% endif %}
-            {% if include_badges %}
-                <span class="badges">{{ info['badges']|join(' | ') }}</span>
+                {% if description %}
+                    <span class="description">{{ text_runs(info.get('description', '')) }}</span>
+                {% endif %}
+                {% if include_badges %}
+                    <span class="badges">{{ info['badges']|join(' | ') }}</span>
+                {% endif %}
+            </div>
+            {% if info['type'] == 'video' %}
+                <input class="item-checkbox" type="checkbox" name="video_info_list" value="{{ info['video_info'] }}" form="playlist-edit">
             {% endif %}
-        </div>
-        {% if info['type'] == 'video' %}
-            <input class="item-checkbox" type="checkbox" name="video_info_list" value="{{ info['video_info'] }}" form="playlist-edit">
         {% endif %}
     </div>
 
diff --git a/youtube/templates/playlist.html b/youtube/templates/playlist.html
index 52c468e..ebd152b 100644
--- a/youtube/templates/playlist.html
+++ b/youtube/templates/playlist.html
@@ -54,8 +54,9 @@
         <h2 class="playlist-title">{{ title }}</h2>
         <a class="playlist-author" href="{{ author_url }}">{{ author }}</a>
         <div class="playlist-stats">
-            <div>{{ views }}</div>
-            <div>{{ size }} videos</div>
+            <div>{{ video_count|commatize }} videos</div>
+            <div>{{ view_count|commatize }} views</div>
+            <div>Last updated {{ time_published }}</div>
         </div>
         <div class="playlist-description">{{ common_elements.text_runs(description) }}</div>
     </div>
diff --git a/youtube/templates/watch.html b/youtube/templates/watch.html
index 0ffa358..5bd2a25 100644
--- a/youtube/templates/watch.html
+++ b/youtube/templates/watch.html
@@ -261,11 +261,11 @@
             {%- endif -%}
         </ul>
         <address>Uploaded by <a href="{{ uploader_channel_url }}">{{ uploader }}</a></address>
-        <span class="views">{{ views }} views</span>
+        <span class="views">{{ view_count }} views</span>
 
 
-        <time datetime="$upload_date">Published on {{ upload_date }}</time>
-        <span class="likes-dislikes">{{ likes }} likes {{ dislikes }} dislikes</span>
+        <time datetime="$upload_date">Published on {{ time_published }}</time>
+        <span class="likes-dislikes">{{ like_count }} likes {{ dislike_count }} dislikes</span>
         <details class="download-dropdown">
             <summary class="download-dropdown-label">Download</summary>
             <ul class="download-dropdown-content">
diff --git a/youtube/util.py b/youtube/util.py
index 474e7b5..9023b98 100644
--- a/youtube/util.py
+++ b/youtube/util.py
@@ -310,6 +310,8 @@ def uppercase_escape(s):
          lambda m: chr(int(m.group(1), base=16)), s)
 
 def prefix_url(url):
+    if url is None:
+        return None
     url = url.lstrip('/')     # some urls have // before them, which has a special meaning
     return '/' + url
 
diff --git a/youtube/watch.py b/youtube/watch.py
index fca794e..2118319 100644
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -405,10 +405,10 @@ def get_watch_page():
     return flask.render_template('watch.html',
         header_playlist_names   = local_playlist.get_playlist_names(),
         uploader_channel_url    = ('/' + info['author_url']) if info['author_url'] else '',
-        upload_date             = info['published_date'],
-        views           = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)),
-        likes           = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)),
-        dislikes        = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)),
+        time_published             = info['time_published'],
+        view_count    = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)),
+        like_count    = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)),
+        dislike_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)),
         download_formats        = download_formats,
         video_info              = json.dumps(video_info),
         video_sources           = video_sources,
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py
index 653a79f..ea67383 100644
--- a/youtube/yt_data_extract.py
+++ b/youtube/yt_data_extract.py
@@ -8,7 +8,7 @@ import collections
 from math import ceil
 import traceback
 
-# videos (all of type str):
+# videos:
 
 # id
 # title
@@ -17,11 +17,12 @@ import traceback
 # author_url
 # thumbnail
 # description
-# published
-# duration
-# likes
-# dislikes
-# views
+# time_published (str)
+# duration (str)
+# like_count (int)
+# dislike_count (int)
+# view_count (int)
+# approx_view_count (str)
 # playlist_index
 
 # playlists:
@@ -33,8 +34,8 @@ import traceback
 # author_url
 # thumbnail
 # description
-# updated
-# size
+# time_published (str)
+# video_count (int)
 # first_video_id
 
 # from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py
@@ -144,26 +145,6 @@ _formats = {
     '397': {'vcodec': 'av01.0.05M.08'},
 }
 
-
-def get_plain_text(node):
-    try:
-        return node['simpleText']
-    except KeyError:
-        return ''.join(text_run['text'] for text_run in node['runs'])
-
-def format_text_runs(runs):
-    if isinstance(runs, str):
-        return runs
-    result = ''
-    for text_run in runs:
-        if text_run.get("bold", False):
-            result += "<b>" + html.escape(text_run["text"]) + "</b>"
-        elif text_run.get('italics', False):
-            result += "<i>" + html.escape(text_run["text"]) + "</i>"
-        else:
-            result += html.escape(text_run["text"])
-    return result
-
 def default_get(object, key, default=None, types=()):
     '''Like dict.get(), but returns default if the result doesn't match one of the types.
        Also works for indexing lists.'''
@@ -177,6 +158,19 @@ def default_get(object, key, default=None, types=()):
     else:
         return default
 
+def multi_default_get(object, *keys, default=None, types=()):
+    '''Like default_get, but try other keys if the first fails'''
+    for key in keys:
+        try:
+            result = object[key]
+        except (TypeError, IndexError, KeyError):
+            pass
+        else:
+            if not types or isinstance(result, types):
+                return result
+            else:
+                continue
+    return default
 
 
 def default_multi_get(object, *keys, default=None, types=()):
@@ -211,101 +205,85 @@ def multi_default_multi_get(object, *key_sequences, default=None, types=()):
                 continue
     return default
 
+def liberal_update(obj, key, value):
+    '''Updates obj[key] with value as long as value is not None.
+    Ensures obj[key] will at least get a value of None, however'''
+    if (value is not None) or (key not in obj):
+        obj[key] = value
+
+def conservative_update(obj, key, value):
+    '''Only updates obj if it doesn't have key or obj[key] is None'''
+    if obj.get(key) is None:
+        obj[key] = value
+
 def remove_redirect(url):
     if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking
         query_string = url[url.find('?')+1: ]
         return urllib.parse.parse_qs(query_string)['q'][0]
     return url
 
-def get_url(node):
-    try:
-        return node['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
-    except KeyError:
-        return node['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url']
+def _recover_urls(runs):
+    for run in runs:
+        url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
+        text = run.get('text', '')
+        # second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text
+        if url is not None and (text.startswith('http://') or text.startswith('https://')):
+            url = remove_redirect(url)
+            run['url'] = url
+            run['text'] = url # youtube truncates the url text, use actual url instead
 
+def extract_str(node, default=None, recover_urls=False):
+    '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix Youtube's truncation of url text (most prominently seen in descriptions)'''
+    if isinstance(node, str):
+        return node
 
-def get_text(node):
-    if node == {}:
-        return ''
     try:
         return node['simpleText']
-    except KeyError:
+    except (KeyError, TypeError):
         pass
-    try:
-        return node['runs'][0]['text']
-    except IndexError: # empty text runs
-        return ''
-    except KeyError:
-        print(node)
-        raise
 
-def get_formatted_text(node):
-    try:
-        return node['runs']
-    except KeyError:
-        return node['simpleText']
-
-def get_badges(node):
-    badges = []
-    for badge_node in node:
-        badge = badge_node['metadataBadgeRenderer']['label']
-        badges.append(badge)
-    return badges
+    if isinstance(node, dict) and 'runs' in node:
+        if recover_urls:
+            _recover_urls(node['runs'])
+        return ''.join(text_run.get('text', '') for text_run in node['runs'])
 
-def get_thumbnail(node):
-    try:
-        return node['thumbnails'][0]['url']     # polymer format
-    except KeyError:
-        return node['url']     # ajax format
-
-dispatch = {
-
-# polymer format    
-    'title':                ('title',       get_text),
-    'publishedTimeText':    ('published',   get_text),
-    'videoId':              ('id',          lambda node: node),
-    'descriptionSnippet':   ('description', get_formatted_text),
-    'lengthText':           ('duration',    get_text),
-    'thumbnail':            ('thumbnail',   get_thumbnail),
-    'thumbnails':           ('thumbnail',   lambda node: node[0]['thumbnails'][0]['url']),
-
-    'viewCountText':        ('views',       get_text),
-    'numVideosText':        ('size',        lambda node: get_text(node).split(' ')[0]),     # the format is "324 videos"
-    'videoCountText':       ('size',        get_text),
-    'playlistId':           ('id',          lambda node: node),
-    'descriptionText':      ('description', get_formatted_text),
-
-    'subscriberCountText':  ('subscriber_count',    get_text),
-    'channelId':            ('id',          lambda node: node),
-    'badges':               ('badges',      get_badges),
-
-# ajax format
-    'view_count_text':  ('views',       get_text),
-    'num_videos_text':  ('size',        lambda node: get_text(node).split(' ')[0]),
-    'owner_text':       ('author',      get_text),
-    'owner_endpoint':   ('author_url',  lambda node: node['url']),
-    'description':      ('description', get_formatted_text),
-    'index':            ('playlist_index', get_text),
-    'short_byline':     ('author',      get_text),
-    'length':           ('duration',    get_text),
-    'video_id':         ('id',          lambda node: node),
+    return default
 
-}
+def extract_formatted_text(node):
+    if not node:
+        return []
+    if 'runs' in node:
+        _recover_urls(node['runs'])
+        return node['runs']
+    elif 'simpleText' in node:
+        return [{'text': node['simpleText']}]
+    return []
 
-def ajax_info(item_json):
+def extract_int(string):
+    if isinstance(string, int):
+        return string
+    if not isinstance(string, str):
+        string = extract_str(string)
+    if not string:
+        return None
+    match = re.search(r'(\d+)', string.replace(',', ''))
+    if match is None:
+        return None
     try:
-        info = {}          
-        for key, node in item_json.items():
-            try:
-                simple_key, function = dispatch[key]
-            except KeyError:
-                continue
-            info[simple_key] = function(node)
-        return info
-    except KeyError:
-        print(item_json)
-        raise
+        return int(match.group(1))
+    except ValueError:
+        return None
 
+def extract_approx_int(string):
+    '''e.g. "15M" from "15M subscribers"'''
+    if not isinstance(string, str):
+        string = extract_str(string)
+    if not string:
+        return None
+    match = re.search(r'(\d+[KMBTkmbt])', string.replace(',', ''))
+    if match is None:
+        return None
+    return match.group(1)
 
 youtube_url_re = re.compile(r'^(?:(?:(?:https?:)?//)?(?:www\.)?youtube\.com)?(/.*)$')
 def normalize_url(url):
@@ -330,7 +308,7 @@ def prefix_urls(item):
 
 def add_extra_html_info(item):
     if item['type'] == 'video':
-        item['url'] = util.URL_ORIGIN + '/watch?v=' + item['id']
+        item['url'] = (util.URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None
 
         video_info = {}
         for key in ('id', 'title', 'author', 'duration'):
@@ -342,17 +320,22 @@ def add_extra_html_info(item):
         item['video_info'] = json.dumps(video_info)
 
     elif item['type'] == 'playlist':
-        item['url'] = util.URL_ORIGIN + '/playlist?list=' + item['id']
+        item['url'] = (util.URL_ORIGIN + '/playlist?list=' + item['id']) if item.get('id') else None
     elif item['type'] == 'channel':
-        item['url'] = util.URL_ORIGIN + "/channel/" + item['id']
+        item['url'] = (util.URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None
 
+def extract_item_info(item, additional_info={}):
+    if not item:
+        return {'error': 'No item given'}
 
-def renderer_info(renderer, additional_info={}):
-    type = list(renderer.keys())[0]
-    renderer = renderer[type]
-    info = {}
+    type = default_get(list(item.keys()), 0)
+    if not type:
+        return {'error': 'Could not find type'}
+    item = item[type]
+
+    info = {'error': None}
     if type in ('itemSectionRenderer', 'compactAutoplayRenderer'):
-        return renderer_info(renderer['contents'][0], additional_info)
+        return extract_item_info(default_multi_get(item, 'contents', 0), additional_info)
 
     if type in ('movieRenderer', 'clarificationRenderer'):
         info['type'] = 'unsupported'
@@ -360,75 +343,78 @@ def renderer_info(renderer, additional_info={}):
 
     info.update(additional_info)
 
-
-    if type in ('compactVideoRenderer', 'videoRenderer', 'playlistVideoRenderer', 'gridVideoRenderer'):
+    # type looks like e.g. 'compactVideoRenderer' or 'gridVideoRenderer'
+    # camelCase split, https://stackoverflow.com/a/37697078
+    type_parts = [s.lower() for s in re.sub(r'([A-Z][a-z]+)', r' \1', type).split()]
+    if len(type_parts) < 2:
+        info['type'] = 'unsupported'
+        return
+    primary_type = type_parts[-2]
+    if primary_type == 'video':
         info['type'] = 'video'
-    elif type in ('playlistRenderer', 'compactPlaylistRenderer', 'gridPlaylistRenderer',
-                  'radioRenderer', 'compactRadioRenderer', 'gridRadioRenderer',
-                  'showRenderer', 'compactShowRenderer', 'gridShowRenderer'):
+    elif primary_type in ('playlist', 'radio', 'show'):
         info['type'] = 'playlist'
-    elif type == 'channelRenderer':
+    elif primary_type == 'channel':
         info['type'] = 'channel'
-    elif type == 'playlistHeaderRenderer':
-        info['type'] = 'playlist_metadata'
     else:
         info['type'] = 'unsupported'
-        return info
 
-    try:
-        if 'viewCountText' in renderer:     # prefer this one as it contains all the digits
-            info['views'] = get_text(renderer['viewCountText'])
-        elif 'shortViewCountText' in renderer:
-            info['views'] = get_text(renderer['shortViewCountText'])
-
-        if 'ownerText' in renderer:
-            info['author'] = renderer['ownerText']['runs'][0]['text']
-            info['author_url'] = normalize_url(renderer['ownerText']['runs'][0]['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'])
-        try:
-            overlays = renderer['thumbnailOverlays']
-        except KeyError:
-            pass
-        else:
-            for overlay in overlays:
-                if 'thumbnailOverlayTimeStatusRenderer' in overlay:
-                    info['duration'] = get_text(overlay['thumbnailOverlayTimeStatusRenderer']['text'])
-                # show renderers don't have videoCountText
-                elif 'thumbnailOverlayBottomPanelRenderer' in overlay:
-                    info['size'] = get_text(overlay['thumbnailOverlayBottomPanelRenderer']['text'])
-
-        # show renderers don't have playlistId, have to dig into the url to get it
-        try:
-            info['id'] = renderer['navigationEndpoint']['watchEndpoint']['playlistId']
-        except KeyError:
-            pass
-        for key, node in renderer.items():
-            if key in ('longBylineText', 'shortBylineText'):
-                info['author'] = get_text(node)
-                try:
-                    info['author_url'] = normalize_url(get_url(node))
-                except KeyError:
-                    pass
+    info['title'] = extract_str(item.get('title'))
+    info['author'] = extract_str(multi_default_get(item, 'longBylineText', 'shortBylineText', 'ownerText'))
+    info['author_id'] = extract_str(multi_default_multi_get(item,
+        ['longBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
+        ['shortBylineText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
+        ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId']
+    ))
+    info['author_url'] = ('https://www.youtube.com/channel/' + info['author_id']) if info['author_id'] else None
+    info['description'] = extract_formatted_text(multi_default_get(item, 'descriptionSnippet', 'descriptionText'))
+    info['thumbnail'] = multi_default_multi_get(item,
+        ['thumbnail', 'thumbnails', 0, 'url'],      # videos
+        ['thumbnails', 0, 'thumbnails', 0, 'url'],  # playlists
+        ['thumbnailRenderer', 'showCustomThumbnailRenderer', 'thumbnail', 'thumbnails', 0, 'url'], # shows
+    )
 
-            # show renderers don't have thumbnail key at top level, dig into thumbnailRenderer
-            elif key == 'thumbnailRenderer' and 'showCustomThumbnailRenderer' in node:
-                info['thumbnail'] = node['showCustomThumbnailRenderer']['thumbnail']['thumbnails'][0]['url']
-            else:
-                try:
-                    simple_key, function = dispatch[key]
-                except KeyError:
-                    continue
-                info[simple_key] = function(node)
-        if info['type'] == 'video' and 'duration' not in info:
-            info['duration'] = 'Live'
+    info['badges'] = []
+    for badge_node in multi_default_get(item, 'badges', 'ownerBadges', default=()):
+        badge = default_multi_get(badge_node, 'metadataBadgeRenderer', 'label')
+        if badge:
+            info['badges'].append(badge)
 
-        return info
-    except KeyError:
-        print(renderer)
-        raise
+    if primary_type in ('video', 'playlist'):
+        info['time_published'] = extract_str(item.get('publishedTimeText'))
 
+    if primary_type == 'video':
+        info['id'] = item.get('videoId')
+        info['view_count'] = extract_int(item.get('viewCountText'))
+        if info['view_count']:
+            info['approx_view_count'] = '{:,}'.format(info['view_count'])
+        else:
+            info['approx_view_count'] = extract_approx_int(multi_default_get(item, 'shortViewCountText'))
+        info['duration'] = extract_str(item.get('lengthText'))
+    elif primary_type == 'playlist':
+        info['id'] = item.get('playlistId')
+        info['video_count'] = extract_int(item.get('videoCount'))
+    elif primary_type == 'channel':
+        info['id'] = item.get('channelId')
+        info['approx_subscriber_count'] = extract_approx_int(item.get('subscriberCountText'))
+    elif primary_type == 'show':
+        info['id'] = default_multi_get(item, 'navigationEndpoint', 'watchEndpoint', 'playlistId')
+
+    if primary_type in ('playlist', 'channel'):
+        conservative_update(info, 'video_count', extract_int(item.get('videoCountText')))
+
+    for overlay in item.get('thumbnailOverlays', []):
+        conservative_update(info, 'duration', extract_str(default_multi_get(
+            overlay, 'thumbnailOverlayTimeStatusRenderer', 'text'
+        )))
+        # show renderers don't have videoCountText
+        conservative_update(info, 'video_count', extract_int(default_multi_get(
+            overlay, 'thumbnailOverlayBottomPanelRenderer', 'text'
+        )))
+    return info
 
 def parse_info_prepare_for_html(renderer, additional_info={}):
-    item = renderer_info(renderer, additional_info)
+    item = extract_item_info(renderer, additional_info)
     prefix_urls(item)
     add_extra_html_info(item)
 
@@ -616,7 +602,7 @@ def extract_channel_info(polymer_json, tab):
     items, _ = extract_items(response)
     if tab in ('videos', 'playlists', 'search'):
         additional_info = {'author': info['channel_name'], 'author_url': 'https://www.youtube.com/channel/' + channel_id}
-        info['items'] = [renderer_info(renderer, additional_info) for renderer in items]
+        info['items'] = [extract_item_info(renderer, additional_info) for renderer in items]
 
     elif tab == 'about':
         for item in items:
@@ -633,7 +619,7 @@ def extract_channel_info(polymer_json, tab):
         for link_json in channel_metadata.get('primaryLinks', ()):
             url = remove_redirect(link_json['navigationEndpoint']['urlEndpoint']['url'])
 
-            text = get_plain_text(link_json['title'])
+            text = extract_str(link_json['title'])
 
             info['links'].append( (text, url) )
 
@@ -644,10 +630,10 @@ def extract_channel_info(polymer_json, tab):
                 stat = channel_metadata[stat_name]
             except KeyError:
                 continue
-            info['stats'].append(get_plain_text(stat))
+            info['stats'].append(extract_str(stat))
 
         if 'description' in channel_metadata:
-            info['description'] = get_text(channel_metadata['description'])
+            info['description'] = extract_str(channel_metadata['description'])
         else:
             info['description'] = ''
 
@@ -693,9 +679,9 @@ def extract_search_info(polymer_json):
             }
             continue
 
-        item_info = renderer_info(renderer)
-        if item_info['type'] != 'unsupported':
-            info['items'].append(item_info)
+        i_info = extract_item_info(renderer)
+        if i_info.get('type') != 'unsupported':
+            info['items'].append(i_info)
 
 
     return info
@@ -704,13 +690,41 @@ def extract_playlist_metadata(polymer_json):
     response, err = extract_response(polymer_json)
     if err:
         return {'error': err}
-    metadata = renderer_info(response['header'])
-    metadata['error'] = None
 
-    if 'description' not in metadata:
-        metadata['description'] = ''
-
-    metadata['size'] = int(metadata['size'].replace(',', ''))
+    metadata = {'error': None}
+    header = default_multi_get(response, 'header', 'playlistHeaderRenderer', default={})
+    metadata['title'] = extract_str(header.get('title'))
+
+    metadata['first_video_id'] = default_multi_get(header, 'playEndpoint', 'watchEndpoint', 'videoId')
+    first_id = re.search(r'([a-z_\-]{11})', default_multi_get(header,
+        'thumbnail', 'thumbnails', 0, 'url', default=''))
+    if first_id:
+        conservative_update(metadata, 'first_video_id', first_id.group(1))
+    if metadata['first_video_id'] is None:
+        metadata['thumbnail'] = None
+    else:
+        metadata['thumbnail'] = 'https://i.ytimg.com/vi/' + metadata['first_video_id'] + '/mqdefault.jpg'
+
+    metadata['video_count'] = extract_int(header.get('numVideosText'))
+    metadata['description'] = extract_str(header.get('descriptionText'), default='')
+    metadata['author'] = extract_str(header.get('ownerText'))
+    metadata['author_id'] = multi_default_multi_get(header, 
+        ['ownerText', 'runs', 0, 'navigationEndpoint', 'browseEndpoint', 'browseId'],
+        ['ownerEndpoint', 'browseEndpoint', 'browseId'])
+    if metadata['author_id']:
+        metadata['author_url'] = 'https://www.youtube.com/channel/' + metadata['author_id']
+    else:
+        metadata['author_url'] = None
+    metadata['view_count'] = extract_int(header.get('viewCountText'))
+    metadata['like_count'] = extract_int(header.get('likesCountWithoutLikeText'))
+    for stat in header.get('stats', ()):
+        text = extract_str(stat)
+        if 'videos' in text:
+            conservative_update(metadata, 'video_count', extract_int(text))
+        elif 'views' in text:
+            conservative_update(metadata, 'view_count', extract_int(text))
+        elif 'updated' in text:
+            metadata['time_published'] = extract_date(text)
 
     return metadata
 
@@ -722,7 +736,7 @@ def extract_playlist_info(polymer_json):
     first_page = 'continuationContents' not in response
     video_list, _ = extract_items(response)
 
-    info['items'] = [renderer_info(renderer) for renderer in video_list]
+    info['items'] = [extract_item_info(renderer) for renderer in video_list]
 
     if first_page:
         info['metadata'] = extract_playlist_metadata(polymer_json)
@@ -777,7 +791,7 @@ def parse_comments_polymer(polymer_json):
                     video_title = comment_thread['commentTargetTitle']['runs'][0]['text']
 
                 if 'replies' in comment_thread:
-                    view_replies_text = get_plain_text(comment_thread['replies']['commentRepliesRenderer']['moreText'])
+                    view_replies_text = extract_str(comment_thread['replies']['commentRepliesRenderer']['moreText'])
                     view_replies_text = view_replies_text.replace(',', '')
                     match = re.search(r'(\d+)', view_replies_text)
                     if match is None:
@@ -789,15 +803,15 @@ def parse_comments_polymer(polymer_json):
             comment = {
                 'author_id': comment_renderer.get('authorId', ''),
                 'author_avatar': comment_renderer['authorThumbnail']['thumbnails'][0]['url'],
-                'likes': comment_renderer['likeCount'],
-                'published': get_plain_text(comment_renderer['publishedTimeText']),
+                'like_count': comment_renderer['likeCount'],
+                'time_published': extract_str(comment_renderer['publishedTimeText']),
                 'text': comment_renderer['contentText'].get('runs', ''),
-                'number_of_replies': number_of_replies,
-                'comment_id': comment_renderer['commentId'],
+                'reply_count': number_of_replies,
+                'id': comment_renderer['commentId'],
             }
 
             if 'authorText' in comment_renderer:     # deleted channels have no name or channel link
-                comment['author'] = get_plain_text(comment_renderer['authorText'])
+                comment['author'] = extract_str(comment_renderer['authorText'])
                 comment['author_url'] = comment_renderer['authorEndpoint']['commandMetadata']['webCommandMetadata']['url']
                 comment['author_channel_id'] = comment_renderer['authorEndpoint']['browseEndpoint']['browseId']
             else:
@@ -832,66 +846,6 @@ def check_missing_keys(object, *key_sequences):
 
     return None
 
-def extract_str(node, default=None, recover_urls=False):
-    '''default is the value returned if the extraction fails. If recover_urls is true, will attempt to fix Youtube's truncation of url text (most prominently seen in descriptions)'''
-    if isinstance(node, str):
-        return node
-
-    try:
-        return node['simpleText']
-    except (KeyError, TypeError):
-        pass
-
-    if isinstance(node, dict) and 'runs' in node:
-        if recover_urls:
-            result = ''
-            for run in node['runs']:
-                url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
-                text = run.get('text', '')
-                # second condition is necessary because youtube makes other things into urls, such as hashtags, which we want to keep as text
-                if url is not None and (text.startswith('http://') or text.startswith('https://')):
-                    url = remove_redirect(url)
-                    result += url # youtube truncates the url text, use actual url instead
-                else:
-                    result += text
-            return result
-        else:
-            return ''.join(text_run.get('text', '') for text_run in node['runs'])
-
-    return default
-
-def extract_formatted_text(node):
-    try:
-        result = []
-        runs = node['runs']
-        for run in runs:
-            url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
-            if url is not None:
-                run['url'] = remove_redirect(url)
-                run['text'] = run['url'] # youtube truncates the url text, we don't want that nonsense
-        return runs
-    except (KeyError, TypeError):
-        traceback.print_exc()
-        pass
-
-    try:
-        return [{'text': node['simpleText']}]
-    except (KeyError, TypeError):
-        pass
-
-    return []
-
-def extract_int(string):
-    if not isinstance(string, str):
-        return None
-    match = re.search(r'(\d+)', string.replace(',', ''))
-    if match is None:
-        return None
-    try:
-        return int(match.group(1))
-    except ValueError:
-        return None
-
 def extract_metadata_row_info(video_renderer_info):
     # extract category and music list
     info = {
@@ -944,7 +898,7 @@ def extract_watch_info_mobile(top_level):
     else:
         info['age_restricted'] = not family_safe
     info['allowed_countries'] = microformat.get('availableCountries', [])
-    info['published_date'] = microformat.get('publishDate')
+    info['time_published'] = microformat.get('publishDate')
 
     response = top_level.get('response', {})
 
@@ -962,15 +916,15 @@ def extract_watch_info_mobile(top_level):
     info['author'] = extract_str(default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'title'))
     info['author_id'] = default_multi_get(video_info, 'owner', 'slimOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
     info['title'] = extract_str(video_info.get('title'))
-    info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'))
+    info['live'] = 'watching' in extract_str(video_info.get('expandedSubtitle'), default='')
     info['unlisted'] = False
     for badge in video_info.get('badges', []):
         if default_multi_get(badge, 'metadataBadgeRenderer', 'label') == 'Unlisted':
             info['unlisted'] = True
     info['like_count'] = None
     info['dislike_count'] = None
-    if not info['published_date']:
-        info['published_date'] = extract_date(extract_str(video_info.get('dateText', None)))
+    if not info['time_published']:
+        info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
     for button in video_info.get('buttons', ()):
         button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
 
@@ -1012,7 +966,7 @@ def extract_watch_info_mobile(top_level):
 
     # related videos
     related, _ = extract_items(response)
-    info['related_videos'] = [renderer_info(renderer) for renderer in related]
+    info['related_videos'] = [extract_item_info(renderer) for renderer in related]
 
     return info
 
@@ -1032,7 +986,7 @@ def extract_watch_info_desktop(top_level):
 
     info.update(extract_metadata_row_info(video_info))
     info['description'] = extract_str(video_info.get('description', None), recover_urls=True)
-    info['published_date'] = extract_date(extract_str(video_info.get('dateText', None)))
+    info['time_published'] = extract_date(extract_str(video_info.get('dateText', None)))
 
     likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/')
     if len(likes_dislikes) == 2:
@@ -1048,7 +1002,7 @@ def extract_watch_info_desktop(top_level):
     info['view_count'] = extract_int(extract_str(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount')))
 
     related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[])
-    info['related_videos'] = [renderer_info(renderer) for renderer in related]
+    info['related_videos'] = [extract_item_info(renderer) for renderer in related]
 
     return info
 
@@ -1114,17 +1068,6 @@ def extract_playability_error(info, player_response, error_prefix=''):
     else:
         info['playability_error'] = error_prefix + 'Unknown playability error'
 
-def liberal_update(obj, key, value):
-    '''Updates obj[key] with value as long as value is not None.
-    Ensures obj[key] will at least get a value of None, however'''
-    if (value is not None) or (key not in obj):
-        obj[key] = value
-
-def conservative_update(obj, key, value):
-    '''Only updates obj if it doesn't have key or obj[key] is None'''
-    if obj.get(key) is None:
-        obj[key] = value
-
 SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
 def extract_watch_info(polymer_json):
     info = {'playability_error': None, 'error': None}
@@ -1223,8 +1166,8 @@ def extract_watch_info(polymer_json):
     conservative_update(info, 'author_id', mf.get('externalChannelId'))
     liberal_update(info, 'unlisted', mf.get('isUnlisted'))
     liberal_update(info, 'category', mf.get('category'))
-    liberal_update(info, 'published_date', mf.get('publishDate'))
-    liberal_update(info, 'uploaded_date', mf.get('uploadDate'))
+    liberal_update(info, 'time_published', mf.get('publishDate'))
+    liberal_update(info, 'time_uploaded', mf.get('uploadDate'))
 
     # other stuff
     info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
author	James Taylor <user234683@users.noreply.github.com>	2019-12-18 19:39:16 -0800
committer	James Taylor <user234683@users.noreply.github.com>	2019-12-18 19:39:16 -0800
commit	98777ee82561ae205f156a7f8497728aecfa080c (patch)
tree	aaaf3e82dcdac00abda588b6cfb15e5382a49cd0
parent	ee0a118a6c7ed0e371fed18dcdace1f18a3cabf6 (diff)
download	yt-local-98777ee82561ae205f156a7f8497728aecfa080c.tar.lz yt-local-98777ee82561ae205f156a7f8497728aecfa080c.tar.xz yt-local-98777ee82561ae205f156a7f8497728aecfa080c.zip