Extraction: Replace youtube-dl with custom-built watch page extraction

author: James Taylor <user234683@users.noreply.github.com> 2019-10-17 19:58:13 -0700
committer: James Taylor <user234683@users.noreply.github.com> 2019-10-17 19:58:13 -0700
commit: 4c07546e7a5e5882abdda896009b744e947df1c4 (patch)
tree: 25870ecb94999df109895840810609e1d2167d96 /youtube
parent: 9abb83fdbc05294f186daeefff8c85cfda06b7d2 (diff)
download: yt-local-4c07546e7a5e5882abdda896009b744e947df1c4.tar.lz
yt-local-4c07546e7a5e5882abdda896009b744e947df1c4.tar.xz
yt-local-4c07546e7a5e5882abdda896009b744e947df1c4.zip
4 files changed, 523 insertions, 96 deletions
diff --git a/youtube/templates/watch.html b/youtube/templates/watch.html
index 14e953b..e97b638 100644
--- a/youtube/templates/watch.html
+++ b/youtube/templates/watch.html
@@ -187,8 +187,17 @@
                 .format-ext{
                     width: 60px;
                 }
-                .format-res{
-                    width:90px;
+                .format-video-quality{
+                    width: 140px;
+                }
+                .format-audio-quality{
+                    width: 120px;
+                }
+                .format-file-size{
+                    width: 80px;
+                }
+                .format-codecs{
+                    width: 120px;
                 }
 {% endblock style %}
 
@@ -227,8 +236,10 @@
                         <a class="download-link" href="{{ format['url'] }}">
                             <ol class="format-attributes">
                                 <li class="format-ext">{{ format['ext'] }}</li>
-                                <li class="format-res">{{ format['resolution'] }}</li>
-                                <li class="format-note">{{ format['note'] }}</li>
+                                <li class="format-video-quality">{{ format['video_quality'] }}</li>
+                                <li class="format-audio-quality">{{ format['audio_quality'] }}</li>
+                                <li class="format-file-size">{{ format['file_size'] }}</li>
+                                <li class="format-codecs">{{ format['codecs'] }}</li>
                             </ol>
                         </a>
                     </li>
@@ -238,7 +249,7 @@
         <input class="checkbox" name="video_info_list" value="{{ video_info }}" form="playlist-edit" type="checkbox">
 
 
-        <span class="description">{{ description }}</span>
+        <span class="description">{{ common_elements.text_runs(description) }}</span>
         <div class="music-list">
             {% if music_list.__len__() != 0 %}
                 <hr>
diff --git a/youtube/util.py b/youtube/util.py
index 5b63e2a..474e7b5 100644
--- a/youtube/util.py
+++ b/youtube/util.py
@@ -176,7 +176,7 @@ def fetch_url(url, headers=(), timeout=15, report_text=None, data=None, cookieja
         return content, response
     return content
 
-mobile_user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
+mobile_user_agent = 'Mozilla/5.0 (Linux; Android 7.0; Redmi Note 4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36'
 mobile_ua = (('User-Agent', mobile_user_agent),)
 desktop_user_agent = 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0'
 desktop_ua = (('User-Agent', desktop_user_agent),)
@@ -312,3 +312,10 @@ def uppercase_escape(s):
 def prefix_url(url):
     url = url.lstrip('/')     # some urls have // before them, which has a special meaning
     return '/' + url
+
+def left_remove(string, substring):
+    '''removes substring from the start of string, if present'''
+    if string.startswith(substring):
+        return string[len(substring):]
+    return string
+
diff --git a/youtube/watch.py b/youtube/watch.py
index 41c90e4..a5e0759 100644
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -5,49 +5,15 @@ import settings
 from flask import request
 import flask
 
-from youtube_dl.YoutubeDL import YoutubeDL
-from youtube_dl.extractor.youtube import YoutubeError
 import json
 import html
 import gevent
 import os
+import math
+import traceback
+
 
 
-def get_related_items(info):
-    results = []
-    for item in info['related_vids']:
-        if 'list' in item:  # playlist:
-            result = watch_page_related_playlist_info(item)
-        else:
-            result = watch_page_related_video_info(item)
-        yt_data_extract.prefix_urls(result)
-        yt_data_extract.add_extra_html_info(result)
-        results.append(result)
-    return results
-
-    
-# json of related items retrieved directly from the watch page has different names for everything
-# converts these to standard names
-def watch_page_related_video_info(item):
-    result = {key: item[key] for key in ('id', 'title', 'author')}
-    result['duration'] = util.seconds_to_timestamp(item['length_seconds'])
-    try:
-        result['views'] = item['short_view_count_text']
-    except KeyError:
-        result['views'] = ''
-    result['thumbnail'] = util.get_thumbnail_url(item['id'])
-    result['type'] = 'video'
-    return result
-    
-def watch_page_related_playlist_info(item):
-    return {
-        'size': item['playlist_length'] if item['playlist_length'] != "0" else "50+",
-        'title': item['playlist_title'],
-        'id': item['list'],
-        'first_video_id': item['video_id'],
-        'thumbnail': util.get_thumbnail_url(item['video_id']),
-        'type': 'playlist',
-    }
 
 def get_video_sources(info):
     video_sources = []
@@ -55,9 +21,10 @@ def get_video_sources(info):
         max_resolution = 360
     else:
         max_resolution = settings.default_resolution
-
     for format in info['formats']:
-        if format['acodec'] != 'none' and format['vcodec'] != 'none' and format['height'] <= max_resolution:
+        if not all(attr in format for attr in ('height', 'width', 'ext', 'url')):
+            continue
+        if 'acodec' in format and 'vcodec' in format and format['height'] <= max_resolution:
             video_sources.append({
                 'src': format['url'],
                 'type': 'video/' + format['ext'],
@@ -134,14 +101,57 @@ def get_ordered_music_list_attributes(music_list):
 
     return ordered_attributes
 
+headers = (
+    ('Accept', '*/*'),
+    ('Accept-Language', 'en-US,en;q=0.5'),
+    ('X-YouTube-Client-Name', '2'),
+    ('X-YouTube-Client-Version', '2.20180830'),
+) + util.mobile_ua
 
-def extract_info(downloader, *args, **kwargs):
+def extract_info(video_id):
+    polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch')
     try:
-        return downloader.extract_info(*args, **kwargs)
-    except YoutubeError as e:
-        return str(e)
-
-
+        polymer_json = json.loads(polymer_json)
+    except json.decoder.JSONDecodeError:
+        traceback.print_exc()
+        return {'error': 'Failed to parse json response'}
+    return yt_data_extract.extract_watch_info(polymer_json)
+
+def video_quality_string(format):
+    if 'vcodec' in format:
+        result =str(format.get('width', '?')) + 'x' + str(format.get('height', '?'))
+        if 'fps' in format:
+            result += ' ' + format['fps'] + 'fps'
+        return result
+    elif 'acodec' in format:
+        return 'audio only'
+
+    return '?'
+
+def audio_quality_string(format):
+    if 'acodec' in format:
+        result = str(format.get('abr', '?')) + 'k'
+        if 'audio_sample_rate' in format:
+            result += ' ' + str(format['audio_sample_rate']) + ' Hz'
+        return result
+    elif 'vcodec' in format:
+        return 'video only'
+
+    return '?'
+
+# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py
+def format_bytes(bytes):
+    if bytes is None:
+        return 'N/A'
+    if type(bytes) is str:
+        bytes = float(bytes)
+    if bytes == 0.0:
+        exponent = 0
+    else:
+        exponent = int(math.log(bytes, 1024.0))
+    suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
+    converted = float(bytes) / float(1024 ** exponent)
+    return '%.2f%s' % (converted, suffix)
 
 
 @yt_app.route('/watch')
@@ -152,38 +162,26 @@ def get_watch_page():
         flask.abort(flask.Response('Incomplete video id (too short): ' + video_id))
 
     lc = request.args.get('lc', '')
-    if settings.route_tor:
-        proxy = 'socks5://127.0.0.1:9150/'
-    else:
-        proxy = ''
-    yt_dl_downloader = YoutubeDL(params={'youtube_include_dash_manifest':False, 'proxy':proxy})
     tasks = (
         gevent.spawn(comments.video_comments, video_id, int(settings.default_comment_sorting), lc=lc ),
-        gevent.spawn(extract_info, yt_dl_downloader, "https://www.youtube.com/watch?v=" + video_id, download=False)
+        gevent.spawn(extract_info, video_id)
     )
     gevent.joinall(tasks)
     comments_info, info = tasks[0].value, tasks[1].value
 
-    if isinstance(info, str): # youtube error
-        return flask.render_template('error.html', error_message = info)
+    if info['error']:
+        return flask.render_template('error.html', error_message = info['error'])
 
     video_info = {
-        "duration": util.seconds_to_timestamp(info["duration"]),
+        "duration": util.seconds_to_timestamp(info["duration"] or 0),
         "id":       info['id'],
         "title":    info['title'],
-        "author":   info['uploader'],
+        "author":   info['author'],
     }
 
-    upload_year = info["upload_date"][0:4]
-    upload_month = info["upload_date"][4:6]
-    upload_day = info["upload_date"][6:8]
-    upload_date = upload_month + "/" + upload_day + "/" + upload_year
-    
-    if settings.related_videos_mode:
-        related_videos = get_related_items(info)
-    else:
-        related_videos = []
-
+    for item in info['related_videos']:
+        yt_data_extract.prefix_urls(item)
+        yt_data_extract.add_extra_html_info(item)
 
     if settings.gather_googlevideo_domains:
         with open(os.path.join(settings.data_dir, 'googlevideo-domains.txt'), 'a+', encoding='utf-8') as f:
@@ -195,23 +193,29 @@ def get_watch_page():
     download_formats = []
 
     for format in info['formats']:
+        if 'acodec' in format and 'vcodec' in format:
+            codecs_string = format['acodec'] + ', ' + format['vcodec']
+        else:
+            codecs_string = format.get('acodec') or format.get('vcodec') or '?'
         download_formats.append({
             'url': format['url'],
-            'ext': format['ext'],
-            'resolution': yt_dl_downloader.format_resolution(format),
-            'note': yt_dl_downloader._format_note(format),
+            'ext': format.get('ext', '?'),
+            'audio_quality': audio_quality_string(format),
+            'video_quality': video_quality_string(format),
+            'file_size': format_bytes(format['file_size']),
+            'codecs': codecs_string,
         })
 
     video_sources = get_video_sources(info)
-    video_height = video_sources[0]['height']
-
+    video_height = yt_data_extract.default_multi_get(video_sources, 0, 'height', default=360)
+    video_width = yt_data_extract.default_multi_get(video_sources, 0, 'width', default=640)
     # 1 second per pixel, or the actual video width
-    theater_video_target_width = max(640, info['duration'], video_sources[0]['width'])
+    theater_video_target_width = max(640, info['duration'] or 0, video_width)
 
     return flask.render_template('watch.html',
         header_playlist_names   = local_playlist.get_playlist_names(),
-        uploader_channel_url    = '/' + info['uploader_url'],
-        upload_date             = upload_date,
+        uploader_channel_url    = ('/' + info['author_url']) if info['author_url'] else '',
+        upload_date             = info['published_date'],
         views           = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)),
         likes           = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)),
         dislikes        = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)),
@@ -219,7 +223,7 @@ def get_watch_page():
         video_info              = json.dumps(video_info),
         video_sources           = video_sources,
         subtitle_sources        = get_subtitle_sources(info),
-        related                 = related_videos,
+        related                 = info['related_videos'],
         music_list              = info['music_list'],
         music_attributes        = get_ordered_music_list_attributes(info['music_list']),
         comments_info           = comments_info,
@@ -232,7 +236,7 @@ def get_watch_page():
         theater_video_target_width = theater_video_target_width,
 
         title       = info['title'],
-        uploader    = info['uploader'],
+        uploader    = info['author'],
         description = info['description'],
         unlisted    = info['unlisted'],
     )
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py
index cccd679..81604fd 100644
--- a/youtube/yt_data_extract.py
+++ b/youtube/yt_data_extract.py
@@ -6,6 +6,7 @@ import re
 import urllib
 import collections
 from math import ceil
+import traceback
 
 # videos (all of type str):
 
@@ -36,8 +37,112 @@ from math import ceil
 # size
 # first_video_id
 
-
-
+# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/youtube.py
+_formats = {
+    '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
+    '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
+    '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
+    '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
+    '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
+    '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+    '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+    '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+    # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
+    '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
+    '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+    '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+    '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
+    '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
+    '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
+    '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
+    '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+    '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+
+
+    # 3D videos
+    '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+    '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+    '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+    '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+    '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
+    '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
+    '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
+
+    # Apple HTTP Live Streaming
+    '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'},
+    '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'},
+    '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+    '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+    '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'},
+    '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264'},
+    '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264'},
+    '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264'},
+
+    # DASH mp4 video
+    '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
+    '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
+    '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+    '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
+    '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
+    '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'},  # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
+    '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
+    '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+    '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
+    '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+    '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+    '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
+
+    # Dash mp4 audio
+    '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
+    '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
+    '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
+    '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+    '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+    '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
+    '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
+
+    # Dash webm
+    '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+    '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+    '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+    '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+    '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+    '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+    '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
+    '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+    '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+    '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+    '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+    '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+    '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+    '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+    '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+    # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
+    '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+    '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+    '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+    '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+    '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+    '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+
+    # Dash webm audio
+    '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
+    '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
+
+    # Dash webm audio with opus inside
+    '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
+    '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
+    '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
+
+    # RTMP (unnamed)
+    '_rtmp': {'protocol': 'rtmp'},
+
+    # av01 video only formats sometimes served with "unknown" codecs
+    '394': {'vcodec': 'av01.0.05M.08'},
+    '395': {'vcodec': 'av01.0.05M.08'},
+    '396': {'vcodec': 'av01.0.05M.08'},
+    '397': {'vcodec': 'av01.0.05M.08'},
+}
 
 
 def get_plain_text(node):
@@ -59,7 +164,7 @@ def format_text_runs(runs):
             result += html.escape(text_run["text"])
     return result
 
-def default_get(object, key, default, types=()):
+def default_get(object, key, default=None, types=()):
     '''Like dict.get(), but returns default if the result doesn't match one of the types.
        Also works for indexing lists.'''
     try:
@@ -74,7 +179,7 @@ def default_get(object, key, default, types=()):
 
 
 
-def default_multi_get(object, *keys, default, types=()):
+def default_multi_get(object, *keys, default=None, types=()):
     '''Like dict.get(), but for nested dictionaries/sequences, supporting keys or indices.
        Last argument is the default value to use in case of any IndexErrors or KeyErrors.
        If types is given and the result doesn't match one of those types, default is returned'''
@@ -106,6 +211,11 @@ def multi_default_multi_get(object, *key_sequences, default=None, types=()):
                 continue
     return default
 
+def remove_redirect(url):
+    if re.fullmatch(r'(((https?:)?//)?(www.)?youtube.com)?/redirect\?.*', url) is not None: # youtube puts these on external links to do tracking
+        query_string = url[url.find('?')+1: ]
+        return urllib.parse.parse_qs(query_string)['q'][0]
+    return url
 
 def get_url(node):
     try:
@@ -239,9 +349,9 @@ def renderer_info(renderer, additional_info={}):
     type = list(renderer.keys())[0]
     renderer = renderer[type]
     info = {}
-    if type == 'itemSectionRenderer':
+    if type in ('itemSectionRenderer', 'compactAutoplayRenderer'):
         return renderer_info(renderer['contents'][0], additional_info)
-    
+
     if type in ('movieRenderer', 'clarificationRenderer'):
         info['type'] = 'unsupported'
         return info
@@ -345,6 +455,7 @@ item_types = {
 
     'videoRenderer',
     'compactVideoRenderer',
+    'compactAutoplayRenderer',
     'gridVideoRenderer',
     'playlistVideoRenderer',
 
@@ -378,6 +489,11 @@ def traverse_browse_renderer(renderer):
     print('Could not find tab with content')
     return {}
 
+def traverse_standard_list(renderer):
+    renderer_list = multi_default_multi_get(renderer, ['contents'], ['items'], default=(), types=(list, tuple))
+    continuation = default_multi_get(renderer, 'continuations', 0, 'nextContinuationData', 'continuation')
+    return renderer_list, continuation
+
 # these renderers contain one inside them
 nested_renderer_dispatch = {
     'singleColumnBrowseResultsRenderer': traverse_browse_renderer,
@@ -385,7 +501,16 @@ nested_renderer_dispatch = {
     'twoColumnSearchResultsRenderer': lambda renderer: default_get(renderer, 'primaryContents', {}, types=dict),
 }
 
-def extract_items(response):
+# these renderers contain a list of renderers in side them
+nested_renderer_list_dispatch = {
+    'sectionListRenderer': traverse_standard_list,
+    'itemSectionRenderer': traverse_standard_list,
+    'gridRenderer': traverse_standard_list,
+    'playlistVideoListRenderer': traverse_standard_list,
+    'singleColumnWatchNextResults': lambda r: (default_multi_get(r, 'results', 'results', 'contents', default=[], types=(list, tuple)), None),
+}
+
+def extract_items(response, item_types=item_types):
     '''return items, ctoken'''
     if 'continuationContents' in response:
         # always has just the one [something]Continuation key, but do this just in case they add some tracking key or something
@@ -414,13 +539,11 @@ def extract_items(response):
                 key, value = list(renderer.items())[0]
 
                 # has a list in it, add it to the iter stack
-                if key in list_types:
-                    renderer_list = multi_default_multi_get(value, ['contents'], ['items'], default=(), types=(list, tuple))
+                if key in nested_renderer_list_dispatch:
+                    renderer_list, continuation = nested_renderer_list_dispatch[key](value)
                     if renderer_list:
                         iter_stack.append(current_iter)
                         current_iter = iter(renderer_list)
-
-                    continuation = default_multi_get(value, 'continuations', 0, 'nextContinuationData', 'continuation', default=None, types=str)
                     if continuation:
                         ctoken = continuation
 
@@ -506,10 +629,7 @@ def extract_channel_info(polymer_json, tab):
 
         info['links'] = []
         for link_json in channel_metadata.get('primaryLinks', ()):
-            url = link_json['navigationEndpoint']['urlEndpoint']['url']
-            if url.startswith('/redirect'):     # youtube puts these on external links to do tracking
-                query_string = url[url.find('?')+1: ]
-                url = urllib.parse.parse_qs(query_string)['q'][0]
+            url = remove_redirect(link_json['navigationEndpoint']['urlEndpoint']['url'])
 
             text = get_plain_text(link_json['title'])
 
@@ -699,5 +819,290 @@ def parse_comments_polymer(polymer_json):
         'sort': metadata['sort'],
     }
 
+def check_missing_keys(object, *key_sequences):
+    for key_sequence in key_sequences:
+        _object = object
+        try:
+            for key in key_sequence:
+                _object = object[key]
+        except (KeyError, IndexError, TypeError):
+            return 'Could not find ' + key
+
+    return None
+
+def extract_plain_text(node, default=None):
+    if isinstance(node, str):
+        return node
+
+    try:
+        return node['simpleText']
+    except (KeyError, TypeError):
+        pass
+
+    try:
+        return ''.join(text_run['text'] for text_run in node['runs'])
+    except (KeyError, TypeError):
+        pass
+
+    return default
+
+def extract_formatted_text(node):
+    try:
+        result = []
+        runs = node['runs']
+        for run in runs:
+            url = default_multi_get(run, 'navigationEndpoint', 'urlEndpoint', 'url')
+            if url is not None:
+                run['url'] = remove_redirect(url)
+                run['text'] = run['url'] # youtube truncates the url text, we don't want that nonsense
+        return runs
+    except (KeyError, TypeError):
+        traceback.print_exc()
+        pass
+
+    try:
+        return [{'text': node['simpleText']}]
+    except (KeyError, TypeError):
+        pass
+
+    return []
+
+def extract_integer(string):
+    if not isinstance(string, str):
+        return None
+    match = re.search(r'(\d+)', string.replace(',', ''))
+    if match is None:
+        return None
+    try:
+        return int(match.group(1))
+    except ValueError:
+        return None
+
+def extract_metadata_row_info(video_renderer_info):
+    # extract category and music list
+    info = {
+        'category': None,
+        'music_list': [],
+    }
+
+    current_song = {}
+    for row in default_multi_get(video_renderer_info, 'metadataRowContainer', 'metadataRowContainerRenderer', 'rows', default=[]):
+        row_title = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'title'), default='')
+        row_content = extract_plain_text(default_multi_get(row, 'metadataRowRenderer', 'contents', 0))
+        if row_title == 'Category':
+            info['category'] = row_content
+        elif row_title in ('Song', 'Music'):
+            if current_song:
+                info['music_list'].append(current_song)
+            current_song = {'title': row_content}
+        elif row_title == 'Artist':
+            current_song['artist'] = row_content
+        elif row_title == 'Album':
+            current_song['album'] = row_content
+        elif row_title == 'Writers':
+            current_song['writers'] = row_content
+        elif row_title.startswith('Licensed'):
+            current_song['licensor'] = row_content
+    if current_song:
+        info['music_list'].append(current_song)
 
+    return info
 
+
+def extract_watch_info_mobile(top_level):
+    info = {}
+    microformat = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
+
+    info['allowed_countries'] = microformat.get('availableCountries', [])
+    info['published_date'] = microformat.get('publishDate')
+
+    response = top_level.get('response', {})
+
+    # video info from metadata renderers
+    items, _ = extract_items(response, item_types={'slimVideoMetadataRenderer'})
+    if items:
+        video_info = items[0]['slimVideoMetadataRenderer']
+    else:
+        print('Failed to extract video metadata')
+        video_info = {}
+
+    info.update(extract_metadata_row_info(video_info))
+    #info['description'] = extract_formatted_text(video_info.get('description'))
+    info['like_count'] = None
+    info['dislike_count'] = None
+    for button in video_info.get('buttons', ()):
+        button_renderer = button.get('slimMetadataToggleButtonRenderer', {})
+
+        # all the digits can be found in the accessibility data
+        count = extract_integer(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText', 'accessibility', 'accessibilityData', 'label'))
+
+        # this count doesn't have all the digits, it's like 53K for instance
+        dumb_count = extract_integer(extract_plain_text(default_multi_get(button_renderer, 'button', 'toggleButtonRenderer', 'defaultText')))
+
+        # the accessibility text will be "No likes" or "No dislikes" or something like that, but dumb count will be 0
+        if dumb_count == 0:
+            count = 0
+
+        if 'isLike' in button_renderer:
+            info['like_count'] = count
+        elif 'isDislike' in button_renderer:
+            info['dislike_count'] = count
+
+    # comment section info
+    items, _ = extract_items(response, item_types={'commentSectionRenderer'})
+    if items:
+        comment_info = items[0]['commentSectionRenderer']
+        comment_count_text = extract_plain_text(default_multi_get(comment_info, 'header', 'commentSectionHeaderRenderer', 'countText'))
+        if comment_count_text == 'Comments':    # just this with no number, means 0 comments
+            info['comment_count'] = 0
+        else:
+            info['comment_count'] = extract_integer(comment_count_text)
+        info['comments_disabled'] = False
+    else:   # no comment section present means comments are disabled
+        info['comment_count'] = 0
+        info['comments_disabled'] = True
+
+    # related videos
+    related, _ = extract_items(response)
+    info['related_videos'] = [renderer_info(renderer) for renderer in related]
+
+    return info
+
+month_abbreviations = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6', 'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'}
+def extract_watch_info_desktop(top_level):
+    info = {
+        'comment_count': None,
+        'comments_disabled': None,
+        'allowed_countries': None,
+    }
+
+    video_info = {}
+    for renderer in default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', default=()):
+        if renderer and list(renderer.keys())[0] in ('videoPrimaryInfoRenderer', 'videoSecondaryInfoRenderer'):
+            video_info.update(list(renderer.values())[0])
+
+    info.update(extract_metadata_row_info(video_info))
+    #info['description'] = extract_formatted_text(video_info.get('description', None))
+    info['published_date'] = None
+    date_text = extract_plain_text(video_info.get('dateText', None))
+    if date_text is not None:
+        date_text = util.left_remove(date_text.lower(), 'published on ').replace(',', '')
+        parts = date_text.split()
+        if len(parts) == 3:
+            month, day, year = date_text.split()
+            month = month_abbreviations.get(month[0:3]) # slicing in case they start writing out the full month name
+            if month and (re.fullmatch(r'\d\d?', day) is not None) and (re.fullmatch(r'\d{4}', year) is not None):
+                info['published_date'] = year + '-' + month + '-' + day
+
+    likes_dislikes = default_multi_get(video_info, 'sentimentBar', 'sentimentBarRenderer', 'tooltip', default='').split('/')
+    if len(likes_dislikes) == 2:
+        info['like_count'] = extract_integer(likes_dislikes[0])
+        info['dislike_count'] = extract_integer(likes_dislikes[1])
+    else:
+        info['like_count'] = None
+        info['dislike_count'] = None
+
+    #info['title'] = extract_plain_text(video_info.get('title', None))
+    #info['author'] = extract_plain_text(default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'title'))
+    #info['author_id'] = default_multi_get(video_info, 'owner', 'videoOwnerRenderer', 'navigationEndpoint', 'browseEndpoint', 'browseId')
+    #info['view_count'] = extract_integer(extract_plain_text(default_multi_get(video_info, 'viewCount', 'videoViewCountRenderer', 'viewCount')))
+
+    related = default_multi_get(top_level, 'response', 'contents', 'twoColumnWatchNextResults', 'secondaryResults', 'secondaryResults', 'results', default=[])
+    info['related_videos'] = [renderer_info(renderer) for renderer in related]
+
+    return info
+
+
+def extract_watch_info(polymer_json):
+    info = {'playability_error': None, 'error': None}
+
+    if isinstance(polymer_json, dict):
+        top_level = polymer_json
+    elif isinstance(polymer_json, (list, tuple)):
+        top_level = {}
+        for page_part in polymer_json:
+            if not isinstance(page_part, dict):
+                return {'error': 'Invalid page part'}
+            top_level.update(page_part)
+    else:
+        return {'error': 'Invalid top level polymer data'}
+
+    error = check_missing_keys(top_level,
+        ['playerResponse'],
+    )
+    if error:
+        return {'error': error}
+
+    error = check_missing_keys(top_level,
+        ['player', 'args'],
+        ['player', 'assets', 'js'],
+    )
+    if error:
+        info['playability_error'] = error
+
+
+    player_args = default_multi_get(top_level, 'player', 'args', default={})
+    parsed_formats = []
+
+    if 'url_encoded_fmt_stream_map' in player_args:
+        string_formats = player_args['url_encoded_fmt_stream_map'].split(',')
+        parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string]
+
+    if 'adaptive_fmts' in player_args:
+        string_formats = player_args['adaptive_fmts'].split(',')
+        parsed_formats += [dict(urllib.parse.parse_qsl(fmt_string)) for fmt_string in string_formats if fmt_string]
+
+    info['formats'] = []
+
+    for parsed_fmt in parsed_formats:
+        # start with defaults from the big table at the top
+        if 'itag' in parsed_fmt:
+            fmt = _formats.get(parsed_fmt['itag'], {}).copy()
+        else:
+            fmt = {}
+
+        # then override them
+        fmt.update(parsed_fmt)
+        try:
+            fmt['width'], fmt['height'] = map(int, fmt['size'].split('x'))
+        except (KeyError, ValueError, TypeError):
+            pass
+
+        fmt['file_size'] = None
+        if 'clen' in fmt:
+            fmt['file_size'] = int(fmt.get('clen'))
+        else:
+            match = re.search(r'&clen=(\d+)', fmt.get('url'))
+            if match:
+                fmt['file_size'] = int(match.group(1))
+        info['formats'].append(fmt)
+
+    info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js')
+    if info['base_js']:
+        info['base_js'] = normalize_url(info['base_js'])
+
+    mobile = 'singleColumnWatchNextResults' in default_multi_get(top_level, 'response', 'contents', default={})
+    if mobile:
+        info.update(extract_watch_info_mobile(top_level))
+    else:
+        info.update(extract_watch_info_desktop(top_level))
+
+    # stuff from videoDetails
+    video_details = default_multi_get(top_level, 'playerResponse', 'videoDetails', default={})
+    info['title'] =      extract_plain_text(video_details.get('title'))
+    info['duration'] =   extract_integer(video_details.get('lengthSeconds'))
+    info['view_count'] = extract_integer(video_details.get('viewCount'))
+    # videos with no description have a blank string
+    info['description'] = video_details.get('shortDescription')
+    info['id'] =          video_details.get('videoId')
+    info['author'] =      video_details.get('author')
+    info['author_id'] =   video_details.get('channelId')
+    info['live'] =        video_details.get('isLiveContent')
+    info['unlisted'] = not video_details.get('isCrawlable', True)
+    info['tags'] =        video_details.get('keywords', [])
+
+    # other stuff
+    info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
+    info['subtitles'] = {}  # TODO
+
+    return info
author	James Taylor <user234683@users.noreply.github.com>	2019-10-17 19:58:13 -0700
committer	James Taylor <user234683@users.noreply.github.com>	2019-10-17 19:58:13 -0700
commit	4c07546e7a5e5882abdda896009b744e947df1c4 (patch)
tree	25870ecb94999df109895840810609e1d2167d96 /youtube
parent	9abb83fdbc05294f186daeefff8c85cfda06b7d2 (diff)
download	yt-local-4c07546e7a5e5882abdda896009b744e947df1c4.tar.lz yt-local-4c07546e7a5e5882abdda896009b744e947df1c4.tar.xz yt-local-4c07546e7a5e5882abdda896009b744e947df1c4.zip