From 4c07546e7a5e5882abdda896009b744e947df1c4 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 17 Oct 2019 19:58:13 -0700 Subject: Extraction: Replace youtube-dl with custom-built watch page extraction --- youtube/watch.py | 154 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 79 insertions(+), 75 deletions(-) (limited to 'youtube/watch.py') diff --git a/youtube/watch.py b/youtube/watch.py index 41c90e4..a5e0759 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -5,49 +5,15 @@ import settings from flask import request import flask -from youtube_dl.YoutubeDL import YoutubeDL -from youtube_dl.extractor.youtube import YoutubeError import json import html import gevent import os +import math +import traceback + -def get_related_items(info): - results = [] - for item in info['related_vids']: - if 'list' in item: # playlist: - result = watch_page_related_playlist_info(item) - else: - result = watch_page_related_video_info(item) - yt_data_extract.prefix_urls(result) - yt_data_extract.add_extra_html_info(result) - results.append(result) - return results - - -# json of related items retrieved directly from the watch page has different names for everything -# converts these to standard names -def watch_page_related_video_info(item): - result = {key: item[key] for key in ('id', 'title', 'author')} - result['duration'] = util.seconds_to_timestamp(item['length_seconds']) - try: - result['views'] = item['short_view_count_text'] - except KeyError: - result['views'] = '' - result['thumbnail'] = util.get_thumbnail_url(item['id']) - result['type'] = 'video' - return result - -def watch_page_related_playlist_info(item): - return { - 'size': item['playlist_length'] if item['playlist_length'] != "0" else "50+", - 'title': item['playlist_title'], - 'id': item['list'], - 'first_video_id': item['video_id'], - 'thumbnail': util.get_thumbnail_url(item['video_id']), - 'type': 'playlist', - } def get_video_sources(info): video_sources = [] @@ -55,9 +21,10 @@ def get_video_sources(info): max_resolution = 360 else: max_resolution = settings.default_resolution - for format in info['formats']: - if format['acodec'] != 'none' and format['vcodec'] != 'none' and format['height'] <= max_resolution: + if not all(attr in format for attr in ('height', 'width', 'ext', 'url')): + continue + if 'acodec' in format and 'vcodec' in format and format['height'] <= max_resolution: video_sources.append({ 'src': format['url'], 'type': 'video/' + format['ext'], @@ -134,14 +101,57 @@ def get_ordered_music_list_attributes(music_list): return ordered_attributes +headers = ( + ('Accept', '*/*'), + ('Accept-Language', 'en-US,en;q=0.5'), + ('X-YouTube-Client-Name', '2'), + ('X-YouTube-Client-Version', '2.20180830'), +) + util.mobile_ua -def extract_info(downloader, *args, **kwargs): +def extract_info(video_id): + polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch') try: - return downloader.extract_info(*args, **kwargs) - except YoutubeError as e: - return str(e) - - + polymer_json = json.loads(polymer_json) + except json.decoder.JSONDecodeError: + traceback.print_exc() + return {'error': 'Failed to parse json response'} + return yt_data_extract.extract_watch_info(polymer_json) + +def video_quality_string(format): + if 'vcodec' in format: + result =str(format.get('width', '?')) + 'x' + str(format.get('height', '?')) + if 'fps' in format: + result += ' ' + format['fps'] + 'fps' + return result + elif 'acodec' in format: + return 'audio only' + + return '?' + +def audio_quality_string(format): + if 'acodec' in format: + result = str(format.get('abr', '?')) + 'k' + if 'audio_sample_rate' in format: + result += ' ' + str(format['audio_sample_rate']) + ' Hz' + return result + elif 'vcodec' in format: + return 'video only' + + return '?' + +# from https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py +def format_bytes(bytes): + if bytes is None: + return 'N/A' + if type(bytes) is str: + bytes = float(bytes) + if bytes == 0.0: + exponent = 0 + else: + exponent = int(math.log(bytes, 1024.0)) + suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent] + converted = float(bytes) / float(1024 ** exponent) + return '%.2f%s' % (converted, suffix) @yt_app.route('/watch') @@ -152,38 +162,26 @@ def get_watch_page(): flask.abort(flask.Response('Incomplete video id (too short): ' + video_id)) lc = request.args.get('lc', '') - if settings.route_tor: - proxy = 'socks5://127.0.0.1:9150/' - else: - proxy = '' - yt_dl_downloader = YoutubeDL(params={'youtube_include_dash_manifest':False, 'proxy':proxy}) tasks = ( gevent.spawn(comments.video_comments, video_id, int(settings.default_comment_sorting), lc=lc ), - gevent.spawn(extract_info, yt_dl_downloader, "https://www.youtube.com/watch?v=" + video_id, download=False) + gevent.spawn(extract_info, video_id) ) gevent.joinall(tasks) comments_info, info = tasks[0].value, tasks[1].value - if isinstance(info, str): # youtube error - return flask.render_template('error.html', error_message = info) + if info['error']: + return flask.render_template('error.html', error_message = info['error']) video_info = { - "duration": util.seconds_to_timestamp(info["duration"]), + "duration": util.seconds_to_timestamp(info["duration"] or 0), "id": info['id'], "title": info['title'], - "author": info['uploader'], + "author": info['author'], } - upload_year = info["upload_date"][0:4] - upload_month = info["upload_date"][4:6] - upload_day = info["upload_date"][6:8] - upload_date = upload_month + "/" + upload_day + "/" + upload_year - - if settings.related_videos_mode: - related_videos = get_related_items(info) - else: - related_videos = [] - + for item in info['related_videos']: + yt_data_extract.prefix_urls(item) + yt_data_extract.add_extra_html_info(item) if settings.gather_googlevideo_domains: with open(os.path.join(settings.data_dir, 'googlevideo-domains.txt'), 'a+', encoding='utf-8') as f: @@ -195,23 +193,29 @@ def get_watch_page(): download_formats = [] for format in info['formats']: + if 'acodec' in format and 'vcodec' in format: + codecs_string = format['acodec'] + ', ' + format['vcodec'] + else: + codecs_string = format.get('acodec') or format.get('vcodec') or '?' download_formats.append({ 'url': format['url'], - 'ext': format['ext'], - 'resolution': yt_dl_downloader.format_resolution(format), - 'note': yt_dl_downloader._format_note(format), + 'ext': format.get('ext', '?'), + 'audio_quality': audio_quality_string(format), + 'video_quality': video_quality_string(format), + 'file_size': format_bytes(format['file_size']), + 'codecs': codecs_string, }) video_sources = get_video_sources(info) - video_height = video_sources[0]['height'] - + video_height = yt_data_extract.default_multi_get(video_sources, 0, 'height', default=360) + video_width = yt_data_extract.default_multi_get(video_sources, 0, 'width', default=640) # 1 second per pixel, or the actual video width - theater_video_target_width = max(640, info['duration'], video_sources[0]['width']) + theater_video_target_width = max(640, info['duration'] or 0, video_width) return flask.render_template('watch.html', header_playlist_names = local_playlist.get_playlist_names(), - uploader_channel_url = '/' + info['uploader_url'], - upload_date = upload_date, + uploader_channel_url = ('/' + info['author_url']) if info['author_url'] else '', + upload_date = info['published_date'], views = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)), likes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)), dislikes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)), @@ -219,7 +223,7 @@ def get_watch_page(): video_info = json.dumps(video_info), video_sources = video_sources, subtitle_sources = get_subtitle_sources(info), - related = related_videos, + related = info['related_videos'], music_list = info['music_list'], music_attributes = get_ordered_music_list_attributes(info['music_list']), comments_info = comments_info, @@ -232,7 +236,7 @@ def get_watch_page(): theater_video_target_width = theater_video_target_width, title = info['title'], - uploader = info['uploader'], + uploader = info['author'], description = info['description'], unlisted = info['unlisted'], ) -- cgit v1.2.3 From 70b56d6eef4fd9d6c46c8fbf48dfec3ae7a2937e Mon Sep 17 00:00:00 2001 From: James Taylor Date: Fri, 18 Oct 2019 14:02:28 -0700 Subject: Extraction: Add signature decryption --- youtube/watch.py | 149 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 133 insertions(+), 16 deletions(-) (limited to 'youtube/watch.py') diff --git a/youtube/watch.py b/youtube/watch.py index a5e0759..959dca2 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -11,8 +11,14 @@ import gevent import os import math import traceback +import re +import urllib - +try: + with open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'r') as f: + decrypt_cache = json.loads(f.read())['decrypt_cache'] +except FileNotFoundError: + decrypt_cache = {} def get_video_sources(info): @@ -22,9 +28,9 @@ def get_video_sources(info): else: max_resolution = settings.default_resolution for format in info['formats']: - if not all(attr in format for attr in ('height', 'width', 'ext', 'url')): + if not all(format[attr] for attr in ('height', 'width', 'ext', 'url')): continue - if 'acodec' in format and 'vcodec' in format and format['height'] <= max_resolution: + if format['acodec'] and format['vcodec'] and format['height'] <= max_resolution: video_sources.append({ 'src': format['url'], 'type': 'video/' + format['ext'], @@ -101,6 +107,112 @@ def get_ordered_music_list_attributes(music_list): return ordered_attributes +def save_decrypt_cache(): + try: + f = open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'w') + except FileNotFoundError: + os.makedirs(settings.data_dir) + f = open(os.path.join(settings.data_dir, 'decrypt_function_cache.json'), 'w') + + f.write(json.dumps({'version': 1, 'decrypt_cache':decrypt_cache}, indent=4, sort_keys=True)) + f.close() + +# adapted from youtube-dl and invidious: +# https://github.com/omarroth/invidious/blob/master/src/invidious/helpers/signatures.cr +decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}]+)\}') +op_with_arg_re = re.compile(r'[^\.]+\.([^\(]+)\(a,(\d+)\)') +def decrypt_signatures(info): + '''return error string, or False if no errors''' + if not info['formats'] or not info['formats'][0]['s']: + return False # No decryption needed + if not info['base_js']: + return 'Failed to find base.js' + player_name = yt_data_extract.default_get(info['base_js'].split('/'), -2) + if not player_name: + return 'Could not find player name' + + if player_name in decrypt_cache: + print('Using cached decryption function for: ' + player_name) + decryption_function = decrypt_cache[player_name] + else: + base_js = util.fetch_url(info['base_js'], debug_name='base.js', report_text='Fetched player ' + player_name) + base_js = base_js.decode('utf-8') + + decrypt_function_match = decrypt_function_re.search(base_js) + if decrypt_function_match is None: + return 'Could not find decryption function in base.js' + + function_body = decrypt_function_match.group(1).split(';')[1:-1] + if not function_body: + return 'Empty decryption function body' + + var_name = yt_data_extract.default_get(function_body[0].split('.'), 0) + if var_name is None: + return 'Could not find var_name' + + var_body_match = re.search(r'var ' + re.escape(var_name) + r'=\{(.*?)\};', base_js, flags=re.DOTALL) + if var_body_match is None: + return 'Could not find var_body' + + operations = var_body_match.group(1).replace('\n', '').split('},') + if not operations: + return 'Did not find any definitions in var_body' + operations[-1] = operations[-1][:-1] # remove the trailing '}' since we split by '},' on the others + operation_definitions = {} + for op in operations: + colon_index = op.find(':') + opening_brace_index = op.find('{') + + if colon_index == -1 or opening_brace_index == -1: + return 'Could not parse operation' + op_name = op[:colon_index] + op_body = op[opening_brace_index+1:] + if op_body == 'a.reverse()': + operation_definitions[op_name] = 0 + elif op_body == 'a.splice(0,b)': + operation_definitions[op_name] = 1 + elif op_body.startswith('var c=a[0]'): + operation_definitions[op_name] = 2 + else: + return 'Unknown op_body: ' + op_body + + decryption_function = [] + for op_with_arg in function_body: + match = op_with_arg_re.fullmatch(op_with_arg) + if match is None: + return 'Could not parse operation with arg' + op_name = match.group(1) + if op_name not in operation_definitions: + return 'Unknown op_name: ' + op_name + op_argument = match.group(2) + decryption_function.append([operation_definitions[op_name], int(op_argument)]) + + decrypt_cache[player_name] = decryption_function + save_decrypt_cache() + + for format in info['formats']: + if not format['s'] or not format['sp'] or not format['url']: + print('Warning: s, sp, or url not in format') + continue + + a = list(format['s']) + for op, argument in decryption_function: + if op == 0: + a.reverse() + elif op == 1: + a = a[argument:] + else: + operation_2(a, argument) + + signature = ''.join(a) + format['url'] += '&' + format['sp'] + '=' + signature + return False + +def operation_2(a, b): + c = a[0] + a[0] = a[b % len(a)] + a[b % len(a)] = c + headers = ( ('Accept', '*/*'), ('Accept-Language', 'en-US,en;q=0.5'), @@ -115,26 +227,31 @@ def extract_info(video_id): except json.decoder.JSONDecodeError: traceback.print_exc() return {'error': 'Failed to parse json response'} - return yt_data_extract.extract_watch_info(polymer_json) + info = yt_data_extract.extract_watch_info(polymer_json) + error = decrypt_signatures(info) + if error: + print('Error decrypting url signatures: ' + error) + info['playability_error'] = error + return info def video_quality_string(format): - if 'vcodec' in format: - result =str(format.get('width', '?')) + 'x' + str(format.get('height', '?')) - if 'fps' in format: - result += ' ' + format['fps'] + 'fps' + if format['vcodec']: + result =str(format['width'] or '?') + 'x' + str(format['height'] or '?') + if format['fps']: + result += ' ' + str(format['fps']) + 'fps' return result - elif 'acodec' in format: + elif format['acodec']: return 'audio only' return '?' def audio_quality_string(format): - if 'acodec' in format: - result = str(format.get('abr', '?')) + 'k' - if 'audio_sample_rate' in format: + if format['acodec']: + result = str(format['audio_bitrate'] or '?') + 'k' + if format['audio_sample_rate']: result += ' ' + str(format['audio_sample_rate']) + ' Hz' return result - elif 'vcodec' in format: + elif format['vcodec']: return 'video only' return '?' @@ -193,13 +310,13 @@ def get_watch_page(): download_formats = [] for format in info['formats']: - if 'acodec' in format and 'vcodec' in format: + if format['acodec'] and format['vcodec']: codecs_string = format['acodec'] + ', ' + format['vcodec'] else: - codecs_string = format.get('acodec') or format.get('vcodec') or '?' + codecs_string = format['acodec'] or format['vcodec'] or '?' download_formats.append({ 'url': format['url'], - 'ext': format.get('ext', '?'), + 'ext': format['ext'] or '?', 'audio_quality': audio_quality_string(format), 'video_quality': video_quality_string(format), 'file_size': format_bytes(format['file_size']), -- cgit v1.2.3 From 79d9a18f815a03498e21dd5769a2e70c7ae7afa5 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Fri, 22 Nov 2019 14:56:53 -0800 Subject: Extraction: return and display any errors preventing video playback --- youtube/watch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'youtube/watch.py') diff --git a/youtube/watch.py b/youtube/watch.py index 959dca2..8a396a7 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -123,7 +123,7 @@ decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}]+)\}') op_with_arg_re = re.compile(r'[^\.]+\.([^\(]+)\(a,(\d+)\)') def decrypt_signatures(info): '''return error string, or False if no errors''' - if not info['formats'] or not info['formats'][0]['s']: + if ('formats' not in info) or (not info['formats']) or (not info['formats'][0]['s']): return False # No decryption needed if not info['base_js']: return 'Failed to find base.js' @@ -356,6 +356,7 @@ def get_watch_page(): uploader = info['author'], description = info['description'], unlisted = info['unlisted'], + playability_error = info['playability_error'], ) -- cgit v1.2.3 From 205ad29cb0763dd263a5940cdcb3059d189bbfe7 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Fri, 29 Nov 2019 18:36:27 -0800 Subject: Extraction: Add general subtitle extraction and translation --- youtube/watch.py | 132 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 93 insertions(+), 39 deletions(-) (limited to 'youtube/watch.py') diff --git a/youtube/watch.py b/youtube/watch.py index 8a396a7..fa697ba 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -44,50 +44,104 @@ def get_video_sources(info): return video_sources +def make_caption_src(info, lang, auto=False, trans_lang=None): + label = lang + if auto: + label += ' (Automatic)' + if trans_lang: + label += ' -> ' + trans_lang + return { + 'url': '/' + yt_data_extract.get_caption_url(info, lang, 'vtt', auto, trans_lang), + 'label': label, + 'srclang': trans_lang[0:2] if trans_lang else lang[0:2], + 'on': False, + } + +def lang_in(lang, sequence): + '''Tests if the language is in sequence, with e.g. en and en-US considered the same''' + lang = lang[0:2] + return lang in (l[0:2] for l in sequence) + +def lang_eq(lang1, lang2): + '''Tests if two iso 639-1 codes are equal, with en and en-US considered the same. + Just because the codes are equal does not mean the dialects are mutually intelligible, but this will have to do for now without a complex language model''' + return lang1[0:2] == lang2[0:2] + +def equiv_lang_in(lang, sequence): + '''Extracts a language in sequence which is equivalent to lang. + e.g. if lang is en, extracts en-GB from sequence. + Necessary because if only a specific variant like en-GB is available, can't ask Youtube for simply en. Need to get the available variant.''' + lang = lang[0:2] + for l in sequence: + if l[0:2] == lang: + return l + return None + def get_subtitle_sources(info): + '''Returns these sources, ordered from least to most intelligible: + native_video_lang (Automatic) + foreign_langs (Manual) + native_video_lang (Automatic) -> pref_lang + foreign_langs (Manual) -> pref_lang + native_video_lang (Manual) -> pref_lang + pref_lang (Automatic) + pref_lang (Manual)''' sources = [] - default_found = False - default = None - for language, formats in info['subtitles'].items(): - for format in formats: - if format['ext'] == 'vtt': - source = { - 'url': '/' + format['url'], - 'label': language, - 'srclang': language, - - # set as on by default if this is the preferred language and a default-on subtitles mode is in settings - 'on': language == settings.subtitles_language and settings.subtitles_mode > 0, - } - - if language == settings.subtitles_language: - default_found = True - default = source - else: - sources.append(source) - break - - # Put it at the end to avoid browser bug when there are too many languages - # (in firefox, it is impossible to select a language near the top of the list because it is cut off) - if default_found: - sources.append(default) + pref_lang = settings.subtitles_language + native_video_lang = None + if info['automatic_caption_languages']: + native_video_lang = info['automatic_caption_languages'][0] - try: - formats = info['automatic_captions'][settings.subtitles_language] - except KeyError: - pass - else: - for format in formats: - if format['ext'] == 'vtt': - sources.append({ - 'url': '/' + format['url'], - 'label': settings.subtitles_language + ' - Automatic', - 'srclang': settings.subtitles_language, + highest_fidelity_is_manual = False - # set as on by default if this is the preferred language and a default-on subtitles mode is in settings - 'on': settings.subtitles_mode == 2 and not default_found, + # Sources are added in very specific order outlined above + # More intelligible sources are put further down to avoid browser bug when there are too many languages + # (in firefox, it is impossible to select a language near the top of the list because it is cut off) - }) + # native_video_lang (Automatic) + if native_video_lang and not lang_eq(native_video_lang, pref_lang): + sources.append(make_caption_src(info, native_video_lang, auto=True)) + + # foreign_langs (Manual) + for lang in info['manual_caption_languages']: + if not lang_eq(lang, pref_lang): + sources.append(make_caption_src(info, lang)) + + if (lang_in(pref_lang, info['translation_languages']) + and not lang_in(pref_lang, info['automatic_caption_languages']) + and not lang_in(pref_lang, info['manual_caption_languages'])): + # native_video_lang (Automatic) -> pref_lang + if native_video_lang and not lang_eq(pref_lang, native_video_lang): + sources.append(make_caption_src(info, native_video_lang, auto=True, trans_lang=pref_lang)) + + # foreign_langs (Manual) -> pref_lang + for lang in info['manual_caption_languages']: + if not lang_eq(lang, native_video_lang): + sources.append(make_caption_src(info, lang, trans_lang=pref_lang)) + + # native_video_lang (Manual) -> pref_lang + if lang_in(native_video_lang, info['manual_caption_languages']): + sources.append(make_caption_src(info, native_video_lang, trans_lang=pref_lang)) + + # pref_lang (Automatic) + if lang_in(pref_lang, info['automatic_caption_languages']): + sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['automatic_caption_languages']), auto=True)) + + # pref_lang (Manual) + if lang_in(pref_lang, info['manual_caption_languages']): + sources.append(make_caption_src(info, equiv_lang_in(pref_lang, info['manual_caption_languages']))) + highest_fidelity_is_manual = True + + if sources and sources[-1]['srclang'] == pref_lang: + # set as on by default since it's manual a default-on subtitles mode is in settings + if highest_fidelity_is_manual and settings.subtitles_mode > 0: + sources[-1]['on'] = True + # set as on by default since settings indicate to set it as such even if it's not manual + elif settings.subtitles_mode == 2: + sources[-1]['on'] = True + + if len(sources) == 0: + assert len(info['automatic_caption_languages']) == 0 and len(info['manual_caption_languages']) == 0 return sources -- cgit v1.2.3 From 26f37521babbb2fc4b86ad59354e8c69da1f3897 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 12 Dec 2019 22:13:17 -0800 Subject: Extraction: Bypass age-restriction --- youtube/watch.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'youtube/watch.py') diff --git a/youtube/watch.py b/youtube/watch.py index fa697ba..4575c1e 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -275,17 +275,32 @@ headers = ( ) + util.mobile_ua def extract_info(video_id): - polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch') + polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch').decode('utf-8') + # TODO: Decide whether this should be done in yt_data_extract.extract_watch_info try: polymer_json = json.loads(polymer_json) except json.decoder.JSONDecodeError: traceback.print_exc() return {'error': 'Failed to parse json response'} info = yt_data_extract.extract_watch_info(polymer_json) - error = decrypt_signatures(info) - if error: - print('Error decrypting url signatures: ' + error) - info['playability_error'] = error + + # age restriction bypass + if info['age_restricted']: + print('Fetching age restriction bypass page') + data = { + 'video_id': video_id, + 'eurl': 'https://youtube.googleapis.com/v/' + video_id, + } + url = 'https://www.youtube.com/get_video_info?' + urllib.parse.urlencode(data) + video_info_page = util.fetch_url(url, debug_name='get_video_info', report_text='Fetched age restriction bypass page').decode('utf-8') + yt_data_extract.update_with_age_restricted_info(info, video_info_page) + + # signature decryption + decryption_error = decrypt_signatures(info) + if decryption_error: + decryption_error = 'Error decrypting url signatures: ' + decryption_error + info['playability_error'] = decryption_error + return info def video_quality_string(format): @@ -410,6 +425,7 @@ def get_watch_page(): uploader = info['author'], description = info['description'], unlisted = info['unlisted'], + age_restricted = info['age_restricted'], playability_error = info['playability_error'], ) -- cgit v1.2.3 From a04aa63efee5813c6083dcdb3defcbcf32ce88f4 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Sat, 14 Dec 2019 14:42:39 -0800 Subject: Extraction: Fix subtitles error when video has no automatic captions but has foreign language captions --- youtube/watch.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'youtube/watch.py') diff --git a/youtube/watch.py b/youtube/watch.py index 4575c1e..77a4b45 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -59,12 +59,16 @@ def make_caption_src(info, lang, auto=False, trans_lang=None): def lang_in(lang, sequence): '''Tests if the language is in sequence, with e.g. en and en-US considered the same''' + if lang is None: + return False lang = lang[0:2] return lang in (l[0:2] for l in sequence) def lang_eq(lang1, lang2): '''Tests if two iso 639-1 codes are equal, with en and en-US considered the same. Just because the codes are equal does not mean the dialects are mutually intelligible, but this will have to do for now without a complex language model''' + if lang1 is None or lang2 is None: + return False return lang1[0:2] == lang2[0:2] def equiv_lang_in(lang, sequence): @@ -116,7 +120,7 @@ def get_subtitle_sources(info): # foreign_langs (Manual) -> pref_lang for lang in info['manual_caption_languages']: - if not lang_eq(lang, native_video_lang): + if not lang_eq(lang, native_video_lang) and not lang_eq(lang, pref_lang): sources.append(make_caption_src(info, lang, trans_lang=pref_lang)) # native_video_lang (Manual) -> pref_lang -- cgit v1.2.3 From 81c7ecf161b528ba293678e0bdbf42952cc87386 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Tue, 17 Dec 2019 20:39:20 -0800 Subject: Extraction: Make limited state videos work --- youtube/watch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube/watch.py') diff --git a/youtube/watch.py b/youtube/watch.py index 77a4b45..092885d 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -279,7 +279,7 @@ headers = ( ) + util.mobile_ua def extract_info(video_id): - polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch').decode('utf-8') + polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1&bpctr=9999999999', headers=headers, debug_name='watch').decode('utf-8') # TODO: Decide whether this should be done in yt_data_extract.extract_watch_info try: polymer_json = json.loads(polymer_json) -- cgit v1.2.3 From 45a4ab5acedd2fd7531604d3e817e0742a036c4a Mon Sep 17 00:00:00 2001 From: James Taylor Date: Tue, 17 Dec 2019 20:58:15 -0800 Subject: Extraction: Detect limited state and fix false detection as unlisted --- youtube/watch.py | 1 + 1 file changed, 1 insertion(+) (limited to 'youtube/watch.py') diff --git a/youtube/watch.py b/youtube/watch.py index 092885d..fca794e 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -429,6 +429,7 @@ def get_watch_page(): uploader = info['author'], description = info['description'], unlisted = info['unlisted'], + limited_state = info['limited_state'], age_restricted = info['age_restricted'], playability_error = info['playability_error'], ) -- cgit v1.2.3 From 98777ee82561ae205f156a7f8497728aecfa080c Mon Sep 17 00:00:00 2001 From: James Taylor Date: Wed, 18 Dec 2019 19:39:16 -0800 Subject: Extraction: Rewrite item_extraction for better error handling and readability, rename extracted names for more consistency --- youtube/watch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'youtube/watch.py') diff --git a/youtube/watch.py b/youtube/watch.py index fca794e..2118319 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -405,10 +405,10 @@ def get_watch_page(): return flask.render_template('watch.html', header_playlist_names = local_playlist.get_playlist_names(), uploader_channel_url = ('/' + info['author_url']) if info['author_url'] else '', - upload_date = info['published_date'], - views = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)), - likes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)), - dislikes = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)), + time_published = info['time_published'], + view_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("view_count", None)), + like_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)), + dislike_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)), download_formats = download_formats, video_info = json.dumps(video_info), video_sources = video_sources, -- cgit v1.2.3 From f6bf5213a579b16e17e8d72b51b090ffe4bc9bdb Mon Sep 17 00:00:00 2001 From: James Taylor Date: Wed, 18 Dec 2019 19:43:55 -0800 Subject: Extraction: rename multi_get functions to more descriptive names --- youtube/watch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'youtube/watch.py') diff --git a/youtube/watch.py b/youtube/watch.py index 2118319..69ab87b 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -185,7 +185,7 @@ def decrypt_signatures(info): return False # No decryption needed if not info['base_js']: return 'Failed to find base.js' - player_name = yt_data_extract.default_get(info['base_js'].split('/'), -2) + player_name = yt_data_extract.get(info['base_js'].split('/'), -2) if not player_name: return 'Could not find player name' @@ -204,7 +204,7 @@ def decrypt_signatures(info): if not function_body: return 'Empty decryption function body' - var_name = yt_data_extract.default_get(function_body[0].split('.'), 0) + var_name = yt_data_extract.get(function_body[0].split('.'), 0) if var_name is None: return 'Could not find var_name' @@ -397,8 +397,8 @@ def get_watch_page(): }) video_sources = get_video_sources(info) - video_height = yt_data_extract.default_multi_get(video_sources, 0, 'height', default=360) - video_width = yt_data_extract.default_multi_get(video_sources, 0, 'width', default=640) + video_height = yt_data_extract.deep_get(video_sources, 0, 'height', default=360) + video_width = yt_data_extract.deep_get(video_sources, 0, 'width', default=640) # 1 second per pixel, or the actual video width theater_video_target_width = max(640, info['duration'] or 0, video_width) -- cgit v1.2.3 From d1d908d5b1aadb0dc75b25df1a47789c021f89e2 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 19 Dec 2019 19:48:53 -0800 Subject: Extraction: Move html post processing stuff from yt_data_extract to util --- youtube/watch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube/watch.py') diff --git a/youtube/watch.py b/youtube/watch.py index 69ab87b..45d658f 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -370,8 +370,8 @@ def get_watch_page(): } for item in info['related_videos']: - yt_data_extract.prefix_urls(item) - yt_data_extract.add_extra_html_info(item) + util.prefix_urls(item) + util.add_extra_html_info(item) if settings.gather_googlevideo_domains: with open(os.path.join(settings.data_dir, 'googlevideo-domains.txt'), 'a+', encoding='utf-8') as f: -- cgit v1.2.3 From 6b7a1212e30b713453aa7d2b3a7122e97689dad0 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 19 Dec 2019 21:28:21 -0800 Subject: Extraction: Move non-stateful signature decryption functionality into yt_data_extract --- youtube/watch.py | 97 +++++++------------------------------------------------- 1 file changed, 12 insertions(+), 85 deletions(-) (limited to 'youtube/watch.py') diff --git a/youtube/watch.py b/youtube/watch.py index 45d658f..429f272 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -11,7 +11,6 @@ import gevent import os import math import traceback -import re import urllib try: @@ -175,101 +174,29 @@ def save_decrypt_cache(): f.write(json.dumps({'version': 1, 'decrypt_cache':decrypt_cache}, indent=4, sort_keys=True)) f.close() -# adapted from youtube-dl and invidious: -# https://github.com/omarroth/invidious/blob/master/src/invidious/helpers/signatures.cr -decrypt_function_re = re.compile(r'function\(a\)\{(a=a\.split\(""\)[^\}]+)\}') -op_with_arg_re = re.compile(r'[^\.]+\.([^\(]+)\(a,(\d+)\)') def decrypt_signatures(info): '''return error string, or False if no errors''' - if ('formats' not in info) or (not info['formats']) or (not info['formats'][0]['s']): - return False # No decryption needed + if not yt_data_extract.requires_decryption(info): + return False + if not info['player_name']: + return 'Could not find player name' if not info['base_js']: return 'Failed to find base.js' - player_name = yt_data_extract.get(info['base_js'].split('/'), -2) - if not player_name: - return 'Could not find player name' + player_name = info['player_name'] if player_name in decrypt_cache: print('Using cached decryption function for: ' + player_name) - decryption_function = decrypt_cache[player_name] + info['decryption_function'] = decrypt_cache[player_name] else: base_js = util.fetch_url(info['base_js'], debug_name='base.js', report_text='Fetched player ' + player_name) base_js = base_js.decode('utf-8') - - decrypt_function_match = decrypt_function_re.search(base_js) - if decrypt_function_match is None: - return 'Could not find decryption function in base.js' - - function_body = decrypt_function_match.group(1).split(';')[1:-1] - if not function_body: - return 'Empty decryption function body' - - var_name = yt_data_extract.get(function_body[0].split('.'), 0) - if var_name is None: - return 'Could not find var_name' - - var_body_match = re.search(r'var ' + re.escape(var_name) + r'=\{(.*?)\};', base_js, flags=re.DOTALL) - if var_body_match is None: - return 'Could not find var_body' - - operations = var_body_match.group(1).replace('\n', '').split('},') - if not operations: - return 'Did not find any definitions in var_body' - operations[-1] = operations[-1][:-1] # remove the trailing '}' since we split by '},' on the others - operation_definitions = {} - for op in operations: - colon_index = op.find(':') - opening_brace_index = op.find('{') - - if colon_index == -1 or opening_brace_index == -1: - return 'Could not parse operation' - op_name = op[:colon_index] - op_body = op[opening_brace_index+1:] - if op_body == 'a.reverse()': - operation_definitions[op_name] = 0 - elif op_body == 'a.splice(0,b)': - operation_definitions[op_name] = 1 - elif op_body.startswith('var c=a[0]'): - operation_definitions[op_name] = 2 - else: - return 'Unknown op_body: ' + op_body - - decryption_function = [] - for op_with_arg in function_body: - match = op_with_arg_re.fullmatch(op_with_arg) - if match is None: - return 'Could not parse operation with arg' - op_name = match.group(1) - if op_name not in operation_definitions: - return 'Unknown op_name: ' + op_name - op_argument = match.group(2) - decryption_function.append([operation_definitions[op_name], int(op_argument)]) - - decrypt_cache[player_name] = decryption_function + err = yt_data_extract.extract_decryption_function(info, base_js) + if err: + return err + decrypt_cache[player_name] = info['decryption_function'] save_decrypt_cache() - - for format in info['formats']: - if not format['s'] or not format['sp'] or not format['url']: - print('Warning: s, sp, or url not in format') - continue - - a = list(format['s']) - for op, argument in decryption_function: - if op == 0: - a.reverse() - elif op == 1: - a = a[argument:] - else: - operation_2(a, argument) - - signature = ''.join(a) - format['url'] += '&' + format['sp'] + '=' + signature - return False - -def operation_2(a, b): - c = a[0] - a[0] = a[b % len(a)] - a[b % len(a)] = c + err = yt_data_extract.decrypt_signatures(info) + return err headers = ( ('Accept', '*/*'), -- cgit v1.2.3