From d1d908d5b1aadb0dc75b25df1a47789c021f89e2 Mon Sep 17 00:00:00 2001 From: James Taylor Date: Thu, 19 Dec 2019 19:48:53 -0800 Subject: Extraction: Move html post processing stuff from yt_data_extract to util --- youtube/channel.py | 4 ++-- youtube/local_playlist.py | 2 +- youtube/playlist.py | 6 +++--- youtube/search.py | 4 ++-- youtube/subscriptions.py | 2 +- youtube/util.py | 38 ++++++++++++++++++++++++++++++++++++ youtube/watch.py | 4 ++-- youtube/yt_data_extract/__init__.py | 3 +-- youtube/yt_data_extract/common.py | 39 ------------------------------------- 9 files changed, 50 insertions(+), 52 deletions(-) diff --git a/youtube/channel.py b/youtube/channel.py index 67a79ad..ad06e3f 100644 --- a/youtube/channel.py +++ b/youtube/channel.py @@ -142,8 +142,8 @@ def post_process_channel_info(info): info['avatar'] = util.prefix_url(info['avatar']) info['channel_url'] = util.prefix_url(info['channel_url']) for item in info['items']: - yt_data_extract.prefix_urls(item) - yt_data_extract.add_extra_html_info(item) + util.prefix_urls(item) + util.add_extra_html_info(item) diff --git a/youtube/local_playlist.py b/youtube/local_playlist.py index 2375ba2..0b47c72 100644 --- a/youtube/local_playlist.py +++ b/youtube/local_playlist.py @@ -57,7 +57,7 @@ def get_local_playlist_videos(name, offset=0, amount=50): info['thumbnail'] = util.get_thumbnail_url(info['id']) missing_thumbnails.append(info['id']) info['type'] = 'video' - yt_data_extract.add_extra_html_info(info) + util.add_extra_html_info(info) videos.append(info) except json.decoder.JSONDecodeError: if not video_json.strip() == '': diff --git a/youtube/playlist.py b/youtube/playlist.py index 5dc8ab7..3ca235a 100644 --- a/youtube/playlist.py +++ b/youtube/playlist.py @@ -97,10 +97,10 @@ def get_playlist_page(): if page != '1': info['metadata'] = yt_data_extract.extract_playlist_metadata(first_page_json) - yt_data_extract.prefix_urls(info['metadata']) + util.prefix_urls(info['metadata']) for item in info.get('items', ()): - yt_data_extract.prefix_urls(item) - yt_data_extract.add_extra_html_info(item) + util.prefix_urls(item) + util.add_extra_html_info(item) if 'id' in item: item['thumbnail'] = '/https://i.ytimg.com/vi/' + item['id'] + '/default.jpg' diff --git a/youtube/search.py b/youtube/search.py index a881557..0f6bbc4 100644 --- a/youtube/search.py +++ b/youtube/search.py @@ -80,8 +80,8 @@ def get_search_page(): return flask.render_template('error.html', error_message = search_info['error']) for extract_item_info in search_info['items']: - yt_data_extract.prefix_urls(extract_item_info) - yt_data_extract.add_extra_html_info(extract_item_info) + util.prefix_urls(extract_item_info) + util.add_extra_html_info(extract_item_info) corrections = search_info['corrections'] if corrections['type'] == 'did_you_mean': diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py index 9709467..dd058b3 100644 --- a/youtube/subscriptions.py +++ b/youtube/subscriptions.py @@ -766,7 +766,7 @@ def get_subscriptions_page(): video['thumbnail'] = util.URL_ORIGIN + '/data/subscription_thumbnails/' + video['id'] + '.jpg' video['type'] = 'video' video['item_size'] = 'small' - yt_data_extract.add_extra_html_info(video) + util.add_extra_html_info(video) tags = _get_all_tags(cursor) diff --git a/youtube/util.py b/youtube/util.py index 9023b98..feeec8c 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -1,4 +1,5 @@ import settings +from youtube import yt_data_extract import socks, sockshandler import gzip import brotli @@ -6,6 +7,7 @@ import urllib.parse import re import time import os +import json import gevent import gevent.queue import gevent.lock @@ -321,3 +323,39 @@ def left_remove(string, substring): return string[len(substring):] return string + +def prefix_urls(item): + try: + item['thumbnail'] = prefix_url(item['thumbnail']) + except KeyError: + pass + + try: + item['author_url'] = prefix_url(item['author_url']) + except KeyError: + pass + +def add_extra_html_info(item): + if item['type'] == 'video': + item['url'] = (URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None + + video_info = {} + for key in ('id', 'title', 'author', 'duration'): + try: + video_info[key] = item[key] + except KeyError: + video_info[key] = '' + + item['video_info'] = json.dumps(video_info) + + elif item['type'] == 'playlist': + item['url'] = (URL_ORIGIN + '/playlist?list=' + item['id']) if item.get('id') else None + elif item['type'] == 'channel': + item['url'] = (URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None + +def parse_info_prepare_for_html(renderer, additional_info={}): + item = yt_data_extract.extract_item_info(renderer, additional_info) + prefix_urls(item) + add_extra_html_info(item) + + return item diff --git a/youtube/watch.py b/youtube/watch.py index 69ab87b..45d658f 100644 --- a/youtube/watch.py +++ b/youtube/watch.py @@ -370,8 +370,8 @@ def get_watch_page(): } for item in info['related_videos']: - yt_data_extract.prefix_urls(item) - yt_data_extract.add_extra_html_info(item) + util.prefix_urls(item) + util.add_extra_html_info(item) if settings.gather_googlevideo_domains: with open(os.path.join(settings.data_dir, 'googlevideo-domains.txt'), 'a+', encoding='utf-8') as f: diff --git a/youtube/yt_data_extract/__init__.py b/youtube/yt_data_extract/__init__.py index f2a93a9..f2f07c0 100644 --- a/youtube/yt_data_extract/__init__.py +++ b/youtube/yt_data_extract/__init__.py @@ -1,8 +1,7 @@ from .common import (get, multi_get, deep_get, multi_deep_get, liberal_update, conservative_update, remove_redirect, normalize_url, extract_str, extract_formatted_text, extract_int, extract_approx_int, - extract_date, extract_item_info, extract_items, extract_response, - prefix_urls, add_extra_html_info, parse_info_prepare_for_html) + extract_date, extract_item_info, extract_items, extract_response) from .everything_else import (extract_channel_info, extract_search_info, extract_playlist_metadata, extract_playlist_info, extract_comments_info) diff --git a/youtube/yt_data_extract/common.py b/youtube/yt_data_extract/common.py index 5fa67bc..459b5e9 100644 --- a/youtube/yt_data_extract/common.py +++ b/youtube/yt_data_extract/common.py @@ -1,6 +1,3 @@ -from youtube import util - -import json import re import urllib.parse import collections @@ -179,35 +176,6 @@ def check_missing_keys(object, *key_sequences): return None -def prefix_urls(item): - try: - item['thumbnail'] = util.prefix_url(item['thumbnail']) - except KeyError: - pass - - try: - item['author_url'] = util.prefix_url(item['author_url']) - except KeyError: - pass - -def add_extra_html_info(item): - if item['type'] == 'video': - item['url'] = (util.URL_ORIGIN + '/watch?v=' + item['id']) if item.get('id') else None - - video_info = {} - for key in ('id', 'title', 'author', 'duration'): - try: - video_info[key] = item[key] - except KeyError: - video_info[key] = '' - - item['video_info'] = json.dumps(video_info) - - elif item['type'] == 'playlist': - item['url'] = (util.URL_ORIGIN + '/playlist?list=' + item['id']) if item.get('id') else None - elif item['type'] == 'channel': - item['url'] = (util.URL_ORIGIN + "/channel/" + item['id']) if item.get('id') else None - def extract_item_info(item, additional_info={}): if not item: return {'error': 'No item given'} @@ -307,13 +275,6 @@ def extract_item_info(item, additional_info={}): ))) return info -def parse_info_prepare_for_html(renderer, additional_info={}): - item = extract_item_info(renderer, additional_info) - prefix_urls(item) - add_extra_html_info(item) - - return item - def extract_response(polymer_json): '''return response, error''' response = multi_deep_get(polymer_json, [1, 'response'], ['response'], default=None, types=dict) -- cgit v1.2.3