diff options
Diffstat (limited to 'youtube')
-rw-r--r-- | youtube/subscriptions.py | 144 | ||||
-rw-r--r-- | youtube/util.py | 54 |
2 files changed, 171 insertions, 27 deletions
diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py index ff74d94..ba27655 100644 --- a/youtube/subscriptions.py +++ b/youtube/subscriptions.py @@ -26,7 +26,7 @@ database_path = os.path.join(settings.data_dir, "subscriptions.sqlite") def open_database(): if not os.path.exists(settings.data_dir): os.makedirs(settings.data_dir) - connection = sqlite3.connect(database_path) + connection = sqlite3.connect(database_path, check_same_thread=False) # Create tables if they don't exist try: @@ -172,17 +172,75 @@ def youtube_timestamp_to_posix(dumb_timestamp): unit = unit[:-1] # remove s from end return now - number*units[unit] + +try: + existing_thumbnails = set(os.path.splitext(name)[0] for name in os.listdir(thumbnails_directory)) +except FileNotFoundError: + existing_thumbnails = set() + + +thumbnails_queue = util.RateLimitedQueue() +check_channels_queue = util.RateLimitedQueue() + + # Use this to mark a thumbnail acceptable to be retrieved at the request of the browser +# can't simply check if it's in the queue because items are removed when the download starts, not when it finishes downloading_thumbnails = set() -def download_thumbnails(thumbnails_directory, thumbnails): - try: - g = gevent.spawn(util.download_thumbnails, thumbnails_directory, thumbnails) - g.join() - finally: - downloading_thumbnails.difference_update(thumbnails) + +checking_channels = set() + +# Just to use for printing channel checking status to console without opening database +channel_names = dict() + +def download_thumbnail_worker(): + while True: + video_id = thumbnails_queue.get() + try: + success = util.download_thumbnail(thumbnails_directory, video_id) + if success: + existing_thumbnails.add(video_id) + except Exception: + traceback.print_exc() + finally: + downloading_thumbnails.remove(video_id) + +def check_channel_worker(): + while True: + channel_id = check_channels_queue.get() + try: + _get_upstream_videos(channel_id) + finally: + checking_channels.remove(channel_id) + +for i in range(0,5): + gevent.spawn(download_thumbnail_worker) + gevent.spawn(check_channel_worker) + + + + + + +def download_thumbnails_if_necessary(thumbnails): + for video_id in thumbnails: + if video_id not in existing_thumbnails and video_id not in downloading_thumbnails: + downloading_thumbnails.add(video_id) + thumbnails_queue.put(video_id) + +def check_channels_if_necessary(channel_ids): + for channel_id in channel_ids: + if channel_id not in checking_channels: + checking_channels.add(channel_id) + check_channels_queue.put(channel_id) + def _get_upstream_videos(channel_id): + try: + print("Checking channel: " + channel_names[channel_id]) + except KeyError: + print("Checking channel " + channel_id) + videos = [] json_channel_videos = channel.get_grid_items(channel.get_channel_tab(channel_id)[1]['response']) @@ -190,23 +248,56 @@ def _get_upstream_videos(channel_id): info = yt_data_extract.renderer_info(json_video['gridVideoRenderer']) if 'description' not in info: info['description'] = '' - info['time_published'] = youtube_timestamp_to_posix(info['published']) - i # subtract a few seconds off the videos so they will be in the right order - videos.append(info) + try: + info['time_published'] = youtube_timestamp_to_posix(info['published']) - i # subtract a few seconds off the videos so they will be in the right order + except KeyError: + print(info) + videos.append((channel_id, info['id'], info['title'], info['duration'], info['time_published'], info['description'])) + + now = time.time() + download_thumbnails_if_necessary(video[1] for video in videos if (now - video[4]) < 30*24*3600) # Don't download thumbnails from videos older than a month + + with open_database() as connection: + with connection as cursor: + cursor.executemany('''INSERT OR IGNORE INTO videos (sql_channel_id, video_id, title, duration, time_published, description) + VALUES ((SELECT id FROM subscribed_channels WHERE yt_channel_id=?), ?, ?, ?, ?, ?)''', videos) + cursor.execute('''UPDATE subscribed_channels + SET time_last_checked = ? + WHERE yt_channel_id=?''', [int(time.time()), channel_id]) - try: - existing_thumbnails = set(os.path.splitext(name)[0] for name in os.listdir(thumbnails_directory)) - except FileNotFoundError: - existing_thumbnails = set() - missing_thumbnails = set(video['id'] for video in videos) - existing_thumbnails - downloading_thumbnails.update(missing_thumbnails) - gevent.spawn(download_thumbnails, thumbnails_directory, missing_thumbnails) - return videos +def check_all_channels(): + with open_database() as connection: + with connection as cursor: + channel_id_name_list = cursor.execute('''SELECT yt_channel_id, channel_name FROM subscribed_channels''').fetchall() + channel_names.update(channel_id_name_list) + check_channels_if_necessary([item[0] for item in channel_id_name_list]) +def check_tags(tags): + channel_id_name_list = [] + with open_database() as connection: + with connection as cursor: + for tag in tags: + channel_id_name_list += cursor.execute('''SELECT yt_channel_id, channel_name + FROM subscribed_channels + WHERE subscribed_channels.id IN ( + SELECT tag_associations.sql_channel_id FROM tag_associations WHERE tag=? + )''', [tag]).fetchall() + channel_names.update(channel_id_name_list) + check_channels_if_necessary([item[0] for item in channel_id_name_list]) +def check_specific_channels(channel_ids): + with open_database() as connection: + with connection as cursor: + for channel_id in channel_ids: + channel_id_name_list += cursor.execute('''SELECT yt_channel_id, channel_name + FROM subscribed_channels + WHERE yt_channel_id=?''', [channel_id]).fetchall() + channel_names.update(channel_id_name_list) + check_channels_if_necessary(channel_ids) @@ -408,15 +499,18 @@ def post_subscriptions_page(env, start_response): _unsubscribe(params['channel_id']) elif action == 'refresh': - with open_database() as connection: - with connection as cursor: - for sql_channel_id, yt_channel_id in cursor.execute('''SELECT id, yt_channel_id FROM subscribed_channels''').fetchall(): - db_videos = ( (sql_channel_id, info['id'], info['title'], info['duration'], info['time_published'], info['description']) for info in _get_upstream_videos(yt_channel_id) ) - cursor.executemany('''INSERT OR IGNORE INTO videos (sql_channel_id, video_id, title, duration, time_published, description) VALUES (?, ?, ?, ?, ?, ?)''', db_videos) - - cursor.execute('''UPDATE subscribed_channels SET time_last_checked = ?''', ( int(time.time()), ) ) + type = params['type'][0] + if type == 'all': + check_all_channels() + elif type == 'tag': + check_tags(params['tag_name']) + elif type == 'channel': + check_specific_channels(params['channel_id']) + else: + start_response('400 Bad Request', ()) + return b'400 Bad Request' - start_response('303 See Other', [('Location', util.URL_ORIGIN + '/subscriptions'),] ) + start_response('204 No Content', ()) return b'' else: start_response('400 Bad Request', ()) diff --git a/youtube/util.py b/youtube/util.py index 42d76a3..c4e1aff 100644 --- a/youtube/util.py +++ b/youtube/util.py @@ -7,6 +7,8 @@ import re import time import os import gevent +import gevent.queue +import gevent.lock # The trouble with the requests library: It ships its own certificate bundle via certifi # instead of using the system certificate store, meaning self-signed certificates @@ -176,6 +178,53 @@ desktop_ua = (('User-Agent', desktop_user_agent),) +class RateLimitedQueue(gevent.queue.Queue): + ''' Does initial_burst (def. 30) at first, then alternates between waiting waiting_period (def. 5) seconds and doing subsequent_bursts (def. 10) queries. After 5 seconds with nothing left in the queue, resets rate limiting. ''' + + def __init__(self, initial_burst=30, waiting_period=5, subsequent_bursts=10): + self.initial_burst = initial_burst + self.waiting_period = waiting_period + self.subsequent_bursts = subsequent_bursts + + self.count_since_last_wait = 0 + self.surpassed_initial = False + + self.lock = gevent.lock.BoundedSemaphore(1) + self.currently_empty = False + self.empty_start = 0 + gevent.queue.Queue.__init__(self) + + + def get(self): + self.lock.acquire() # blocks if another greenlet currently has the lock + if self.count_since_last_wait >= self.subsequent_bursts and self.surpassed_initial: + gevent.sleep(self.waiting_period) + self.count_since_last_wait = 0 + + elif self.count_since_last_wait >= self.initial_burst and not self.surpassed_initial: + self.surpassed_initial = True + gevent.sleep(self.waiting_period) + self.count_since_last_wait = 0 + + self.count_since_last_wait += 1 + + if not self.currently_empty and self.empty(): + self.currently_empty = True + self.empty_start = time.monotonic() + + item = gevent.queue.Queue.get(self) # blocks when nothing left + + if self.currently_empty: + if time.monotonic() - self.empty_start >= self.waiting_period: + self.count_since_last_wait = 0 + self.surpassed_initial = False + + self.currently_empty = False + + self.lock.release() + + return item + def download_thumbnail(save_directory, video_id): @@ -185,14 +234,15 @@ def download_thumbnail(save_directory, video_id): thumbnail = fetch_url(url, report_text="Saved thumbnail: " + video_id) except urllib.error.HTTPError as e: print("Failed to download thumbnail for " + video_id + ": " + str(e)) - return + return False try: f = open(save_location, 'wb') except FileNotFoundError: - os.makedirs(save_directory) + os.makedirs(save_directory, exist_ok = True) f = open(save_location, 'wb') f.write(thumbnail) f.close() + return True def download_thumbnails(save_directory, ids): if not isinstance(ids, (list, tuple)): |