diff options
-rw-r--r-- | youtube/subscriptions.py | 100 |
1 files changed, 62 insertions, 38 deletions
diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py index 2e821de..56bdf93 100644 --- a/youtube/subscriptions.py +++ b/youtube/subscriptions.py @@ -455,46 +455,40 @@ def _get_upstream_videos(channel_id): print('Failed to read atoma feed for ' + channel_status_name) traceback.print_exc() - with open_database() as connection: - with connection as cursor: - is_first_check = cursor.execute('''SELECT time_last_checked FROM subscribed_channels WHERE yt_channel_id=?''', [channel_id]).fetchone()[0] in (None, 0) - video_add_time = int(time.time()) + videos = channel.extract_info(json.loads(channel_tab), 'videos')['items'] + for i, video_item in enumerate(videos): + if 'description' not in video_item: + video_item['description'] = '' + + if video_item['id'] in times_published: + video_item['time_published'] = times_published[video_item['id']] + video_item['is_time_published_exact'] = True + else: + video_item['is_time_published_exact'] = False + try: + video_item['time_published'] = youtube_timestamp_to_posix(video_item['published']) - i # subtract a few seconds off the videos so they will be in the right order + except KeyError: + print(video_item) - videos = [] - channel_videos = channel.extract_info(json.loads(channel_tab), 'videos')['items'] - for i, video_item in enumerate(channel_videos): - if 'description' not in video_item: - video_item['description'] = '' + video_item['channel_id'] = channel_id - if video_item['id'] in times_published: - time_published = times_published[video_item['id']] - is_time_published_exact = True - else: - is_time_published_exact = False - try: - time_published = youtube_timestamp_to_posix(video_item['published']) - i # subtract a few seconds off the videos so they will be in the right order - except KeyError: - print(video_item) - if is_first_check: - time_noticed = time_published # don't want a crazy ordering on first check, since we're ordering by time_noticed - else: - time_noticed = video_add_time - videos.append((channel_id, video_item['id'], video_item['title'], video_item['duration'], time_published, is_time_published_exact, time_noticed, video_item['description'])) + if len(videos) == 0: + average_upload_period = 4*7*24*3600 # assume 1 month for channel with no videos + elif len(videos) < 5: + average_upload_period = int((time.time() - videos[len(videos)-1]['time_published'])/len(videos)) + else: + average_upload_period = int((time.time() - videos[4]['time_published'])/5) # equivalent to averaging the time between videos for the last 5 videos - if len(videos) == 0: - average_upload_period = 4*7*24*3600 # assume 1 month for channel with no videos - elif len(videos) < 5: - average_upload_period = int((time.time() - videos[len(videos)-1][4])/len(videos)) - else: - average_upload_period = int((time.time() - videos[4][4])/5) # equivalent to averaging the time between videos for the last 5 videos + # calculate when to check next for auto checking + # add some quantization and randomness to make pattern analysis by Youtube slightly harder + quantized_upload_period = average_upload_period - (average_upload_period % (4*3600)) + 4*3600 # round up to nearest 4 hours + randomized_upload_period = quantized_upload_period*(1 + secrets.randbelow(50)/50*0.5) # randomly between 1x and 1.5x + next_check_delay = randomized_upload_period/10 # check at 10x the channel posting rate. might want to fine tune this number + next_check_time = int(time.time() + next_check_delay) - # calculate when to check next for auto checking - # add some quantization and randomness to make pattern analysis by Youtube slightly harder - quantized_upload_period = average_upload_period - (average_upload_period % (4*3600)) + 4*3600 # round up to nearest 4 hours - randomized_upload_period = quantized_upload_period*(1 + secrets.randbelow(50)/50*0.5) # randomly between 1x and 1.5x - next_check_delay = randomized_upload_period/10 # check at 10x the channel posting rate. might want to fine tune this number - next_check_time = int(time.time() + next_check_delay) + with open_database() as connection: + with connection as cursor: # calculate how many new videos there are row = cursor.execute('''SELECT video_id @@ -509,13 +503,43 @@ def _get_upstream_videos(channel_id): latest_video_id = row[0] index = 0 for video in videos: - if video[1] == latest_video_id: + if video['id'] == latest_video_id: break index += 1 number_of_new_videos = index - cursor.executemany('''INSERT OR IGNORE INTO videos (sql_channel_id, video_id, title, duration, time_published, is_time_published_exact, time_noticed, description) - VALUES ((SELECT id FROM subscribed_channels WHERE yt_channel_id=?), ?, ?, ?, ?, ?, ?, ?)''', videos) + is_first_check = cursor.execute('''SELECT time_last_checked FROM subscribed_channels WHERE yt_channel_id=?''', [channel_id]).fetchone()[0] in (None, 0) + time_videos_retrieved = int(time.time()) + rows = [] + for video_item in videos: + if is_first_check or number_of_new_videos > 6: + # don't want a crazy ordering on first check or check in a long time, since we're ordering by time_noticed + time_noticed = video_item['time_published'] + else: + time_noticed = time_videos_retrieved + rows.append(( + video_item['channel_id'], + video_item['id'], + video_item['title'], + video_item['duration'], + video_item['time_published'], + video_item['is_time_published_exact'], + time_noticed, + video_item['description'], + )) + + + cursor.executemany('''INSERT OR IGNORE INTO videos ( + sql_channel_id, + video_id, + title, + duration, + time_published, + is_time_published_exact, + time_noticed, + description + ) + VALUES ((SELECT id FROM subscribed_channels WHERE yt_channel_id=?), ?, ?, ?, ?, ?, ?, ?)''', rows) cursor.execute('''UPDATE subscribed_channels SET time_last_checked = ?, next_check_time = ? WHERE yt_channel_id=?''', [int(time.time()), next_check_time, channel_id]) |