aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/subscriptions.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube/subscriptions.py')
-rw-r--r--youtube/subscriptions.py524
1 files changed, 387 insertions, 137 deletions
diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py
index 87e1659..04d3c5a 100644
--- a/youtube/subscriptions.py
+++ b/youtube/subscriptions.py
@@ -1,4 +1,4 @@
-from youtube import util, yt_data_extract, channel
+from youtube import util, yt_data_extract, channel, local_playlist, playlist
from youtube import yt_app
import settings
@@ -15,6 +15,8 @@ import math
import secrets
import collections
import calendar # bullshit! https://bugs.python.org/issue6280
+import csv
+import re
import flask
from flask import request
@@ -26,6 +28,7 @@ thumbnails_directory = os.path.join(settings.data_dir, "subscription_thumbnails"
database_path = os.path.join(settings.data_dir, "subscriptions.sqlite")
+
def open_database():
if not os.path.exists(settings.data_dir):
os.makedirs(settings.data_dir)
@@ -74,11 +77,13 @@ def open_database():
# https://stackoverflow.com/questions/19522505/using-sqlite3-in-python-with-with-keyword
return contextlib.closing(connection)
+
def with_open_db(function, *args, **kwargs):
with open_database() as connection:
with connection as cursor:
return function(cursor, *args, **kwargs)
+
def _is_subscribed(cursor, channel_id):
result = cursor.execute('''SELECT EXISTS(
SELECT 1
@@ -88,12 +93,14 @@ def _is_subscribed(cursor, channel_id):
)''', [channel_id]).fetchone()
return bool(result[0])
+
def is_subscribed(channel_id):
if not os.path.exists(database_path):
return False
return with_open_db(_is_subscribed, channel_id)
+
def _subscribe(channels):
''' channels is a list of (channel_id, channel_name) '''
channels = list(channels)
@@ -101,7 +108,7 @@ def _subscribe(channels):
with connection as cursor:
channel_ids_to_check = [channel[0] for channel in channels if not _is_subscribed(cursor, channel[0])]
- rows = ( (channel_id, channel_name, 0, 0) for channel_id, channel_name in channels)
+ rows = ((channel_id, channel_name, 0, 0) for channel_id, channel_name in channels)
cursor.executemany('''INSERT OR IGNORE INTO subscribed_channels (yt_channel_id, channel_name, time_last_checked, next_check_time)
VALUES (?, ?, ?, ?)''', rows)
@@ -111,6 +118,7 @@ def _subscribe(channels):
channel_names.update(channels)
check_channels_if_necessary(channel_ids_to_check)
+
def delete_thumbnails(to_delete):
for thumbnail in to_delete:
try:
@@ -122,6 +130,7 @@ def delete_thumbnails(to_delete):
print('Failed to delete thumbnail: ' + thumbnail)
traceback.print_exc()
+
def _unsubscribe(cursor, channel_ids):
''' channel_ids is a list of channel_ids '''
to_delete = []
@@ -138,7 +147,8 @@ def _unsubscribe(cursor, channel_ids):
gevent.spawn(delete_thumbnails, to_delete)
cursor.executemany("DELETE FROM subscribed_channels WHERE yt_channel_id=?", ((channel_id, ) for channel_id in channel_ids))
-def _get_videos(cursor, number_per_page, offset, tag = None):
+
+def _get_videos(cursor, number_per_page, offset, tag=None):
'''Returns a full page of videos with an offset, and a value good enough to be used as the total number of videos'''
# We ask for the next 9 pages from the database
# Then the actual length of the results tell us if there are more than 9 pages left, and if not, how many there actually are
@@ -149,7 +159,7 @@ def _get_videos(cursor, number_per_page, offset, tag = None):
# We cannot use tricks with the sql id for the video since we frequently have filters and other restrictions in place on the results anyway
# TODO: This is probably not the ideal solution
if tag is not None:
- db_videos = cursor.execute('''SELECT video_id, title, duration, time_published, is_time_published_exact, channel_name
+ db_videos = cursor.execute('''SELECT video_id, title, duration, time_published, is_time_published_exact, channel_name, yt_channel_id
FROM videos
INNER JOIN subscribed_channels on videos.sql_channel_id = subscribed_channels.id
INNER JOIN tag_associations on videos.sql_channel_id = tag_associations.sql_channel_id
@@ -157,7 +167,7 @@ def _get_videos(cursor, number_per_page, offset, tag = None):
ORDER BY time_noticed DESC, time_published DESC
LIMIT ? OFFSET ?''', (tag, number_per_page*9, offset)).fetchall()
else:
- db_videos = cursor.execute('''SELECT video_id, title, duration, time_published, is_time_published_exact, channel_name
+ db_videos = cursor.execute('''SELECT video_id, title, duration, time_published, is_time_published_exact, channel_name, yt_channel_id
FROM videos
INNER JOIN subscribed_channels on videos.sql_channel_id = subscribed_channels.id
WHERE muted = 0
@@ -172,15 +182,15 @@ def _get_videos(cursor, number_per_page, offset, tag = None):
'id': db_video[0],
'title': db_video[1],
'duration': db_video[2],
- 'published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]),
+ 'time_published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]),
'author': db_video[5],
+ 'author_id': db_video[6],
+ 'author_url': '/https://www.youtube.com/channel/' + db_video[6],
})
return videos, pseudo_number_of_videos
-
-
def _get_subscribed_channels(cursor):
for item in cursor.execute('''SELECT channel_name, yt_channel_id, muted
FROM subscribed_channels
@@ -202,7 +212,6 @@ def _remove_tags(cursor, channel_ids, tags):
)''', pairs)
-
def _get_tags(cursor, channel_id):
return [row[0] for row in cursor.execute('''SELECT tag
FROM tag_associations
@@ -210,9 +219,11 @@ def _get_tags(cursor, channel_id):
SELECT id FROM subscribed_channels WHERE yt_channel_id = ?
)''', (channel_id,))]
+
def _get_all_tags(cursor):
return [row[0] for row in cursor.execute('''SELECT DISTINCT tag FROM tag_associations''')]
+
def _get_channel_names(cursor, channel_ids):
''' returns list of (channel_id, channel_name) '''
result = []
@@ -220,7 +231,7 @@ def _get_channel_names(cursor, channel_ids):
row = cursor.execute('''SELECT channel_name
FROM subscribed_channels
WHERE yt_channel_id = ?''', (channel_id,)).fetchone()
- result.append( (channel_id, row[0]) )
+ result.append((channel_id, row[0]))
return result
@@ -245,12 +256,15 @@ def _channels_with_tag(cursor, tag, order=False, exclude_muted=False, include_mu
return cursor.execute(statement, [tag]).fetchall()
+
def _schedule_checking(cursor, channel_id, next_check_time):
cursor.execute('''UPDATE subscribed_channels SET next_check_time = ? WHERE yt_channel_id = ?''', [int(next_check_time), channel_id])
+
def _is_muted(cursor, channel_id):
return bool(cursor.execute('''SELECT muted FROM subscribed_channels WHERE yt_channel_id=?''', [channel_id]).fetchone()[0])
+
units = collections.OrderedDict([
('year', 31536000), # 365*24*3600
('month', 2592000), # 30*24*3600
@@ -260,6 +274,8 @@ units = collections.OrderedDict([
('minute', 60),
('second', 1),
])
+
+
def youtube_timestamp_to_posix(dumb_timestamp):
''' Given a dumbed down timestamp such as 1 year ago, 3 hours ago,
approximates the unix time (seconds since 1/1/1970) '''
@@ -273,6 +289,7 @@ def youtube_timestamp_to_posix(dumb_timestamp):
unit = unit[:-1] # remove s from end
return now - quantifier*units[unit]
+
def posix_to_dumbed_down(posix_time):
'''Inverse of youtube_timestamp_to_posix.'''
delta = int(time.time() - posix_time)
@@ -291,12 +308,14 @@ def posix_to_dumbed_down(posix_time):
else:
raise Exception()
+
def exact_timestamp(posix_time):
result = time.strftime('%I:%M %p %m/%d/%y', time.localtime(posix_time))
if result[0] == '0': # remove 0 infront of hour (like 01:00 PM)
return result[1:]
return result
+
try:
existing_thumbnails = set(os.path.splitext(name)[0] for name in os.listdir(thumbnails_directory))
except FileNotFoundError:
@@ -312,23 +331,71 @@ checking_channels = set()
# Just to use for printing channel checking status to console without opening database
channel_names = dict()
+
def check_channel_worker():
while True:
channel_id = check_channels_queue.get()
try:
_get_upstream_videos(channel_id)
+ except Exception:
+ traceback.print_exc()
finally:
checking_channels.remove(channel_id)
-for i in range(0,5):
+
+for i in range(0, 5):
gevent.spawn(check_channel_worker)
# ----------------------------
-
# --- Auto checking system - Spaghetti code ---
+def autocheck_dispatcher():
+ '''Scans the auto_check_list. Sleeps until the earliest job is due, then adds that channel to the checking queue above. Can be sent a new job through autocheck_job_application'''
+ while True:
+ if len(autocheck_jobs) == 0:
+ new_job = autocheck_job_application.get()
+ autocheck_jobs.append(new_job)
+ else:
+ earliest_job_index = min(range(0, len(autocheck_jobs)), key=lambda index: autocheck_jobs[index]['next_check_time']) # https://stackoverflow.com/a/11825864
+ earliest_job = autocheck_jobs[earliest_job_index]
+ time_until_earliest_job = earliest_job['next_check_time'] - time.time()
+
+ if time_until_earliest_job <= -5: # should not happen unless we're running extremely slow
+ print('ERROR: autocheck_dispatcher got job scheduled in the past, skipping and rescheduling: ' + earliest_job['channel_id'] + ', ' + earliest_job['channel_name'] + ', ' + str(earliest_job['next_check_time']))
+ next_check_time = time.time() + 3600*secrets.randbelow(60)/60
+ with_open_db(_schedule_checking, earliest_job['channel_id'], next_check_time)
+ autocheck_jobs[earliest_job_index]['next_check_time'] = next_check_time
+ continue
+
+ # make sure it's not muted
+ if with_open_db(_is_muted, earliest_job['channel_id']):
+ del autocheck_jobs[earliest_job_index]
+ continue
+
+ if time_until_earliest_job > 0: # it can become less than zero (in the past) when it's set to go off while the dispatcher is doing something else at that moment
+ try:
+ new_job = autocheck_job_application.get(timeout=time_until_earliest_job) # sleep for time_until_earliest_job time, but allow to be interrupted by new jobs
+ except gevent.queue.Empty: # no new jobs
+ pass
+ else: # new job, add it to the list
+ autocheck_jobs.append(new_job)
+ continue
+
+ # no new jobs, time to execute the earliest job
+ channel_names[earliest_job['channel_id']] = earliest_job['channel_name']
+ checking_channels.add(earliest_job['channel_id'])
+ check_channels_queue.put(earliest_job['channel_id'])
+ del autocheck_jobs[earliest_job_index]
+
+
+dispatcher_greenlet = None
+
+
+def start_autocheck_system():
+ global autocheck_job_application
+ global autocheck_jobs
+ global dispatcher_greenlet
-if settings.autocheck_subscriptions:
# job application format: dict with keys (channel_id, channel_name, next_check_time)
autocheck_job_application = gevent.queue.Queue() # only really meant to hold 1 item, just reusing gevent's wait and timeout machinery
@@ -350,53 +417,30 @@ if settings.autocheck_subscriptions:
row = (row[0], row[1], next_check_time)
_schedule_checking(cursor, row[0], next_check_time)
autocheck_jobs.append({'channel_id': row[0], 'channel_name': row[1], 'next_check_time': next_check_time})
+ dispatcher_greenlet = gevent.spawn(autocheck_dispatcher)
+def stop_autocheck_system():
+ if dispatcher_greenlet is not None:
+ dispatcher_greenlet.kill()
- def autocheck_dispatcher():
- '''Scans the auto_check_list. Sleeps until the earliest job is due, then adds that channel to the checking queue above. Can be sent a new job through autocheck_job_application'''
- while True:
- if len(autocheck_jobs) == 0:
- new_job = autocheck_job_application.get()
- autocheck_jobs.append(new_job)
- else:
- earliest_job_index = min(range(0, len(autocheck_jobs)), key=lambda index: autocheck_jobs[index]['next_check_time']) # https://stackoverflow.com/a/11825864
- earliest_job = autocheck_jobs[earliest_job_index]
- time_until_earliest_job = earliest_job['next_check_time'] - time.time()
-
- if time_until_earliest_job <= -5: # should not happen unless we're running extremely slow
- print('ERROR: autocheck_dispatcher got job scheduled in the past, skipping and rescheduling: ' + earliest_job['channel_id'] + ', ' + earliest_job['channel_name'] + ', ' + str(earliest_job['next_check_time']))
- next_check_time = time.time() + 3600*secrets.randbelow(60)/60
- with_open_db(_schedule_checking, earliest_job['channel_id'], next_check_time)
- autocheck_jobs[earliest_job_index]['next_check_time'] = next_check_time
- continue
-
- # make sure it's not muted
- if with_open_db(_is_muted, earliest_job['channel_id']):
- del autocheck_jobs[earliest_job_index]
- continue
- if time_until_earliest_job > 0: # it can become less than zero (in the past) when it's set to go off while the dispatcher is doing something else at that moment
- try:
- new_job = autocheck_job_application.get(timeout = time_until_earliest_job) # sleep for time_until_earliest_job time, but allow to be interrupted by new jobs
- except gevent.queue.Empty: # no new jobs
- pass
- else: # new job, add it to the list
- autocheck_jobs.append(new_job)
- continue
-
- # no new jobs, time to execute the earliest job
- channel_names[earliest_job['channel_id']] = earliest_job['channel_name']
- checking_channels.add(earliest_job['channel_id'])
- check_channels_queue.put(earliest_job['channel_id'])
- del autocheck_jobs[earliest_job_index]
+def autocheck_setting_changed(old_value, new_value):
+ if new_value:
+ start_autocheck_system()
+ else:
+ stop_autocheck_system()
- gevent.spawn(autocheck_dispatcher)
+settings.add_setting_changed_hook(
+ 'autocheck_subscriptions',
+ autocheck_setting_changed
+)
+if settings.autocheck_subscriptions:
+ start_autocheck_system()
# ----------------------------
-
def check_channels_if_necessary(channel_ids):
for channel_id in channel_ids:
if channel_id not in checking_channels:
@@ -404,6 +448,54 @@ def check_channels_if_necessary(channel_ids):
check_channels_queue.put(channel_id)
+def _get_atoma_feed(channel_id):
+ url = 'https://www.youtube.com/feeds/videos.xml?channel_id=' + channel_id
+ try:
+ return util.fetch_url(url).decode('utf-8')
+ except util.FetchError as e:
+ # 404 is expected for terminated channels
+ if e.code in ('404', '429'):
+ return ''
+ if e.code == '502':
+ return str(e)
+ raise
+
+
+def _get_channel_videos_first_page(channel_id, channel_status_name):
+ try:
+ # First try the playlist method
+ pl_json = playlist.get_videos(
+ 'UU' + channel_id[2:],
+ 1,
+ include_shorts=settings.include_shorts_in_subscriptions,
+ report_text=None
+ )
+ pl_info = yt_data_extract.extract_playlist_info(pl_json)
+ if pl_info.get('items'):
+ pl_info['items'] = pl_info['items'][0:30]
+ return pl_info
+
+ # Try the channel api method
+ channel_json = channel.get_channel_first_page(channel_id=channel_id)
+ channel_info = yt_data_extract.extract_channel_info(
+ json.loads(channel_json), 'videos'
+ )
+ return channel_info
+ except util.FetchError as e:
+ if e.code == '429' and settings.route_tor:
+ error_message = ('Error checking channel ' + channel_status_name
+ + ': YouTube blocked the request because the'
+ + ' Tor exit node is overutilized. Try getting a new exit node'
+ + ' by using the New Identity button in the Tor Browser.')
+ if e.ip:
+ error_message += ' Exit node IP address: ' + e.ip
+ print(error_message)
+ return None
+ elif e.code == '502':
+ print('Error checking channel', channel_status_name + ':', str(e))
+ return None
+ raise
+
def _get_upstream_videos(channel_id):
try:
@@ -414,12 +506,15 @@ def _get_upstream_videos(channel_id):
print("Checking channel: " + channel_status_name)
tasks = (
- gevent.spawn(channel.get_channel_tab, channel_id, print_status=False), # channel page, need for video duration
- gevent.spawn(util.fetch_url, 'https://www.youtube.com/feeds/videos.xml?channel_id=' + channel_id) # atoma feed, need for exact published time
+ # channel page, need for video duration
+ gevent.spawn(_get_channel_videos_first_page, channel_id,
+ channel_status_name),
+ # need atoma feed for exact published time
+ gevent.spawn(_get_atoma_feed, channel_id)
)
gevent.joinall(tasks)
- channel_tab, feed = tasks[0].value, tasks[1].value
+ channel_info, feed = tasks[0].value, tasks[1].value
# extract published times from atoma feed
times_published = {}
@@ -436,7 +531,7 @@ def _get_upstream_videos(channel_id):
return element
return None
- root = defusedxml.ElementTree.fromstring(feed.decode('utf-8'))
+ root = defusedxml.ElementTree.fromstring(feed)
assert remove_bullshit(root.tag) == 'feed'
for entry in root:
if (remove_bullshit(entry.tag) != 'entry'):
@@ -451,42 +546,70 @@ def _get_upstream_videos(channel_id):
time_published = int(calendar.timegm(time.strptime(time_published_element.text, '%Y-%m-%dT%H:%M:%S+00:00')))
times_published[video_id_element.text] = time_published
- except (AssertionError, defusedxml.ElementTree.ParseError) as e:
+ except AssertionError:
print('Failed to read atoma feed for ' + channel_status_name)
traceback.print_exc()
+ except defusedxml.ElementTree.ParseError:
+ print('Failed to read atoma feed for ' + channel_status_name)
- channel_info = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos')
- if channel_info['errors']:
- print('Error checking channel ' + channel_status_name + ': ' + ', '.join(channel_info['errors']))
+ if channel_info is None: # there was an error
+ return
+ if channel_info['error']:
+ print('Error checking channel ' + channel_status_name + ': ' + channel_info['error'])
return
videos = channel_info['items']
for i, video_item in enumerate(videos):
- if 'description' not in video_item:
+ if not video_item.get('description'):
video_item['description'] = ''
+ else:
+ video_item['description'] = ''.join(run.get('text', '') for run in video_item['description'])
if video_item['id'] in times_published:
video_item['time_published'] = times_published[video_item['id']]
video_item['is_time_published_exact'] = True
- else:
+ elif video_item.get('time_published'):
video_item['is_time_published_exact'] = False
try:
- video_item['time_published'] = youtube_timestamp_to_posix(video_item['published']) - i # subtract a few seconds off the videos so they will be in the right order
- except KeyError:
+ video_item['time_published'] = youtube_timestamp_to_posix(video_item['time_published']) - i # subtract a few seconds off the videos so they will be in the right order
+ except Exception:
print(video_item)
-
+ else:
+ video_item['is_time_published_exact'] = False
+ video_item['time_published'] = None
video_item['channel_id'] = channel_id
+ if len(videos) > 1:
+ # Go back and fill in any videos that don't have a time published
+ # using the time published of the surrounding ones
+ for i in range(len(videos)-1):
+ if (videos[i+1]['time_published'] is None
+ and videos[i]['time_published'] is not None
+ ):
+ videos[i+1]['time_published'] = videos[i]['time_published'] - 1
+ for i in reversed(range(1,len(videos))):
+ if (videos[i-1]['time_published'] is None
+ and videos[i]['time_published'] is not None
+ ):
+ videos[i-1]['time_published'] = videos[i]['time_published'] + 1
+ # Special case: none of the videos have a time published.
+ # In this case, make something up
+ if videos and videos[0]['time_published'] is None:
+ assert all(v['time_published'] is None for v in videos)
+ now = time.time()
+ for i in range(len(videos)):
+ # 1 month between videos
+ videos[i]['time_published'] = now - i*3600*24*30
if len(videos) == 0:
- average_upload_period = 4*7*24*3600 # assume 1 month for channel with no videos
+ average_upload_period = 4*7*24*3600 # assume 1 month for channel with no videos
elif len(videos) < 5:
average_upload_period = int((time.time() - videos[len(videos)-1]['time_published'])/len(videos))
else:
average_upload_period = int((time.time() - videos[4]['time_published'])/5) # equivalent to averaging the time between videos for the last 5 videos
# calculate when to check next for auto checking
- # add some quantization and randomness to make pattern analysis by Youtube slightly harder
+ # add some quantization and randomness to make pattern analysis by YouTube slightly harder
quantized_upload_period = average_upload_period - (average_upload_period % (4*3600)) + 4*3600 # round up to nearest 4 hours
randomized_upload_period = quantized_upload_period*(1 + secrets.randbelow(50)/50*0.5) # randomly between 1x and 1.5x
next_check_delay = randomized_upload_period/10 # check at 10x the channel posting rate. might want to fine tune this number
@@ -495,44 +618,74 @@ def _get_upstream_videos(channel_id):
with open_database() as connection:
with connection as cursor:
- # calculate how many new videos there are
- row = cursor.execute('''SELECT video_id
- FROM videos
- INNER JOIN subscribed_channels ON videos.sql_channel_id = subscribed_channels.id
- WHERE yt_channel_id=?
- ORDER BY time_published DESC
- LIMIT 1''', [channel_id]).fetchone()
- if row is None:
- number_of_new_videos = len(videos)
- else:
- latest_video_id = row[0]
- index = 0
- for video in videos:
- if video['id'] == latest_video_id:
- break
- index += 1
- number_of_new_videos = index
+ # Get video ids and duration of existing vids so we
+ # can see how many new ones there are and update
+ # livestreams/premiers
+ existing_vids = list(cursor.execute(
+ '''SELECT video_id, duration
+ FROM videos
+ INNER JOIN subscribed_channels
+ ON videos.sql_channel_id = subscribed_channels.id
+ WHERE yt_channel_id=?
+ ORDER BY time_published DESC
+ LIMIT 30''', [channel_id]).fetchall())
+ existing_vid_ids = set(row[0] for row in existing_vids)
+ existing_durs = dict(existing_vids)
+
+ # new videos the channel has uploaded since last time we checked
+ number_of_new_videos = 0
+ for video in videos:
+ if video['id'] in existing_vid_ids:
+ break
+ number_of_new_videos += 1
is_first_check = cursor.execute('''SELECT time_last_checked FROM subscribed_channels WHERE yt_channel_id=?''', [channel_id]).fetchone()[0] in (None, 0)
time_videos_retrieved = int(time.time())
rows = []
- for video_item in videos:
- if is_first_check or number_of_new_videos > 6:
+ update_rows = []
+ for i, video_item in enumerate(videos):
+ if (is_first_check
+ or number_of_new_videos > 6
+ or i >= number_of_new_videos):
# don't want a crazy ordering on first check or check in a long time, since we're ordering by time_noticed
+ # Last condition is for when the channel deleting videos
+ # causes new videos to appear at the end of the backlog.
+ # For instance, if we have 30 vids in the DB, and 1 vid
+ # that we previously saw has since been deleted,
+ # then a video we haven't seen before will appear as the
+ # 30th. Don't want this to be considered a newly noticed
+ # vid which would appear at top of subscriptions feed
time_noticed = video_item['time_published']
else:
time_noticed = time_videos_retrieved
- rows.append((
- video_item['channel_id'],
- video_item['id'],
- video_item['title'],
- video_item['duration'],
- video_item['time_published'],
- video_item['is_time_published_exact'],
- time_noticed,
- video_item['description'],
- ))
+ # videos which need durations updated
+ non_durations = ('upcoming', 'none', 'live', '')
+ v_id = video_item['id']
+ if (existing_durs.get(v_id) is not None
+ and existing_durs[v_id].lower() in non_durations
+ and video_item['duration'] not in non_durations
+ ):
+ update_rows.append((
+ video_item['title'],
+ video_item['duration'],
+ video_item['time_published'],
+ video_item['is_time_published_exact'],
+ video_item['description'],
+ video_item['id'],
+ ))
+ # all other videos
+ else:
+ rows.append((
+ video_item['channel_id'],
+ video_item['id'],
+ video_item['title'],
+ video_item['duration'],
+ video_item['time_published'],
+ video_item['is_time_published_exact'],
+ time_noticed,
+ video_item['description'],
+ ))
cursor.executemany('''INSERT OR IGNORE INTO videos (
sql_channel_id,
@@ -545,6 +698,13 @@ def _get_upstream_videos(channel_id):
description
)
VALUES ((SELECT id FROM subscribed_channels WHERE yt_channel_id=?), ?, ?, ?, ?, ?, ?, ?)''', rows)
+ cursor.executemany('''UPDATE videos SET
+ title=?,
+ duration=?,
+ time_published=?,
+ is_time_published_exact=?,
+ description=?
+ WHERE video_id=?''', update_rows)
cursor.execute('''UPDATE subscribed_channels
SET time_last_checked = ?, next_check_time = ?
WHERE yt_channel_id=?''', [int(time.time()), next_check_time, channel_id])
@@ -561,7 +721,6 @@ def _get_upstream_videos(channel_id):
print(str(number_of_new_videos) + ' new videos from ' + channel_status_name)
-
def check_all_channels():
with open_database() as connection:
with connection as cursor:
@@ -595,35 +754,54 @@ def check_specific_channels(channel_ids):
channel_names.update(channel_id_name_list)
check_channels_if_necessary(channel_ids)
-
-
+CHANNEL_ID_RE = re.compile(r'UC[-_\w]{22}')
@yt_app.route('/import_subscriptions', methods=['POST'])
def import_subscriptions():
# check if the post request has the file part
if 'subscriptions_file' not in request.files:
- #flash('No file part')
+ # flash('No file part')
return flask.redirect(util.URL_ORIGIN + request.full_path)
file = request.files['subscriptions_file']
# if user does not select file, browser also
# submit an empty part without filename
if file.filename == '':
- #flash('No selected file')
+ # flash('No selected file')
return flask.redirect(util.URL_ORIGIN + request.full_path)
-
mime_type = file.mimetype
if mime_type == 'application/json':
- file = file.read().decode('utf-8')
+ info = file.read().decode('utf-8')
+ if info == '':
+ return '400 Bad Request: File is empty', 400
try:
- file = json.loads(file)
+ info = json.loads(info)
except json.decoder.JSONDecodeError:
traceback.print_exc()
return '400 Bad Request: Invalid json file', 400
+ channels = []
try:
- channels = ( (item['snippet']['resourceId']['channelId'], item['snippet']['title']) for item in file)
+ if 'app_version_int' in info: # NewPipe Format
+ for item in info['subscriptions']:
+ # Other service, such as SoundCloud
+ if item.get('service_id', 0) != 0:
+ continue
+ channel_url = item['url']
+ channel_id_match = CHANNEL_ID_RE.search(channel_url)
+ if channel_id_match:
+ channel_id = channel_id_match.group(0)
+ else:
+ print('WARNING: Could not find channel id in url',
+ channel_url)
+ continue
+ channels.append((channel_id, item['name']))
+ else: # Old Google Takeout format
+ for item in info:
+ snippet = item['snippet']
+ channel_id = snippet['resourceId']['channelId']
+ channels.append((channel_id, snippet['title']))
except (KeyError, IndexError):
traceback.print_exc()
return '400 Bad Request: Unknown json structure', 400
@@ -637,22 +815,98 @@ def import_subscriptions():
if (outline_element.tag != 'outline') or ('xmlUrl' not in outline_element.attrib):
continue
-
channel_name = outline_element.attrib['text']
channel_rss_url = outline_element.attrib['xmlUrl']
channel_id = channel_rss_url[channel_rss_url.find('channel_id=')+11:].strip()
- channels.append( (channel_id, channel_name) )
+ channels.append((channel_id, channel_name))
except (AssertionError, IndexError, defusedxml.ElementTree.ParseError) as e:
return '400 Bad Request: Unable to read opml xml file, or the file is not the expected format', 400
+ elif mime_type in ('text/csv', 'application/vnd.ms-excel'):
+ content = file.read().decode('utf-8')
+ reader = csv.reader(content.splitlines())
+ channels = []
+ for row in reader:
+ if not row or row[0].lower().strip() == 'channel id':
+ continue
+ elif len(row) > 1 and CHANNEL_ID_RE.fullmatch(row[0].strip()):
+ channels.append( (row[0], row[-1]) )
+ else:
+ print('WARNING: Unknown row format:', row)
else:
- return '400 Bad Request: Unsupported file format: ' + mime_type + '. Only subscription.json files (from Google Takeouts) and XML OPML files exported from Youtube\'s subscription manager page are supported', 400
+ error = 'Unsupported file format: ' + mime_type
+ error += (' . Only subscription.json, subscriptions.csv files'
+ ' (from Google Takeouts)'
+ ' and XML OPML files exported from YouTube\'s'
+ ' subscription manager page are supported')
+ return (flask.render_template('error.html', error_message=error),
+ 400)
_subscribe(channels)
return flask.redirect(util.URL_ORIGIN + '/subscription_manager', 303)
+@yt_app.route('/export_subscriptions', methods=['POST'])
+def export_subscriptions():
+ include_muted = request.values.get('include_muted') == 'on'
+ with open_database() as connection:
+ with connection as cursor:
+ sub_list = []
+ for channel_name, channel_id, muted in (
+ _get_subscribed_channels(cursor)):
+ if muted and not include_muted:
+ continue
+ if request.values['export_format'] == 'json_google_takeout':
+ sub_list.append({
+ 'kind': 'youtube#subscription',
+ 'snippet': {
+ 'muted': bool(muted),
+ 'resourceId': {
+ 'channelId': channel_id,
+ 'kind': 'youtube#channel',
+ },
+ 'tags': _get_tags(cursor, channel_id),
+ 'title': channel_name,
+ },
+ })
+ elif request.values['export_format'] == 'json_newpipe':
+ sub_list.append({
+ 'service_id': 0,
+ 'url': 'https://www.youtube.com/channel/' + channel_id,
+ 'name': channel_name,
+ })
+ elif request.values['export_format'] == 'opml':
+ sub_list.append({
+ 'channel_name': channel_name,
+ 'channel_id': channel_id,
+ })
+ date_time = time.strftime('%Y%m%d%H%M', time.localtime())
+ if request.values['export_format'] == 'json_google_takeout':
+ r = flask.Response(json.dumps(sub_list), mimetype='text/json')
+ cd = 'attachment; filename="subscriptions_%s.json"' % date_time
+ r.headers['Content-Disposition'] = cd
+ return r
+ elif request.values['export_format'] == 'json_newpipe':
+ r = flask.Response(json.dumps({
+ 'app_version': '0.21.9',
+ 'app_version_int': 975,
+ 'subscriptions': sub_list,
+ }), mimetype='text/json')
+ file_name = 'newpipe_subscriptions_%s_youtube-local.json' % date_time
+ cd = 'attachment; filename="%s"' % file_name
+ r.headers['Content-Disposition'] = cd
+ return r
+ elif request.values['export_format'] == 'opml':
+ r = flask.Response(
+ flask.render_template('subscriptions.xml', sub_list=sub_list),
+ mimetype='text/xml')
+ cd = 'attachment; filename="subscriptions_%s.xml"' % date_time
+ r.headers['Content-Disposition'] = cd
+ return r
+ else:
+ return '400 Bad Request', 400
+
@yt_app.route('/subscription_manager', methods=['GET'])
def get_subscription_manager_page():
@@ -673,7 +927,7 @@ def get_subscription_manager_page():
'tags': [t for t in _get_tags(cursor, channel_id) if t != tag],
})
- tag_groups.append( (tag, sub_list) )
+ tag_groups.append((tag, sub_list))
# Channels with no tags
channel_list = cursor.execute('''SELECT yt_channel_id, channel_name, muted
@@ -693,7 +947,7 @@ def get_subscription_manager_page():
'tags': [],
})
- tag_groups.append( ('No tags', sub_list) )
+ tag_groups.append(('No tags', sub_list))
else:
sub_list = []
for channel_name, channel_id, muted in _get_subscribed_channels(cursor):
@@ -705,20 +959,20 @@ def get_subscription_manager_page():
'tags': _get_tags(cursor, channel_id),
})
-
-
-
if group_by_tags:
- return flask.render_template('subscription_manager.html',
- group_by_tags = True,
- tag_groups = tag_groups,
+ return flask.render_template(
+ 'subscription_manager.html',
+ group_by_tags=True,
+ tag_groups=tag_groups,
)
else:
- return flask.render_template('subscription_manager.html',
- group_by_tags = False,
- sub_list = sub_list,
+ return flask.render_template(
+ 'subscription_manager.html',
+ group_by_tags=False,
+ sub_list=sub_list,
)
+
def list_from_comma_separated_tags(string):
return [tag.strip() for tag in string.split(',') if tag.strip()]
@@ -737,7 +991,7 @@ def post_subscription_manager_page():
_unsubscribe(cursor, request.values.getlist('channel_ids'))
elif action == 'unsubscribe_verify':
unsubscribe_list = _get_channel_names(cursor, request.values.getlist('channel_ids'))
- return flask.render_template('unsubscribe_verify.html', unsubscribe_list = unsubscribe_list)
+ return flask.render_template('unsubscribe_verify.html', unsubscribe_list=unsubscribe_list)
elif action == 'mute':
cursor.executemany('''UPDATE subscribed_channels
@@ -752,6 +1006,7 @@ def post_subscription_manager_page():
return flask.redirect(util.URL_ORIGIN + request.full_path, 303)
+
@yt_app.route('/subscriptions', methods=['GET'])
@yt_app.route('/feed/subscriptions', methods=['GET'])
def get_subscriptions_page():
@@ -764,11 +1019,10 @@ def get_subscriptions_page():
video['thumbnail'] = util.URL_ORIGIN + '/data/subscription_thumbnails/' + video['id'] + '.jpg'
video['type'] = 'video'
video['item_size'] = 'small'
- yt_data_extract.add_extra_html_info(video)
+ util.add_extra_html_info(video)
tags = _get_all_tags(cursor)
-
subscription_list = []
for channel_name, channel_id, muted in _get_subscribed_channels(cursor):
subscription_list.append({
@@ -778,15 +1032,18 @@ def get_subscriptions_page():
'muted': muted,
})
- return flask.render_template('subscriptions.html',
- videos = videos,
- num_pages = math.ceil(number_of_videos_in_db/60),
- parameters_dictionary = request.args,
- tags = tags,
- current_tag = tag,
- subscription_list = subscription_list,
+ return flask.render_template(
+ 'subscriptions.html',
+ header_playlist_names=local_playlist.get_playlist_names(),
+ videos=videos,
+ num_pages=math.ceil(number_of_videos_in_db/60),
+ parameters_dictionary=request.args,
+ tags=tags,
+ current_tag=tag,
+ subscription_list=subscription_list,
)
+
@yt_app.route('/subscriptions', methods=['POST'])
@yt_app.route('/feed/subscriptions', methods=['POST'])
def post_subscriptions_page():
@@ -832,7 +1089,7 @@ def serve_subscription_thumbnail(thumbnail):
f.close()
return flask.Response(image, mimetype='image/jpeg')
- url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
+ url = f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"
try:
image = util.fetch_url(url, report_text="Saved thumbnail: " + video_id)
except urllib.error.HTTPError as e:
@@ -841,17 +1098,10 @@ def serve_subscription_thumbnail(thumbnail):
try:
f = open(thumbnail_path, 'wb')
except FileNotFoundError:
- os.makedirs(thumbnails_directory, exist_ok = True)
+ os.makedirs(thumbnails_directory, exist_ok=True)
f = open(thumbnail_path, 'wb')
f.write(image)
f.close()
existing_thumbnails.add(video_id)
return flask.Response(image, mimetype='image/jpeg')
-
-
-
-
-
-
-