aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/subscriptions.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube/subscriptions.py')
-rw-r--r--youtube/subscriptions.py341
1 files changed, 273 insertions, 68 deletions
diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py
index c18f822..7d3efab 100644
--- a/youtube/subscriptions.py
+++ b/youtube/subscriptions.py
@@ -1,4 +1,4 @@
-from youtube import util, yt_data_extract, channel, local_playlist
+from youtube import util, yt_data_extract, channel, local_playlist, playlist
from youtube import yt_app
import settings
@@ -15,6 +15,8 @@ import math
import secrets
import collections
import calendar # bullshit! https://bugs.python.org/issue6280
+import csv
+import re
import flask
from flask import request
@@ -28,8 +30,7 @@ database_path = os.path.join(settings.data_dir, "subscriptions.sqlite")
def open_database():
- if not os.path.exists(settings.data_dir):
- os.makedirs(settings.data_dir)
+ os.makedirs(settings.data_dir, exist_ok=True)
connection = sqlite3.connect(database_path, check_same_thread=False)
try:
@@ -106,8 +107,7 @@ def _subscribe(channels):
with connection as cursor:
channel_ids_to_check = [channel[0] for channel in channels if not _is_subscribed(cursor, channel[0])]
- rows = ((channel_id, channel_name, 0, 0) for channel_id,
- channel_name in channels)
+ rows = ((channel_id, channel_name, 0, 0) for channel_id, channel_name in channels)
cursor.executemany('''INSERT OR IGNORE INTO subscribed_channels (yt_channel_id, channel_name, time_last_checked, next_check_time)
VALUES (?, ?, ?, ?)''', rows)
@@ -126,7 +126,7 @@ def delete_thumbnails(to_delete):
os.remove(os.path.join(thumbnails_directory, thumbnail))
existing_thumbnails.remove(video_id)
except Exception:
- print('Failed to delete thumbnail: ' + thumbnail)
+ print(f'Failed to delete thumbnail: {thumbnail}')
traceback.print_exc()
@@ -184,7 +184,7 @@ def _get_videos(cursor, number_per_page, offset, tag=None):
'time_published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]),
'author': db_video[5],
'author_id': db_video[6],
- 'author_url': '/https://www.youtube.com/channel/' + db_video[6],
+ 'author_url': f'/https://www.youtube.com/channel/{db_video[6]}',
})
return videos, pseudo_number_of_videos
@@ -234,8 +234,7 @@ def _get_channel_names(cursor, channel_ids):
return result
-def _channels_with_tag(cursor, tag, order=False, exclude_muted=False,
- include_muted_status=False):
+def _channels_with_tag(cursor, tag, order=False, exclude_muted=False, include_muted_status=False):
''' returns list of (channel_id, channel_name) '''
statement = '''SELECT yt_channel_id, channel_name'''
@@ -293,7 +292,10 @@ def youtube_timestamp_to_posix(dumb_timestamp):
def posix_to_dumbed_down(posix_time):
'''Inverse of youtube_timestamp_to_posix.'''
delta = int(time.time() - posix_time)
- assert delta >= 0
+ # Guard against future timestamps (clock drift) without relying on
+ # `assert` (which is stripped under `python -O`).
+ if delta < 0:
+ delta = 0
if delta == 0:
return '0 seconds ago'
@@ -302,9 +304,9 @@ def posix_to_dumbed_down(posix_time):
if delta >= unit_time:
quantifier = round(delta/unit_time)
if quantifier == 1:
- return '1 ' + unit_name + ' ago'
+ return f'1 {unit_name} ago'
else:
- return str(quantifier) + ' ' + unit_name + 's ago'
+ return f'{quantifier} {unit_name}s ago'
else:
raise Exception()
@@ -361,7 +363,7 @@ def autocheck_dispatcher():
time_until_earliest_job = earliest_job['next_check_time'] - time.time()
if time_until_earliest_job <= -5: # should not happen unless we're running extremely slow
- print('ERROR: autocheck_dispatcher got job scheduled in the past, skipping and rescheduling: ' + earliest_job['channel_id'] + ', ' + earliest_job['channel_name'] + ', ' + str(earliest_job['next_check_time']))
+ print(f'ERROR: autocheck_dispatcher got job scheduled in the past, skipping and rescheduling: {earliest_job["channel_id"]}, {earliest_job["channel_name"]}, {earliest_job["next_check_time"]}')
next_check_time = time.time() + 3600*secrets.randbelow(60)/60
with_open_db(_schedule_checking, earliest_job['channel_id'], next_check_time)
autocheck_jobs[earliest_job_index]['next_check_time'] = next_check_time
@@ -434,7 +436,8 @@ def autocheck_setting_changed(old_value, new_value):
settings.add_setting_changed_hook(
'autocheck_subscriptions',
- autocheck_setting_changed)
+ autocheck_setting_changed
+)
if settings.autocheck_subscriptions:
start_autocheck_system()
# ----------------------------
@@ -448,29 +451,50 @@ def check_channels_if_necessary(channel_ids):
def _get_atoma_feed(channel_id):
- url = 'https://www.youtube.com/feeds/videos.xml?channel_id=' + channel_id
+ url = f'https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}'
try:
return util.fetch_url(url).decode('utf-8')
except util.FetchError as e:
# 404 is expected for terminated channels
if e.code in ('404', '429'):
return ''
+ if e.code == '502':
+ return str(e)
raise
-def _get_channel_tab(channel_id, channel_status_name):
+def _get_channel_videos_first_page(channel_id, channel_status_name):
try:
- return channel.get_channel_tab(channel_id, print_status=False)
+ # First try the playlist method
+ pl_json = playlist.get_videos(
+ 'UU' + channel_id[2:],
+ 1,
+ include_shorts=settings.include_shorts_in_subscriptions,
+ report_text=None
+ )
+ pl_info = yt_data_extract.extract_playlist_info(pl_json)
+ if pl_info.get('items'):
+ pl_info['items'] = pl_info['items'][0:30]
+ return pl_info
+
+ # Try the channel api method
+ channel_json = channel.get_channel_first_page(channel_id=channel_id)
+ channel_info = yt_data_extract.extract_channel_info(
+ json.loads(channel_json), 'videos'
+ )
+ return channel_info
except util.FetchError as e:
if e.code == '429' and settings.route_tor:
- error_message = ('Error checking channel ' + channel_status_name
- + ': Youtube blocked the request because the'
- + ' Tor exit node is overutilized. Try getting a new exit node'
- + ' by using the New Identity button in the Tor Browser.')
+ error_message = (f'Error checking channel {channel_status_name}: '
+ f'YouTube blocked the request because the Tor exit node is overutilized. '
+ f'Try getting a new exit node by using the New Identity button in the Tor Browser.')
if e.ip:
- error_message += ' Exit node IP address: ' + e.ip
+ error_message += f' Exit node IP address: {e.ip}'
print(error_message)
return None
+ elif e.code == '502':
+ print(f'Error checking channel {channel_status_name}: {e}')
+ return None
raise
@@ -480,17 +504,18 @@ def _get_upstream_videos(channel_id):
except KeyError:
channel_status_name = channel_id
- print("Checking channel: " + channel_status_name)
+ print(f"Checking channel: {channel_status_name}")
tasks = (
# channel page, need for video duration
- gevent.spawn(channel.get_channel_first_page, channel_id=channel_id),
+ gevent.spawn(_get_channel_videos_first_page, channel_id,
+ channel_status_name),
# need atoma feed for exact published time
gevent.spawn(_get_atoma_feed, channel_id)
)
gevent.joinall(tasks)
- channel_tab, feed = tasks[0].value, tasks[1].value
+ channel_info, feed = tasks[0].value, tasks[1].value
# extract published times from atoma feed
times_published = {}
@@ -508,7 +533,8 @@ def _get_upstream_videos(channel_id):
return None
root = defusedxml.ElementTree.fromstring(feed)
- assert remove_bullshit(root.tag) == 'feed'
+ if remove_bullshit(root.tag) != 'feed':
+ raise ValueError('Root element is not <feed>')
for entry in root:
if (remove_bullshit(entry.tag) != 'entry'):
continue
@@ -516,23 +542,22 @@ def _get_upstream_videos(channel_id):
# it's yt:videoId in the xml but the yt: is turned into a namespace which is removed by remove_bullshit
video_id_element = find_element(entry, 'videoId')
time_published_element = find_element(entry, 'published')
- assert video_id_element is not None
- assert time_published_element is not None
+ if video_id_element is None or time_published_element is None:
+ raise ValueError('Missing videoId or published element')
time_published = int(calendar.timegm(time.strptime(time_published_element.text, '%Y-%m-%dT%H:%M:%S+00:00')))
times_published[video_id_element.text] = time_published
- except AssertionError:
- print('Failed to read atoma feed for ' + channel_status_name)
+ except ValueError:
+ print(f'Failed to read atoma feed for {channel_status_name}')
traceback.print_exc()
except defusedxml.ElementTree.ParseError:
- print('Failed to read atoma feed for ' + channel_status_name)
+ print(f'Failed to read atoma feed for {channel_status_name}')
- if channel_tab is None: # there was an error
+ if channel_info is None: # there was an error
return
- channel_info = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos')
if channel_info['error']:
- print('Error checking channel ' + channel_status_name + ': ' + channel_info['error'])
+ print(f'Error checking channel {channel_status_name}: {channel_info["error"]}')
return
videos = channel_info['items']
@@ -545,14 +570,41 @@ def _get_upstream_videos(channel_id):
if video_item['id'] in times_published:
video_item['time_published'] = times_published[video_item['id']]
video_item['is_time_published_exact'] = True
- else:
+ elif video_item.get('time_published'):
video_item['is_time_published_exact'] = False
try:
video_item['time_published'] = youtube_timestamp_to_posix(video_item['time_published']) - i # subtract a few seconds off the videos so they will be in the right order
- except KeyError:
+ except Exception:
print(video_item)
-
+ else:
+ video_item['is_time_published_exact'] = False
+ video_item['time_published'] = None
video_item['channel_id'] = channel_id
+ if len(videos) > 1:
+ # Go back and fill in any videos that don't have a time published
+ # using the time published of the surrounding ones
+ for i in range(len(videos)-1):
+ if (videos[i+1]['time_published'] is None
+ and videos[i]['time_published'] is not None
+ ):
+ videos[i+1]['time_published'] = videos[i]['time_published'] - 1
+ for i in reversed(range(1,len(videos))):
+ if (videos[i-1]['time_published'] is None
+ and videos[i]['time_published'] is not None
+ ):
+ videos[i-1]['time_published'] = videos[i]['time_published'] + 1
+ # Special case: none of the videos have a time published.
+ # In this case, make something up
+ if videos and videos[0]['time_published'] is None:
+ # Invariant: if the first video has no timestamp, earlier passes
+ # ensure all of them are unset. Don't rely on `assert`.
+ if not all(v['time_published'] is None for v in videos):
+ raise RuntimeError('Inconsistent time_published state')
+ now = time.time()
+ for i in range(len(videos)):
+ # 1 month between videos
+ videos[i]['time_published'] = now - i*3600*24*30
+
if len(videos) == 0:
average_upload_period = 4*7*24*3600 # assume 1 month for channel with no videos
@@ -562,7 +614,7 @@ def _get_upstream_videos(channel_id):
average_upload_period = int((time.time() - videos[4]['time_published'])/5) # equivalent to averaging the time between videos for the last 5 videos
# calculate when to check next for auto checking
- # add some quantization and randomness to make pattern analysis by Youtube slightly harder
+ # add some quantization and randomness to make pattern analysis by YouTube slightly harder
quantized_upload_period = average_upload_period - (average_upload_period % (4*3600)) + 4*3600 # round up to nearest 4 hours
randomized_upload_period = quantized_upload_period*(1 + secrets.randbelow(50)/50*0.5) # randomly between 1x and 1.5x
next_check_delay = randomized_upload_period/10 # check at 10x the channel posting rate. might want to fine tune this number
@@ -571,26 +623,31 @@ def _get_upstream_videos(channel_id):
with open_database() as connection:
with connection as cursor:
- # calculate how many new videos there are
- existing_vids = set(row[0] for row in cursor.execute(
- '''SELECT video_id
+ # Get video ids and duration of existing vids so we
+ # can see how many new ones there are and update
+ # livestreams/premiers
+ existing_vids = list(cursor.execute(
+ '''SELECT video_id, duration
FROM videos
INNER JOIN subscribed_channels
ON videos.sql_channel_id = subscribed_channels.id
WHERE yt_channel_id=?
ORDER BY time_published DESC
LIMIT 30''', [channel_id]).fetchall())
+ existing_vid_ids = set(row[0] for row in existing_vids)
+ existing_durs = dict(existing_vids)
# new videos the channel has uploaded since last time we checked
number_of_new_videos = 0
for video in videos:
- if video['id'] in existing_vids:
+ if video['id'] in existing_vid_ids:
break
number_of_new_videos += 1
is_first_check = cursor.execute('''SELECT time_last_checked FROM subscribed_channels WHERE yt_channel_id=?''', [channel_id]).fetchone()[0] in (None, 0)
time_videos_retrieved = int(time.time())
rows = []
+ update_rows = []
for i, video_item in enumerate(videos):
if (is_first_check
or number_of_new_videos > 6
@@ -606,16 +663,34 @@ def _get_upstream_videos(channel_id):
time_noticed = video_item['time_published']
else:
time_noticed = time_videos_retrieved
- rows.append((
- video_item['channel_id'],
- video_item['id'],
- video_item['title'],
- video_item['duration'],
- video_item['time_published'],
- video_item['is_time_published_exact'],
- time_noticed,
- video_item['description'],
- ))
+
+ # videos which need durations updated
+ non_durations = ('upcoming', 'none', 'live', '')
+ v_id = video_item['id']
+ if (existing_durs.get(v_id) is not None
+ and existing_durs[v_id].lower() in non_durations
+ and video_item['duration'] not in non_durations
+ ):
+ update_rows.append((
+ video_item['title'],
+ video_item['duration'],
+ video_item['time_published'],
+ video_item['is_time_published_exact'],
+ video_item['description'],
+ video_item['id'],
+ ))
+ # all other videos
+ else:
+ rows.append((
+ video_item['channel_id'],
+ video_item['id'],
+ video_item['title'],
+ video_item['duration'],
+ video_item['time_published'],
+ video_item['is_time_published_exact'],
+ time_noticed,
+ video_item['description'],
+ ))
cursor.executemany('''INSERT OR IGNORE INTO videos (
sql_channel_id,
@@ -628,6 +703,13 @@ def _get_upstream_videos(channel_id):
description
)
VALUES ((SELECT id FROM subscribed_channels WHERE yt_channel_id=?), ?, ?, ?, ?, ?, ?, ?)''', rows)
+ cursor.executemany('''UPDATE videos SET
+ title=?,
+ duration=?,
+ time_published=?,
+ is_time_published_exact=?,
+ description=?
+ WHERE video_id=?''', update_rows)
cursor.execute('''UPDATE subscribed_channels
SET time_last_checked = ?, next_check_time = ?
WHERE yt_channel_id=?''', [int(time.time()), next_check_time, channel_id])
@@ -677,7 +759,7 @@ def check_specific_channels(channel_ids):
channel_names.update(channel_id_name_list)
check_channels_if_necessary(channel_ids)
-
+CHANNEL_ID_RE = re.compile(r'UC[-_\w]{22}')
@yt_app.route('/import_subscriptions', methods=['POST'])
def import_subscriptions():
@@ -695,15 +777,36 @@ def import_subscriptions():
mime_type = file.mimetype
if mime_type == 'application/json':
- file = file.read().decode('utf-8')
+ info = file.read().decode('utf-8')
+ if info == '':
+ return '400 Bad Request: File is empty', 400
try:
- file = json.loads(file)
+ info = json.loads(info)
except json.decoder.JSONDecodeError:
traceback.print_exc()
return '400 Bad Request: Invalid json file', 400
+ channels = []
try:
- channels = ((item['snippet']['resourceId']['channelId'], item['snippet']['title']) for item in file)
+ if 'app_version_int' in info: # NewPipe Format
+ for item in info['subscriptions']:
+ # Other service, such as SoundCloud
+ if item.get('service_id', 0) != 0:
+ continue
+ channel_url = item['url']
+ channel_id_match = CHANNEL_ID_RE.search(channel_url)
+ if channel_id_match:
+ channel_id = channel_id_match.group(0)
+ else:
+ print('WARNING: Could not find channel id in url',
+ channel_url)
+ continue
+ channels.append((channel_id, item['name']))
+ else: # Old Google Takeout format
+ for item in info:
+ snippet = item['snippet']
+ channel_id = snippet['resourceId']['channelId']
+ channels.append((channel_id, snippet['title']))
except (KeyError, IndexError):
traceback.print_exc()
return '400 Bad Request: Unknown json structure', 400
@@ -711,7 +814,8 @@ def import_subscriptions():
file = file.read().decode('utf-8')
try:
root = defusedxml.ElementTree.fromstring(file)
- assert root.tag == 'opml'
+ if root.tag != 'opml':
+ raise ValueError('Root element is not <opml>')
channels = []
for outline_element in root[0][0]:
if (outline_element.tag != 'outline') or ('xmlUrl' not in outline_element.attrib):
@@ -722,16 +826,94 @@ def import_subscriptions():
channel_id = channel_rss_url[channel_rss_url.find('channel_id=')+11:].strip()
channels.append((channel_id, channel_name))
- except (AssertionError, IndexError, defusedxml.ElementTree.ParseError) as e:
+ except (ValueError, IndexError, defusedxml.ElementTree.ParseError):
return '400 Bad Request: Unable to read opml xml file, or the file is not the expected format', 400
+ elif mime_type in ('text/csv', 'application/vnd.ms-excel'):
+ content = file.read().decode('utf-8')
+ reader = csv.reader(content.splitlines())
+ channels = []
+ for row in reader:
+ if not row or row[0].lower().strip() == 'channel id':
+ continue
+ elif len(row) > 1 and CHANNEL_ID_RE.fullmatch(row[0].strip()):
+ channels.append( (row[0], row[-1]) )
+ else:
+ print('WARNING: Unknown row format:', row)
else:
- return '400 Bad Request: Unsupported file format: ' + mime_type + '. Only subscription.json files (from Google Takeouts) and XML OPML files exported from Youtube\'s subscription manager page are supported', 400
+ error = 'Unsupported file format: ' + mime_type
+ error += (' . Only subscription.json, subscriptions.csv files'
+ ' (from Google Takeouts)'
+ ' and XML OPML files exported from YouTube\'s'
+ ' subscription manager page are supported')
+ return (flask.render_template('error.html', error_message=error),
+ 400)
_subscribe(channels)
return flask.redirect(util.URL_ORIGIN + '/subscription_manager', 303)
+@yt_app.route('/export_subscriptions', methods=['POST'])
+def export_subscriptions():
+ include_muted = request.values.get('include_muted') == 'on'
+ with open_database() as connection:
+ with connection as cursor:
+ sub_list = []
+ for channel_name, channel_id, muted in (
+ _get_subscribed_channels(cursor)):
+ if muted and not include_muted:
+ continue
+ if request.values['export_format'] == 'json_google_takeout':
+ sub_list.append({
+ 'kind': 'youtube#subscription',
+ 'snippet': {
+ 'muted': bool(muted),
+ 'resourceId': {
+ 'channelId': channel_id,
+ 'kind': 'youtube#channel',
+ },
+ 'tags': _get_tags(cursor, channel_id),
+ 'title': channel_name,
+ },
+ })
+ elif request.values['export_format'] == 'json_newpipe':
+ sub_list.append({
+ 'service_id': 0,
+ 'url': 'https://www.youtube.com/channel/' + channel_id,
+ 'name': channel_name,
+ })
+ elif request.values['export_format'] == 'opml':
+ sub_list.append({
+ 'channel_name': channel_name,
+ 'channel_id': channel_id,
+ })
+ date_time = time.strftime('%Y%m%d%H%M', time.localtime())
+ if request.values['export_format'] == 'json_google_takeout':
+ r = flask.Response(json.dumps(sub_list), mimetype='text/json')
+ cd = 'attachment; filename="subscriptions_%s.json"' % date_time
+ r.headers['Content-Disposition'] = cd
+ return r
+ elif request.values['export_format'] == 'json_newpipe':
+ r = flask.Response(json.dumps({
+ 'app_version': '0.21.9',
+ 'app_version_int': 975,
+ 'subscriptions': sub_list,
+ }), mimetype='text/json')
+ file_name = 'newpipe_subscriptions_%s_youtube-local.json' % date_time
+ cd = 'attachment; filename="%s"' % file_name
+ r.headers['Content-Disposition'] = cd
+ return r
+ elif request.values['export_format'] == 'opml':
+ r = flask.Response(
+ flask.render_template('subscriptions.xml', sub_list=sub_list),
+ mimetype='text/xml')
+ cd = 'attachment; filename="subscriptions_%s.xml"' % date_time
+ r.headers['Content-Disposition'] = cd
+ return r
+ else:
+ return '400 Bad Request', 400
+
+
@yt_app.route('/subscription_manager', methods=['GET'])
def get_subscription_manager_page():
group_by_tags = request.args.get('group_by_tags', '0') == '1'
@@ -840,7 +1022,7 @@ def get_subscriptions_page():
tag = request.args.get('tag', None)
videos, number_of_videos_in_db = _get_videos(cursor, 60, (page - 1)*60, tag)
for video in videos:
- video['thumbnail'] = util.URL_ORIGIN + '/data/subscription_thumbnails/' + video['id'] + '.jpg'
+ video['thumbnail'] = f'{util.URL_ORIGIN}/data/subscription_thumbnails/{video["id"]}.jpg'
video['type'] = 'video'
video['item_size'] = 'small'
util.add_extra_html_info(video)
@@ -850,7 +1032,7 @@ def get_subscriptions_page():
subscription_list = []
for channel_name, channel_id, muted in _get_subscribed_channels(cursor):
subscription_list.append({
- 'channel_url': util.URL_ORIGIN + '/channel/' + channel_id,
+ 'channel_url': f'{util.URL_ORIGIN}/channel/{channel_id}',
'channel_name': channel_name,
'channel_id': channel_id,
'muted': muted,
@@ -896,11 +1078,20 @@ def post_subscriptions_page():
return '', 204
+# YouTube video IDs are exactly 11 chars from [A-Za-z0-9_-]. Enforce this
+# before using the value in filesystem paths to prevent path traversal
+# (CWE-22, OWASP A01:2021).
+_VIDEO_ID_RE = re.compile(r'^[A-Za-z0-9_-]{11}$')
+
+
@yt_app.route('/data/subscription_thumbnails/<thumbnail>')
def serve_subscription_thumbnail(thumbnail):
'''Serves thumbnail from disk if it's been saved already. If not, downloads the thumbnail, saves to disk, and serves it.'''
- assert thumbnail[-4:] == '.jpg'
+ if not thumbnail.endswith('.jpg'):
+ flask.abort(400)
video_id = thumbnail[0:-4]
+ if not _VIDEO_ID_RE.match(video_id):
+ flask.abort(400)
thumbnail_path = os.path.join(thumbnails_directory, thumbnail)
if video_id in existing_thumbnails:
@@ -913,12 +1104,26 @@ def serve_subscription_thumbnail(thumbnail):
f.close()
return flask.Response(image, mimetype='image/jpeg')
- url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
- try:
- image = util.fetch_url(url, report_text="Saved thumbnail: " + video_id)
- except urllib.error.HTTPError as e:
- print("Failed to download thumbnail for " + video_id + ": " + str(e))
- abort(e.code)
+ image = None
+ for quality in ('hq720.jpg', 'sddefault.jpg', 'hqdefault.jpg'):
+ url = f"https://i.ytimg.com/vi/{video_id}/{quality}"
+ try:
+ image = util.fetch_url(url, report_text=f"Saved thumbnail: {video_id}")
+ break
+ except util.FetchError as e:
+ if '404' in str(e):
+ continue
+ print(f"Failed to download thumbnail for {video_id}: {e}")
+ flask.abort(500)
+ except urllib.error.HTTPError as e:
+ if e.code == 404:
+ continue
+ print(f"Failed to download thumbnail for {video_id}: {e}")
+ flask.abort(e.code)
+
+ if image is None:
+ flask.abort(404)
+
try:
f = open(thumbnail_path, 'wb')
except FileNotFoundError: