diff options
Diffstat (limited to 'youtube/subscriptions.py')
| -rw-r--r-- | youtube/subscriptions.py | 341 |
1 files changed, 273 insertions, 68 deletions
diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py index c18f822..7d3efab 100644 --- a/youtube/subscriptions.py +++ b/youtube/subscriptions.py @@ -1,4 +1,4 @@ -from youtube import util, yt_data_extract, channel, local_playlist +from youtube import util, yt_data_extract, channel, local_playlist, playlist from youtube import yt_app import settings @@ -15,6 +15,8 @@ import math import secrets import collections import calendar # bullshit! https://bugs.python.org/issue6280 +import csv +import re import flask from flask import request @@ -28,8 +30,7 @@ database_path = os.path.join(settings.data_dir, "subscriptions.sqlite") def open_database(): - if not os.path.exists(settings.data_dir): - os.makedirs(settings.data_dir) + os.makedirs(settings.data_dir, exist_ok=True) connection = sqlite3.connect(database_path, check_same_thread=False) try: @@ -106,8 +107,7 @@ def _subscribe(channels): with connection as cursor: channel_ids_to_check = [channel[0] for channel in channels if not _is_subscribed(cursor, channel[0])] - rows = ((channel_id, channel_name, 0, 0) for channel_id, - channel_name in channels) + rows = ((channel_id, channel_name, 0, 0) for channel_id, channel_name in channels) cursor.executemany('''INSERT OR IGNORE INTO subscribed_channels (yt_channel_id, channel_name, time_last_checked, next_check_time) VALUES (?, ?, ?, ?)''', rows) @@ -126,7 +126,7 @@ def delete_thumbnails(to_delete): os.remove(os.path.join(thumbnails_directory, thumbnail)) existing_thumbnails.remove(video_id) except Exception: - print('Failed to delete thumbnail: ' + thumbnail) + print(f'Failed to delete thumbnail: {thumbnail}') traceback.print_exc() @@ -184,7 +184,7 @@ def _get_videos(cursor, number_per_page, offset, tag=None): 'time_published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]), 'author': db_video[5], 'author_id': db_video[6], - 'author_url': '/https://www.youtube.com/channel/' + db_video[6], + 'author_url': f'/https://www.youtube.com/channel/{db_video[6]}', }) return videos, pseudo_number_of_videos @@ -234,8 +234,7 @@ def _get_channel_names(cursor, channel_ids): return result -def _channels_with_tag(cursor, tag, order=False, exclude_muted=False, - include_muted_status=False): +def _channels_with_tag(cursor, tag, order=False, exclude_muted=False, include_muted_status=False): ''' returns list of (channel_id, channel_name) ''' statement = '''SELECT yt_channel_id, channel_name''' @@ -293,7 +292,10 @@ def youtube_timestamp_to_posix(dumb_timestamp): def posix_to_dumbed_down(posix_time): '''Inverse of youtube_timestamp_to_posix.''' delta = int(time.time() - posix_time) - assert delta >= 0 + # Guard against future timestamps (clock drift) without relying on + # `assert` (which is stripped under `python -O`). + if delta < 0: + delta = 0 if delta == 0: return '0 seconds ago' @@ -302,9 +304,9 @@ def posix_to_dumbed_down(posix_time): if delta >= unit_time: quantifier = round(delta/unit_time) if quantifier == 1: - return '1 ' + unit_name + ' ago' + return f'1 {unit_name} ago' else: - return str(quantifier) + ' ' + unit_name + 's ago' + return f'{quantifier} {unit_name}s ago' else: raise Exception() @@ -361,7 +363,7 @@ def autocheck_dispatcher(): time_until_earliest_job = earliest_job['next_check_time'] - time.time() if time_until_earliest_job <= -5: # should not happen unless we're running extremely slow - print('ERROR: autocheck_dispatcher got job scheduled in the past, skipping and rescheduling: ' + earliest_job['channel_id'] + ', ' + earliest_job['channel_name'] + ', ' + str(earliest_job['next_check_time'])) + print(f'ERROR: autocheck_dispatcher got job scheduled in the past, skipping and rescheduling: {earliest_job["channel_id"]}, {earliest_job["channel_name"]}, {earliest_job["next_check_time"]}') next_check_time = time.time() + 3600*secrets.randbelow(60)/60 with_open_db(_schedule_checking, earliest_job['channel_id'], next_check_time) autocheck_jobs[earliest_job_index]['next_check_time'] = next_check_time @@ -434,7 +436,8 @@ def autocheck_setting_changed(old_value, new_value): settings.add_setting_changed_hook( 'autocheck_subscriptions', - autocheck_setting_changed) + autocheck_setting_changed +) if settings.autocheck_subscriptions: start_autocheck_system() # ---------------------------- @@ -448,29 +451,50 @@ def check_channels_if_necessary(channel_ids): def _get_atoma_feed(channel_id): - url = 'https://www.youtube.com/feeds/videos.xml?channel_id=' + channel_id + url = f'https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}' try: return util.fetch_url(url).decode('utf-8') except util.FetchError as e: # 404 is expected for terminated channels if e.code in ('404', '429'): return '' + if e.code == '502': + return str(e) raise -def _get_channel_tab(channel_id, channel_status_name): +def _get_channel_videos_first_page(channel_id, channel_status_name): try: - return channel.get_channel_tab(channel_id, print_status=False) + # First try the playlist method + pl_json = playlist.get_videos( + 'UU' + channel_id[2:], + 1, + include_shorts=settings.include_shorts_in_subscriptions, + report_text=None + ) + pl_info = yt_data_extract.extract_playlist_info(pl_json) + if pl_info.get('items'): + pl_info['items'] = pl_info['items'][0:30] + return pl_info + + # Try the channel api method + channel_json = channel.get_channel_first_page(channel_id=channel_id) + channel_info = yt_data_extract.extract_channel_info( + json.loads(channel_json), 'videos' + ) + return channel_info except util.FetchError as e: if e.code == '429' and settings.route_tor: - error_message = ('Error checking channel ' + channel_status_name - + ': Youtube blocked the request because the' - + ' Tor exit node is overutilized. Try getting a new exit node' - + ' by using the New Identity button in the Tor Browser.') + error_message = (f'Error checking channel {channel_status_name}: ' + f'YouTube blocked the request because the Tor exit node is overutilized. ' + f'Try getting a new exit node by using the New Identity button in the Tor Browser.') if e.ip: - error_message += ' Exit node IP address: ' + e.ip + error_message += f' Exit node IP address: {e.ip}' print(error_message) return None + elif e.code == '502': + print(f'Error checking channel {channel_status_name}: {e}') + return None raise @@ -480,17 +504,18 @@ def _get_upstream_videos(channel_id): except KeyError: channel_status_name = channel_id - print("Checking channel: " + channel_status_name) + print(f"Checking channel: {channel_status_name}") tasks = ( # channel page, need for video duration - gevent.spawn(channel.get_channel_first_page, channel_id=channel_id), + gevent.spawn(_get_channel_videos_first_page, channel_id, + channel_status_name), # need atoma feed for exact published time gevent.spawn(_get_atoma_feed, channel_id) ) gevent.joinall(tasks) - channel_tab, feed = tasks[0].value, tasks[1].value + channel_info, feed = tasks[0].value, tasks[1].value # extract published times from atoma feed times_published = {} @@ -508,7 +533,8 @@ def _get_upstream_videos(channel_id): return None root = defusedxml.ElementTree.fromstring(feed) - assert remove_bullshit(root.tag) == 'feed' + if remove_bullshit(root.tag) != 'feed': + raise ValueError('Root element is not <feed>') for entry in root: if (remove_bullshit(entry.tag) != 'entry'): continue @@ -516,23 +542,22 @@ def _get_upstream_videos(channel_id): # it's yt:videoId in the xml but the yt: is turned into a namespace which is removed by remove_bullshit video_id_element = find_element(entry, 'videoId') time_published_element = find_element(entry, 'published') - assert video_id_element is not None - assert time_published_element is not None + if video_id_element is None or time_published_element is None: + raise ValueError('Missing videoId or published element') time_published = int(calendar.timegm(time.strptime(time_published_element.text, '%Y-%m-%dT%H:%M:%S+00:00'))) times_published[video_id_element.text] = time_published - except AssertionError: - print('Failed to read atoma feed for ' + channel_status_name) + except ValueError: + print(f'Failed to read atoma feed for {channel_status_name}') traceback.print_exc() except defusedxml.ElementTree.ParseError: - print('Failed to read atoma feed for ' + channel_status_name) + print(f'Failed to read atoma feed for {channel_status_name}') - if channel_tab is None: # there was an error + if channel_info is None: # there was an error return - channel_info = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos') if channel_info['error']: - print('Error checking channel ' + channel_status_name + ': ' + channel_info['error']) + print(f'Error checking channel {channel_status_name}: {channel_info["error"]}') return videos = channel_info['items'] @@ -545,14 +570,41 @@ def _get_upstream_videos(channel_id): if video_item['id'] in times_published: video_item['time_published'] = times_published[video_item['id']] video_item['is_time_published_exact'] = True - else: + elif video_item.get('time_published'): video_item['is_time_published_exact'] = False try: video_item['time_published'] = youtube_timestamp_to_posix(video_item['time_published']) - i # subtract a few seconds off the videos so they will be in the right order - except KeyError: + except Exception: print(video_item) - + else: + video_item['is_time_published_exact'] = False + video_item['time_published'] = None video_item['channel_id'] = channel_id + if len(videos) > 1: + # Go back and fill in any videos that don't have a time published + # using the time published of the surrounding ones + for i in range(len(videos)-1): + if (videos[i+1]['time_published'] is None + and videos[i]['time_published'] is not None + ): + videos[i+1]['time_published'] = videos[i]['time_published'] - 1 + for i in reversed(range(1,len(videos))): + if (videos[i-1]['time_published'] is None + and videos[i]['time_published'] is not None + ): + videos[i-1]['time_published'] = videos[i]['time_published'] + 1 + # Special case: none of the videos have a time published. + # In this case, make something up + if videos and videos[0]['time_published'] is None: + # Invariant: if the first video has no timestamp, earlier passes + # ensure all of them are unset. Don't rely on `assert`. + if not all(v['time_published'] is None for v in videos): + raise RuntimeError('Inconsistent time_published state') + now = time.time() + for i in range(len(videos)): + # 1 month between videos + videos[i]['time_published'] = now - i*3600*24*30 + if len(videos) == 0: average_upload_period = 4*7*24*3600 # assume 1 month for channel with no videos @@ -562,7 +614,7 @@ def _get_upstream_videos(channel_id): average_upload_period = int((time.time() - videos[4]['time_published'])/5) # equivalent to averaging the time between videos for the last 5 videos # calculate when to check next for auto checking - # add some quantization and randomness to make pattern analysis by Youtube slightly harder + # add some quantization and randomness to make pattern analysis by YouTube slightly harder quantized_upload_period = average_upload_period - (average_upload_period % (4*3600)) + 4*3600 # round up to nearest 4 hours randomized_upload_period = quantized_upload_period*(1 + secrets.randbelow(50)/50*0.5) # randomly between 1x and 1.5x next_check_delay = randomized_upload_period/10 # check at 10x the channel posting rate. might want to fine tune this number @@ -571,26 +623,31 @@ def _get_upstream_videos(channel_id): with open_database() as connection: with connection as cursor: - # calculate how many new videos there are - existing_vids = set(row[0] for row in cursor.execute( - '''SELECT video_id + # Get video ids and duration of existing vids so we + # can see how many new ones there are and update + # livestreams/premiers + existing_vids = list(cursor.execute( + '''SELECT video_id, duration FROM videos INNER JOIN subscribed_channels ON videos.sql_channel_id = subscribed_channels.id WHERE yt_channel_id=? ORDER BY time_published DESC LIMIT 30''', [channel_id]).fetchall()) + existing_vid_ids = set(row[0] for row in existing_vids) + existing_durs = dict(existing_vids) # new videos the channel has uploaded since last time we checked number_of_new_videos = 0 for video in videos: - if video['id'] in existing_vids: + if video['id'] in existing_vid_ids: break number_of_new_videos += 1 is_first_check = cursor.execute('''SELECT time_last_checked FROM subscribed_channels WHERE yt_channel_id=?''', [channel_id]).fetchone()[0] in (None, 0) time_videos_retrieved = int(time.time()) rows = [] + update_rows = [] for i, video_item in enumerate(videos): if (is_first_check or number_of_new_videos > 6 @@ -606,16 +663,34 @@ def _get_upstream_videos(channel_id): time_noticed = video_item['time_published'] else: time_noticed = time_videos_retrieved - rows.append(( - video_item['channel_id'], - video_item['id'], - video_item['title'], - video_item['duration'], - video_item['time_published'], - video_item['is_time_published_exact'], - time_noticed, - video_item['description'], - )) + + # videos which need durations updated + non_durations = ('upcoming', 'none', 'live', '') + v_id = video_item['id'] + if (existing_durs.get(v_id) is not None + and existing_durs[v_id].lower() in non_durations + and video_item['duration'] not in non_durations + ): + update_rows.append(( + video_item['title'], + video_item['duration'], + video_item['time_published'], + video_item['is_time_published_exact'], + video_item['description'], + video_item['id'], + )) + # all other videos + else: + rows.append(( + video_item['channel_id'], + video_item['id'], + video_item['title'], + video_item['duration'], + video_item['time_published'], + video_item['is_time_published_exact'], + time_noticed, + video_item['description'], + )) cursor.executemany('''INSERT OR IGNORE INTO videos ( sql_channel_id, @@ -628,6 +703,13 @@ def _get_upstream_videos(channel_id): description ) VALUES ((SELECT id FROM subscribed_channels WHERE yt_channel_id=?), ?, ?, ?, ?, ?, ?, ?)''', rows) + cursor.executemany('''UPDATE videos SET + title=?, + duration=?, + time_published=?, + is_time_published_exact=?, + description=? + WHERE video_id=?''', update_rows) cursor.execute('''UPDATE subscribed_channels SET time_last_checked = ?, next_check_time = ? WHERE yt_channel_id=?''', [int(time.time()), next_check_time, channel_id]) @@ -677,7 +759,7 @@ def check_specific_channels(channel_ids): channel_names.update(channel_id_name_list) check_channels_if_necessary(channel_ids) - +CHANNEL_ID_RE = re.compile(r'UC[-_\w]{22}') @yt_app.route('/import_subscriptions', methods=['POST']) def import_subscriptions(): @@ -695,15 +777,36 @@ def import_subscriptions(): mime_type = file.mimetype if mime_type == 'application/json': - file = file.read().decode('utf-8') + info = file.read().decode('utf-8') + if info == '': + return '400 Bad Request: File is empty', 400 try: - file = json.loads(file) + info = json.loads(info) except json.decoder.JSONDecodeError: traceback.print_exc() return '400 Bad Request: Invalid json file', 400 + channels = [] try: - channels = ((item['snippet']['resourceId']['channelId'], item['snippet']['title']) for item in file) + if 'app_version_int' in info: # NewPipe Format + for item in info['subscriptions']: + # Other service, such as SoundCloud + if item.get('service_id', 0) != 0: + continue + channel_url = item['url'] + channel_id_match = CHANNEL_ID_RE.search(channel_url) + if channel_id_match: + channel_id = channel_id_match.group(0) + else: + print('WARNING: Could not find channel id in url', + channel_url) + continue + channels.append((channel_id, item['name'])) + else: # Old Google Takeout format + for item in info: + snippet = item['snippet'] + channel_id = snippet['resourceId']['channelId'] + channels.append((channel_id, snippet['title'])) except (KeyError, IndexError): traceback.print_exc() return '400 Bad Request: Unknown json structure', 400 @@ -711,7 +814,8 @@ def import_subscriptions(): file = file.read().decode('utf-8') try: root = defusedxml.ElementTree.fromstring(file) - assert root.tag == 'opml' + if root.tag != 'opml': + raise ValueError('Root element is not <opml>') channels = [] for outline_element in root[0][0]: if (outline_element.tag != 'outline') or ('xmlUrl' not in outline_element.attrib): @@ -722,16 +826,94 @@ def import_subscriptions(): channel_id = channel_rss_url[channel_rss_url.find('channel_id=')+11:].strip() channels.append((channel_id, channel_name)) - except (AssertionError, IndexError, defusedxml.ElementTree.ParseError) as e: + except (ValueError, IndexError, defusedxml.ElementTree.ParseError): return '400 Bad Request: Unable to read opml xml file, or the file is not the expected format', 400 + elif mime_type in ('text/csv', 'application/vnd.ms-excel'): + content = file.read().decode('utf-8') + reader = csv.reader(content.splitlines()) + channels = [] + for row in reader: + if not row or row[0].lower().strip() == 'channel id': + continue + elif len(row) > 1 and CHANNEL_ID_RE.fullmatch(row[0].strip()): + channels.append( (row[0], row[-1]) ) + else: + print('WARNING: Unknown row format:', row) else: - return '400 Bad Request: Unsupported file format: ' + mime_type + '. Only subscription.json files (from Google Takeouts) and XML OPML files exported from Youtube\'s subscription manager page are supported', 400 + error = 'Unsupported file format: ' + mime_type + error += (' . Only subscription.json, subscriptions.csv files' + ' (from Google Takeouts)' + ' and XML OPML files exported from YouTube\'s' + ' subscription manager page are supported') + return (flask.render_template('error.html', error_message=error), + 400) _subscribe(channels) return flask.redirect(util.URL_ORIGIN + '/subscription_manager', 303) +@yt_app.route('/export_subscriptions', methods=['POST']) +def export_subscriptions(): + include_muted = request.values.get('include_muted') == 'on' + with open_database() as connection: + with connection as cursor: + sub_list = [] + for channel_name, channel_id, muted in ( + _get_subscribed_channels(cursor)): + if muted and not include_muted: + continue + if request.values['export_format'] == 'json_google_takeout': + sub_list.append({ + 'kind': 'youtube#subscription', + 'snippet': { + 'muted': bool(muted), + 'resourceId': { + 'channelId': channel_id, + 'kind': 'youtube#channel', + }, + 'tags': _get_tags(cursor, channel_id), + 'title': channel_name, + }, + }) + elif request.values['export_format'] == 'json_newpipe': + sub_list.append({ + 'service_id': 0, + 'url': 'https://www.youtube.com/channel/' + channel_id, + 'name': channel_name, + }) + elif request.values['export_format'] == 'opml': + sub_list.append({ + 'channel_name': channel_name, + 'channel_id': channel_id, + }) + date_time = time.strftime('%Y%m%d%H%M', time.localtime()) + if request.values['export_format'] == 'json_google_takeout': + r = flask.Response(json.dumps(sub_list), mimetype='text/json') + cd = 'attachment; filename="subscriptions_%s.json"' % date_time + r.headers['Content-Disposition'] = cd + return r + elif request.values['export_format'] == 'json_newpipe': + r = flask.Response(json.dumps({ + 'app_version': '0.21.9', + 'app_version_int': 975, + 'subscriptions': sub_list, + }), mimetype='text/json') + file_name = 'newpipe_subscriptions_%s_youtube-local.json' % date_time + cd = 'attachment; filename="%s"' % file_name + r.headers['Content-Disposition'] = cd + return r + elif request.values['export_format'] == 'opml': + r = flask.Response( + flask.render_template('subscriptions.xml', sub_list=sub_list), + mimetype='text/xml') + cd = 'attachment; filename="subscriptions_%s.xml"' % date_time + r.headers['Content-Disposition'] = cd + return r + else: + return '400 Bad Request', 400 + + @yt_app.route('/subscription_manager', methods=['GET']) def get_subscription_manager_page(): group_by_tags = request.args.get('group_by_tags', '0') == '1' @@ -840,7 +1022,7 @@ def get_subscriptions_page(): tag = request.args.get('tag', None) videos, number_of_videos_in_db = _get_videos(cursor, 60, (page - 1)*60, tag) for video in videos: - video['thumbnail'] = util.URL_ORIGIN + '/data/subscription_thumbnails/' + video['id'] + '.jpg' + video['thumbnail'] = f'{util.URL_ORIGIN}/data/subscription_thumbnails/{video["id"]}.jpg' video['type'] = 'video' video['item_size'] = 'small' util.add_extra_html_info(video) @@ -850,7 +1032,7 @@ def get_subscriptions_page(): subscription_list = [] for channel_name, channel_id, muted in _get_subscribed_channels(cursor): subscription_list.append({ - 'channel_url': util.URL_ORIGIN + '/channel/' + channel_id, + 'channel_url': f'{util.URL_ORIGIN}/channel/{channel_id}', 'channel_name': channel_name, 'channel_id': channel_id, 'muted': muted, @@ -896,11 +1078,20 @@ def post_subscriptions_page(): return '', 204 +# YouTube video IDs are exactly 11 chars from [A-Za-z0-9_-]. Enforce this +# before using the value in filesystem paths to prevent path traversal +# (CWE-22, OWASP A01:2021). +_VIDEO_ID_RE = re.compile(r'^[A-Za-z0-9_-]{11}$') + + @yt_app.route('/data/subscription_thumbnails/<thumbnail>') def serve_subscription_thumbnail(thumbnail): '''Serves thumbnail from disk if it's been saved already. If not, downloads the thumbnail, saves to disk, and serves it.''' - assert thumbnail[-4:] == '.jpg' + if not thumbnail.endswith('.jpg'): + flask.abort(400) video_id = thumbnail[0:-4] + if not _VIDEO_ID_RE.match(video_id): + flask.abort(400) thumbnail_path = os.path.join(thumbnails_directory, thumbnail) if video_id in existing_thumbnails: @@ -913,12 +1104,26 @@ def serve_subscription_thumbnail(thumbnail): f.close() return flask.Response(image, mimetype='image/jpeg') - url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg" - try: - image = util.fetch_url(url, report_text="Saved thumbnail: " + video_id) - except urllib.error.HTTPError as e: - print("Failed to download thumbnail for " + video_id + ": " + str(e)) - abort(e.code) + image = None + for quality in ('hq720.jpg', 'sddefault.jpg', 'hqdefault.jpg'): + url = f"https://i.ytimg.com/vi/{video_id}/{quality}" + try: + image = util.fetch_url(url, report_text=f"Saved thumbnail: {video_id}") + break + except util.FetchError as e: + if '404' in str(e): + continue + print(f"Failed to download thumbnail for {video_id}: {e}") + flask.abort(500) + except urllib.error.HTTPError as e: + if e.code == 404: + continue + print(f"Failed to download thumbnail for {video_id}: {e}") + flask.abort(e.code) + + if image is None: + flask.abort(404) + try: f = open(thumbnail_path, 'wb') except FileNotFoundError: |
