1 files changed, 387 insertions, 137 deletions
diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py
index 87e1659..04d3c5a 100644
--- a/youtube/subscriptions.py
+++ b/youtube/subscriptions.py
@@ -1,4 +1,4 @@
-from youtube import util, yt_data_extract, channel
+from youtube import util, yt_data_extract, channel, local_playlist, playlist
 from youtube import yt_app
 import settings
 
@@ -15,6 +15,8 @@ import math
 import secrets
 import collections
 import calendar # bullshit! https://bugs.python.org/issue6280
+import csv
+import re
 
 import flask
 from flask import request
@@ -26,6 +28,7 @@ thumbnails_directory = os.path.join(settings.data_dir, "subscription_thumbnails"
 
 database_path = os.path.join(settings.data_dir, "subscriptions.sqlite")
 
+
 def open_database():
     if not os.path.exists(settings.data_dir):
         os.makedirs(settings.data_dir)
@@ -74,11 +77,13 @@ def open_database():
     # https://stackoverflow.com/questions/19522505/using-sqlite3-in-python-with-with-keyword
     return contextlib.closing(connection)
 
+
 def with_open_db(function, *args, **kwargs):
     with open_database() as connection:
         with connection as cursor:
             return function(cursor, *args, **kwargs)
 
+
 def _is_subscribed(cursor, channel_id):
     result = cursor.execute('''SELECT EXISTS(
                                    SELECT 1
@@ -88,12 +93,14 @@ def _is_subscribed(cursor, channel_id):
                                )''', [channel_id]).fetchone()
     return bool(result[0])
 
+
 def is_subscribed(channel_id):
     if not os.path.exists(database_path):
         return False
 
     return with_open_db(_is_subscribed, channel_id)
 
+
 def _subscribe(channels):
     ''' channels is a list of (channel_id, channel_name) '''
     channels = list(channels)
@@ -101,7 +108,7 @@ def _subscribe(channels):
         with connection as cursor:
             channel_ids_to_check = [channel[0] for channel in channels if not _is_subscribed(cursor, channel[0])]
 
-            rows = ( (channel_id, channel_name, 0, 0) for channel_id, channel_name in channels)
+            rows = ((channel_id, channel_name, 0, 0) for channel_id, channel_name in channels)
             cursor.executemany('''INSERT OR IGNORE INTO subscribed_channels (yt_channel_id, channel_name, time_last_checked, next_check_time)
                                   VALUES (?, ?, ?, ?)''', rows)
 
@@ -111,6 +118,7 @@ def _subscribe(channels):
         channel_names.update(channels)
         check_channels_if_necessary(channel_ids_to_check)
 
+
 def delete_thumbnails(to_delete):
     for thumbnail in to_delete:
         try:
@@ -122,6 +130,7 @@ def delete_thumbnails(to_delete):
             print('Failed to delete thumbnail: ' + thumbnail)
             traceback.print_exc()
 
+
 def _unsubscribe(cursor, channel_ids):
     ''' channel_ids is a list of channel_ids '''
     to_delete = []
@@ -138,7 +147,8 @@ def _unsubscribe(cursor, channel_ids):
     gevent.spawn(delete_thumbnails, to_delete)
     cursor.executemany("DELETE FROM subscribed_channels WHERE yt_channel_id=?", ((channel_id, ) for channel_id in channel_ids))
 
-def _get_videos(cursor, number_per_page, offset, tag = None):
+
+def _get_videos(cursor, number_per_page, offset, tag=None):
     '''Returns a full page of videos with an offset, and a value good enough to be used as the total number of videos'''
     # We ask for the next 9 pages from the database
     # Then the actual length of the results tell us if there are more than 9 pages left, and if not, how many there actually are
@@ -149,7 +159,7 @@ def _get_videos(cursor, number_per_page, offset, tag = None):
     # We cannot use tricks with the sql id for the video since we frequently have filters and other restrictions in place on the results anyway
     # TODO: This is probably not the ideal solution
     if tag is not None:
-        db_videos = cursor.execute('''SELECT video_id, title, duration, time_published, is_time_published_exact, channel_name
+        db_videos = cursor.execute('''SELECT video_id, title, duration, time_published, is_time_published_exact, channel_name, yt_channel_id
                                       FROM videos
                                       INNER JOIN subscribed_channels on videos.sql_channel_id = subscribed_channels.id
                                       INNER JOIN tag_associations on videos.sql_channel_id = tag_associations.sql_channel_id
@@ -157,7 +167,7 @@ def _get_videos(cursor, number_per_page, offset, tag = None):
                                       ORDER BY time_noticed DESC, time_published DESC
                                       LIMIT ? OFFSET ?''', (tag, number_per_page*9, offset)).fetchall()
     else:
-        db_videos = cursor.execute('''SELECT video_id, title, duration, time_published, is_time_published_exact, channel_name
+        db_videos = cursor.execute('''SELECT video_id, title, duration, time_published, is_time_published_exact, channel_name, yt_channel_id
                                       FROM videos
                                       INNER JOIN subscribed_channels on videos.sql_channel_id = subscribed_channels.id
                                       WHERE muted = 0
@@ -172,15 +182,15 @@ def _get_videos(cursor, number_per_page, offset, tag = None):
             'id':   db_video[0],
             'title':    db_video[1],
             'duration': db_video[2],
-            'published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]),
+            'time_published': exact_timestamp(db_video[3]) if db_video[4] else posix_to_dumbed_down(db_video[3]),
             'author':   db_video[5],
+            'author_id': db_video[6],
+            'author_url': '/https://www.youtube.com/channel/' + db_video[6],
         })
 
     return videos, pseudo_number_of_videos
 
 
-
-
 def _get_subscribed_channels(cursor):
     for item in cursor.execute('''SELECT channel_name, yt_channel_id, muted
                                   FROM subscribed_channels
@@ -202,7 +212,6 @@ def _remove_tags(cursor, channel_ids, tags):
                            )''', pairs)
 
 
-
 def _get_tags(cursor, channel_id):
     return [row[0] for row in cursor.execute('''SELECT tag
                                                 FROM tag_associations
@@ -210,9 +219,11 @@ def _get_tags(cursor, channel_id):
                                                     SELECT id FROM subscribed_channels WHERE yt_channel_id = ?
                                                 )''', (channel_id,))]
 
+
 def _get_all_tags(cursor):
     return [row[0] for row in cursor.execute('''SELECT DISTINCT tag FROM tag_associations''')]
 
+
 def _get_channel_names(cursor, channel_ids):
     ''' returns list of (channel_id, channel_name) '''
     result = []
@@ -220,7 +231,7 @@ def _get_channel_names(cursor, channel_ids):
         row = cursor.execute('''SELECT channel_name
                                 FROM subscribed_channels
                                 WHERE yt_channel_id = ?''', (channel_id,)).fetchone()
-        result.append( (channel_id, row[0]) )
+        result.append((channel_id, row[0]))
     return result
 
 
@@ -245,12 +256,15 @@ def _channels_with_tag(cursor, tag, order=False, exclude_muted=False, include_mu
 
     return cursor.execute(statement, [tag]).fetchall()
 
+
 def _schedule_checking(cursor, channel_id, next_check_time):
     cursor.execute('''UPDATE subscribed_channels SET next_check_time = ? WHERE yt_channel_id = ?''', [int(next_check_time), channel_id])
 
+
 def _is_muted(cursor, channel_id):
     return bool(cursor.execute('''SELECT muted FROM subscribed_channels WHERE yt_channel_id=?''', [channel_id]).fetchone()[0])
 
+
 units = collections.OrderedDict([
     ('year', 31536000),   # 365*24*3600
     ('month', 2592000),   # 30*24*3600
@@ -260,6 +274,8 @@ units = collections.OrderedDict([
     ('minute', 60),
     ('second', 1),
 ])
+
+
 def youtube_timestamp_to_posix(dumb_timestamp):
     ''' Given a dumbed down timestamp such as 1 year ago, 3 hours ago,
          approximates the unix time (seconds since 1/1/1970) '''
@@ -273,6 +289,7 @@ def youtube_timestamp_to_posix(dumb_timestamp):
         unit = unit[:-1]    # remove s from end
     return now - quantifier*units[unit]
 
+
 def posix_to_dumbed_down(posix_time):
     '''Inverse of youtube_timestamp_to_posix.'''
     delta = int(time.time() - posix_time)
@@ -291,12 +308,14 @@ def posix_to_dumbed_down(posix_time):
     else:
         raise Exception()
 
+
 def exact_timestamp(posix_time):
     result = time.strftime('%I:%M %p %m/%d/%y', time.localtime(posix_time))
     if result[0] == '0':    # remove 0 infront of hour (like 01:00 PM)
         return result[1:]
     return result
 
+
 try:
     existing_thumbnails = set(os.path.splitext(name)[0] for name in os.listdir(thumbnails_directory))
 except FileNotFoundError:
@@ -312,23 +331,71 @@ checking_channels = set()
 # Just to use for printing channel checking status to console without opening database
 channel_names = dict()
 
+
 def check_channel_worker():
     while True:
         channel_id = check_channels_queue.get()
         try:
             _get_upstream_videos(channel_id)
+        except Exception:
+            traceback.print_exc()
         finally:
             checking_channels.remove(channel_id)
 
-for i in range(0,5):
+
+for i in range(0, 5):
     gevent.spawn(check_channel_worker)
 # ----------------------------
 
 
-
 # --- Auto checking system - Spaghetti code ---
+def autocheck_dispatcher():
+    '''Scans the auto_check_list. Sleeps until the earliest job is due, then adds that channel to the checking queue above. Can be sent a new job through autocheck_job_application'''
+    while True:
+        if len(autocheck_jobs) == 0:
+            new_job = autocheck_job_application.get()
+            autocheck_jobs.append(new_job)
+        else:
+            earliest_job_index = min(range(0, len(autocheck_jobs)), key=lambda index: autocheck_jobs[index]['next_check_time']) # https://stackoverflow.com/a/11825864
+            earliest_job = autocheck_jobs[earliest_job_index]
+            time_until_earliest_job = earliest_job['next_check_time'] - time.time()
+
+            if time_until_earliest_job <= -5:   # should not happen unless we're running extremely slow
+                print('ERROR: autocheck_dispatcher got job scheduled in the past, skipping and rescheduling: ' + earliest_job['channel_id'] + ', ' + earliest_job['channel_name'] + ', ' + str(earliest_job['next_check_time']))
+                next_check_time = time.time() + 3600*secrets.randbelow(60)/60
+                with_open_db(_schedule_checking, earliest_job['channel_id'], next_check_time)
+                autocheck_jobs[earliest_job_index]['next_check_time'] = next_check_time
+                continue
+
+            # make sure it's not muted
+            if with_open_db(_is_muted, earliest_job['channel_id']):
+                del autocheck_jobs[earliest_job_index]
+                continue
+
+            if time_until_earliest_job > 0: # it can become less than zero (in the past) when it's set to go off while the dispatcher is doing something else at that moment
+                try:
+                    new_job = autocheck_job_application.get(timeout=time_until_earliest_job)  # sleep for time_until_earliest_job time, but allow to be interrupted by new jobs
+                except gevent.queue.Empty: # no new jobs
+                    pass
+                else: # new job, add it to the list
+                    autocheck_jobs.append(new_job)
+                    continue
+
+            # no new jobs, time to execute the earliest job
+            channel_names[earliest_job['channel_id']] = earliest_job['channel_name']
+            checking_channels.add(earliest_job['channel_id'])
+            check_channels_queue.put(earliest_job['channel_id'])
+            del autocheck_jobs[earliest_job_index]
+
+
+dispatcher_greenlet = None
+
+
+def start_autocheck_system():
+    global autocheck_job_application
+    global autocheck_jobs
+    global dispatcher_greenlet
 
-if settings.autocheck_subscriptions:
     # job application format: dict with keys (channel_id, channel_name, next_check_time)
     autocheck_job_application = gevent.queue.Queue() # only really meant to hold 1 item, just reusing gevent's wait and timeout machinery
 
@@ -350,53 +417,30 @@ if settings.autocheck_subscriptions:
                     row = (row[0], row[1], next_check_time)
                     _schedule_checking(cursor, row[0], next_check_time)
                 autocheck_jobs.append({'channel_id': row[0], 'channel_name': row[1], 'next_check_time': next_check_time})
+    dispatcher_greenlet = gevent.spawn(autocheck_dispatcher)
 
 
+def stop_autocheck_system():
+    if dispatcher_greenlet is not None:
+        dispatcher_greenlet.kill()
 
-    def autocheck_dispatcher():
-        '''Scans the auto_check_list. Sleeps until the earliest job is due, then adds that channel to the checking queue above. Can be sent a new job through autocheck_job_application'''
-        while True:
-            if len(autocheck_jobs) == 0:
-                new_job = autocheck_job_application.get()
-                autocheck_jobs.append(new_job)
-            else:
-                earliest_job_index = min(range(0, len(autocheck_jobs)), key=lambda index: autocheck_jobs[index]['next_check_time']) # https://stackoverflow.com/a/11825864
-                earliest_job = autocheck_jobs[earliest_job_index]
-                time_until_earliest_job = earliest_job['next_check_time'] - time.time()
-
-                if time_until_earliest_job <= -5:   # should not happen unless we're running extremely slow
-                    print('ERROR: autocheck_dispatcher got job scheduled in the past, skipping and rescheduling: ' + earliest_job['channel_id'] + ', ' + earliest_job['channel_name'] + ', ' + str(earliest_job['next_check_time']))
-                    next_check_time = time.time() + 3600*secrets.randbelow(60)/60
-                    with_open_db(_schedule_checking, earliest_job['channel_id'], next_check_time)
-                    autocheck_jobs[earliest_job_index]['next_check_time'] = next_check_time
-                    continue
-
-                # make sure it's not muted
-                if with_open_db(_is_muted, earliest_job['channel_id']):
-                    del autocheck_jobs[earliest_job_index]
-                    continue
 
-                if time_until_earliest_job > 0: # it can become less than zero (in the past) when it's set to go off while the dispatcher is doing something else at that moment
-                    try:
-                        new_job = autocheck_job_application.get(timeout = time_until_earliest_job)  # sleep for time_until_earliest_job time, but allow to be interrupted by new jobs
-                    except gevent.queue.Empty: # no new jobs
-                        pass
-                    else: # new job, add it to the list
-                        autocheck_jobs.append(new_job)
-                        continue
-
-                # no new jobs, time to execute the earliest job
-                channel_names[earliest_job['channel_id']] = earliest_job['channel_name']
-                checking_channels.add(earliest_job['channel_id'])
-                check_channels_queue.put(earliest_job['channel_id'])
-                del autocheck_jobs[earliest_job_index]
+def autocheck_setting_changed(old_value, new_value):
+    if new_value:
+        start_autocheck_system()
+    else:
+        stop_autocheck_system()
 
 
-    gevent.spawn(autocheck_dispatcher)
+settings.add_setting_changed_hook(
+    'autocheck_subscriptions',
+    autocheck_setting_changed
+)
+if settings.autocheck_subscriptions:
+    start_autocheck_system()
 # ----------------------------
 
 
-
 def check_channels_if_necessary(channel_ids):
     for channel_id in channel_ids:
         if channel_id not in checking_channels:
@@ -404,6 +448,54 @@ def check_channels_if_necessary(channel_ids):
             check_channels_queue.put(channel_id)
 
 
+def _get_atoma_feed(channel_id):
+    url = 'https://www.youtube.com/feeds/videos.xml?channel_id=' + channel_id
+    try:
+        return util.fetch_url(url).decode('utf-8')
+    except util.FetchError as e:
+        # 404 is expected for terminated channels
+        if e.code in ('404', '429'):
+            return ''
+        if e.code == '502':
+            return str(e)
+        raise
+
+
+def _get_channel_videos_first_page(channel_id, channel_status_name):
+    try:
+        # First try the playlist method
+        pl_json = playlist.get_videos(
+            'UU' + channel_id[2:],
+            1,
+            include_shorts=settings.include_shorts_in_subscriptions,
+            report_text=None
+        )
+        pl_info = yt_data_extract.extract_playlist_info(pl_json)
+        if pl_info.get('items'):
+            pl_info['items'] = pl_info['items'][0:30]
+            return pl_info
+
+        # Try the channel api method
+        channel_json = channel.get_channel_first_page(channel_id=channel_id)
+        channel_info = yt_data_extract.extract_channel_info(
+            json.loads(channel_json), 'videos'
+        )
+        return channel_info
+    except util.FetchError as e:
+        if e.code == '429' and settings.route_tor:
+            error_message = ('Error checking channel ' + channel_status_name
+                + ': YouTube blocked the request because the'
+                + ' Tor exit node is overutilized. Try getting a new exit node'
+                + ' by using the New Identity button in the Tor Browser.')
+            if e.ip:
+                error_message += ' Exit node IP address: ' + e.ip
+            print(error_message)
+            return None
+        elif e.code == '502':
+            print('Error checking channel', channel_status_name + ':', str(e))
+            return None
+        raise
+
 
 def _get_upstream_videos(channel_id):
     try:
@@ -414,12 +506,15 @@ def _get_upstream_videos(channel_id):
     print("Checking channel: " + channel_status_name)
 
     tasks = (
-        gevent.spawn(channel.get_channel_tab, channel_id, print_status=False), # channel page, need for video duration
-        gevent.spawn(util.fetch_url, 'https://www.youtube.com/feeds/videos.xml?channel_id=' + channel_id) # atoma feed, need for exact published time
+        # channel page, need for video duration
+        gevent.spawn(_get_channel_videos_first_page, channel_id,
+                     channel_status_name),
+        # need atoma feed for exact published time
+        gevent.spawn(_get_atoma_feed, channel_id)
     )
     gevent.joinall(tasks)
 
-    channel_tab, feed = tasks[0].value, tasks[1].value
+    channel_info, feed = tasks[0].value, tasks[1].value
 
     # extract published times from atoma feed
     times_published = {}
@@ -436,7 +531,7 @@ def _get_upstream_videos(channel_id):
                     return element
             return None
 
-        root = defusedxml.ElementTree.fromstring(feed.decode('utf-8'))
+        root = defusedxml.ElementTree.fromstring(feed)
         assert remove_bullshit(root.tag) == 'feed'
         for entry in root:
             if (remove_bullshit(entry.tag) != 'entry'):
@@ -451,42 +546,70 @@ def _get_upstream_videos(channel_id):
             time_published = int(calendar.timegm(time.strptime(time_published_element.text, '%Y-%m-%dT%H:%M:%S+00:00')))
             times_published[video_id_element.text] = time_published
 
-    except (AssertionError, defusedxml.ElementTree.ParseError) as e:
+    except AssertionError:
         print('Failed to read atoma feed for ' + channel_status_name)
         traceback.print_exc()
+    except defusedxml.ElementTree.ParseError:
+        print('Failed to read atoma feed for ' + channel_status_name)
 
-    channel_info = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos')
-    if channel_info['errors']:
-        print('Error checking channel ' + channel_status_name + ': ' + ', '.join(channel_info['errors']))
+    if channel_info is None: # there was an error
+        return
+    if channel_info['error']:
+        print('Error checking channel ' + channel_status_name + ': ' + channel_info['error'])
         return
 
     videos = channel_info['items']
     for i, video_item in enumerate(videos):
-        if 'description' not in video_item:
+        if not video_item.get('description'):
             video_item['description'] = ''
+        else:
+            video_item['description'] = ''.join(run.get('text', '') for run in video_item['description'])
 
         if video_item['id'] in times_published:
             video_item['time_published'] = times_published[video_item['id']]
             video_item['is_time_published_exact'] = True
-        else:
+        elif video_item.get('time_published'):
             video_item['is_time_published_exact'] = False
             try:
-                video_item['time_published'] = youtube_timestamp_to_posix(video_item['published']) - i  # subtract a few seconds off the videos so they will be in the right order
-            except KeyError:
+                video_item['time_published'] = youtube_timestamp_to_posix(video_item['time_published']) - i  # subtract a few seconds off the videos so they will be in the right order
+            except Exception:
                 print(video_item)
-
+        else:
+            video_item['is_time_published_exact'] = False
+            video_item['time_published'] = None
         video_item['channel_id'] = channel_id
+    if len(videos) > 1:
+        # Go back and fill in any videos that don't have a time published
+        # using the time published of the surrounding ones
+        for i in range(len(videos)-1):
+            if (videos[i+1]['time_published'] is None
+                and videos[i]['time_published'] is not None
+            ):
+                videos[i+1]['time_published'] = videos[i]['time_published'] - 1
+        for i in reversed(range(1,len(videos))):
+            if (videos[i-1]['time_published'] is None
+                and videos[i]['time_published'] is not None
+            ):
+                videos[i-1]['time_published'] = videos[i]['time_published'] + 1
+    # Special case: none of the videos have a time published.
+    # In this case, make something up
+    if videos and videos[0]['time_published'] is None:
+        assert all(v['time_published'] is None for v in videos)
+        now = time.time()
+        for i in range(len(videos)):
+            # 1 month between videos
+            videos[i]['time_published'] = now - i*3600*24*30
 
 
     if len(videos) == 0:
-        average_upload_period = 4*7*24*3600 # assume 1 month for channel with no videos
+        average_upload_period = 4*7*24*3600  # assume 1 month for channel with no videos
     elif len(videos) < 5:
         average_upload_period = int((time.time() - videos[len(videos)-1]['time_published'])/len(videos))
     else:
         average_upload_period = int((time.time() - videos[4]['time_published'])/5) # equivalent to averaging the time between videos for the last 5 videos
 
     # calculate when to check next for auto checking
-    # add some quantization and randomness to make pattern analysis by Youtube slightly harder
+    # add some quantization and randomness to make pattern analysis by YouTube slightly harder
     quantized_upload_period = average_upload_period - (average_upload_period % (4*3600)) + 4*3600   # round up to nearest 4 hours
     randomized_upload_period = quantized_upload_period*(1 + secrets.randbelow(50)/50*0.5) # randomly between 1x and 1.5x
     next_check_delay = randomized_upload_period/10    # check at 10x the channel posting rate. might want to fine tune this number
@@ -495,44 +618,74 @@ def _get_upstream_videos(channel_id):
     with open_database() as connection:
         with connection as cursor:
 
-            # calculate how many new videos there are
-            row = cursor.execute('''SELECT video_id
-                                    FROM videos
-                                    INNER JOIN subscribed_channels ON videos.sql_channel_id = subscribed_channels.id
-                                    WHERE yt_channel_id=?
-                                    ORDER BY time_published DESC
-                                    LIMIT 1''', [channel_id]).fetchone()
-            if row is None:
-                number_of_new_videos = len(videos)
-            else:
-                latest_video_id = row[0]
-                index = 0
-                for video in videos:
-                    if video['id'] == latest_video_id:
-                        break
-                    index += 1
-                number_of_new_videos = index
+            # Get video ids and duration of existing vids so we
+            # can see how many new ones there are and update
+            # livestreams/premiers
+            existing_vids = list(cursor.execute(
+                '''SELECT video_id, duration
+                   FROM videos
+                   INNER JOIN subscribed_channels
+                       ON videos.sql_channel_id = subscribed_channels.id
+                   WHERE yt_channel_id=?
+                   ORDER BY time_published DESC
+                   LIMIT 30''', [channel_id]).fetchall())
+            existing_vid_ids = set(row[0] for row in existing_vids)
+            existing_durs = dict(existing_vids)
+
+            # new videos the channel has uploaded since last time we checked
+            number_of_new_videos = 0
+            for video in videos:
+                if video['id'] in existing_vid_ids:
+                    break
+                number_of_new_videos += 1
 
             is_first_check = cursor.execute('''SELECT time_last_checked FROM subscribed_channels WHERE yt_channel_id=?''', [channel_id]).fetchone()[0] in (None, 0)
             time_videos_retrieved = int(time.time())
             rows = []
-            for video_item in videos:
-                if is_first_check or number_of_new_videos > 6:
+            update_rows = []
+            for i, video_item in enumerate(videos):
+                if (is_first_check
+                        or number_of_new_videos > 6
+                        or i >= number_of_new_videos):
                     # don't want a crazy ordering on first check or check in a long time, since we're ordering by time_noticed
+                    # Last condition is for when the channel deleting videos
+                    # causes new videos to appear at the end of the backlog.
+                    # For instance, if we have 30 vids in the DB, and 1 vid
+                    # that we previously saw has since been deleted,
+                    # then a video we haven't seen before will appear as the
+                    # 30th. Don't want this to be considered a newly noticed
+                    # vid which would appear at top of subscriptions feed
                     time_noticed = video_item['time_published']
                 else:
                     time_noticed = time_videos_retrieved
-                rows.append((
-                    video_item['channel_id'],
-                    video_item['id'],
-                    video_item['title'],
-                    video_item['duration'],
-                    video_item['time_published'],
-                    video_item['is_time_published_exact'],
-                    time_noticed,
-                    video_item['description'],
-                ))
 
+                # videos which need durations updated
+                non_durations = ('upcoming', 'none', 'live', '')
+                v_id = video_item['id']
+                if (existing_durs.get(v_id) is not None
+                    and existing_durs[v_id].lower() in non_durations
+                    and video_item['duration'] not in non_durations
+                ):
+                    update_rows.append((
+                        video_item['title'],
+                        video_item['duration'],
+                        video_item['time_published'],
+                        video_item['is_time_published_exact'],
+                        video_item['description'],
+                        video_item['id'],
+                    ))
+                # all other videos
+                else:
+                    rows.append((
+                        video_item['channel_id'],
+                        video_item['id'],
+                        video_item['title'],
+                        video_item['duration'],
+                        video_item['time_published'],
+                        video_item['is_time_published_exact'],
+                        time_noticed,
+                        video_item['description'],
+                    ))
 
             cursor.executemany('''INSERT OR IGNORE INTO videos (
                                       sql_channel_id,
@@ -545,6 +698,13 @@ def _get_upstream_videos(channel_id):
                                       description
                                   )
                                   VALUES ((SELECT id FROM subscribed_channels WHERE yt_channel_id=?), ?, ?, ?, ?, ?, ?, ?)''', rows)
+            cursor.executemany('''UPDATE videos SET
+                                      title=?,
+                                      duration=?,
+                                      time_published=?,
+                                      is_time_published_exact=?,
+                                      description=?
+                                  WHERE video_id=?''', update_rows)
             cursor.execute('''UPDATE subscribed_channels
                               SET time_last_checked = ?, next_check_time = ?
                               WHERE yt_channel_id=?''', [int(time.time()), next_check_time, channel_id])
@@ -561,7 +721,6 @@ def _get_upstream_videos(channel_id):
         print(str(number_of_new_videos) + ' new videos from ' + channel_status_name)
 
 
-
 def check_all_channels():
     with open_database() as connection:
         with connection as cursor:
@@ -595,35 +754,54 @@ def check_specific_channels(channel_ids):
     channel_names.update(channel_id_name_list)
     check_channels_if_necessary(channel_ids)
 
-
-
+CHANNEL_ID_RE = re.compile(r'UC[-_\w]{22}')
 @yt_app.route('/import_subscriptions', methods=['POST'])
 def import_subscriptions():
 
     # check if the post request has the file part
     if 'subscriptions_file' not in request.files:
-        #flash('No file part')
+        # flash('No file part')
         return flask.redirect(util.URL_ORIGIN + request.full_path)
     file = request.files['subscriptions_file']
     # if user does not select file, browser also
     # submit an empty part without filename
     if file.filename == '':
-        #flash('No selected file')
+        # flash('No selected file')
         return flask.redirect(util.URL_ORIGIN + request.full_path)
 
-
     mime_type = file.mimetype
 
     if mime_type == 'application/json':
-        file = file.read().decode('utf-8')
+        info = file.read().decode('utf-8')
+        if info == '':
+            return '400 Bad Request: File is empty', 400
         try:
-            file = json.loads(file)
+            info = json.loads(info)
         except json.decoder.JSONDecodeError:
             traceback.print_exc()
             return '400 Bad Request: Invalid json file', 400
 
+        channels = []
         try:
-            channels = ( (item['snippet']['resourceId']['channelId'], item['snippet']['title']) for item in file)
+            if 'app_version_int' in info:   # NewPipe Format
+                for item in info['subscriptions']:
+                    # Other service, such as SoundCloud
+                    if item.get('service_id', 0) != 0:
+                        continue
+                    channel_url = item['url']
+                    channel_id_match = CHANNEL_ID_RE.search(channel_url)
+                    if channel_id_match:
+                        channel_id = channel_id_match.group(0)
+                    else:
+                        print('WARNING: Could not find channel id in url',
+                              channel_url)
+                        continue
+                    channels.append((channel_id, item['name']))
+            else:   # Old Google Takeout format
+                for item in info:
+                    snippet = item['snippet']
+                    channel_id = snippet['resourceId']['channelId']
+                    channels.append((channel_id, snippet['title']))
         except (KeyError, IndexError):
             traceback.print_exc()
             return '400 Bad Request: Unknown json structure', 400
@@ -637,22 +815,98 @@ def import_subscriptions():
                 if (outline_element.tag != 'outline') or ('xmlUrl' not in outline_element.attrib):
                     continue
 
-
                 channel_name = outline_element.attrib['text']
                 channel_rss_url = outline_element.attrib['xmlUrl']
                 channel_id = channel_rss_url[channel_rss_url.find('channel_id=')+11:].strip()
-                channels.append( (channel_id, channel_name) )
+                channels.append((channel_id, channel_name))
 
         except (AssertionError, IndexError, defusedxml.ElementTree.ParseError) as e:
             return '400 Bad Request: Unable to read opml xml file, or the file is not the expected format', 400
+    elif mime_type in ('text/csv', 'application/vnd.ms-excel'):
+        content = file.read().decode('utf-8')
+        reader = csv.reader(content.splitlines())
+        channels = []
+        for row in reader:
+            if not row or row[0].lower().strip() == 'channel id':
+                continue
+            elif len(row) > 1 and CHANNEL_ID_RE.fullmatch(row[0].strip()):
+                channels.append( (row[0], row[-1]) )
+            else:
+                print('WARNING: Unknown row format:', row)
     else:
-            return '400 Bad Request: Unsupported file format: ' + mime_type + '. Only subscription.json files (from Google Takeouts) and XML OPML files exported from Youtube\'s subscription manager page are supported', 400
+        error = 'Unsupported file format: ' + mime_type
+        error += (' . Only subscription.json, subscriptions.csv files'
+                  ' (from Google Takeouts)'
+                  ' and XML OPML files exported from YouTube\'s'
+                  ' subscription manager page are supported')
+        return (flask.render_template('error.html', error_message=error),
+                400)
 
     _subscribe(channels)
 
     return flask.redirect(util.URL_ORIGIN + '/subscription_manager', 303)
 
 
+@yt_app.route('/export_subscriptions', methods=['POST'])
+def export_subscriptions():
+    include_muted = request.values.get('include_muted') == 'on'
+    with open_database() as connection:
+        with connection as cursor:
+            sub_list = []
+            for channel_name, channel_id, muted in (
+                    _get_subscribed_channels(cursor)):
+                if muted and not include_muted:
+                    continue
+                if request.values['export_format'] == 'json_google_takeout':
+                    sub_list.append({
+                        'kind': 'youtube#subscription',
+                        'snippet': {
+                            'muted': bool(muted),
+                            'resourceId': {
+                                'channelId': channel_id,
+                                'kind': 'youtube#channel',
+                            },
+                            'tags': _get_tags(cursor, channel_id),
+                            'title': channel_name,
+                        },
+                    })
+                elif request.values['export_format'] == 'json_newpipe':
+                    sub_list.append({
+                        'service_id': 0,
+                        'url': 'https://www.youtube.com/channel/' + channel_id,
+                        'name': channel_name,
+                    })
+                elif request.values['export_format'] == 'opml':
+                    sub_list.append({
+                        'channel_name': channel_name,
+                        'channel_id': channel_id,
+                    })
+    date_time = time.strftime('%Y%m%d%H%M', time.localtime())
+    if request.values['export_format'] == 'json_google_takeout':
+        r = flask.Response(json.dumps(sub_list), mimetype='text/json')
+        cd = 'attachment; filename="subscriptions_%s.json"' % date_time
+        r.headers['Content-Disposition'] = cd
+        return r
+    elif request.values['export_format'] == 'json_newpipe':
+        r = flask.Response(json.dumps({
+            'app_version': '0.21.9',
+            'app_version_int': 975,
+            'subscriptions': sub_list,
+        }), mimetype='text/json')
+        file_name = 'newpipe_subscriptions_%s_youtube-local.json' % date_time
+        cd = 'attachment; filename="%s"' % file_name
+        r.headers['Content-Disposition'] = cd
+        return r
+    elif request.values['export_format'] == 'opml':
+        r = flask.Response(
+            flask.render_template('subscriptions.xml', sub_list=sub_list),
+            mimetype='text/xml')
+        cd = 'attachment; filename="subscriptions_%s.xml"' % date_time
+        r.headers['Content-Disposition'] = cd
+        return r
+    else:
+        return '400 Bad Request', 400
+
 
 @yt_app.route('/subscription_manager', methods=['GET'])
 def get_subscription_manager_page():
@@ -673,7 +927,7 @@ def get_subscription_manager_page():
                             'tags': [t for t in _get_tags(cursor, channel_id) if t != tag],
                         })
 
-                    tag_groups.append( (tag, sub_list) )
+                    tag_groups.append((tag, sub_list))
 
                 # Channels with no tags
                 channel_list = cursor.execute('''SELECT yt_channel_id, channel_name, muted
@@ -693,7 +947,7 @@ def get_subscription_manager_page():
                             'tags': [],
                         })
 
-                    tag_groups.append( ('No tags', sub_list) )
+                    tag_groups.append(('No tags', sub_list))
             else:
                 sub_list = []
                 for channel_name, channel_id, muted in _get_subscribed_channels(cursor):
@@ -705,20 +959,20 @@ def get_subscription_manager_page():
                         'tags': _get_tags(cursor, channel_id),
                     })
 
-
-
-
     if group_by_tags:
-        return flask.render_template('subscription_manager.html',
-            group_by_tags = True,
-            tag_groups = tag_groups,
+        return flask.render_template(
+            'subscription_manager.html',
+            group_by_tags=True,
+            tag_groups=tag_groups,
         )
     else:
-        return flask.render_template('subscription_manager.html',
-            group_by_tags = False,
-            sub_list = sub_list,
+        return flask.render_template(
+            'subscription_manager.html',
+            group_by_tags=False,
+            sub_list=sub_list,
         )
 
+
 def list_from_comma_separated_tags(string):
     return [tag.strip() for tag in string.split(',') if tag.strip()]
 
@@ -737,7 +991,7 @@ def post_subscription_manager_page():
                 _unsubscribe(cursor, request.values.getlist('channel_ids'))
             elif action == 'unsubscribe_verify':
                 unsubscribe_list = _get_channel_names(cursor, request.values.getlist('channel_ids'))
-                return flask.render_template('unsubscribe_verify.html', unsubscribe_list = unsubscribe_list)
+                return flask.render_template('unsubscribe_verify.html', unsubscribe_list=unsubscribe_list)
 
             elif action == 'mute':
                 cursor.executemany('''UPDATE subscribed_channels
@@ -752,6 +1006,7 @@ def post_subscription_manager_page():
 
     return flask.redirect(util.URL_ORIGIN + request.full_path, 303)
 
+
 @yt_app.route('/subscriptions', methods=['GET'])
 @yt_app.route('/feed/subscriptions', methods=['GET'])
 def get_subscriptions_page():
@@ -764,11 +1019,10 @@ def get_subscriptions_page():
                 video['thumbnail'] = util.URL_ORIGIN + '/data/subscription_thumbnails/' + video['id'] + '.jpg'
                 video['type'] = 'video'
                 video['item_size'] = 'small'
-                yt_data_extract.add_extra_html_info(video)
+                util.add_extra_html_info(video)
 
             tags = _get_all_tags(cursor)
 
-
             subscription_list = []
             for channel_name, channel_id, muted in _get_subscribed_channels(cursor):
                 subscription_list.append({
@@ -778,15 +1032,18 @@ def get_subscriptions_page():
                     'muted': muted,
                 })
 
-    return flask.render_template('subscriptions.html',
-        videos = videos,
-        num_pages = math.ceil(number_of_videos_in_db/60),
-        parameters_dictionary = request.args,
-        tags = tags,
-        current_tag = tag,
-        subscription_list = subscription_list,
+    return flask.render_template(
+        'subscriptions.html',
+        header_playlist_names=local_playlist.get_playlist_names(),
+        videos=videos,
+        num_pages=math.ceil(number_of_videos_in_db/60),
+        parameters_dictionary=request.args,
+        tags=tags,
+        current_tag=tag,
+        subscription_list=subscription_list,
     )
 
+
 @yt_app.route('/subscriptions', methods=['POST'])
 @yt_app.route('/feed/subscriptions', methods=['POST'])
 def post_subscriptions_page():
@@ -832,7 +1089,7 @@ def serve_subscription_thumbnail(thumbnail):
             f.close()
             return flask.Response(image, mimetype='image/jpeg')
 
-    url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
+    url = f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"
     try:
         image = util.fetch_url(url, report_text="Saved thumbnail: " + video_id)
     except urllib.error.HTTPError as e:
@@ -841,17 +1098,10 @@ def serve_subscription_thumbnail(thumbnail):
     try:
         f = open(thumbnail_path, 'wb')
     except FileNotFoundError:
-        os.makedirs(thumbnails_directory, exist_ok = True)
+        os.makedirs(thumbnails_directory, exist_ok=True)
         f = open(thumbnail_path, 'wb')
     f.write(image)
     f.close()
     existing_thumbnails.add(video_id)
 
     return flask.Response(image, mimetype='image/jpeg')
-
-
-
-
-
-
-