1 files changed, 213 insertions, 37 deletions
diff --git a/youtube/subscriptions.py b/youtube/subscriptions.py
index c18f822..04d3c5a 100644
--- a/youtube/subscriptions.py
+++ b/youtube/subscriptions.py
@@ -1,4 +1,4 @@
-from youtube import util, yt_data_extract, channel, local_playlist
+from youtube import util, yt_data_extract, channel, local_playlist, playlist
 from youtube import yt_app
 import settings
 
@@ -15,6 +15,8 @@ import math
 import secrets
 import collections
 import calendar # bullshit! https://bugs.python.org/issue6280
+import csv
+import re
 
 import flask
 from flask import request
@@ -106,8 +108,7 @@ def _subscribe(channels):
         with connection as cursor:
             channel_ids_to_check = [channel[0] for channel in channels if not _is_subscribed(cursor, channel[0])]
 
-            rows = ((channel_id, channel_name, 0, 0) for channel_id,
-                    channel_name in channels)
+            rows = ((channel_id, channel_name, 0, 0) for channel_id, channel_name in channels)
             cursor.executemany('''INSERT OR IGNORE INTO subscribed_channels (yt_channel_id, channel_name, time_last_checked, next_check_time)
                                   VALUES (?, ?, ?, ?)''', rows)
 
@@ -234,8 +235,7 @@ def _get_channel_names(cursor, channel_ids):
     return result
 
 
-def _channels_with_tag(cursor, tag, order=False, exclude_muted=False,
-                       include_muted_status=False):
+def _channels_with_tag(cursor, tag, order=False, exclude_muted=False, include_muted_status=False):
     ''' returns list of (channel_id, channel_name) '''
 
     statement = '''SELECT yt_channel_id, channel_name'''
@@ -434,7 +434,8 @@ def autocheck_setting_changed(old_value, new_value):
 
 settings.add_setting_changed_hook(
     'autocheck_subscriptions',
-    autocheck_setting_changed)
+    autocheck_setting_changed
+)
 if settings.autocheck_subscriptions:
     start_autocheck_system()
 # ----------------------------
@@ -455,22 +456,44 @@ def _get_atoma_feed(channel_id):
         # 404 is expected for terminated channels
         if e.code in ('404', '429'):
             return ''
+        if e.code == '502':
+            return str(e)
         raise
 
 
-def _get_channel_tab(channel_id, channel_status_name):
+def _get_channel_videos_first_page(channel_id, channel_status_name):
     try:
-        return channel.get_channel_tab(channel_id, print_status=False)
+        # First try the playlist method
+        pl_json = playlist.get_videos(
+            'UU' + channel_id[2:],
+            1,
+            include_shorts=settings.include_shorts_in_subscriptions,
+            report_text=None
+        )
+        pl_info = yt_data_extract.extract_playlist_info(pl_json)
+        if pl_info.get('items'):
+            pl_info['items'] = pl_info['items'][0:30]
+            return pl_info
+
+        # Try the channel api method
+        channel_json = channel.get_channel_first_page(channel_id=channel_id)
+        channel_info = yt_data_extract.extract_channel_info(
+            json.loads(channel_json), 'videos'
+        )
+        return channel_info
     except util.FetchError as e:
         if e.code == '429' and settings.route_tor:
             error_message = ('Error checking channel ' + channel_status_name
-                + ': Youtube blocked the request because the'
+                + ': YouTube blocked the request because the'
                 + ' Tor exit node is overutilized. Try getting a new exit node'
                 + ' by using the New Identity button in the Tor Browser.')
             if e.ip:
                 error_message += ' Exit node IP address: ' + e.ip
             print(error_message)
             return None
+        elif e.code == '502':
+            print('Error checking channel', channel_status_name + ':', str(e))
+            return None
         raise
 
 
@@ -484,13 +507,14 @@ def _get_upstream_videos(channel_id):
 
     tasks = (
         # channel page, need for video duration
-        gevent.spawn(channel.get_channel_first_page, channel_id=channel_id),
+        gevent.spawn(_get_channel_videos_first_page, channel_id,
+                     channel_status_name),
         # need atoma feed for exact published time
         gevent.spawn(_get_atoma_feed, channel_id)
     )
     gevent.joinall(tasks)
 
-    channel_tab, feed = tasks[0].value, tasks[1].value
+    channel_info, feed = tasks[0].value, tasks[1].value
 
     # extract published times from atoma feed
     times_published = {}
@@ -528,9 +552,8 @@ def _get_upstream_videos(channel_id):
     except defusedxml.ElementTree.ParseError:
         print('Failed to read atoma feed for ' + channel_status_name)
 
-    if channel_tab is None: # there was an error
+    if channel_info is None: # there was an error
         return
-    channel_info = yt_data_extract.extract_channel_info(json.loads(channel_tab), 'videos')
     if channel_info['error']:
         print('Error checking channel ' + channel_status_name + ': ' + channel_info['error'])
         return
@@ -545,14 +568,38 @@ def _get_upstream_videos(channel_id):
         if video_item['id'] in times_published:
             video_item['time_published'] = times_published[video_item['id']]
             video_item['is_time_published_exact'] = True
-        else:
+        elif video_item.get('time_published'):
             video_item['is_time_published_exact'] = False
             try:
                 video_item['time_published'] = youtube_timestamp_to_posix(video_item['time_published']) - i  # subtract a few seconds off the videos so they will be in the right order
-            except KeyError:
+            except Exception:
                 print(video_item)
-
+        else:
+            video_item['is_time_published_exact'] = False
+            video_item['time_published'] = None
         video_item['channel_id'] = channel_id
+    if len(videos) > 1:
+        # Go back and fill in any videos that don't have a time published
+        # using the time published of the surrounding ones
+        for i in range(len(videos)-1):
+            if (videos[i+1]['time_published'] is None
+                and videos[i]['time_published'] is not None
+            ):
+                videos[i+1]['time_published'] = videos[i]['time_published'] - 1
+        for i in reversed(range(1,len(videos))):
+            if (videos[i-1]['time_published'] is None
+                and videos[i]['time_published'] is not None
+            ):
+                videos[i-1]['time_published'] = videos[i]['time_published'] + 1
+    # Special case: none of the videos have a time published.
+    # In this case, make something up
+    if videos and videos[0]['time_published'] is None:
+        assert all(v['time_published'] is None for v in videos)
+        now = time.time()
+        for i in range(len(videos)):
+            # 1 month between videos
+            videos[i]['time_published'] = now - i*3600*24*30
+
 
     if len(videos) == 0:
         average_upload_period = 4*7*24*3600  # assume 1 month for channel with no videos
@@ -562,7 +609,7 @@ def _get_upstream_videos(channel_id):
         average_upload_period = int((time.time() - videos[4]['time_published'])/5) # equivalent to averaging the time between videos for the last 5 videos
 
     # calculate when to check next for auto checking
-    # add some quantization and randomness to make pattern analysis by Youtube slightly harder
+    # add some quantization and randomness to make pattern analysis by YouTube slightly harder
     quantized_upload_period = average_upload_period - (average_upload_period % (4*3600)) + 4*3600   # round up to nearest 4 hours
     randomized_upload_period = quantized_upload_period*(1 + secrets.randbelow(50)/50*0.5) # randomly between 1x and 1.5x
     next_check_delay = randomized_upload_period/10    # check at 10x the channel posting rate. might want to fine tune this number
@@ -571,26 +618,31 @@ def _get_upstream_videos(channel_id):
     with open_database() as connection:
         with connection as cursor:
 
-            # calculate how many new videos there are
-            existing_vids = set(row[0] for row in cursor.execute(
-                '''SELECT video_id
+            # Get video ids and duration of existing vids so we
+            # can see how many new ones there are and update
+            # livestreams/premiers
+            existing_vids = list(cursor.execute(
+                '''SELECT video_id, duration
                    FROM videos
                    INNER JOIN subscribed_channels
                        ON videos.sql_channel_id = subscribed_channels.id
                    WHERE yt_channel_id=?
                    ORDER BY time_published DESC
                    LIMIT 30''', [channel_id]).fetchall())
+            existing_vid_ids = set(row[0] for row in existing_vids)
+            existing_durs = dict(existing_vids)
 
             # new videos the channel has uploaded since last time we checked
             number_of_new_videos = 0
             for video in videos:
-                if video['id'] in existing_vids:
+                if video['id'] in existing_vid_ids:
                     break
                 number_of_new_videos += 1
 
             is_first_check = cursor.execute('''SELECT time_last_checked FROM subscribed_channels WHERE yt_channel_id=?''', [channel_id]).fetchone()[0] in (None, 0)
             time_videos_retrieved = int(time.time())
             rows = []
+            update_rows = []
             for i, video_item in enumerate(videos):
                 if (is_first_check
                         or number_of_new_videos > 6
@@ -606,16 +658,34 @@ def _get_upstream_videos(channel_id):
                     time_noticed = video_item['time_published']
                 else:
                     time_noticed = time_videos_retrieved
-                rows.append((
-                    video_item['channel_id'],
-                    video_item['id'],
-                    video_item['title'],
-                    video_item['duration'],
-                    video_item['time_published'],
-                    video_item['is_time_published_exact'],
-                    time_noticed,
-                    video_item['description'],
-                ))
+
+                # videos which need durations updated
+                non_durations = ('upcoming', 'none', 'live', '')
+                v_id = video_item['id']
+                if (existing_durs.get(v_id) is not None
+                    and existing_durs[v_id].lower() in non_durations
+                    and video_item['duration'] not in non_durations
+                ):
+                    update_rows.append((
+                        video_item['title'],
+                        video_item['duration'],
+                        video_item['time_published'],
+                        video_item['is_time_published_exact'],
+                        video_item['description'],
+                        video_item['id'],
+                    ))
+                # all other videos
+                else:
+                    rows.append((
+                        video_item['channel_id'],
+                        video_item['id'],
+                        video_item['title'],
+                        video_item['duration'],
+                        video_item['time_published'],
+                        video_item['is_time_published_exact'],
+                        time_noticed,
+                        video_item['description'],
+                    ))
 
             cursor.executemany('''INSERT OR IGNORE INTO videos (
                                       sql_channel_id,
@@ -628,6 +698,13 @@ def _get_upstream_videos(channel_id):
                                       description
                                   )
                                   VALUES ((SELECT id FROM subscribed_channels WHERE yt_channel_id=?), ?, ?, ?, ?, ?, ?, ?)''', rows)
+            cursor.executemany('''UPDATE videos SET
+                                      title=?,
+                                      duration=?,
+                                      time_published=?,
+                                      is_time_published_exact=?,
+                                      description=?
+                                  WHERE video_id=?''', update_rows)
             cursor.execute('''UPDATE subscribed_channels
                               SET time_last_checked = ?, next_check_time = ?
                               WHERE yt_channel_id=?''', [int(time.time()), next_check_time, channel_id])
@@ -677,7 +754,7 @@ def check_specific_channels(channel_ids):
     channel_names.update(channel_id_name_list)
     check_channels_if_necessary(channel_ids)
 
-
+CHANNEL_ID_RE = re.compile(r'UC[-_\w]{22}')
 @yt_app.route('/import_subscriptions', methods=['POST'])
 def import_subscriptions():
 
@@ -695,15 +772,36 @@ def import_subscriptions():
     mime_type = file.mimetype
 
     if mime_type == 'application/json':
-        file = file.read().decode('utf-8')
+        info = file.read().decode('utf-8')
+        if info == '':
+            return '400 Bad Request: File is empty', 400
         try:
-            file = json.loads(file)
+            info = json.loads(info)
         except json.decoder.JSONDecodeError:
             traceback.print_exc()
             return '400 Bad Request: Invalid json file', 400
 
+        channels = []
         try:
-            channels = ((item['snippet']['resourceId']['channelId'], item['snippet']['title']) for item in file)
+            if 'app_version_int' in info:   # NewPipe Format
+                for item in info['subscriptions']:
+                    # Other service, such as SoundCloud
+                    if item.get('service_id', 0) != 0:
+                        continue
+                    channel_url = item['url']
+                    channel_id_match = CHANNEL_ID_RE.search(channel_url)
+                    if channel_id_match:
+                        channel_id = channel_id_match.group(0)
+                    else:
+                        print('WARNING: Could not find channel id in url',
+                              channel_url)
+                        continue
+                    channels.append((channel_id, item['name']))
+            else:   # Old Google Takeout format
+                for item in info:
+                    snippet = item['snippet']
+                    channel_id = snippet['resourceId']['channelId']
+                    channels.append((channel_id, snippet['title']))
         except (KeyError, IndexError):
             traceback.print_exc()
             return '400 Bad Request: Unknown json structure', 400
@@ -724,14 +822,92 @@ def import_subscriptions():
 
         except (AssertionError, IndexError, defusedxml.ElementTree.ParseError) as e:
             return '400 Bad Request: Unable to read opml xml file, or the file is not the expected format', 400
+    elif mime_type in ('text/csv', 'application/vnd.ms-excel'):
+        content = file.read().decode('utf-8')
+        reader = csv.reader(content.splitlines())
+        channels = []
+        for row in reader:
+            if not row or row[0].lower().strip() == 'channel id':
+                continue
+            elif len(row) > 1 and CHANNEL_ID_RE.fullmatch(row[0].strip()):
+                channels.append( (row[0], row[-1]) )
+            else:
+                print('WARNING: Unknown row format:', row)
     else:
-            return '400 Bad Request: Unsupported file format: ' + mime_type + '. Only subscription.json files (from Google Takeouts) and XML OPML files exported from Youtube\'s subscription manager page are supported', 400
+        error = 'Unsupported file format: ' + mime_type
+        error += (' . Only subscription.json, subscriptions.csv files'
+                  ' (from Google Takeouts)'
+                  ' and XML OPML files exported from YouTube\'s'
+                  ' subscription manager page are supported')
+        return (flask.render_template('error.html', error_message=error),
+                400)
 
     _subscribe(channels)
 
     return flask.redirect(util.URL_ORIGIN + '/subscription_manager', 303)
 
 
+@yt_app.route('/export_subscriptions', methods=['POST'])
+def export_subscriptions():
+    include_muted = request.values.get('include_muted') == 'on'
+    with open_database() as connection:
+        with connection as cursor:
+            sub_list = []
+            for channel_name, channel_id, muted in (
+                    _get_subscribed_channels(cursor)):
+                if muted and not include_muted:
+                    continue
+                if request.values['export_format'] == 'json_google_takeout':
+                    sub_list.append({
+                        'kind': 'youtube#subscription',
+                        'snippet': {
+                            'muted': bool(muted),
+                            'resourceId': {
+                                'channelId': channel_id,
+                                'kind': 'youtube#channel',
+                            },
+                            'tags': _get_tags(cursor, channel_id),
+                            'title': channel_name,
+                        },
+                    })
+                elif request.values['export_format'] == 'json_newpipe':
+                    sub_list.append({
+                        'service_id': 0,
+                        'url': 'https://www.youtube.com/channel/' + channel_id,
+                        'name': channel_name,
+                    })
+                elif request.values['export_format'] == 'opml':
+                    sub_list.append({
+                        'channel_name': channel_name,
+                        'channel_id': channel_id,
+                    })
+    date_time = time.strftime('%Y%m%d%H%M', time.localtime())
+    if request.values['export_format'] == 'json_google_takeout':
+        r = flask.Response(json.dumps(sub_list), mimetype='text/json')
+        cd = 'attachment; filename="subscriptions_%s.json"' % date_time
+        r.headers['Content-Disposition'] = cd
+        return r
+    elif request.values['export_format'] == 'json_newpipe':
+        r = flask.Response(json.dumps({
+            'app_version': '0.21.9',
+            'app_version_int': 975,
+            'subscriptions': sub_list,
+        }), mimetype='text/json')
+        file_name = 'newpipe_subscriptions_%s_youtube-local.json' % date_time
+        cd = 'attachment; filename="%s"' % file_name
+        r.headers['Content-Disposition'] = cd
+        return r
+    elif request.values['export_format'] == 'opml':
+        r = flask.Response(
+            flask.render_template('subscriptions.xml', sub_list=sub_list),
+            mimetype='text/xml')
+        cd = 'attachment; filename="subscriptions_%s.xml"' % date_time
+        r.headers['Content-Disposition'] = cd
+        return r
+    else:
+        return '400 Bad Request', 400
+
+
 @yt_app.route('/subscription_manager', methods=['GET'])
 def get_subscription_manager_page():
     group_by_tags = request.args.get('group_by_tags', '0') == '1'
@@ -913,7 +1089,7 @@ def serve_subscription_thumbnail(thumbnail):
             f.close()
             return flask.Response(image, mimetype='image/jpeg')
 
-    url = "https://i.ytimg.com/vi/" + video_id + "/mqdefault.jpg"
+    url = f"https://i.ytimg.com/vi/{video_id}/hqdefault.jpg"
     try:
         image = util.fetch_url(url, report_text="Saved thumbnail: " + video_id)
     except urllib.error.HTTPError as e: