Keep download archive in memory for better performance

The old behavior was to open and scan the entire archive file for every single video download. This resulted in horrible performance for archives of any remotely large size, especially since all new video IDs are appended to the end of the archive. For anyone who uses the archive feature to maintain archives of entire video playlists or channels, this meant that all such lists with newer downloads would have to scan close to the end of the archive file before the potential download was rejected. For archives with tens of thousands of lines, this easily resulted in millions of line reads and checks over the course of scanning a single channel or playlist that had been seen previously. The new behavior in this commit is to preload the archive file into a binary search tree and scan the tree instead of constantly scanning the file on disk for every file. When a new download is appended to the archive file, it is also added to this tree. The performance is massively better using this strategy over the more "naive" line-by-line archive file parsing strategy. The only negative consequence of this change is that the archive in memory will not be synchronized with the archive file on disk. Running multiple instances of the program at the same time that all use the same archive file may result in duplicate archive entries or duplicated downloads. This is unlikely to be a serious issue for the vast majority of users. If the instances are not likely to try to download identical video IDs then this should not be a problem anyway; for example, having two instances pull two completely different YouTube channels at once should be fine. Signed-off-by: Jody Bruchon <jody@jodybruchon.com>
author: Jody Bruchon <jody@jodybruchon.com> 2020-09-17 14:22:07 -0400
committer: Jody Bruchon <jody@jodybruchon.com> 2020-09-17 14:22:07 -0400
commit: ecdec1913fbe350b4dddd8b459b0f43464a8c5bc (patch)
tree: cb8bd05d03e672843427d4a0c8151ccb47907cff
parent: 7ac0ba50ce2e60f9105f94f1c66e7d8fa9b6b692 (diff)
download: hypervideo-pre-ecdec1913fbe350b4dddd8b459b0f43464a8c5bc.tar.lz
hypervideo-pre-ecdec1913fbe350b4dddd8b459b0f43464a8c5bc.tar.xz
hypervideo-pre-ecdec1913fbe350b4dddd8b459b0f43464a8c5bc.zip
1 files changed, 57 insertions, 10 deletions
diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py
index 1e2070f8c..d2d100850 100644
--- a/youtube_dlc/YoutubeDL.py
+++ b/youtube_dlc/YoutubeDL.py
@@ -113,6 +113,43 @@ from .version import __version__
 if compat_os_name == 'nt':
     import ctypes
 
+class ArchiveTree(object):
+    def __init__(self, line):
+        self.left = None
+        self.right = None
+        self.line = line
+
+    def at_insert(self, line):
+        print("at_insert: ", line)
+        if self.line:
+            if line < self.line:
+                if self.left is None:
+                    self.left = ArchiveTree(line)
+                else:
+                    self.left.at_insert(line)
+            elif line > self.line:
+                if self.right is None:
+                    self.right = ArchiveTree(line)
+                else:
+                    self.right.at_insert(line)
+        else:
+            self.line = line
+
+    def at_exist(self, line):
+        print("at_exist: ", line)
+        if self.line is None:
+            return False
+        if line < self.line:
+            if self.left is None:
+                return False
+            return self.left.at_exist(line)
+        elif line > self.line:
+            if self.right is None:
+                return False
+            return self.right.at_exist(line)
+        else:
+            return True
+
 
 class YoutubeDL(object):
     """YoutubeDL class.
@@ -359,6 +396,21 @@ class YoutubeDL(object):
         }
         self.params.update(params)
         self.cache = Cache(self)
+        self.archive = ArchiveTree(None)
+
+        """Preload the archive, if any is specified"""
+        def preload_download_archive(self):
+            fn = self.params.get('download_archive')
+            if fn is None:
+                return False
+            try:
+                with locked_file(fn, 'r', encoding='utf-8') as archive_file:
+                    for line in archive_file:
+                        self.archive.at_insert(line.strip())
+            except IOError as ioe:
+                if ioe.errno != errno.ENOENT:
+                    raise
+            return True
 
         def check_deprecated(param, option, suggestion):
             if self.params.get(param) is not None:
@@ -367,6 +419,8 @@ class YoutubeDL(object):
                 return True
             return False
 
+        preload_download_archive(self)
+
         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
             if self.params.get('geo_verification_proxy') is None:
                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
@@ -722,7 +776,7 @@ class YoutubeDL(object):
             return None
 
     def _match_entry(self, info_dict, incomplete):
-        """ Returns None iff the file should be downloaded """
+        """ Returns None if the file should be downloaded """
 
         video_title = info_dict.get('title', info_dict.get('id', 'video'))
         if 'title' in info_dict:
@@ -2142,15 +2196,7 @@ class YoutubeDL(object):
         if not vid_id:
             return False  # Incomplete video information
 
-        try:
-            with locked_file(fn, 'r', encoding='utf-8') as archive_file:
-                for line in archive_file:
-                    if line.strip() == vid_id:
-                        return True
-        except IOError as ioe:
-            if ioe.errno != errno.ENOENT:
-                raise
-        return False
+        return self.archive.at_exist(vid_id)
 
     def record_download_archive(self, info_dict):
         fn = self.params.get('download_archive')
@@ -2160,6 +2206,7 @@ class YoutubeDL(object):
         assert vid_id
         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
             archive_file.write(vid_id + '\n')
+        self.archive.at_insert(vid_id)
 
     @staticmethod
     def format_resolution(format, default='unknown'):
author	Jody Bruchon <jody@jodybruchon.com>	2020-09-17 14:22:07 -0400
committer	Jody Bruchon <jody@jodybruchon.com>	2020-09-17 14:22:07 -0400
commit	ecdec1913fbe350b4dddd8b459b0f43464a8c5bc (patch)
tree	cb8bd05d03e672843427d4a0c8151ccb47907cff
parent	7ac0ba50ce2e60f9105f94f1c66e7d8fa9b6b692 (diff)
download	hypervideo-pre-ecdec1913fbe350b4dddd8b459b0f43464a8c5bc.tar.lz hypervideo-pre-ecdec1913fbe350b4dddd8b459b0f43464a8c5bc.tar.xz hypervideo-pre-ecdec1913fbe350b4dddd8b459b0f43464a8c5bc.zip