Add Search plugin: indexedsearch

author: Jesús <heckyel@hyperbola.info> 2022-02-28 10:20:36 +0800
committer: Jesús <heckyel@hyperbola.info> 2022-02-28 10:20:36 +0800
commit: 51b14efa4807dcfb1ce5c35556166ae17acac154 (patch)
tree: 7ddb997bfe6f03dbf7934f3fb59bc44ed3f42d95 /mediagoblin/plugins/indexedsearch/backends
parent: 1079d1cee4a1389a6b697ae7e08a1c6835adcd52 (diff)
download: mediagoblin-51b14efa4807dcfb1ce5c35556166ae17acac154.tar.lz
mediagoblin-51b14efa4807dcfb1ce5c35556166ae17acac154.tar.xz
mediagoblin-51b14efa4807dcfb1ce5c35556166ae17acac154.zip
2 files changed, 236 insertions, 0 deletions
diff --git a/mediagoblin/plugins/indexedsearch/backends/__init__.py b/mediagoblin/plugins/indexedsearch/backends/__init__.py
new file mode 100644
index 00000000..3dd2b5d6
--- /dev/null
+++ b/mediagoblin/plugins/indexedsearch/backends/__init__.py
@@ -0,0 +1,69 @@
+# GNU MediaGoblin -- federated, autonomous media hosting
+# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import logging
+
+_log = logging.getLogger(__name__)
+
+
+class MediaNotProcessedError(Exception):
+    """Error indicating that a media entry is not marked as processed."""
+    pass
+
+
+class BaseEngine(object):
+
+    def add_media_entry(self, media):
+        raise NotImplementedError
+
+    def remove_media_entry(self, media_entry_id):
+        raise NotImplementedError
+
+    def update_index(self):
+        """Update the index to make it consistent with the database."""
+        raise NotImplementedError
+
+    def get_doc_for_media_entry(self, media):
+        """Creates a document suitable for indexing.
+
+        If the media entry is not processed then a MediaNotProcessedError
+        will be raised.
+
+        Args:
+            media: A MediaEntry for indexing.
+
+        """
+        _log.info("Indexing: %d" % media.id)
+
+        if media.state != 'processed':
+            _log.info('Ignoring: not yet processed')
+            raise MediaNotProcessedError()
+
+        tags = ' '.join([tag['name'] for tag in media.tags])
+        comments = '\n'.join([comment.content
+                             for comment in media.get_comments()])
+        # collections = u','.join([col.title for col in media.collections])
+        doc = {'title': media.title,
+               'description': media.description,
+               'media_id': media.id,
+               'time': media.updated,
+               'tag': tags,
+               'comment': comments}
+
+        if media.get_actor:
+            doc['user'] = media.get_actor.username
+
+        return doc
diff --git a/mediagoblin/plugins/indexedsearch/backends/whoosh.py b/mediagoblin/plugins/indexedsearch/backends/whoosh.py
new file mode 100644
index 00000000..afab3885
--- /dev/null
+++ b/mediagoblin/plugins/indexedsearch/backends/whoosh.py
@@ -0,0 +1,167 @@
+import os
+import logging
+
+import whoosh.index
+import whoosh.fields
+import whoosh.writing
+import whoosh.qparser
+
+from mediagoblin.db.models import MediaEntry
+from mediagoblin.plugins.indexedsearch.backends import (
+    BaseEngine, MediaNotProcessedError
+)
+
+_log = logging.getLogger(__name__)
+INDEX_NAME = 'media_entries'
+DEFAULT_SEARCH_FIELDS = ['title', 'description', 'tag', 'comment']
+
+
+class MediaEntrySchema(whoosh.fields.SchemaClass):
+    """ Whoosh schema for MediaEntry objects.
+    """
+    media_id = whoosh.fields.NUMERIC(signed=False, unique=True, stored=True)
+    title = whoosh.fields.TEXT
+    description = whoosh.fields.TEXT
+    tag = whoosh.fields.KEYWORD
+    # collection = whoosh.fields.KEYWORD(commas=True)
+    time = whoosh.fields.DATETIME(stored=True)
+    user = whoosh.fields.TEXT
+    comment = whoosh.fields.TEXT
+
+
+class Engine(BaseEngine):
+
+    def __init__(self, **connection_options):
+        self.index_dir = connection_options.get('INDEX_DIR')
+
+        try:
+            self.index = whoosh.index.open_dir(self.index_dir,
+                                               indexname=INDEX_NAME)
+        except whoosh.index.EmptyIndexError:
+            self.maybe_create_index()
+
+    def update_index(self):
+        """ Make an index consistent with the database.
+
+        Removes media entries from the index that aren't in the database.
+        Indexes media entries that are in the database but not in the index.
+        Re-indexes media entries that have been updated since they were last
+        indexed.
+
+        Args:
+            dirname: directory containing the index.
+        """
+        _log.info("Updating index ")
+
+        # The set of all media in the index
+        indexed_media = set()
+        # The set of all media we need to re-index
+        to_index = set()
+
+        with self.index.searcher() as searcher:
+            with whoosh.writing.AsyncWriter(self.index) as writer:
+                # Loop over the stored fields in the index
+                for fields in searcher.all_stored_fields():
+                    media_id = fields['media_id']
+                    indexed_media.add(media_id)
+
+                    media = MediaEntry.query.filter_by(id=media_id).first()
+                    if not media:
+                        # This entry has been deleted since it was indexed
+                        self.remove_media_entry(media_id, writer)
+                    else:
+                        # Check if this file was changed since it
+                        # was indexed
+                        indexed_time = fields['time']
+                        last_updated = media.updated
+                        if last_updated > indexed_time:
+                            # The file has changed, delete it and add it to the
+                            # list of files to reindex
+                            writer.delete_by_term('media_id', media_id)
+                            to_index.add(media_id)
+
+                for media in MediaEntry.query.all():
+                    if media.id in to_index or media.id not in indexed_media:
+                        # This is either a entry that's changed, or a new entry
+                        # that wasn't indexed before. So index it!
+                        self.add_media_entry(media, writer)
+
+    def add_media_entry(self, media, writer=None):
+        """Adds a media entry to the index using a writer.
+
+        Adds a media entry to the index using a writer. If a writer is given
+        then the operation won't be committed to the index.
+
+        Args:
+            media: a media entry for indexing.
+            writer: a whoosh writer to index the media entry.
+        """
+        commit = False
+
+        if not writer:
+            writer = whoosh.writing.AsyncWriter(self.index)
+            commit = True
+        try:
+            writer.update_document(**self.get_doc_for_media_entry(media))
+
+            if commit:
+                writer.commit()
+
+        except MediaNotProcessedError:
+            pass
+
+    def maybe_create_index(self):
+        """Ensure that a given directory contains the plugin's index.
+
+        If the index doesn't exist in the directory, then it will be created.
+
+        """
+        new_index_required = False
+        # If the directory doesn't exist, or the index doesn't exist in the
+        # directory, then a new index will be made.
+        if not os.path.exists(self.index_dir):
+            _log.info("Index directory doesn't exist: " + self.index_dir)
+            os.mkdir(self.index_dir)
+            new_index_required = True
+        elif not whoosh.index.exists_in(self.index_dir, INDEX_NAME):
+            _log.info("Index doesn't exist in " + self.index_dir)
+            new_index_required = True
+
+        if new_index_required:
+            _log.info("Creating new index in " + self.index_dir)
+            self.index = whoosh.index.create_in(self.index_dir,
+                                                schema=MediaEntrySchema(),
+                                                indexname=INDEX_NAME)
+        else:
+            _log.info("Using existing index in " + self.index_dir)
+            self.index = whoosh.index.open_dir(self.index_dir,
+                                               indexname=INDEX_NAME)
+
+    def remove_media_entry(self, media_entry_id, writer=None):
+        """Remove a media entry from the index using a writer.
+
+        Removes a media entry from the index using a writer. If a writer is
+        given then the operation won't be committed to the index.
+
+        Args:
+            media_entry_id: id of the media entry to be removed.
+            writer: a whoosh writer for removing the media entry.
+        """
+        commit = False
+
+        if not writer:
+            writer = whoosh.writing.AsyncWriter(self.index)
+            commit = True
+
+        _log.info("Deleting media entry with id: %d" % media_entry_id)
+        writer.delete_by_term('media_id', media_entry_id)
+
+        if commit:
+            writer.commit()
+
+    def search(self, query):
+        with self.index.searcher() as searcher:
+            query_string = whoosh.qparser.MultifieldParser(
+                DEFAULT_SEARCH_FIELDS, self.index.schema).parse(query)
+            results = searcher.search(query_string)
+            return [result['media_id'] for result in results]
author	Jesús <heckyel@hyperbola.info>	2022-02-28 10:20:36 +0800
committer	Jesús <heckyel@hyperbola.info>	2022-02-28 10:20:36 +0800
commit	51b14efa4807dcfb1ce5c35556166ae17acac154 (patch)
tree	7ddb997bfe6f03dbf7934f3fb59bc44ed3f42d95 /mediagoblin/plugins/indexedsearch/backends
parent	1079d1cee4a1389a6b697ae7e08a1c6835adcd52 (diff)
download	mediagoblin-51b14efa4807dcfb1ce5c35556166ae17acac154.tar.lz mediagoblin-51b14efa4807dcfb1ce5c35556166ae17acac154.tar.xz mediagoblin-51b14efa4807dcfb1ce5c35556166ae17acac154.zip