aboutsummaryrefslogtreecommitdiffstats
path: root/mediagoblin/plugins/indexedsearch/backends
diff options
context:
space:
mode:
authorJesús <heckyel@hyperbola.info>2022-02-28 10:20:36 +0800
committerJesús <heckyel@hyperbola.info>2022-02-28 10:20:36 +0800
commit51b14efa4807dcfb1ce5c35556166ae17acac154 (patch)
tree7ddb997bfe6f03dbf7934f3fb59bc44ed3f42d95 /mediagoblin/plugins/indexedsearch/backends
parent1079d1cee4a1389a6b697ae7e08a1c6835adcd52 (diff)
downloadmediagoblin-51b14efa4807dcfb1ce5c35556166ae17acac154.tar.lz
mediagoblin-51b14efa4807dcfb1ce5c35556166ae17acac154.tar.xz
mediagoblin-51b14efa4807dcfb1ce5c35556166ae17acac154.zip
Add Search plugin: indexedsearch
Diffstat (limited to 'mediagoblin/plugins/indexedsearch/backends')
-rw-r--r--mediagoblin/plugins/indexedsearch/backends/__init__.py69
-rw-r--r--mediagoblin/plugins/indexedsearch/backends/whoosh.py167
2 files changed, 236 insertions, 0 deletions
diff --git a/mediagoblin/plugins/indexedsearch/backends/__init__.py b/mediagoblin/plugins/indexedsearch/backends/__init__.py
new file mode 100644
index 00000000..3dd2b5d6
--- /dev/null
+++ b/mediagoblin/plugins/indexedsearch/backends/__init__.py
@@ -0,0 +1,69 @@
+# GNU MediaGoblin -- federated, autonomous media hosting
+# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import logging
+
+_log = logging.getLogger(__name__)
+
+
+class MediaNotProcessedError(Exception):
+ """Error indicating that a media entry is not marked as processed."""
+ pass
+
+
+class BaseEngine(object):
+
+ def add_media_entry(self, media):
+ raise NotImplementedError
+
+ def remove_media_entry(self, media_entry_id):
+ raise NotImplementedError
+
+ def update_index(self):
+ """Update the index to make it consistent with the database."""
+ raise NotImplementedError
+
+ def get_doc_for_media_entry(self, media):
+ """Creates a document suitable for indexing.
+
+ If the media entry is not processed then a MediaNotProcessedError
+ will be raised.
+
+ Args:
+ media: A MediaEntry for indexing.
+
+ """
+ _log.info("Indexing: %d" % media.id)
+
+ if media.state != 'processed':
+ _log.info('Ignoring: not yet processed')
+ raise MediaNotProcessedError()
+
+ tags = ' '.join([tag['name'] for tag in media.tags])
+ comments = '\n'.join([comment.content
+ for comment in media.get_comments()])
+ # collections = u','.join([col.title for col in media.collections])
+ doc = {'title': media.title,
+ 'description': media.description,
+ 'media_id': media.id,
+ 'time': media.updated,
+ 'tag': tags,
+ 'comment': comments}
+
+ if media.get_actor:
+ doc['user'] = media.get_actor.username
+
+ return doc
diff --git a/mediagoblin/plugins/indexedsearch/backends/whoosh.py b/mediagoblin/plugins/indexedsearch/backends/whoosh.py
new file mode 100644
index 00000000..afab3885
--- /dev/null
+++ b/mediagoblin/plugins/indexedsearch/backends/whoosh.py
@@ -0,0 +1,167 @@
+import os
+import logging
+
+import whoosh.index
+import whoosh.fields
+import whoosh.writing
+import whoosh.qparser
+
+from mediagoblin.db.models import MediaEntry
+from mediagoblin.plugins.indexedsearch.backends import (
+ BaseEngine, MediaNotProcessedError
+)
+
+_log = logging.getLogger(__name__)
+INDEX_NAME = 'media_entries'
+DEFAULT_SEARCH_FIELDS = ['title', 'description', 'tag', 'comment']
+
+
+class MediaEntrySchema(whoosh.fields.SchemaClass):
+ """ Whoosh schema for MediaEntry objects.
+ """
+ media_id = whoosh.fields.NUMERIC(signed=False, unique=True, stored=True)
+ title = whoosh.fields.TEXT
+ description = whoosh.fields.TEXT
+ tag = whoosh.fields.KEYWORD
+ # collection = whoosh.fields.KEYWORD(commas=True)
+ time = whoosh.fields.DATETIME(stored=True)
+ user = whoosh.fields.TEXT
+ comment = whoosh.fields.TEXT
+
+
+class Engine(BaseEngine):
+
+ def __init__(self, **connection_options):
+ self.index_dir = connection_options.get('INDEX_DIR')
+
+ try:
+ self.index = whoosh.index.open_dir(self.index_dir,
+ indexname=INDEX_NAME)
+ except whoosh.index.EmptyIndexError:
+ self.maybe_create_index()
+
+ def update_index(self):
+ """ Make an index consistent with the database.
+
+ Removes media entries from the index that aren't in the database.
+ Indexes media entries that are in the database but not in the index.
+ Re-indexes media entries that have been updated since they were last
+ indexed.
+
+ Args:
+ dirname: directory containing the index.
+ """
+ _log.info("Updating index ")
+
+ # The set of all media in the index
+ indexed_media = set()
+ # The set of all media we need to re-index
+ to_index = set()
+
+ with self.index.searcher() as searcher:
+ with whoosh.writing.AsyncWriter(self.index) as writer:
+ # Loop over the stored fields in the index
+ for fields in searcher.all_stored_fields():
+ media_id = fields['media_id']
+ indexed_media.add(media_id)
+
+ media = MediaEntry.query.filter_by(id=media_id).first()
+ if not media:
+ # This entry has been deleted since it was indexed
+ self.remove_media_entry(media_id, writer)
+ else:
+ # Check if this file was changed since it
+ # was indexed
+ indexed_time = fields['time']
+ last_updated = media.updated
+ if last_updated > indexed_time:
+ # The file has changed, delete it and add it to the
+ # list of files to reindex
+ writer.delete_by_term('media_id', media_id)
+ to_index.add(media_id)
+
+ for media in MediaEntry.query.all():
+ if media.id in to_index or media.id not in indexed_media:
+ # This is either a entry that's changed, or a new entry
+ # that wasn't indexed before. So index it!
+ self.add_media_entry(media, writer)
+
+ def add_media_entry(self, media, writer=None):
+ """Adds a media entry to the index using a writer.
+
+ Adds a media entry to the index using a writer. If a writer is given
+ then the operation won't be committed to the index.
+
+ Args:
+ media: a media entry for indexing.
+ writer: a whoosh writer to index the media entry.
+ """
+ commit = False
+
+ if not writer:
+ writer = whoosh.writing.AsyncWriter(self.index)
+ commit = True
+ try:
+ writer.update_document(**self.get_doc_for_media_entry(media))
+
+ if commit:
+ writer.commit()
+
+ except MediaNotProcessedError:
+ pass
+
+ def maybe_create_index(self):
+ """Ensure that a given directory contains the plugin's index.
+
+ If the index doesn't exist in the directory, then it will be created.
+
+ """
+ new_index_required = False
+ # If the directory doesn't exist, or the index doesn't exist in the
+ # directory, then a new index will be made.
+ if not os.path.exists(self.index_dir):
+ _log.info("Index directory doesn't exist: " + self.index_dir)
+ os.mkdir(self.index_dir)
+ new_index_required = True
+ elif not whoosh.index.exists_in(self.index_dir, INDEX_NAME):
+ _log.info("Index doesn't exist in " + self.index_dir)
+ new_index_required = True
+
+ if new_index_required:
+ _log.info("Creating new index in " + self.index_dir)
+ self.index = whoosh.index.create_in(self.index_dir,
+ schema=MediaEntrySchema(),
+ indexname=INDEX_NAME)
+ else:
+ _log.info("Using existing index in " + self.index_dir)
+ self.index = whoosh.index.open_dir(self.index_dir,
+ indexname=INDEX_NAME)
+
+ def remove_media_entry(self, media_entry_id, writer=None):
+ """Remove a media entry from the index using a writer.
+
+ Removes a media entry from the index using a writer. If a writer is
+ given then the operation won't be committed to the index.
+
+ Args:
+ media_entry_id: id of the media entry to be removed.
+ writer: a whoosh writer for removing the media entry.
+ """
+ commit = False
+
+ if not writer:
+ writer = whoosh.writing.AsyncWriter(self.index)
+ commit = True
+
+ _log.info("Deleting media entry with id: %d" % media_entry_id)
+ writer.delete_by_term('media_id', media_entry_id)
+
+ if commit:
+ writer.commit()
+
+ def search(self, query):
+ with self.index.searcher() as searcher:
+ query_string = whoosh.qparser.MultifieldParser(
+ DEFAULT_SEARCH_FIELDS, self.index.schema).parse(query)
+ results = searcher.search(query_string)
+ return [result['media_id'] for result in results]