diff options
author | Jesús <heckyel@hyperbola.info> | 2022-02-28 10:20:36 +0800 |
---|---|---|
committer | Jesús <heckyel@hyperbola.info> | 2022-02-28 10:20:36 +0800 |
commit | 51b14efa4807dcfb1ce5c35556166ae17acac154 (patch) | |
tree | 7ddb997bfe6f03dbf7934f3fb59bc44ed3f42d95 /mediagoblin/plugins/indexedsearch/backends | |
parent | 1079d1cee4a1389a6b697ae7e08a1c6835adcd52 (diff) | |
download | mediagoblin-51b14efa4807dcfb1ce5c35556166ae17acac154.tar.lz mediagoblin-51b14efa4807dcfb1ce5c35556166ae17acac154.tar.xz mediagoblin-51b14efa4807dcfb1ce5c35556166ae17acac154.zip |
Add Search plugin: indexedsearch
Diffstat (limited to 'mediagoblin/plugins/indexedsearch/backends')
-rw-r--r-- | mediagoblin/plugins/indexedsearch/backends/__init__.py | 69 | ||||
-rw-r--r-- | mediagoblin/plugins/indexedsearch/backends/whoosh.py | 167 |
2 files changed, 236 insertions, 0 deletions
diff --git a/mediagoblin/plugins/indexedsearch/backends/__init__.py b/mediagoblin/plugins/indexedsearch/backends/__init__.py new file mode 100644 index 00000000..3dd2b5d6 --- /dev/null +++ b/mediagoblin/plugins/indexedsearch/backends/__init__.py @@ -0,0 +1,69 @@ +# GNU MediaGoblin -- federated, autonomous media hosting +# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import logging + +_log = logging.getLogger(__name__) + + +class MediaNotProcessedError(Exception): + """Error indicating that a media entry is not marked as processed.""" + pass + + +class BaseEngine(object): + + def add_media_entry(self, media): + raise NotImplementedError + + def remove_media_entry(self, media_entry_id): + raise NotImplementedError + + def update_index(self): + """Update the index to make it consistent with the database.""" + raise NotImplementedError + + def get_doc_for_media_entry(self, media): + """Creates a document suitable for indexing. + + If the media entry is not processed then a MediaNotProcessedError + will be raised. + + Args: + media: A MediaEntry for indexing. + + """ + _log.info("Indexing: %d" % media.id) + + if media.state != 'processed': + _log.info('Ignoring: not yet processed') + raise MediaNotProcessedError() + + tags = ' '.join([tag['name'] for tag in media.tags]) + comments = '\n'.join([comment.content + for comment in media.get_comments()]) + # collections = u','.join([col.title for col in media.collections]) + doc = {'title': media.title, + 'description': media.description, + 'media_id': media.id, + 'time': media.updated, + 'tag': tags, + 'comment': comments} + + if media.get_actor: + doc['user'] = media.get_actor.username + + return doc diff --git a/mediagoblin/plugins/indexedsearch/backends/whoosh.py b/mediagoblin/plugins/indexedsearch/backends/whoosh.py new file mode 100644 index 00000000..afab3885 --- /dev/null +++ b/mediagoblin/plugins/indexedsearch/backends/whoosh.py @@ -0,0 +1,167 @@ +import os +import logging + +import whoosh.index +import whoosh.fields +import whoosh.writing +import whoosh.qparser + +from mediagoblin.db.models import MediaEntry +from mediagoblin.plugins.indexedsearch.backends import ( + BaseEngine, MediaNotProcessedError +) + +_log = logging.getLogger(__name__) +INDEX_NAME = 'media_entries' +DEFAULT_SEARCH_FIELDS = ['title', 'description', 'tag', 'comment'] + + +class MediaEntrySchema(whoosh.fields.SchemaClass): + """ Whoosh schema for MediaEntry objects. + """ + media_id = whoosh.fields.NUMERIC(signed=False, unique=True, stored=True) + title = whoosh.fields.TEXT + description = whoosh.fields.TEXT + tag = whoosh.fields.KEYWORD + # collection = whoosh.fields.KEYWORD(commas=True) + time = whoosh.fields.DATETIME(stored=True) + user = whoosh.fields.TEXT + comment = whoosh.fields.TEXT + + +class Engine(BaseEngine): + + def __init__(self, **connection_options): + self.index_dir = connection_options.get('INDEX_DIR') + + try: + self.index = whoosh.index.open_dir(self.index_dir, + indexname=INDEX_NAME) + except whoosh.index.EmptyIndexError: + self.maybe_create_index() + + def update_index(self): + """ Make an index consistent with the database. + + Removes media entries from the index that aren't in the database. + Indexes media entries that are in the database but not in the index. + Re-indexes media entries that have been updated since they were last + indexed. + + Args: + dirname: directory containing the index. + """ + _log.info("Updating index ") + + # The set of all media in the index + indexed_media = set() + # The set of all media we need to re-index + to_index = set() + + with self.index.searcher() as searcher: + with whoosh.writing.AsyncWriter(self.index) as writer: + # Loop over the stored fields in the index + for fields in searcher.all_stored_fields(): + media_id = fields['media_id'] + indexed_media.add(media_id) + + media = MediaEntry.query.filter_by(id=media_id).first() + if not media: + # This entry has been deleted since it was indexed + self.remove_media_entry(media_id, writer) + else: + # Check if this file was changed since it + # was indexed + indexed_time = fields['time'] + last_updated = media.updated + if last_updated > indexed_time: + # The file has changed, delete it and add it to the + # list of files to reindex + writer.delete_by_term('media_id', media_id) + to_index.add(media_id) + + for media in MediaEntry.query.all(): + if media.id in to_index or media.id not in indexed_media: + # This is either a entry that's changed, or a new entry + # that wasn't indexed before. So index it! + self.add_media_entry(media, writer) + + def add_media_entry(self, media, writer=None): + """Adds a media entry to the index using a writer. + + Adds a media entry to the index using a writer. If a writer is given + then the operation won't be committed to the index. + + Args: + media: a media entry for indexing. + writer: a whoosh writer to index the media entry. + """ + commit = False + + if not writer: + writer = whoosh.writing.AsyncWriter(self.index) + commit = True + try: + writer.update_document(**self.get_doc_for_media_entry(media)) + + if commit: + writer.commit() + + except MediaNotProcessedError: + pass + + def maybe_create_index(self): + """Ensure that a given directory contains the plugin's index. + + If the index doesn't exist in the directory, then it will be created. + + """ + new_index_required = False + # If the directory doesn't exist, or the index doesn't exist in the + # directory, then a new index will be made. + if not os.path.exists(self.index_dir): + _log.info("Index directory doesn't exist: " + self.index_dir) + os.mkdir(self.index_dir) + new_index_required = True + elif not whoosh.index.exists_in(self.index_dir, INDEX_NAME): + _log.info("Index doesn't exist in " + self.index_dir) + new_index_required = True + + if new_index_required: + _log.info("Creating new index in " + self.index_dir) + self.index = whoosh.index.create_in(self.index_dir, + schema=MediaEntrySchema(), + indexname=INDEX_NAME) + else: + _log.info("Using existing index in " + self.index_dir) + self.index = whoosh.index.open_dir(self.index_dir, + indexname=INDEX_NAME) + + def remove_media_entry(self, media_entry_id, writer=None): + """Remove a media entry from the index using a writer. + + Removes a media entry from the index using a writer. If a writer is + given then the operation won't be committed to the index. + + Args: + media_entry_id: id of the media entry to be removed. + writer: a whoosh writer for removing the media entry. + """ + commit = False + + if not writer: + writer = whoosh.writing.AsyncWriter(self.index) + commit = True + + _log.info("Deleting media entry with id: %d" % media_entry_id) + writer.delete_by_term('media_id', media_entry_id) + + if commit: + writer.commit() + + def search(self, query): + with self.index.searcher() as searcher: + query_string = whoosh.qparser.MultifieldParser( + DEFAULT_SEARCH_FIELDS, self.index.schema).parse(query) + results = searcher.search(query_string) + return [result['media_id'] for result in results] |