aboutsummaryrefslogtreecommitdiffstats
path: root/mediagoblin/tools/text.py
diff options
context:
space:
mode:
Diffstat (limited to 'mediagoblin/tools/text.py')
-rw-r--r--mediagoblin/tools/text.py117
1 files changed, 117 insertions, 0 deletions
diff --git a/mediagoblin/tools/text.py b/mediagoblin/tools/text.py
new file mode 100644
index 00000000..de4bb281
--- /dev/null
+++ b/mediagoblin/tools/text.py
@@ -0,0 +1,117 @@
+# GNU MediaGoblin -- federated, autonomous media hosting
+# Copyright (C) 2011 MediaGoblin contributors. See AUTHORS.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import wtforms
+import markdown
+from lxml.html.clean import Cleaner
+
+from mediagoblin import mg_globals
+from mediagoblin.tools import url
+
+# A super strict version of the lxml.html cleaner class
+HTML_CLEANER = Cleaner(
+ scripts=True,
+ javascript=True,
+ comments=True,
+ style=True,
+ links=True,
+ page_structure=True,
+ processing_instructions=True,
+ embedded=True,
+ frames=True,
+ forms=True,
+ annoying_tags=True,
+ allow_tags=[
+ 'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br'],
+ remove_unknown_tags=False, # can't be used with allow_tags
+ safe_attrs_only=True,
+ add_nofollow=True, # for now
+ host_whitelist=(),
+ whitelist_tags=set([]))
+
+def clean_html(html):
+ # clean_html barfs on an empty string
+ if not html:
+ return u''
+
+ return HTML_CLEANER.clean_html(html)
+
+def convert_to_tag_list_of_dicts(tag_string):
+ """
+ Filter input from incoming string containing user tags,
+
+ Strips trailing, leading, and internal whitespace, and also converts
+ the "tags" text into an array of tags
+ """
+ taglist = []
+ if tag_string:
+
+ # Strip out internal, trailing, and leading whitespace
+ stripped_tag_string = u' '.join(tag_string.strip().split())
+
+ # Split the tag string into a list of tags
+ for tag in stripped_tag_string.split(
+ mg_globals.app_config['tags_delimiter']):
+
+ # Ignore empty or duplicate tags
+ if tag.strip() and tag.strip() not in [t['name'] for t in taglist]:
+
+ taglist.append({'name': tag.strip(),
+ 'slug': url.slugify(tag.strip())})
+ return taglist
+
+def media_tags_as_string(media_entry_tags):
+ """
+ Generate a string from a media item's tags, stored as a list of dicts
+
+ This is the opposite of convert_to_tag_list_of_dicts
+ """
+ media_tag_string = ''
+ if media_entry_tags:
+ media_tag_string = mg_globals.app_config['tags_delimiter'].join(
+ [tag['name'] for tag in media_entry_tags])
+ return media_tag_string
+
+TOO_LONG_TAG_WARNING = \
+ u'Tags must be shorter than %s characters. Tags that are too long: %s'
+
+def tag_length_validator(form, field):
+ """
+ Make sure tags do not exceed the maximum tag length.
+ """
+ tags = convert_to_tag_list_of_dicts(field.data)
+ too_long_tags = [
+ tag['name'] for tag in tags
+ if len(tag['name']) > mg_globals.app_config['tags_max_length']]
+
+ if too_long_tags:
+ raise wtforms.ValidationError(
+ TOO_LONG_TAG_WARNING % (mg_globals.app_config['tags_max_length'], \
+ ', '.join(too_long_tags)))
+
+
+MARKDOWN_INSTANCE = markdown.Markdown(safe_mode='escape')
+
+def cleaned_markdown_conversion(text):
+ """
+ Take a block of text, run it through MarkDown, and clean its HTML.
+ """
+ # Markdown will do nothing with and clean_html can do nothing with
+ # an empty string :)
+ if not text:
+ return u''
+
+ return clean_html(MARKDOWN_INSTANCE.convert(text))