diff options
-rw-r--r-- | mediagoblin/tests/test_util.py | 19 | ||||
-rw-r--r-- | mediagoblin/util.py | 27 |
2 files changed, 46 insertions, 0 deletions
diff --git a/mediagoblin/tests/test_util.py b/mediagoblin/tests/test_util.py index 7b00a074..75e28aca 100644 --- a/mediagoblin/tests/test_util.py +++ b/mediagoblin/tests/test_util.py @@ -103,3 +103,22 @@ def test_locale_to_lower_lower(): # crazy renditions. Useful? assert util.locale_to_lower_lower('en-US') == 'en-us' assert util.locale_to_lower_lower('en_us') == 'en-us' + + +def test_html_cleaner(): + # Remove images + result = util.clean_html( + '<p>Hi everybody! ' + '<img src="http://example.org/huge-purple-barney.png" /></p>\n' + '<p>:)</p>') + assert result == ( + '<div>' + '<p>Hi everybody! </p>\n' + '<p>:)</p>' + '</div>') + + # Remove evil javascript + result = util.clean_html( + '<p><a href="javascript:nasty_surprise">innocent link!</a></p>') + assert result == ( + '<p><a href="">innocent link!</a></p>') diff --git a/mediagoblin/util.py b/mediagoblin/util.py index f29f8570..fc380f41 100644 --- a/mediagoblin/util.py +++ b/mediagoblin/util.py @@ -30,6 +30,7 @@ import jinja2 import translitcodec from paste.deploy.loadwsgi import NicerConfigParser from webob import Response, exc +from lxml.html.clean import Cleaner from mediagoblin import mg_globals from mediagoblin.db.util import ObjectId @@ -373,6 +374,32 @@ def read_config_file(conf_file): return mgoblin_conf +# A super strict version of the lxml.html cleaner class +HTML_CLEANER = Cleaner( + scripts=True, + javascript=True, + comments=True, + style=True, + links=True, + page_structure=True, + processing_instructions=True, + embedded=True, + frames=True, + forms=True, + annoying_tags=True, + allow_tags=[ + 'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br'], + remove_unknown_tags=False, # can't be used with allow_tags + safe_attrs_only=True, + add_nofollow=True, # for now + host_whitelist=(), + whitelist_tags=set([])) + + +def clean_html(html): + return HTML_CLEANER.clean_html(html) + + SETUP_GETTEXTS = {} def setup_gettext(locale): |