aboutsummaryrefslogtreecommitdiffstats
path: root/mediagoblin
diff options
context:
space:
mode:
Diffstat (limited to 'mediagoblin')
-rw-r--r--mediagoblin/process_media/__init__.py18
-rw-r--r--mediagoblin/tests/test_util.py19
-rw-r--r--mediagoblin/util.py27
3 files changed, 56 insertions, 8 deletions
diff --git a/mediagoblin/process_media/__init__.py b/mediagoblin/process_media/__init__.py
index f37bf080..0dce1418 100644
--- a/mediagoblin/process_media/__init__.py
+++ b/mediagoblin/process_media/__init__.py
@@ -24,6 +24,13 @@ from mediagoblin import mg_globals as mgg
THUMB_SIZE = 200, 200
+def create_pub_filepath(entry, filename):
+ return mgg.public_store.get_unique_filepath(
+ ['media_entries',
+ unicode(entry['_id']),
+ filename])
+
+
@task
def process_media_initial(media_id):
workbench = mgg.workbench_manager.create_workbench()
@@ -45,10 +52,7 @@ def process_media_initial(media_id):
if thumb.mode != "RGB":
thumb = thumb.convert("RGB")
- thumb_filepath = mgg.public_store.get_unique_filepath(
- ['media_entries',
- unicode(entry['_id']),
- 'thumbnail.jpg'])
+ thumb_filepath = create_pub_filepath(entry, 'thumbnail.jpg')
thumb_file = mgg.public_store.get_file(thumb_filepath, 'w')
with thumb_file:
@@ -59,15 +63,13 @@ def process_media_initial(media_id):
queued_file = file(queued_filename, 'rb')
with queued_file:
- main_filepath = mgg.public_store.get_unique_filepath(
- ['media_entries',
- unicode(entry['_id']),
- queued_filepath[-1]])
+ main_filepath = create_pub_filepath(entry, queued_filepath[-1])
with mgg.public_store.get_file(main_filepath, 'wb') as main_file:
main_file.write(queued_file.read())
mgg.queue_store.delete_file(queued_filepath)
+ entry['queued_media_file'] = []
media_files_dict = entry.setdefault('media_files', {})
media_files_dict['thumb'] = thumb_filepath
media_files_dict['main'] = main_filepath
diff --git a/mediagoblin/tests/test_util.py b/mediagoblin/tests/test_util.py
index 7b00a074..75e28aca 100644
--- a/mediagoblin/tests/test_util.py
+++ b/mediagoblin/tests/test_util.py
@@ -103,3 +103,22 @@ def test_locale_to_lower_lower():
# crazy renditions. Useful?
assert util.locale_to_lower_lower('en-US') == 'en-us'
assert util.locale_to_lower_lower('en_us') == 'en-us'
+
+
+def test_html_cleaner():
+ # Remove images
+ result = util.clean_html(
+ '<p>Hi everybody! '
+ '<img src="http://example.org/huge-purple-barney.png" /></p>\n'
+ '<p>:)</p>')
+ assert result == (
+ '<div>'
+ '<p>Hi everybody! </p>\n'
+ '<p>:)</p>'
+ '</div>')
+
+ # Remove evil javascript
+ result = util.clean_html(
+ '<p><a href="javascript:nasty_surprise">innocent link!</a></p>')
+ assert result == (
+ '<p><a href="">innocent link!</a></p>')
diff --git a/mediagoblin/util.py b/mediagoblin/util.py
index f29f8570..fc380f41 100644
--- a/mediagoblin/util.py
+++ b/mediagoblin/util.py
@@ -30,6 +30,7 @@ import jinja2
import translitcodec
from paste.deploy.loadwsgi import NicerConfigParser
from webob import Response, exc
+from lxml.html.clean import Cleaner
from mediagoblin import mg_globals
from mediagoblin.db.util import ObjectId
@@ -373,6 +374,32 @@ def read_config_file(conf_file):
return mgoblin_conf
+# A super strict version of the lxml.html cleaner class
+HTML_CLEANER = Cleaner(
+ scripts=True,
+ javascript=True,
+ comments=True,
+ style=True,
+ links=True,
+ page_structure=True,
+ processing_instructions=True,
+ embedded=True,
+ frames=True,
+ forms=True,
+ annoying_tags=True,
+ allow_tags=[
+ 'div', 'b', 'i', 'em', 'strong', 'p', 'ul', 'ol', 'li', 'a', 'br'],
+ remove_unknown_tags=False, # can't be used with allow_tags
+ safe_attrs_only=True,
+ add_nofollow=True, # for now
+ host_whitelist=(),
+ whitelist_tags=set([]))
+
+
+def clean_html(html):
+ return HTML_CLEANER.clean_html(html)
+
+
SETUP_GETTEXTS = {}
def setup_gettext(locale):