aboutsummaryrefslogtreecommitdiffstats
path: root/python/defusedxml/lxml.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/defusedxml/lxml.py')
-rw-r--r--python/defusedxml/lxml.py153
1 files changed, 153 insertions, 0 deletions
diff --git a/python/defusedxml/lxml.py b/python/defusedxml/lxml.py
new file mode 100644
index 0000000..7f3ee0b
--- /dev/null
+++ b/python/defusedxml/lxml.py
@@ -0,0 +1,153 @@
+# defusedxml
+#
+# Copyright (c) 2013 by Christian Heimes <christian@python.org>
+# Licensed to PSF under a Contributor Agreement.
+# See http://www.python.org/psf/license for licensing details.
+"""Example code for lxml.etree protection
+
+The code has NO protection against decompression bombs.
+"""
+from __future__ import print_function, absolute_import
+
+import threading
+from lxml import etree as _etree
+
+from .common import DTDForbidden, EntitiesForbidden, NotSupportedError
+
+LXML3 = _etree.LXML_VERSION[0] >= 3
+
+__origin__ = "lxml.etree"
+
+tostring = _etree.tostring
+
+
+class RestrictedElement(_etree.ElementBase):
+ """A restricted Element class that filters out instances of some classes
+ """
+ __slots__ = ()
+ # blacklist = (etree._Entity, etree._ProcessingInstruction, etree._Comment)
+ blacklist = _etree._Entity
+
+ def _filter(self, iterator):
+ blacklist = self.blacklist
+ for child in iterator:
+ if isinstance(child, blacklist):
+ continue
+ yield child
+
+ def __iter__(self):
+ iterator = super(RestrictedElement, self).__iter__()
+ return self._filter(iterator)
+
+ def iterchildren(self, tag=None, reversed=False):
+ iterator = super(RestrictedElement, self).iterchildren(
+ tag=tag, reversed=reversed)
+ return self._filter(iterator)
+
+ def iter(self, tag=None, *tags):
+ iterator = super(RestrictedElement, self).iter(tag=tag, *tags)
+ return self._filter(iterator)
+
+ def iterdescendants(self, tag=None, *tags):
+ iterator = super(RestrictedElement,
+ self).iterdescendants(tag=tag, *tags)
+ return self._filter(iterator)
+
+ def itersiblings(self, tag=None, preceding=False):
+ iterator = super(RestrictedElement, self).itersiblings(
+ tag=tag, preceding=preceding)
+ return self._filter(iterator)
+
+ def getchildren(self):
+ iterator = super(RestrictedElement, self).__iter__()
+ return list(self._filter(iterator))
+
+ def getiterator(self, tag=None):
+ iterator = super(RestrictedElement, self).getiterator(tag)
+ return self._filter(iterator)
+
+
+class GlobalParserTLS(threading.local):
+ """Thread local context for custom parser instances
+ """
+ parser_config = {
+ 'resolve_entities': False,
+ # 'remove_comments': True,
+ # 'remove_pis': True,
+ }
+
+ element_class = RestrictedElement
+
+ def createDefaultParser(self):
+ parser = _etree.XMLParser(**self.parser_config)
+ element_class = self.element_class
+ if self.element_class is not None:
+ lookup = _etree.ElementDefaultClassLookup(element=element_class)
+ parser.set_element_class_lookup(lookup)
+ return parser
+
+ def setDefaultParser(self, parser):
+ self._default_parser = parser
+
+ def getDefaultParser(self):
+ parser = getattr(self, "_default_parser", None)
+ if parser is None:
+ parser = self.createDefaultParser()
+ self.setDefaultParser(parser)
+ return parser
+
+
+_parser_tls = GlobalParserTLS()
+getDefaultParser = _parser_tls.getDefaultParser
+
+
+def check_docinfo(elementtree, forbid_dtd=False, forbid_entities=True):
+ """Check docinfo of an element tree for DTD and entity declarations
+
+ The check for entity declarations needs lxml 3 or newer. lxml 2.x does
+ not support dtd.iterentities().
+ """
+ docinfo = elementtree.docinfo
+ if docinfo.doctype:
+ if forbid_dtd:
+ raise DTDForbidden(docinfo.doctype,
+ docinfo.system_url,
+ docinfo.public_id)
+ if forbid_entities and not LXML3:
+ # lxml < 3 has no iterentities()
+ raise NotSupportedError("Unable to check for entity declarations "
+ "in lxml 2.x")
+
+ if forbid_entities:
+ for dtd in docinfo.internalDTD, docinfo.externalDTD:
+ if dtd is None:
+ continue
+ for entity in dtd.iterentities():
+ raise EntitiesForbidden(entity.name, entity.content, None,
+ None, None, None)
+
+
+def parse(source, parser=None, base_url=None, forbid_dtd=False,
+ forbid_entities=True):
+ if parser is None:
+ parser = getDefaultParser()
+ elementtree = _etree.parse(source, parser, base_url=base_url)
+ check_docinfo(elementtree, forbid_dtd, forbid_entities)
+ return elementtree
+
+
+def fromstring(text, parser=None, base_url=None, forbid_dtd=False,
+ forbid_entities=True):
+ if parser is None:
+ parser = getDefaultParser()
+ rootelement = _etree.fromstring(text, parser, base_url=base_url)
+ elementtree = rootelement.getroottree()
+ check_docinfo(elementtree, forbid_dtd, forbid_entities)
+ return rootelement
+
+
+XML = fromstring
+
+
+def iterparse(*args, **kwargs):
+ raise NotSupportedError("defused lxml.etree.iterparse not available")