1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
# defusedxml
#
# Copyright (c) 2013 by Christian Heimes <christian@python.org>
# Licensed to PSF under a Contributor Agreement.
# See http://www.python.org/psf/license for licensing details.
"""Example code for lxml.etree protection
The code has NO protection against decompression bombs.
"""
from __future__ import print_function, absolute_import
import threading
from lxml import etree as _etree
from .common import DTDForbidden, EntitiesForbidden, NotSupportedError
LXML3 = _etree.LXML_VERSION[0] >= 3
__origin__ = "lxml.etree"
tostring = _etree.tostring
class RestrictedElement(_etree.ElementBase):
"""A restricted Element class that filters out instances of some classes
"""
__slots__ = ()
# blacklist = (etree._Entity, etree._ProcessingInstruction, etree._Comment)
blacklist = _etree._Entity
def _filter(self, iterator):
blacklist = self.blacklist
for child in iterator:
if isinstance(child, blacklist):
continue
yield child
def __iter__(self):
iterator = super(RestrictedElement, self).__iter__()
return self._filter(iterator)
def iterchildren(self, tag=None, reversed=False):
iterator = super(RestrictedElement, self).iterchildren(
tag=tag, reversed=reversed)
return self._filter(iterator)
def iter(self, tag=None, *tags):
iterator = super(RestrictedElement, self).iter(tag=tag, *tags)
return self._filter(iterator)
def iterdescendants(self, tag=None, *tags):
iterator = super(RestrictedElement,
self).iterdescendants(tag=tag, *tags)
return self._filter(iterator)
def itersiblings(self, tag=None, preceding=False):
iterator = super(RestrictedElement, self).itersiblings(
tag=tag, preceding=preceding)
return self._filter(iterator)
def getchildren(self):
iterator = super(RestrictedElement, self).__iter__()
return list(self._filter(iterator))
def getiterator(self, tag=None):
iterator = super(RestrictedElement, self).getiterator(tag)
return self._filter(iterator)
class GlobalParserTLS(threading.local):
"""Thread local context for custom parser instances
"""
parser_config = {
'resolve_entities': False,
# 'remove_comments': True,
# 'remove_pis': True,
}
element_class = RestrictedElement
def createDefaultParser(self):
parser = _etree.XMLParser(**self.parser_config)
element_class = self.element_class
if self.element_class is not None:
lookup = _etree.ElementDefaultClassLookup(element=element_class)
parser.set_element_class_lookup(lookup)
return parser
def setDefaultParser(self, parser):
self._default_parser = parser
def getDefaultParser(self):
parser = getattr(self, "_default_parser", None)
if parser is None:
parser = self.createDefaultParser()
self.setDefaultParser(parser)
return parser
_parser_tls = GlobalParserTLS()
getDefaultParser = _parser_tls.getDefaultParser
def check_docinfo(elementtree, forbid_dtd=False, forbid_entities=True):
"""Check docinfo of an element tree for DTD and entity declarations
The check for entity declarations needs lxml 3 or newer. lxml 2.x does
not support dtd.iterentities().
"""
docinfo = elementtree.docinfo
if docinfo.doctype:
if forbid_dtd:
raise DTDForbidden(docinfo.doctype,
docinfo.system_url,
docinfo.public_id)
if forbid_entities and not LXML3:
# lxml < 3 has no iterentities()
raise NotSupportedError("Unable to check for entity declarations "
"in lxml 2.x")
if forbid_entities:
for dtd in docinfo.internalDTD, docinfo.externalDTD:
if dtd is None:
continue
for entity in dtd.iterentities():
raise EntitiesForbidden(entity.name, entity.content, None,
None, None, None)
def parse(source, parser=None, base_url=None, forbid_dtd=False,
forbid_entities=True):
if parser is None:
parser = getDefaultParser()
elementtree = _etree.parse(source, parser, base_url=base_url)
check_docinfo(elementtree, forbid_dtd, forbid_entities)
return elementtree
def fromstring(text, parser=None, base_url=None, forbid_dtd=False,
forbid_entities=True):
if parser is None:
parser = getDefaultParser()
rootelement = _etree.fromstring(text, parser, base_url=base_url)
elementtree = rootelement.getroottree()
check_docinfo(elementtree, forbid_dtd, forbid_entities)
return rootelement
XML = fromstring
def iterparse(*args, **kwargs):
raise NotSupportedError("defused lxml.etree.iterparse not available")
|