22from xml .sax .saxutils import escape , unescape
33from tokenizer import HTMLTokenizer
44
5- class HTMLSanitizer ( HTMLTokenizer ) :
5+ class HTMLSanitizerMixin :
66 """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
77
88 acceptable_elements = ['a' , 'abbr' , 'acronym' , 'address' , 'area' , 'b' ,
@@ -130,38 +130,37 @@ class HTMLSanitizer(HTMLTokenizer):
130130 # => <script> do_nasty_stuff() </script>
131131 # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
132132 # => <a>Click here for $100</a>
133- def __iter__ (self ):
134- for token in HTMLTokenizer .__iter__ (self ):
135- if token ["type" ] in ["StartTag" , "EndTag" , "EmptyTag" ]:
136- if token ["name" ] in self .allowed_elements :
137- if token .has_key ("data" ):
138- attrs = dict ([(name ,val ) for name ,val in token ["data" ][::- 1 ] if name in self .allowed_attributes ])
139- for attr in self .attr_val_is_uri :
140- if not attrs .has_key (attr ): continue
141- val_unescaped = re .sub ("[`\000 -\040 \177 -\240 \s]+" , '' , unescape (attrs [attr ])).lower ()
142- if re .match ("^[a-z0-9][-+.a-z0-9]*:" ,val_unescaped ) and (val_unescaped .split (':' )[0 ] not in self .allowed_protocols ):
143- del attrs [attr ]
144- if attrs .has_key ('style' ):
145- attrs ['style' ] = self .sanitize_css (attrs ['style' ])
146- token ["data" ] = [[name ,val ] for name ,val in attrs .items ()]
147- yield token
148- else :
149- if token ["type" ] == "EndTag" :
150- token ["data" ] = "</%s>" % token ["name" ]
151- elif token ["data" ]:
152- attrs = '' .join ([' %s="%s"' % (k ,escape (v )) for k ,v in token ["data" ]])
153- token ["data" ] = "<%s%s>" % (token ["name" ],attrs )
154- else :
155- token ["data" ] = "<%s>" % token ["name" ]
156- if token ["type" ] == "EmptyTag" :
157- token ["data" ]= token ["data" ][:- 1 ] + "/>"
158- token ["type" ] = "Characters"
159- del token ["name" ]
160- yield token
161- elif token ["type" ] == "Comment" :
162- pass
133+ def sanitize_token (self , token ):
134+ if token ["type" ] in ["StartTag" , "EndTag" , "EmptyTag" ]:
135+ if token ["name" ] in self .allowed_elements :
136+ if token .has_key ("data" ):
137+ attrs = dict ([(name ,val ) for name ,val in token ["data" ][::- 1 ] if name in self .allowed_attributes ])
138+ for attr in self .attr_val_is_uri :
139+ if not attrs .has_key (attr ): continue
140+ val_unescaped = re .sub ("[`\000 -\040 \177 -\240 \s]+" , '' , unescape (attrs [attr ])).lower ()
141+ if re .match ("^[a-z0-9][-+.a-z0-9]*:" ,val_unescaped ) and (val_unescaped .split (':' )[0 ] not in self .allowed_protocols ):
142+ del attrs [attr ]
143+ if attrs .has_key ('style' ):
144+ attrs ['style' ] = self .sanitize_css (attrs ['style' ])
145+ token ["data" ] = [[name ,val ] for name ,val in attrs .items ()]
146+ return token
163147 else :
164- yield token
148+ if token ["type" ] == "EndTag" :
149+ token ["data" ] = "</%s>" % token ["name" ]
150+ elif token ["data" ]:
151+ attrs = '' .join ([' %s="%s"' % (k ,escape (v )) for k ,v in token ["data" ]])
152+ token ["data" ] = "<%s%s>" % (token ["name" ],attrs )
153+ else :
154+ token ["data" ] = "<%s>" % token ["name" ]
155+ if token ["type" ] == "EmptyTag" :
156+ token ["data" ]= token ["data" ][:- 1 ] + "/>"
157+ token ["type" ] = "Characters"
158+ del token ["name" ]
159+ return token
160+ elif token ["type" ] == "Comment" :
161+ pass
162+ else :
163+ return token
165164
166165 def sanitize_css (self , style ):
167166 # disallow urls
@@ -187,3 +186,9 @@ def sanitize_css(self, style):
187186 clean .append (prop + ': ' + value + ';' )
188187
189188 return ' ' .join (clean )
189+
190+ class HTMLSanitizer (HTMLTokenizer , HTMLSanitizerMixin ):
191+ def __iter__ (self ):
192+ for token in HTMLTokenizer .__iter__ (self ):
193+ token = self .sanitize_token (token )
194+ if token : yield token
0 commit comments