77import gettext
88_ = gettext .gettext
99
10- from constants import voidElements , booleanAttributes , spaceCharacters , entities
10+ from constants import voidElements , booleanAttributes , spaceCharacters
1111
1212spaceCharacters = u"" .join (spaceCharacters )
1313
14- default_entity_map = {}
15- for k , v in entities .items ():
16- if v != "&" and default_entity_map .get (v ) != k .lower ():
17- # prefer < over < and similarly for &, >, etc.
18- default_entity_map [v ] = k
19-
2014try :
2115 from codecs import register_error , xmlcharrefreplace_errors
2216except ImportError :
2317 unicode_encode_errors = "strict"
2418else :
2519 unicode_encode_errors = "htmlentityreplace"
2620
21+ from constants import entities
22+
23+ encode_entity_map = {}
24+ for k , v in entities .items ():
25+ if v != "&" and encode_entity_map .get (v ) != k .lower ():
26+ # prefer < over < and similarly for &, >, etc.
27+ encode_entity_map [v ] = k
28+
2729 def htmlentityreplace_errors (ex ):
2830 if isinstance (ex , UnicodeEncodeError ):
2931 res = []
3032 for c in ex .object [ex .start :ex .end ]:
31- c = default_entity_map .get (c )
33+ c = encode_entity_map .get (c )
3234 if c :
35+ res .append ("&" )
3336 res .append (c )
37+ res .append (";" )
3438 else :
3539 res .append (c .encode (ex .encoding , "xmlcharrefreplace" ))
3640 return (u"" .join (res ), ex .end )
@@ -39,7 +43,7 @@ def htmlentityreplace_errors(ex):
3943
4044 register_error (unicode_encode_errors , htmlentityreplace_errors )
4145
42- del register_error , xmlcharrefreplace_errors
46+ del register_error
4347
4448def _slide (iterator ):
4549 previous = None
@@ -64,18 +68,22 @@ class HTMLSerializer(object):
6468
6569 strip_whitespace = False
6670
71+ inject_meta_charset = True
72+
6773 def __init__ (self , ** kwargs ):
6874 for attr in ("quote_attr_values" , "quote_char" , "use_best_quote_char" ,
6975 "minimize_boolean_attributes" , "use_trailing_solidus" ,
7076 "space_before_trailing_solidus" , "omit_optional_tags" ,
71- "strip_whitespace" ):
77+ "strip_whitespace" , "inject_meta_charset" ):
7278 if attr in kwargs :
7379 setattr (self , attr , kwargs [attr ])
7480 self .errors = []
7581
7682 def serialize (self , treewalker , encoding = None ):
7783 in_cdata = False
7884 self .errors = []
85+ if encoding and self .inject_meta_charset :
86+ treewalker = self .filter_inject_meta_charset (treewalker , encoding )
7987 if self .strip_whitespace :
8088 treewalker = self .filter_whitespace (treewalker )
8189 if self .omit_optional_tags :
@@ -196,7 +204,7 @@ def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
196204 if self .strict :
197205 raise SerializeError
198206
199- def filter_inject_meta_charset (self , treewalker ):
207+ def filter_inject_meta_charset (self , treewalker , encoding ):
200208 done = False
201209 for token in treewalker :
202210 if not done and token ["type" ] == "StartTag" \
@@ -206,8 +214,7 @@ def filter_inject_meta_charset(self, treewalker):
206214 yield token
207215
208216 def filter_whitespace (self , treewalker ):
209- # TODO
210- return treewalker
217+ raise NotImplementedError
211218
212219 def filter_optional_tags (self , treewalker ):
213220 for token , next in _slide (treewalker ):
@@ -254,7 +261,7 @@ def is_optional_start(self, tagname, next):
254261 if type == "StartTag" :
255262 # XXX: we do not look at the preceding event, so instead we never
256263 # omit the colgroup element's end tag when it is immediately
257- # followed by another colgroup element. See _is_optional_end .
264+ # followed by another colgroup element. See is_optional_end .
258265 return next ["name" ] == "col"
259266 else :
260267 return False
@@ -266,11 +273,10 @@ def is_optional_start(self, tagname, next):
266273 if type == "StartTag" :
267274 # XXX: we do not look at the preceding event, so instead we never
268275 # omit the thead and tfoot elements' end tag when they are
269- # immediately followed by a tbody element. See _is_optional_end .
276+ # immediately followed by a tbody element. See is_optional_end .
270277 return next ["name" ] == 'tr'
271278 else :
272279 return False
273- # TODO
274280 return False
275281
276282 def is_optional_end (self , tagname , next ):
@@ -328,7 +334,7 @@ def is_optional_end(self, tagname, next):
328334 return False
329335 elif type == "StartTag" :
330336 # XXX: we also look for an immediately following colgroup
331- # element. See _is_optional_start .
337+ # element. See is_optional_start .
332338 return next ["name" ] != 'colgroup'
333339 else :
334340 return True
@@ -342,7 +348,7 @@ def is_optional_end(self, tagname, next):
342348 # is immediately followed by a tbody element, or if there is no
343349 # more content in the parent element.
344350 # XXX: we never omit the end tag when the following element is
345- # a tbody. See _is_optional_start .
351+ # a tbody. See is_optional_start .
346352 if type == "StartTag" :
347353 return next ["name" ] == 'tfoot'
348354 elif tagname == 'tbody' :
@@ -354,7 +360,7 @@ def is_optional_end(self, tagname, next):
354360 # is immediately followed by a tbody element, or if there is no
355361 # more content in the parent element.
356362 # XXX: we never omit the end tag when the following element is
357- # a tbody. See _is_optional_start .
363+ # a tbody. See is_optional_start .
358364 return type == "EndTag" or type is None
359365 elif tagname in ('td' , 'th' ):
360366 # A td element's end tag may be omitted if the td element is
@@ -367,7 +373,6 @@ def is_optional_end(self, tagname, next):
367373 return next ["name" ] in ('td' , 'th' )
368374 else :
369375 return type == "EndTag" or type is None
370- # TODO
371376 return False
372377
373378def SerializeError (Exception ):
0 commit comments