Skip to content

Commit d73de9c

Browse files
committed
Plugged-in inject_meta_charset in HTMLSerializer
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40635
1 parent 9c6f0f8 commit d73de9c

1 file changed

Lines changed: 25 additions & 20 deletions

File tree

src/serializer.py

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,30 +7,34 @@
77
import gettext
88
_ = gettext.gettext
99

10-
from constants import voidElements, booleanAttributes, spaceCharacters, entities
10+
from constants import voidElements, booleanAttributes, spaceCharacters
1111

1212
spaceCharacters = u"".join(spaceCharacters)
1313

14-
default_entity_map = {}
15-
for k, v in entities.items():
16-
if v != "&" and default_entity_map.get(v) != k.lower():
17-
# prefer < over < and similarly for &, >, etc.
18-
default_entity_map[v] = k
19-
2014
try:
2115
from codecs import register_error, xmlcharrefreplace_errors
2216
except ImportError:
2317
unicode_encode_errors = "strict"
2418
else:
2519
unicode_encode_errors = "htmlentityreplace"
2620

21+
from constants import entities
22+
23+
encode_entity_map = {}
24+
for k, v in entities.items():
25+
if v != "&" and encode_entity_map.get(v) != k.lower():
26+
# prefer < over < and similarly for &, >, etc.
27+
encode_entity_map[v] = k
28+
2729
def htmlentityreplace_errors(ex):
2830
if isinstance(ex, UnicodeEncodeError):
2931
res = []
3032
for c in ex.object[ex.start:ex.end]:
31-
c = default_entity_map.get(c)
33+
c = encode_entity_map.get(c)
3234
if c:
35+
res.append("&")
3336
res.append(c)
37+
res.append(";")
3438
else:
3539
res.append(c.encode(ex.encoding, "xmlcharrefreplace"))
3640
return (u"".join(res), ex.end)
@@ -39,7 +43,7 @@ def htmlentityreplace_errors(ex):
3943

4044
register_error(unicode_encode_errors, htmlentityreplace_errors)
4145

42-
del register_error, xmlcharrefreplace_errors
46+
del register_error
4347

4448
def _slide(iterator):
4549
previous = None
@@ -64,18 +68,22 @@ class HTMLSerializer(object):
6468

6569
strip_whitespace = False
6670

71+
inject_meta_charset = True
72+
6773
def __init__(self, **kwargs):
6874
for attr in ("quote_attr_values", "quote_char", "use_best_quote_char",
6975
"minimize_boolean_attributes", "use_trailing_solidus",
7076
"space_before_trailing_solidus", "omit_optional_tags",
71-
"strip_whitespace"):
77+
"strip_whitespace", "inject_meta_charset"):
7278
if attr in kwargs:
7379
setattr(self, attr, kwargs[attr])
7480
self.errors = []
7581

7682
def serialize(self, treewalker, encoding=None):
7783
in_cdata = False
7884
self.errors = []
85+
if encoding and self.inject_meta_charset:
86+
treewalker = self.filter_inject_meta_charset(treewalker, encoding)
7987
if self.strip_whitespace:
8088
treewalker = self.filter_whitespace(treewalker)
8189
if self.omit_optional_tags:
@@ -196,7 +204,7 @@ def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
196204
if self.strict:
197205
raise SerializeError
198206

199-
def filter_inject_meta_charset(self, treewalker):
207+
def filter_inject_meta_charset(self, treewalker, encoding):
200208
done = False
201209
for token in treewalker:
202210
if not done and token["type"] == "StartTag" \
@@ -206,8 +214,7 @@ def filter_inject_meta_charset(self, treewalker):
206214
yield token
207215

208216
def filter_whitespace(self, treewalker):
209-
# TODO
210-
return treewalker
217+
raise NotImplementedError
211218

212219
def filter_optional_tags(self, treewalker):
213220
for token, next in _slide(treewalker):
@@ -254,7 +261,7 @@ def is_optional_start(self, tagname, next):
254261
if type == "StartTag":
255262
# XXX: we do not look at the preceding event, so instead we never
256263
# omit the colgroup element's end tag when it is immediately
257-
# followed by another colgroup element. See _is_optional_end.
264+
# followed by another colgroup element. See is_optional_end.
258265
return next["name"] == "col"
259266
else:
260267
return False
@@ -266,11 +273,10 @@ def is_optional_start(self, tagname, next):
266273
if type == "StartTag":
267274
# XXX: we do not look at the preceding event, so instead we never
268275
# omit the thead and tfoot elements' end tag when they are
269-
# immediately followed by a tbody element. See _is_optional_end.
276+
# immediately followed by a tbody element. See is_optional_end.
270277
return next["name"] == 'tr'
271278
else:
272279
return False
273-
# TODO
274280
return False
275281

276282
def is_optional_end(self, tagname, next):
@@ -328,7 +334,7 @@ def is_optional_end(self, tagname, next):
328334
return False
329335
elif type == "StartTag":
330336
# XXX: we also look for an immediately following colgroup
331-
# element. See _is_optional_start.
337+
# element. See is_optional_start.
332338
return next["name"] != 'colgroup'
333339
else:
334340
return True
@@ -342,7 +348,7 @@ def is_optional_end(self, tagname, next):
342348
# is immediately followed by a tbody element, or if there is no
343349
# more content in the parent element.
344350
# XXX: we never omit the end tag when the following element is
345-
# a tbody. See _is_optional_start.
351+
# a tbody. See is_optional_start.
346352
if type == "StartTag":
347353
return next["name"] == 'tfoot'
348354
elif tagname == 'tbody':
@@ -354,7 +360,7 @@ def is_optional_end(self, tagname, next):
354360
# is immediately followed by a tbody element, or if there is no
355361
# more content in the parent element.
356362
# XXX: we never omit the end tag when the following element is
357-
# a tbody. See _is_optional_start.
363+
# a tbody. See is_optional_start.
358364
return type == "EndTag" or type is None
359365
elif tagname in ('td', 'th'):
360366
# A td element's end tag may be omitted if the td element is
@@ -367,7 +373,6 @@ def is_optional_end(self, tagname, next):
367373
return next["name"] in ('td', 'th')
368374
else:
369375
return type == "EndTag" or type is None
370-
# TODO
371376
return False
372377

373378
def SerializeError(Exception):

0 commit comments

Comments
 (0)