Skip to content

Commit c98097f

Browse files
committed
get optionaltags tests working again
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40655
1 parent 7e8123a commit c98097f

File tree

2 files changed

+177
-168
lines changed

2 files changed

+177
-168
lines changed

src/serializer.py

Lines changed: 177 additions & 167 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,182 @@ def htmlentityreplace_errors(exc):
4545

4646
del register_error
4747

48-
def _slide(iterator):
49-
previous = None
50-
for token in iterator:
51-
if previous is not None:
52-
yield previous, token
53-
previous = token
54-
yield previous, None
48+
class OptionalTagFilter:
49+
def __init__(self, source):
50+
self.source = source
51+
52+
def slider(self):
53+
previous1 = previous2 = None
54+
for token in self.source:
55+
if previous1 is not None:
56+
yield previous2, previous1, token
57+
previous2 = previous1
58+
previous1 = token
59+
yield previous2, previous1, None
60+
61+
def __iter__(self):
62+
for previous, token, next in self.slider():
63+
type = token["type"]
64+
if type == "StartTag":
65+
if token["data"] or not self.is_optional_start(token["name"], previous, next):
66+
yield token
67+
elif type == "EndTag":
68+
if not self.is_optional_end(token["name"], next):
69+
yield token
70+
else:
71+
yield token
72+
73+
def is_optional_start(self, tagname, previous, next):
74+
type = next and next["type"] or None
75+
if tagname in 'html':
76+
# An html element's start tag may be omitted if the first thing
77+
# inside the html element is not a space character or a comment.
78+
return type not in ("Comment", "SpaceCharacters")
79+
elif tagname == 'head':
80+
# A head element's start tag may be omitted if the first thing
81+
# inside the head element is an element.
82+
return type == "StartTag"
83+
elif tagname == 'body':
84+
# A body element's start tag may be omitted if the first thing
85+
# inside the body element is not a space character or a comment,
86+
# except if the first thing inside the body element is a script
87+
# or style element and the node immediately preceding the body
88+
# element is a head element whose end tag has been omitted.
89+
if type in ("Comment", "SpaceCharacters"):
90+
return False
91+
elif type == "StartTag":
92+
# XXX: we do not look at the preceding event, so we never omit
93+
# the body element's start tag if it's followed by a script or
94+
# a style element.
95+
return next["name"] not in ('script', 'style')
96+
else:
97+
return True
98+
elif tagname == 'colgroup':
99+
# A colgroup element's start tag may be omitted if the first thing
100+
# inside the colgroup element is a col element, and if the element
101+
# is not immediately preceeded by another colgroup element whose
102+
# end tag has been omitted.
103+
if type == "StartTag":
104+
# XXX: we do not look at the preceding event, so instead we never
105+
# omit the colgroup element's end tag when it is immediately
106+
# followed by another colgroup element. See is_optional_end.
107+
return next["name"] == "col"
108+
else:
109+
return False
110+
elif tagname == 'tbody':
111+
# A tbody element's start tag may be omitted if the first thing
112+
# inside the tbody element is a tr element, and if the element is
113+
# not immediately preceeded by a tbody, thead, or tfoot element
114+
# whose end tag has been omitted.
115+
if type == "StartTag":
116+
# omit the thead and tfoot elements' end tag when they are
117+
# immediately followed by a tbody element. See is_optional_end.
118+
if previous and previous['type'] == 'EndTag' and \
119+
previous['name'] in ('tbody','thead','tfoot'):
120+
return False
121+
return next["name"] == 'tr'
122+
else:
123+
return False
124+
return False
125+
126+
def is_optional_end(self, tagname, next):
127+
type = next and next["type"] or None
128+
if tagname in ('html', 'head', 'body'):
129+
# An html element's end tag may be omitted if the html element
130+
# is not immediately followed by a space character or a comment.
131+
return type not in ("Comment", "SpaceCharacters")
132+
elif tagname in ('li', 'optgroup', 'option', 'tr'):
133+
# A li element's end tag may be omitted if the li element is
134+
# immediately followed by another li element or if there is
135+
# no more content in the parent element.
136+
# An optgroup element's end tag may be omitted if the optgroup
137+
# element is immediately followed by another optgroup element,
138+
# or if there is no more content in the parent element.
139+
# An option element's end tag may be omitted if the option
140+
# element is immediately followed by another option element,
141+
# or if there is no more content in the parent element.
142+
# A tr element's end tag may be omitted if the tr element is
143+
# immediately followed by another tr element, or if there is
144+
# no more content in the parent element.
145+
if type == "StartTag":
146+
return next["name"] == tagname
147+
else:
148+
return type == "EndTag" or type is None
149+
elif tagname in ('dt', 'dd'):
150+
# A dt element's end tag may be omitted if the dt element is
151+
# immediately followed by another dt element or a dd element.
152+
# A dd element's end tag may be omitted if the dd element is
153+
# immediately followed by another dd element or a dt element,
154+
# or if there is no more content in the parent element.
155+
if type == "StartTag":
156+
return next["name"] in ('dt', 'dd')
157+
elif tagname == 'dd':
158+
return type == "EndTag" or type is None
159+
else:
160+
return False
161+
elif tagname == 'p':
162+
# A p element's end tag may be omitted if the p element is
163+
# immediately followed by an address, blockquote, dl, fieldset,
164+
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
165+
# or ul element, or if there is no more content in the parent
166+
# element.
167+
if type == "StartTag":
168+
return next["name"] in ('address', 'blockquote', \
169+
'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
170+
'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
171+
else:
172+
return type == "EndTag" or type is None
173+
elif tagname == 'colgroup':
174+
# A colgroup element's end tag may be omitted if the colgroup
175+
# element is not immediately followed by a space character or
176+
# a comment.
177+
if type in ("Comment", "SpaceCharacters"):
178+
return False
179+
elif type == "StartTag":
180+
# XXX: we also look for an immediately following colgroup
181+
# element. See is_optional_start.
182+
return next["name"] != 'colgroup'
183+
else:
184+
return True
185+
elif tagname in ('thead', 'tbody'):
186+
# A thead element's end tag may be omitted if the thead element
187+
# is immediately followed by a tbody or tfoot element.
188+
# A tbody element's end tag may be omitted if the tbody element
189+
# is immediately followed by a tbody or tfoot element, or if
190+
# there is no more content in the parent element.
191+
# A tfoot element's end tag may be omitted if the tfoot element
192+
# is immediately followed by a tbody element, or if there is no
193+
# more content in the parent element.
194+
# XXX: we never omit the end tag when the following element is
195+
# a tbody. See is_optional_start.
196+
if type == "StartTag":
197+
return next["name"] in ['tbody', 'tfoot']
198+
elif tagname == 'tbody':
199+
return type == "EndTag" or type is None
200+
else:
201+
return False
202+
elif tagname == 'tfoot':
203+
# A tfoot element's end tag may be omitted if the tfoot element
204+
# is immediately followed by a tbody element, or if there is no
205+
# more content in the parent element.
206+
# XXX: we never omit the end tag when the following element is
207+
# a tbody. See is_optional_start.
208+
if type == "StartTag":
209+
return next["name"] == 'tbody'
210+
else:
211+
return type == "EndTag" or type is None
212+
elif tagname in ('td', 'th'):
213+
# A td element's end tag may be omitted if the td element is
214+
# immediately followed by a td or th element, or if there is
215+
# no more content in the parent element.
216+
# A th element's end tag may be omitted if the th element is
217+
# immediately followed by a td or th element, or if there is
218+
# no more content in the parent element.
219+
if type == "StartTag":
220+
return next["name"] in ('td', 'th')
221+
else:
222+
return type == "EndTag" or type is None
223+
return False
55224

56225
class HTMLSerializer(object):
57226
cdata_elements = frozenset(("style", "script", "xmp", "iframe", "noembed", "noframes", "noscript"))
@@ -89,7 +258,7 @@ def serialize(self, treewalker, encoding=None):
89258
if self.strip_whitespace:
90259
treewalker = self.filter_whitespace(treewalker)
91260
if self.omit_optional_tags:
92-
treewalker = self.filter_optional_tags(treewalker)
261+
treewalker = OptionalTagFilter(treewalker)
93262
for token in treewalker:
94263
type = token["type"]
95264
if type == "Doctype":
@@ -218,165 +387,6 @@ def filter_inject_meta_charset(self, treewalker, encoding):
218387
def filter_whitespace(self, treewalker):
219388
raise NotImplementedError
220389

221-
def filter_optional_tags(self, treewalker):
222-
for token, next in _slide(treewalker):
223-
type = token["type"]
224-
if type == "StartTag":
225-
if token["data"] or not self.is_optional_start(token["name"], next):
226-
yield token
227-
elif type == "EndTag":
228-
if not self.is_optional_end(token["name"], next):
229-
yield token
230-
else:
231-
yield token
232-
233-
def is_optional_start(self, tagname, next):
234-
type = next and next["type"] or None
235-
if tagname in 'html':
236-
# An html element's start tag may be omitted if the first thing
237-
# inside the html element is not a space character or a comment.
238-
return type not in ("Comment", "SpaceCharacters")
239-
elif tagname == 'head':
240-
# A head element's start tag may be omitted if the first thing
241-
# inside the head element is an element.
242-
return type == "StartTag"
243-
elif tagname == 'body':
244-
# A body element's start tag may be omitted if the first thing
245-
# inside the body element is not a space character or a comment,
246-
# except if the first thing inside the body element is a script
247-
# or style element and the node immediately preceding the body
248-
# element is a head element whose end tag has been omitted.
249-
if type in ("Comment", "SpaceCharacters"):
250-
return False
251-
elif type == "StartTag":
252-
# XXX: we do not look at the preceding event, so we never omit
253-
# the body element's start tag if it's followed by a script or
254-
# a style element.
255-
return next["name"] not in ('script', 'style')
256-
else:
257-
return True
258-
elif tagname == 'colgroup':
259-
# A colgroup element's start tag may be omitted if the first thing
260-
# inside the colgroup element is a col element, and if the element
261-
# is not immediately preceeded by another colgroup element whose
262-
# end tag has been omitted.
263-
if type == "StartTag":
264-
# XXX: we do not look at the preceding event, so instead we never
265-
# omit the colgroup element's end tag when it is immediately
266-
# followed by another colgroup element. See is_optional_end.
267-
return next["name"] == "col"
268-
else:
269-
return False
270-
elif tagname == 'tbody':
271-
# A tbody element's start tag may be omitted if the first thing
272-
# inside the tbody element is a tr element, and if the element is
273-
# not immediately preceeded by a tbody, thead, or tfoot element
274-
# whose end tag has been omitted.
275-
if type == "StartTag":
276-
# XXX: we do not look at the preceding event, so instead we never
277-
# omit the thead and tfoot elements' end tag when they are
278-
# immediately followed by a tbody element. See is_optional_end.
279-
return next["name"] == 'tr'
280-
else:
281-
return False
282-
return False
283-
284-
def is_optional_end(self, tagname, next):
285-
type = next and next["type"] or None
286-
if tagname in ('html', 'head', 'body'):
287-
# An html element's end tag may be omitted if the html element
288-
# is not immediately followed by a space character or a comment.
289-
return type not in ("Comment", "SpaceCharacters")
290-
elif tagname in ('li', 'optgroup', 'option', 'tr'):
291-
# A li element's end tag may be omitted if the li element is
292-
# immediately followed by another li element or if there is
293-
# no more content in the parent element.
294-
# An optgroup element's end tag may be omitted if the optgroup
295-
# element is immediately followed by another optgroup element,
296-
# or if there is no more content in the parent element.
297-
# An option element's end tag may be omitted if the option
298-
# element is immediately followed by another option element,
299-
# or if there is no more content in the parent element.
300-
# A tr element's end tag may be omitted if the tr element is
301-
# immediately followed by another tr element, or if there is
302-
# no more content in the parent element.
303-
if type == "StartTag":
304-
return next["name"] == tagname
305-
else:
306-
return type == "EndTag" or type is None
307-
elif tagname in ('dt', 'dd'):
308-
# A dt element's end tag may be omitted if the dt element is
309-
# immediately followed by another dt element or a dd element.
310-
# A dd element's end tag may be omitted if the dd element is
311-
# immediately followed by another dd element or a dt element,
312-
# or if there is no more content in the parent element.
313-
if type == "StartTag":
314-
return next["name"] in ('dt', 'dd')
315-
elif tagname == 'dd':
316-
return type == "EndTag" or type is None
317-
else:
318-
return False
319-
elif tagname == 'p':
320-
# A p element's end tag may be omitted if the p element is
321-
# immediately followed by an address, blockquote, dl, fieldset,
322-
# form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
323-
# or ul element, or if there is no more content in the parent
324-
# element.
325-
if type == "StartTag":
326-
return next["name"] in ('address', 'blockquote', \
327-
'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
328-
'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
329-
else:
330-
return type == "EndTag" or type is None
331-
elif tagname == 'colgroup':
332-
# A colgroup element's end tag may be omitted if the colgroup
333-
# element is not immediately followed by a space character or
334-
# a comment.
335-
if type in ("Comment", "SpaceCharacters"):
336-
return False
337-
elif type == "StartTag":
338-
# XXX: we also look for an immediately following colgroup
339-
# element. See is_optional_start.
340-
return next["name"] != 'colgroup'
341-
else:
342-
return True
343-
elif tagname in ('thead', 'tbody'):
344-
# A thead element's end tag may be omitted if the thead element
345-
# is immediately followed by a tbody or tfoot element.
346-
# A tbody element's end tag may be omitted if the tbody element
347-
# is immediately followed by a tbody or tfoot element, or if
348-
# there is no more content in the parent element.
349-
# A tfoot element's end tag may be omitted if the tfoot element
350-
# is immediately followed by a tbody element, or if there is no
351-
# more content in the parent element.
352-
# XXX: we never omit the end tag when the following element is
353-
# a tbody. See is_optional_start.
354-
if type == "StartTag":
355-
return next["name"] == 'tfoot'
356-
elif tagname == 'tbody':
357-
return type == "EndTag" or type is None
358-
else:
359-
return False
360-
elif tagname == 'tfoot':
361-
# A tfoot element's end tag may be omitted if the tfoot element
362-
# is immediately followed by a tbody element, or if there is no
363-
# more content in the parent element.
364-
# XXX: we never omit the end tag when the following element is
365-
# a tbody. See is_optional_start.
366-
return type == "EndTag" or type is None
367-
elif tagname in ('td', 'th'):
368-
# A td element's end tag may be omitted if the td element is
369-
# immediately followed by a td or th element, or if there is
370-
# no more content in the parent element.
371-
# A th element's end tag may be omitted if the th element is
372-
# immediately followed by a td or th element, or if there is
373-
# no more content in the parent element.
374-
if type == "StartTag":
375-
return next["name"] in ('td', 'th')
376-
else:
377-
return type == "EndTag" or type is None
378-
return False
379-
380390
def SerializeError(Exception):
381391
"""Error in serialized tree"""
382392
pass

tests/test_serializer.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ def serialize_html(self, input, options):
7777

7878
def test_serializer():
7979
for filename in glob.glob('serializer/*.test'):
80-
if filename.find('optionaltags')>=0: continue # TODO
8180
tests = simplejson.load(file(filename))
8281
for test in tests['tests']:
8382
yield test

0 commit comments

Comments
 (0)