@@ -45,13 +45,182 @@ def htmlentityreplace_errors(exc):
4545
4646 del register_error
4747
48- def _slide (iterator ):
49- previous = None
50- for token in iterator :
51- if previous is not None :
52- yield previous , token
53- previous = token
54- yield previous , None
48+ class OptionalTagFilter :
49+ def __init__ (self , source ):
50+ self .source = source
51+
52+ def slider (self ):
53+ previous1 = previous2 = None
54+ for token in self .source :
55+ if previous1 is not None :
56+ yield previous2 , previous1 , token
57+ previous2 = previous1
58+ previous1 = token
59+ yield previous2 , previous1 , None
60+
61+ def __iter__ (self ):
62+ for previous , token , next in self .slider ():
63+ type = token ["type" ]
64+ if type == "StartTag" :
65+ if token ["data" ] or not self .is_optional_start (token ["name" ], previous , next ):
66+ yield token
67+ elif type == "EndTag" :
68+ if not self .is_optional_end (token ["name" ], next ):
69+ yield token
70+ else :
71+ yield token
72+
73+ def is_optional_start (self , tagname , previous , next ):
74+ type = next and next ["type" ] or None
75+ if tagname in 'html' :
76+ # An html element's start tag may be omitted if the first thing
77+ # inside the html element is not a space character or a comment.
78+ return type not in ("Comment" , "SpaceCharacters" )
79+ elif tagname == 'head' :
80+ # A head element's start tag may be omitted if the first thing
81+ # inside the head element is an element.
82+ return type == "StartTag"
83+ elif tagname == 'body' :
84+ # A body element's start tag may be omitted if the first thing
85+ # inside the body element is not a space character or a comment,
86+ # except if the first thing inside the body element is a script
87+ # or style element and the node immediately preceding the body
88+ # element is a head element whose end tag has been omitted.
89+ if type in ("Comment" , "SpaceCharacters" ):
90+ return False
91+ elif type == "StartTag" :
92+ # XXX: we do not look at the preceding event, so we never omit
93+ # the body element's start tag if it's followed by a script or
94+ # a style element.
95+ return next ["name" ] not in ('script' , 'style' )
96+ else :
97+ return True
98+ elif tagname == 'colgroup' :
99+ # A colgroup element's start tag may be omitted if the first thing
100+ # inside the colgroup element is a col element, and if the element
101+ # is not immediately preceeded by another colgroup element whose
102+ # end tag has been omitted.
103+ if type == "StartTag" :
104+ # XXX: we do not look at the preceding event, so instead we never
105+ # omit the colgroup element's end tag when it is immediately
106+ # followed by another colgroup element. See is_optional_end.
107+ return next ["name" ] == "col"
108+ else :
109+ return False
110+ elif tagname == 'tbody' :
111+ # A tbody element's start tag may be omitted if the first thing
112+ # inside the tbody element is a tr element, and if the element is
113+ # not immediately preceeded by a tbody, thead, or tfoot element
114+ # whose end tag has been omitted.
115+ if type == "StartTag" :
116+ # omit the thead and tfoot elements' end tag when they are
117+ # immediately followed by a tbody element. See is_optional_end.
118+ if previous and previous ['type' ] == 'EndTag' and \
119+ previous ['name' ] in ('tbody' ,'thead' ,'tfoot' ):
120+ return False
121+ return next ["name" ] == 'tr'
122+ else :
123+ return False
124+ return False
125+
126+ def is_optional_end (self , tagname , next ):
127+ type = next and next ["type" ] or None
128+ if tagname in ('html' , 'head' , 'body' ):
129+ # An html element's end tag may be omitted if the html element
130+ # is not immediately followed by a space character or a comment.
131+ return type not in ("Comment" , "SpaceCharacters" )
132+ elif tagname in ('li' , 'optgroup' , 'option' , 'tr' ):
133+ # A li element's end tag may be omitted if the li element is
134+ # immediately followed by another li element or if there is
135+ # no more content in the parent element.
136+ # An optgroup element's end tag may be omitted if the optgroup
137+ # element is immediately followed by another optgroup element,
138+ # or if there is no more content in the parent element.
139+ # An option element's end tag may be omitted if the option
140+ # element is immediately followed by another option element,
141+ # or if there is no more content in the parent element.
142+ # A tr element's end tag may be omitted if the tr element is
143+ # immediately followed by another tr element, or if there is
144+ # no more content in the parent element.
145+ if type == "StartTag" :
146+ return next ["name" ] == tagname
147+ else :
148+ return type == "EndTag" or type is None
149+ elif tagname in ('dt' , 'dd' ):
150+ # A dt element's end tag may be omitted if the dt element is
151+ # immediately followed by another dt element or a dd element.
152+ # A dd element's end tag may be omitted if the dd element is
153+ # immediately followed by another dd element or a dt element,
154+ # or if there is no more content in the parent element.
155+ if type == "StartTag" :
156+ return next ["name" ] in ('dt' , 'dd' )
157+ elif tagname == 'dd' :
158+ return type == "EndTag" or type is None
159+ else :
160+ return False
161+ elif tagname == 'p' :
162+ # A p element's end tag may be omitted if the p element is
163+ # immediately followed by an address, blockquote, dl, fieldset,
164+ # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
165+ # or ul element, or if there is no more content in the parent
166+ # element.
167+ if type == "StartTag" :
168+ return next ["name" ] in ('address' , 'blockquote' , \
169+ 'dl' , 'fieldset' , 'form' , 'h1' , 'h2' , 'h3' , 'h4' , 'h5' , \
170+ 'h6' , 'hr' , 'menu' , 'ol' , 'p' , 'pre' , 'table' , 'ul' )
171+ else :
172+ return type == "EndTag" or type is None
173+ elif tagname == 'colgroup' :
174+ # A colgroup element's end tag may be omitted if the colgroup
175+ # element is not immediately followed by a space character or
176+ # a comment.
177+ if type in ("Comment" , "SpaceCharacters" ):
178+ return False
179+ elif type == "StartTag" :
180+ # XXX: we also look for an immediately following colgroup
181+ # element. See is_optional_start.
182+ return next ["name" ] != 'colgroup'
183+ else :
184+ return True
185+ elif tagname in ('thead' , 'tbody' ):
186+ # A thead element's end tag may be omitted if the thead element
187+ # is immediately followed by a tbody or tfoot element.
188+ # A tbody element's end tag may be omitted if the tbody element
189+ # is immediately followed by a tbody or tfoot element, or if
190+ # there is no more content in the parent element.
191+ # A tfoot element's end tag may be omitted if the tfoot element
192+ # is immediately followed by a tbody element, or if there is no
193+ # more content in the parent element.
194+ # XXX: we never omit the end tag when the following element is
195+ # a tbody. See is_optional_start.
196+ if type == "StartTag" :
197+ return next ["name" ] in ['tbody' , 'tfoot' ]
198+ elif tagname == 'tbody' :
199+ return type == "EndTag" or type is None
200+ else :
201+ return False
202+ elif tagname == 'tfoot' :
203+ # A tfoot element's end tag may be omitted if the tfoot element
204+ # is immediately followed by a tbody element, or if there is no
205+ # more content in the parent element.
206+ # XXX: we never omit the end tag when the following element is
207+ # a tbody. See is_optional_start.
208+ if type == "StartTag" :
209+ return next ["name" ] == 'tbody'
210+ else :
211+ return type == "EndTag" or type is None
212+ elif tagname in ('td' , 'th' ):
213+ # A td element's end tag may be omitted if the td element is
214+ # immediately followed by a td or th element, or if there is
215+ # no more content in the parent element.
216+ # A th element's end tag may be omitted if the th element is
217+ # immediately followed by a td or th element, or if there is
218+ # no more content in the parent element.
219+ if type == "StartTag" :
220+ return next ["name" ] in ('td' , 'th' )
221+ else :
222+ return type == "EndTag" or type is None
223+ return False
55224
56225class HTMLSerializer (object ):
57226 cdata_elements = frozenset (("style" , "script" , "xmp" , "iframe" , "noembed" , "noframes" , "noscript" ))
@@ -89,7 +258,7 @@ def serialize(self, treewalker, encoding=None):
89258 if self .strip_whitespace :
90259 treewalker = self .filter_whitespace (treewalker )
91260 if self .omit_optional_tags :
92- treewalker = self . filter_optional_tags (treewalker )
261+ treewalker = OptionalTagFilter (treewalker )
93262 for token in treewalker :
94263 type = token ["type" ]
95264 if type == "Doctype" :
@@ -218,165 +387,6 @@ def filter_inject_meta_charset(self, treewalker, encoding):
218387 def filter_whitespace (self , treewalker ):
219388 raise NotImplementedError
220389
221- def filter_optional_tags (self , treewalker ):
222- for token , next in _slide (treewalker ):
223- type = token ["type" ]
224- if type == "StartTag" :
225- if token ["data" ] or not self .is_optional_start (token ["name" ], next ):
226- yield token
227- elif type == "EndTag" :
228- if not self .is_optional_end (token ["name" ], next ):
229- yield token
230- else :
231- yield token
232-
233- def is_optional_start (self , tagname , next ):
234- type = next and next ["type" ] or None
235- if tagname in 'html' :
236- # An html element's start tag may be omitted if the first thing
237- # inside the html element is not a space character or a comment.
238- return type not in ("Comment" , "SpaceCharacters" )
239- elif tagname == 'head' :
240- # A head element's start tag may be omitted if the first thing
241- # inside the head element is an element.
242- return type == "StartTag"
243- elif tagname == 'body' :
244- # A body element's start tag may be omitted if the first thing
245- # inside the body element is not a space character or a comment,
246- # except if the first thing inside the body element is a script
247- # or style element and the node immediately preceding the body
248- # element is a head element whose end tag has been omitted.
249- if type in ("Comment" , "SpaceCharacters" ):
250- return False
251- elif type == "StartTag" :
252- # XXX: we do not look at the preceding event, so we never omit
253- # the body element's start tag if it's followed by a script or
254- # a style element.
255- return next ["name" ] not in ('script' , 'style' )
256- else :
257- return True
258- elif tagname == 'colgroup' :
259- # A colgroup element's start tag may be omitted if the first thing
260- # inside the colgroup element is a col element, and if the element
261- # is not immediately preceeded by another colgroup element whose
262- # end tag has been omitted.
263- if type == "StartTag" :
264- # XXX: we do not look at the preceding event, so instead we never
265- # omit the colgroup element's end tag when it is immediately
266- # followed by another colgroup element. See is_optional_end.
267- return next ["name" ] == "col"
268- else :
269- return False
270- elif tagname == 'tbody' :
271- # A tbody element's start tag may be omitted if the first thing
272- # inside the tbody element is a tr element, and if the element is
273- # not immediately preceeded by a tbody, thead, or tfoot element
274- # whose end tag has been omitted.
275- if type == "StartTag" :
276- # XXX: we do not look at the preceding event, so instead we never
277- # omit the thead and tfoot elements' end tag when they are
278- # immediately followed by a tbody element. See is_optional_end.
279- return next ["name" ] == 'tr'
280- else :
281- return False
282- return False
283-
284- def is_optional_end (self , tagname , next ):
285- type = next and next ["type" ] or None
286- if tagname in ('html' , 'head' , 'body' ):
287- # An html element's end tag may be omitted if the html element
288- # is not immediately followed by a space character or a comment.
289- return type not in ("Comment" , "SpaceCharacters" )
290- elif tagname in ('li' , 'optgroup' , 'option' , 'tr' ):
291- # A li element's end tag may be omitted if the li element is
292- # immediately followed by another li element or if there is
293- # no more content in the parent element.
294- # An optgroup element's end tag may be omitted if the optgroup
295- # element is immediately followed by another optgroup element,
296- # or if there is no more content in the parent element.
297- # An option element's end tag may be omitted if the option
298- # element is immediately followed by another option element,
299- # or if there is no more content in the parent element.
300- # A tr element's end tag may be omitted if the tr element is
301- # immediately followed by another tr element, or if there is
302- # no more content in the parent element.
303- if type == "StartTag" :
304- return next ["name" ] == tagname
305- else :
306- return type == "EndTag" or type is None
307- elif tagname in ('dt' , 'dd' ):
308- # A dt element's end tag may be omitted if the dt element is
309- # immediately followed by another dt element or a dd element.
310- # A dd element's end tag may be omitted if the dd element is
311- # immediately followed by another dd element or a dt element,
312- # or if there is no more content in the parent element.
313- if type == "StartTag" :
314- return next ["name" ] in ('dt' , 'dd' )
315- elif tagname == 'dd' :
316- return type == "EndTag" or type is None
317- else :
318- return False
319- elif tagname == 'p' :
320- # A p element's end tag may be omitted if the p element is
321- # immediately followed by an address, blockquote, dl, fieldset,
322- # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
323- # or ul element, or if there is no more content in the parent
324- # element.
325- if type == "StartTag" :
326- return next ["name" ] in ('address' , 'blockquote' , \
327- 'dl' , 'fieldset' , 'form' , 'h1' , 'h2' , 'h3' , 'h4' , 'h5' , \
328- 'h6' , 'hr' , 'menu' , 'ol' , 'p' , 'pre' , 'table' , 'ul' )
329- else :
330- return type == "EndTag" or type is None
331- elif tagname == 'colgroup' :
332- # A colgroup element's end tag may be omitted if the colgroup
333- # element is not immediately followed by a space character or
334- # a comment.
335- if type in ("Comment" , "SpaceCharacters" ):
336- return False
337- elif type == "StartTag" :
338- # XXX: we also look for an immediately following colgroup
339- # element. See is_optional_start.
340- return next ["name" ] != 'colgroup'
341- else :
342- return True
343- elif tagname in ('thead' , 'tbody' ):
344- # A thead element's end tag may be omitted if the thead element
345- # is immediately followed by a tbody or tfoot element.
346- # A tbody element's end tag may be omitted if the tbody element
347- # is immediately followed by a tbody or tfoot element, or if
348- # there is no more content in the parent element.
349- # A tfoot element's end tag may be omitted if the tfoot element
350- # is immediately followed by a tbody element, or if there is no
351- # more content in the parent element.
352- # XXX: we never omit the end tag when the following element is
353- # a tbody. See is_optional_start.
354- if type == "StartTag" :
355- return next ["name" ] == 'tfoot'
356- elif tagname == 'tbody' :
357- return type == "EndTag" or type is None
358- else :
359- return False
360- elif tagname == 'tfoot' :
361- # A tfoot element's end tag may be omitted if the tfoot element
362- # is immediately followed by a tbody element, or if there is no
363- # more content in the parent element.
364- # XXX: we never omit the end tag when the following element is
365- # a tbody. See is_optional_start.
366- return type == "EndTag" or type is None
367- elif tagname in ('td' , 'th' ):
368- # A td element's end tag may be omitted if the td element is
369- # immediately followed by a td or th element, or if there is
370- # no more content in the parent element.
371- # A th element's end tag may be omitted if the th element is
372- # immediately followed by a td or th element, or if there is
373- # no more content in the parent element.
374- if type == "StartTag" :
375- return next ["name" ] in ('td' , 'th' )
376- else :
377- return type == "EndTag" or type is None
378- return False
379-
380390def SerializeError (Exception ):
381391 """Error in serialized tree"""
382392 pass
0 commit comments