99>>> p.parse('<!doctype html>\n <html foo=bar></html>')
1010<<class 'html5lib.treebuilders.simpletree.Document'> None>
1111>>> p.errors
12- [((2, 14), 'unrecognized -attribute', {'attributeName': u'foo', 'tagName': u'html'})]
12+ [((2, 14), 'unknown -attribute', {'attributeName': u'foo', 'tagName': u'html'})]
1313"""
1414
15+ try :
16+ frozenset
17+ except NameError :
18+ # Import from the sets module for python 2.3
19+ from sets import Set as set
20+ from sets import ImmutableSet as frozenset
1521import _base
1622from html5lib .constants import E
1723from html5lib import tokenizer
1824import gettext
1925_ = gettext .gettext
2026
2127E .update ({
22- "unrecognized-attribute" :
23- _ (u"Unrecognized attribute '%(attributeName)s' in <%(tagName)s>" ),
28+ "unknown-start-tag" :
29+ _ (u"Unknown start tag <%(tagName)s'" ),
30+ "unknown-attribute" :
31+ _ (u"Unknown '%(attributeName)s' attribute on <%(tagName)s>" ),
2432 "missing-required-attribute" :
25- _ (u"Missing required attribute '%(attributeName)s' in <%(tagName)s>" ),
33+ _ (u"Missing required '%(attributeName)s' attribute on <%(tagName)s>" ),
34+ "unknown-input-type" :
35+ _ (u"Unknown value for input type: '%(inputType)s'" ),
36+ "attribute-not-allowed-on-this-input-type" :
37+ _ (u"'%(attributeName)s' attribute is not allowed on <input type='%(inputType)s'>" ),
2638})
2739
28- globalAttributes = [ 'class' , 'contenteditable' , 'contextmenu' , 'dir' ,
40+ globalAttributes = frozenset (( 'class' , 'contenteditable' , 'contextmenu' , 'dir' ,
2941 'draggable' , 'id' , 'irrelevant' , 'lang' , 'ref' , 'tabindex' , 'template' ,
3042 'title' , 'onabort' , 'onbeforeunload' , 'onblur' , 'onchange' , 'onclick' ,
3143 'oncontextmenu' , 'ondblclick' , 'ondrag' , 'ondragend' , 'ondragenter' ,
3244 'ondragleave' , 'ondragover' , 'ondragstart' , 'ondrop' , 'onerror' ,
3345 'onfocus' , 'onkeydown' , 'onkeypress' , 'onkeyup' , 'onload' , 'onmessage' ,
3446 'onmousedown' , 'onmousemove' , 'onmouseout' , 'onmouseover' , 'onmouseup' ,
35- 'onmousewheel' , 'onresize' , 'onscroll' , 'onselect' , 'onsubmit' , 'onunload' ]
47+ 'onmousewheel' , 'onresize' , 'onscroll' , 'onselect' , 'onsubmit' , 'onunload' ))
3648# XXX lang in HTML only, xml:lang in XHTML only
3749
3850allowedAttributeMap = {
39- 'html' : ['xmlns' ],
40- 'base' : ['href' , 'target' ],
41- 'link' : ['href' , 'rel' , 'media' , 'hreflang' , 'type' ],
42- 'meta' : ['name' , 'http-equiv' , 'content' , 'charset' ], # XXX charset in HTML only
43- 'style' : ['media' , 'type' , 'scoped' ],
44- 'blockquote' : ['cite' ],
45- 'ol' : ['start' ],
46- 'li' : ['value' ], # XXX depends on parent
47- 'a' : ['href' , 'target' , 'ping' , 'rel' , 'media' , 'hreflang' , 'type' ],
48- 'q' : ['cite' ],
49- 'time' : ['datetime' ],
50- 'meter' : ['value' , 'min' , 'low' , 'high' , 'max' , 'optimum' ],
51- 'progress' : ['value' , 'max' ],
52- 'ins' : ['cite' , 'datetime' ],
53- 'del' : ['cite' , 'datetime' ],
54- 'img' : ['alt' , 'src' , 'usemap' , 'ismap' , 'height' , 'width' ], # XXX ismap depends on parent
55- 'iframe' : ['src' ],
56- 'object' : ['data' , 'type' , 'usemap' , 'height' , 'width' ],
57- 'param' : ['name' , 'value' ],
58- 'video' : ['src' , 'autoplay' , 'start' , 'loopstart' , 'loopend' , 'end' ,
59- 'loopcount' , 'controls' ],
60- 'audio' : ['src' , 'autoplay' , 'start' , 'loopstart' , 'loopend' , 'end' ,
61- 'loopcount' , 'controls' ],
62- 'source' : ['src' , 'type' , 'media' ],
63- 'canvas' : ['height' , 'width' ],
64- 'area' : ['alt' , 'coords' , 'shape' , 'href' , 'target' , 'ping' , 'rel' ,
65- 'media' , 'hreflang' , 'type' ],
66- 'colgroup' : ['span' ], # XXX only if element contains no <col> elements
67- 'col' : ['span' ],
68- 'td' : ['colspan' , 'rowspan' ],
69- 'th' : ['colspan' , 'rowspan' , 'scope' ],
70- # XXX form elements
71- 'script' : ['src' , 'defer' , 'async' , 'type' ],
72- 'event-source' : ['src' ],
73- 'details' : ['open' ],
74- 'datagrid' : ['multiple' , 'disabled' ],
75- 'command' : ['type' , 'label' , 'icon' , 'hidden' , 'disabled' , 'checked' ,
76- 'radiogroup' , 'default' ],
77- 'menu' : ['type' , 'label' , 'autosubmit' ],
78- 'font' : ['style' ]
51+ 'html' : frozenset (('xmlns' ,)),
52+ 'head' : frozenset (()),
53+ 'title' : frozenset (()),
54+ 'base' : frozenset (('href' , 'target' )),
55+ 'link' : frozenset (('href' , 'rel' , 'media' , 'hreflang' , 'type' )),
56+ 'meta' : frozenset (('name' , 'http-equiv' , 'content' , 'charset' )), # XXX charset in HTML only
57+ 'style' : frozenset (('media' , 'type' , 'scoped' )),
58+ 'body' : frozenset (()),
59+ 'section' : frozenset (()),
60+ 'nav' : frozenset (()),
61+ 'article' : frozenset (()),
62+ 'blockquote' : frozenset (('cite' ,)),
63+ 'aside' : frozenset (()),
64+ 'h1' : frozenset (()),
65+ 'h2' : frozenset (()),
66+ 'h3' : frozenset (()),
67+ 'h4' : frozenset (()),
68+ 'h5' : frozenset (()),
69+ 'h6' : frozenset (()),
70+ 'header' : frozenset (()),
71+ 'footer' : frozenset (()),
72+ 'address' : frozenset (()),
73+ 'p' : frozenset (()),
74+ 'hr' : frozenset (()),
75+ 'br' : frozenset (()),
76+ 'dialog' : frozenset (()),
77+ 'pre' : frozenset (()),
78+ 'ol' : frozenset (('start' ,)),
79+ 'ul' : frozenset (()),
80+ 'li' : frozenset (('value' ,)), # XXX depends on parent
81+ 'dl' : frozenset (()),
82+ 'dt' : frozenset (()),
83+ 'dd' : frozenset (()),
84+ 'a' : frozenset (('href' , 'target' , 'ping' , 'rel' , 'media' , 'hreflang' , 'type' )),
85+ 'q' : frozenset (('cite' ,)),
86+ 'cite' : frozenset (()),
87+ 'em' : frozenset (()),
88+ 'strong' : frozenset (()),
89+ 'small' : frozenset (()),
90+ 'm' : frozenset (()),
91+ 'dfn' : frozenset (()),
92+ 'abbr' : frozenset (()),
93+ 'time' : frozenset (('datetime' ,)),
94+ 'meter' : frozenset (('value' , 'min' , 'low' , 'high' , 'max' , 'optimum' )),
95+ 'progress' : frozenset (('value' , 'max' )),
96+ 'code' : frozenset (()),
97+ 'var' : frozenset (()),
98+ 'samp' : frozenset (()),
99+ 'kbd' : frozenset (()),
100+ 'sup' : frozenset (()),
101+ 'sub' : frozenset (()),
102+ 'span' : frozenset (()),
103+ 'i' : frozenset (()),
104+ 'b' : frozenset (()),
105+ 'bdo' : frozenset (()),
106+ 'ins' : frozenset (('cite' , 'datetime' )),
107+ 'del' : frozenset (('cite' , 'datetime' )),
108+ 'figure' : frozenset (()),
109+ 'img' : frozenset (('alt' , 'src' , 'usemap' , 'ismap' , 'height' , 'width' )), # XXX ismap depends on parent
110+ 'iframe' : frozenset (('src' ,)),
111+ # <embed> handled separately
112+ 'object' : frozenset (('data' , 'type' , 'usemap' , 'height' , 'width' )),
113+ 'param' : frozenset (('name' , 'value' )),
114+ 'video' : frozenset (('src' , 'autoplay' , 'start' , 'loopstart' , 'loopend' , 'end' ,
115+ 'loopcount' , 'controls' )),
116+ 'audio' : frozenset (('src' , 'autoplay' , 'start' , 'loopstart' , 'loopend' , 'end' ,
117+ 'loopcount' , 'controls' )),
118+ 'source' : frozenset (('src' , 'type' , 'media' )),
119+ 'canvas' : frozenset (('height' , 'width' )),
120+ 'map' : frozenset (()),
121+ 'area' : frozenset (('alt' , 'coords' , 'shape' , 'href' , 'target' , 'ping' , 'rel' ,
122+ 'media' , 'hreflang' , 'type' )),
123+ 'table' : frozenset (()),
124+ 'caption' : frozenset (()),
125+ 'colgroup' : frozenset (('span' ,)), # XXX only if element contains no <col> elements
126+ 'col' : frozenset (('span' ,)),
127+ 'tbody' : frozenset (()),
128+ 'thead' : frozenset (()),
129+ 'tfoot' : frozenset (()),
130+ 'tr' : frozenset (()),
131+ 'td' : frozenset (('colspan' , 'rowspan' )),
132+ 'th' : frozenset (('colspan' , 'rowspan' , 'scope' )),
133+ # 'form': frozenset(('action', 'method', 'enctype', 'accept', 'name', 'onsubmit',
134+ # 'onreset', 'accept-charset', 'data', 'replace')),
135+ # all possible <input> attributes are listed here but <input> is really handled separately
136+ 'input' : frozenset (('accept' , 'accesskey' , 'action' , 'alt' , 'autocomplete' , 'autofocus' , 'checked' , 'disabled' , 'enctype' , 'form' , 'inputmode' , 'list' , 'maxlength' , 'method' , 'min' , 'max' , 'name' , 'pattern' , 'step' , 'readonly' , 'replace' , 'required' , 'size' , 'src' , 'tabindex' , 'target' , 'template' , 'value' )),
137+ # 'button': frozenset(('name', 'value', 'type', 'disabled', 'form', 'autofocus')),
138+ # 'select': frozenset(('name', 'size', 'multiple', 'disabled', 'data', 'accesskey',
139+ # 'form', 'autofocus')),
140+ # 'optgroup': frozenset(('disabled', 'label', 'form', 'autofocus')),
141+ # 'option': frozenset(('selected', 'disabled', 'label', 'value', 'form', 'autofocus')),
142+ # 'textarea': frozenset(('name', 'rows', 'cols', 'disabled', 'readonly', 'required',
143+ # 'form', 'autofocus', 'wrap', 'accept')),
144+ # 'label': frozenset(('for', 'accesskey', 'form')),
145+ # 'fieldset': frozenset(('disabled', 'form')),
146+ # 'output': frozenset(('form', 'name', 'for', 'onforminput', 'onformchange')),
147+ # 'datalist': frozenset(('data')),
148+ # # XXX repetition model for repeating form controls
149+ 'script' : frozenset (('src' , 'defer' , 'async' , 'type' )),
150+ 'noscript' : frozenset (()),
151+ 'noembed' : frozenset (()),
152+ 'event-source' : frozenset (('src' ,)),
153+ 'details' : frozenset (('open' ,)),
154+ 'datagrid' : frozenset (('multiple' , 'disabled' )),
155+ 'command' : frozenset (('type' , 'label' , 'icon' , 'hidden' , 'disabled' , 'checked' ,
156+ 'radiogroup' , 'default' )),
157+ 'menu' : frozenset (('type' , 'label' , 'autosubmit' )),
158+ 'datatemplate' : frozenset (()),
159+ 'rule' : frozenset (()),
160+ 'nest' : frozenset (()),
161+ 'legend' : frozenset (()),
162+ 'div' : frozenset (()),
163+ 'font' : frozenset (('style' ,)),
79164}
80165
81166requiredAttributeMap = {
82- 'link' : ['href' , 'rel' ],
83- 'bdo' : ['dir' ],
84- 'img' : ['src' ],
85- 'embed' : ['src' ],
86- 'object' : [], # XXX one of 'data' or 'type' is required
87- 'param' : ['name' , 'value' ],
88- 'source' : ['src' ],
89- 'map' : ['id' ],
167+ 'link' : frozenset (('href' , 'rel' )),
168+ 'bdo' : frozenset (('dir' ,)),
169+ 'img' : frozenset (('src' ,)),
170+ 'embed' : frozenset (('src' ,)),
171+ 'object' : frozenset (()), # XXX one of 'data' or 'type' is required
172+ 'param' : frozenset (('name' , 'value' )),
173+ 'source' : frozenset (('src' ,)),
174+ 'map' : frozenset (('id' ,)),
175+ }
176+
177+ inputTypeAllowedAttributeMap = {
178+ 'text' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'inputmode' , 'list' , 'maxlength' , 'name' , 'pattern' , 'readonly' , 'required' , 'size' , 'tabindex' , 'value' )),
179+ 'password' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'inputmode' , 'maxlength' , 'name' , 'pattern' , 'readonly' , 'required' , 'size' , 'tabindex' , 'value' )),
180+ 'checkbox' : frozenset (('accesskey' , 'autofocus' , 'checked' , 'disabled' , 'form' , 'name' , 'required' , 'tabindex' , 'value' )),
181+ 'radio' : frozenset (('accesskey' , 'autofocus' , 'checked' , 'disabled' , 'form' , 'name' , 'required' , 'tabindex' , 'value' )),
182+ 'button' : frozenset (('accesskey' , 'autofocus' , 'disabled' , 'form' , 'name' , 'tabindex' , 'value' )),
183+ 'submit' : frozenset (('accesskey' , 'action' , 'autofocus' , 'disabled' , 'enctype' , 'form' , 'method' , 'name' , 'replace' , 'tabindex' , 'target' , 'value' )),
184+ 'reset' : frozenset (('accesskey' , 'autofocus' , 'disabled' , 'form' , 'name' , 'tabindex' , 'value' )),
185+ 'add' : frozenset (('accesskey' , 'autofocus' , 'disabled' , 'form' , 'name' , 'tabindex' , 'template' , 'value' )),
186+ 'remove' : frozenset (('accesskey' , 'autofocus' , 'disabled' , 'form' , 'name' , 'tabindex' , 'value' )),
187+ 'move-up' : frozenset (('accesskey' , 'autofocus' , 'disabled' , 'form' , 'name' , 'tabindex' , 'value' )),
188+ 'move-down' : frozenset (('accesskey' , 'autofocus' , 'disabled' , 'form' , 'name' , 'tabindex' , 'value' )),
189+ 'file' : frozenset (('accept' , 'accesskey' , 'autofocus' , 'disabled' , 'form' , 'min' , 'max' , 'name' , 'required' , 'tabindex' )),
190+ 'hidden' : frozenset (('disabled' , 'form' , 'name' , 'value' )),
191+ 'image' : frozenset (('accesskey' , 'action' , 'alt' , 'autofocus' , 'disabled' , 'enctype' , 'form' , 'method' , 'name' , 'replace' , 'src' , 'tabindex' , 'target' )),
192+ 'datetime' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
193+ 'datetime-local' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
194+ 'date' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
195+ 'month' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
196+ 'week' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
197+ 'time' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
198+ 'number' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
199+ 'range' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'list' , 'min' , 'max' , 'name' , 'step' , 'readonly' , 'required' , 'tabindex' , 'value' )),
200+ 'email' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'inputmode' , 'list' , 'maxlength' , 'name' , 'pattern' , 'readonly' , 'required' , 'tabindex' , 'value' )),
201+ 'url' : frozenset (('accesskey' , 'autocomplete' , 'autofocus' , 'disabled' , 'form' , 'inputmode' , 'list' , 'maxlength' , 'name' , 'pattern' , 'readonly' , 'required' , 'tabindex' , 'value' )),
90202}
91203
92204class HTMLConformanceChecker (_base .Filter ):
@@ -96,31 +208,76 @@ def __init__(self, stream, encoding, parseMeta, **kwargs):
96208
97209 def __iter__ (self ):
98210 for token in _base .Filter .__iter__ (self ):
99- type = token ["type" ]
100- if type == "StartTag" :
101- name = token ["name" ].lower ()
102- if name == 'embed' :
103- # XXX spec says "any attributes w/o namespace"
104- pass
105- else :
106- if name in allowedAttributeMap .keys ():
107- allowedAttributes = globalAttributes + \
108- allowedAttributeMap [name ]
109- else :
110- allowedAttributes = globalAttributes
111- for attrName , attrValue in token ["data" ]:
112- if attrName .lower () not in allowedAttributes :
113- yield {"type" : "ParseError" ,
114- "data" : "unrecognized-attribute" ,
115- "datavars" : {"tagName" : name ,
116- "attributeName" : attrName }}
117- if name in requiredAttributeMap .keys ():
118- attrsPresent = [attrName for attrName , attrValue
119- in token ["data" ]]
120- for attrName in requiredAttributeMap [name ]:
121- if attrName not in attrsPresent :
122- yield {"type" : "ParseError" ,
123- "data" : "missing-required-attribute" ,
124- "datavars" : {"tagName" : name ,
125- "attributeName" : attrName }}
211+ fakeToken = {"type" : token .get ("type" , "-" ),
212+ "name" : token .get ("name" , "-" ).capitalize ()}
213+ method = getattr (self , "validate%(type)s%(name)s" % fakeToken , None )
214+ if method :
215+ for t in method (token ) or []: yield t
216+ else :
217+ method = getattr (self , "validate%(type)s" % fakeToken , None )
218+ if method :
219+ for t in method (token ) or []: yield t
126220 yield token
221+
222+ def validateStartTag (self , token ):
223+ for t in self .checkUnknownStartTag (token ) or []: yield t
224+ for t in self .checkStartTagRequiredAttributes (token ) or []: yield t
225+ for t in self .checkStartTagUnknownAttributes (token ) or []: yield t
226+
227+ def validateStartTagEmbed (self , token ):
228+ for t in self .checkStartTagRequiredAttributes (token ) or []: yield t
229+ # spec says "any attributes w/o namespace"
230+ # so don't call checkStartTagUnknownAttributes
231+
232+ def validateStartTagInput (self , token ):
233+ attrDict = dict ([(name .lower (), value ) for name , value in token ["data" ]])
234+ inputType = attrDict .get ("type" , "text" )
235+ if inputType not in inputTypeAllowedAttributeMap .keys ():
236+ yield {"type" : "ParseError" ,
237+ "data" : "unknown-input-type" ,
238+ "datavars" : {"attrValue" : inputType }}
239+ allowedAttributes = inputTypeAllowedAttributeMap .get (inputType , [])
240+ for attrName , attrValue in attrDict .items ():
241+ if attrName not in allowedAttributeMap ['input' ]:
242+ yield {"type" : "ParseError" ,
243+ "data" : "unknown-attribute" ,
244+ "datavars" : {"tagName" : "input" ,
245+ "attributeName" : attrName }}
246+ elif attrName not in allowedAttributes :
247+ yield {"type" : "ParseError" ,
248+ "data" : "attribute-not-allowed-on-this-input-type" ,
249+ "datavars" : {"attributeName" : attrName ,
250+ "inputType" : inputType }}
251+
252+ def checkUnknownStartTag (self , token ):
253+ # check for recognized tag name
254+ name = token ["name" ].lower ()
255+ if name not in allowedAttributeMap .keys ():
256+ yield {"type" : "ParseError" ,
257+ "data" : "unknown-start-tag" ,
258+ "datavars" : {"tagName" : name }}
259+
260+ def checkStartTagRequiredAttributes (self , token ):
261+ # check for presence of required attributes
262+ name = token ["name" ].lower ()
263+ if name in requiredAttributeMap .keys ():
264+ attrsPresent = [attrName for attrName , attrValue
265+ in token ["data" ]]
266+ for attrName in requiredAttributeMap [name ]:
267+ if attrName not in attrsPresent :
268+ yield {"type" : "ParseError" ,
269+ "data" : "missing-required-attribute" ,
270+ "datavars" : {"tagName" : name ,
271+ "attributeName" : attrName }}
272+
273+ def checkStartTagUnknownAttributes (self , token ):
274+ # check for recognized attribute names
275+ name = token ["name" ].lower ()
276+ allowedAttributes = globalAttributes | allowedAttributeMap .get (name , frozenset (()))
277+ for attrName , attrValue in token ["data" ]:
278+ if attrName .lower () not in allowedAttributes :
279+ yield {"type" : "ParseError" ,
280+ "data" : "unknown-attribute" ,
281+ "datavars" : {"tagName" : name ,
282+ "attributeName" : attrName }}
283+
0 commit comments