Skip to content

Commit e2fd652

Browse files
committed
Move case folding into the tokenizer. We now fail 4 tests although some are bugs in the test harness
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40892
1 parent 1edb2e8 commit e2fd652

7 files changed

Lines changed: 73 additions & 60 deletions

File tree

src/html5lib/html5parser.py

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ class HTMLParser(object):
3232
"""HTML parser. Generates a tree structure from a stream of (possibly
3333
malformed) HTML"""
3434

35-
def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer):
35+
def __init__(self, strict = False, tree=simpletree.TreeBuilder,
36+
tokenizer=tokenizer.HTMLTokenizer):
3637
"""
3738
strict - raise an exception when a parse error is encountered
3839
@@ -73,14 +74,14 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokeni
7374
}
7475

7576
def _parse(self, stream, innerHTML=False, container="div",
76-
encoding=None):
77+
encoding=None, **kwargs):
7778

7879
self.tree.reset()
7980
self.firstStartTag = False
8081
self.errors = []
8182

82-
self.tokenizer = self.tokenizer_class(stream, encoding,
83-
parseMeta=not innerHTML)
83+
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
84+
parseMeta=not innerHTML, **kwargs)
8485

8586
if innerHTML:
8687
self.innerHTML = container.lower()
@@ -176,26 +177,11 @@ def normalizeToken(self, token):
176177
token["type"] = "StartTag"
177178

178179
if token["type"] == "StartTag":
179-
token["name"] = token["name"].translate(asciiUpper2Lower)
180-
181-
# We need to remove the duplicate attributes and convert attributes
182-
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
183-
184-
# AT When Python 2.4 is widespread we should use
185-
# dict(reversed(token.data))
186-
if token["data"]:
187-
token["data"] = dict([(attr.translate(asciiUpper2Lower), value)
188-
for attr,value in token["data"][::-1]])
189-
else:
190-
token["data"] = {}
191-
192-
elif token["type"] == "EndTag":
193-
if token["data"]:
194-
self.parseError(_("End tag contains unexpected attributes."))
195-
token["name"] = token["name"].lower()
180+
token["data"] = dict(token["data"][::-1])
196181

197182
return token
198183

184+
199185
def resetInsertionMode(self):
200186
# The name of this method is mostly historical. (It's also used in the
201187
# specification.)

src/html5lib/liberalxmlparser.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,28 +27,21 @@ class XMLParser(html5parser.HTMLParser):
2727

2828
def __init__(self, *args, **kwargs):
2929
html5parser.HTMLParser.__init__(self, *args, **kwargs)
30+
3031
self.phases["initial"] = XmlRootPhase(self, self.tree)
3132

3233
def normalizeToken(self, token):
33-
if token["type"] == "StartTag" or token["type"] == "EmptyTag":
34-
# We need to remove the duplicate attributes and convert attributes
35-
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
3634

37-
# AT When Python 2.4 is widespread we should use
38-
# dict(reversed(token.data))
35+
if token["type"] in ("StartTag", "EmptyTag"):
3936
token["data"] = dict(token["data"][::-1])
4037

41-
# For EmptyTags, process both a Start and an End tag
42-
if token["type"] == "EmptyTag":
43-
save = self.tokenizer.contentModelFlag
44-
self.phase.processStartTag(token["name"], token["data"])
45-
self.tokenizer.contentModelFlag = save
46-
token["data"] = {}
47-
token["type"] = "EndTag"
48-
49-
elif token["type"] == "EndTag":
50-
if token["data"]:
51-
self.parseError(_("End tag contains unexpected attributes."))
38+
# For EmptyTags, process both a Start and an End tag
39+
if token["type"] == "EmptyTag":
40+
save = self.tokenizer.contentModelFlag
41+
self.phase.processStartTag(token["name"], token["data"])
42+
self.tokenizer.contentModelFlag = save
43+
token["data"] = {}
44+
token["type"] = "EndTag"
5245

5346
elif token["type"] == "Characters":
5447
# un-escape rcdataElements (e.g. style, script)
@@ -64,6 +57,13 @@ def normalizeToken(self, token):
6457

6558
return token
6659

60+
def _parse(self, stream, innerHTML=False, container="div", encoding=None,
61+
**kwargs):
62+
63+
html5parser.HTMLParser._parse(self, stream, innerHTML, container,
64+
encoding, lowercaseElementName=False,
65+
lowercaseAttrName=False)
66+
6767
class XHTMLParser(XMLParser):
6868
""" liberal XMTHML parser """
6969

src/html5lib/sanitizer.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from xml.sax.saxutils import escape, unescape
33
from tokenizer import HTMLTokenizer
44

5-
class HTMLSanitizerMixin:
5+
class HTMLSanitizerMixin(object):
66
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
77

88
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
@@ -188,7 +188,15 @@ def sanitize_css(self, style):
188188
return ' '.join(clean)
189189

190190
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
191+
def __init__(self, stream, encoding=None, parseMeta=True,
192+
lowercaseElementName=False, lowercaseAttrName=False):
193+
#Change case matching defaults as we only output lowercase html anyway
194+
#This solution doesn't seem ideal...
195+
HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
196+
lowercaseElementName, lowercaseAttrName)
197+
191198
def __iter__(self):
192199
for token in HTMLTokenizer.__iter__(self):
193200
token = self.sanitize_token(token)
194-
if token: yield token
201+
if token:
202+
yield token

src/html5lib/tokenizer.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,14 @@ class HTMLTokenizer(object):
3232

3333
# XXX need to fix documentation
3434

35-
def __init__(self, stream, encoding=None, parseMeta=True):
35+
def __init__(self, stream, encoding=None, parseMeta=True,
36+
lowercaseElementName=True, lowercaseAttrName=True,):
3637
self.stream = HTMLInputStream(stream, encoding, parseMeta)
37-
38+
39+
#Perform case conversions?
40+
self.lowercaseElementName = lowercaseElementName
41+
self.lowercaseAttrName = lowercaseAttrName
42+
3843
self.states = {
3944
"data":self.dataState,
4045
"entityData":self.entityDataState,
@@ -272,9 +277,15 @@ def emitCurrentToken(self):
272277
the state to "data" because that's what's needed after a token has been
273278
emitted.
274279
"""
275-
280+
token = self.currentToken
276281
# Add token to the queue to be yielded
277-
self.tokenQueue.append(self.currentToken)
282+
if (token["type"] in ("StartTag", "EndTag", "EmptyTag")):
283+
if self.lowercaseElementName:
284+
token["name"] = token["name"].translate(asciiUpper2Lower)
285+
if token["type"] == "EndTag" and token["data"]:
286+
self.tokenQueue.append({"type":"ParseError",
287+
"data":_("End tag contains unexpected attributes.")})
288+
self.tokenQueue.append(token)
278289
self.state = self.states["data"]
279290

280291

@@ -511,10 +522,14 @@ def attributeNameState(self):
511522
# Attributes are not dropped at this stage. That happens when the
512523
# start tag token is emitted so values can still be safely appended
513524
# to attributes, but we do want to report the parse error in time.
525+
if self.lowercaseAttrName:
526+
self.currentToken["data"][-1][0] = (
527+
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
514528
for name, value in self.currentToken["data"][:-1]:
515529
if self.currentToken["data"][-1][0] == name:
516530
self.tokenQueue.append({"type": "ParseError", "data":
517531
_("Dropped duplicate attribute on tag.")})
532+
break
518533
# XXX Fix for above XXX
519534
if emitToken:
520535
self.emitCurrentToken()

src/html5lib/treebuilders/simpletree.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -112,9 +112,9 @@ def __init__(self, name):
112112

113113
def __unicode__(self):
114114
if self.name:
115-
return "<!DOCTYPE %s>" % self.name
115+
return u"<!DOCTYPE %s>" % self.name
116116
else:
117-
return "<!DOCTYPE>"
117+
return u"<!DOCTYPE>"
118118

119119
toxml = __unicode__
120120

@@ -128,7 +128,7 @@ def __init__(self, value):
128128
self.value = value
129129

130130
def __unicode__(self):
131-
return "\"%s\"" % self.value
131+
return u"\"%s\"" % self.value
132132

133133
def toxml(self):
134134
return escape(self.value)
@@ -142,20 +142,20 @@ def __init__(self, name):
142142
self.attributes = {}
143143

144144
def __unicode__(self):
145-
return "<%s>" % self.name
145+
return u"<%s>" % self.name
146146

147147
def toxml(self):
148148
result = '<' + self.name
149149
if self.attributes:
150150
for name,value in self.attributes.iteritems():
151-
result += ' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
151+
result += u' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
152152
if self.childNodes:
153153
result += '>'
154154
for child in self.childNodes:
155155
result += child.toxml()
156-
result += '</%s>' % self.name
156+
result += u'</%s>' % self.name
157157
else:
158-
result += '/>'
158+
result += u'/>'
159159
return result
160160

161161
def hilite(self):

tests/test_sanitizer.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ def buildTestSuite():
7777

7878
return unittest.TestLoader().loadTestsFromTestCase(SanitizeTest)
7979

80+
def sanitize_html(stream):
81+
return ''.join([token.toxml() for token in
82+
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
83+
parseFragment(stream).childNodes])
84+
8085
def main():
8186
buildTestSuite()
8287
unittest.main()

tests/test_tokenizer.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,14 @@ def processDoctype(self, token):
2929
self.outputTokens.append([u"DOCTYPE", token["name"], token["publicId"], token["systemId"], token["correct"]])
3030

3131
def processStartTag(self, token):
32-
self.outputTokens.append([u"StartTag", token["name"], token["data"]])
32+
self.outputTokens.append([u"StartTag", token["name"], dict(token["data"][::-1])])
3333

3434
def processEmptyTag(self, token):
3535
if token["name"] not in constants.voidElements:
3636
self.outputTokens.append(u"ParseError")
37-
self.outputTokens.append([u"StartTag", token["name"], token["data"]])
37+
self.outputTokens.append([u"StartTag", token["name"], dict(token["data"][::-1])])
3838

3939
def processEndTag(self, token):
40-
if token["data"]:
41-
self.processParseError(None)
4240
self.outputTokens.append([u"EndTag", token["name"]])
4341

4442
def processComment(self, token):
@@ -55,7 +53,7 @@ def processEOF(self, token):
5553
pass
5654

5755
def processParseError(self, token):
58-
self.outputTokens.append(u"ParseError")
56+
self.outputTokens.append([u"ParseError", token["data"]])
5957

6058
def concatenateCharacterTokens(tokens):
6159
outputTokens = []
@@ -73,9 +71,10 @@ def concatenateCharacterTokens(tokens):
7371
def normalizeTokens(tokens):
7472
""" convert array of attributes to a dictionary """
7573
# TODO: convert tests to reflect arrays
76-
for token in tokens:
77-
if token[0] == 'StartTag':
78-
token[2] = dict(token[2][::-1])
74+
for i, token in enumerate(tokens):
75+
if token[0] == u'ParseError':
76+
tokens[i] = token[0]
77+
#token[2] = dict(token[2][::-1])
7978
return tokens
8079

8180
def tokensMatch(expectedTokens, recievedTokens):
@@ -102,14 +101,14 @@ def runTokenizerTest(self, test):
102101
test['lastStartTag'] = None
103102
parser = TokenizerTestParser(test['contentModelFlag'],
104103
test['lastStartTag'])
105-
106-
tokens = normalizeTokens(parser.parse(test['input']))
104+
tokens = parser.parse(test['input'])
107105
tokens = concatenateCharacterTokens(tokens)
108106
errorMsg = "\n".join(["\n\nContent Model Flag:",
109107
test['contentModelFlag'] ,
110108
"\nInput:", str(test['input']),
111109
"\nExpected:", str(output),
112110
"\nRecieved:", str(tokens)])
111+
tokens = normalizeTokens(tokens)
113112
self.assertEquals(tokensMatch(tokens, output), True, errorMsg)
114113

115114
def buildTestSuite():

0 commit comments

Comments
 (0)