Skip to content

Commit c8d2382

Browse files
committed
implement DOCTYPE sniffing, but do nothing with it...
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40749
1 parent bec1488 commit c8d2382

2 files changed

Lines changed: 101 additions & 3 deletions

File tree

src/html5parser.py

Lines changed: 92 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -319,11 +319,100 @@ def processComment(self, data):
319319
self.tree.insertComment(data, self.tree.document)
320320

321321
def processDoctype(self, name, publicId, systemId, correct):
322-
if name.translate(asciiUpper2Lower) != "html" or publicId != None or\
322+
nameLower = name.translate(asciiUpper2Lower)
323+
if nameLower != "html" or publicId != None or\
323324
systemId != None:
324325
self.parser.parseError(_("Erroneous DOCTYPE."))
325-
# XXX need to check quirks mode here
326+
# XXX need to update DOCTYPE tokens
326327
self.tree.insertDoctype(name)
328+
329+
if publicId == None:
330+
publicId = ""
331+
if publicId != "":
332+
publicId = publicId.translate(asciiUpper2Lower)
333+
334+
if nameLower != "html":
335+
# XXX quirks mode
336+
pass
337+
else:
338+
if publicId in\
339+
("+//silmaril//dtd html pro v0r11 19970101//en",
340+
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
341+
"-//as//dtd html 3.0 aswedit + extensions//en",
342+
"-//ietf//dtd html 2.0 level 1//en",
343+
"-//ietf//dtd html 2.0 level 2//en",
344+
"-//ietf//dtd html 2.0 strict level 1//en",
345+
"-//ietf//dtd html 2.0 strict level 2//en",
346+
"-//ietf//dtd html 2.0 strict//en",
347+
"-//ietf//dtd html 2.0//en",
348+
"-//ietf//dtd html 2.1e//en",
349+
"-//ietf//dtd html 3.0//en",
350+
"-//ietf//dtd html 3.0//en//",
351+
"-//ietf//dtd html 3.2 final//en",
352+
"-//ietf//dtd html 3.2//en",
353+
"-//ietf//dtd html 3//en",
354+
"-//ietf//dtd html level 0//en",
355+
"-//ietf//dtd html level 0//en//2.0",
356+
"-//ietf//dtd html level 1//en",
357+
"-//ietf//dtd html level 1//en//2.0",
358+
"-//ietf//dtd html level 2//en",
359+
"-//ietf//dtd html level 2//en//2.0",
360+
"-//ietf//dtd html level 3//en",
361+
"-//ietf//dtd html level 3//en//3.0",
362+
"-//ietf//dtd html strict level 0//en",
363+
"-//ietf//dtd html strict level 0//en//2.0",
364+
"-//ietf//dtd html strict level 1//en",
365+
"-//ietf//dtd html strict level 1//en//2.0",
366+
"-//ietf//dtd html strict level 2//en",
367+
"-//ietf//dtd html strict level 2//en//2.0",
368+
"-//ietf//dtd html strict level 3//en",
369+
"-//ietf//dtd html strict level 3//en//3.0",
370+
"-//ietf//dtd html strict//en",
371+
"-//ietf//dtd html strict//en//2.0",
372+
"-//ietf//dtd html strict//en//3.0",
373+
"-//ietf//dtd html//en",
374+
"-//ietf//dtd html//en//2.0",
375+
"-//ietf//dtd html//en//3.0",
376+
"-//metrius//dtd metrius presentational//en",
377+
"-//microsoft//dtd internet explorer 2.0 html strict//en",
378+
"-//microsoft//dtd internet explorer 2.0 html//en",
379+
"-//microsoft//dtd internet explorer 2.0 tables//en",
380+
"-//microsoft//dtd internet explorer 3.0 html strict//en",
381+
"-//microsoft//dtd internet explorer 3.0 html//en",
382+
"-//microsoft//dtd internet explorer 3.0 tables//en",
383+
"-//netscape comm. corp.//dtd html//en",
384+
"-//netscape comm. corp.//dtd strict html//en",
385+
"-//o'reilly and associates//dtd html 2.0//en",
386+
"-//o'reilly and associates//dtd html extended 1.0//en",
387+
"-//spyglass//dtd html 2.0 extended//en",
388+
"-//sq//dtd html 2.0 hotmetal + extensions//en",
389+
"-//sun microsystems corp.//dtd hotjava html//en",
390+
"-//sun microsystems corp.//dtd hotjava strict html//en",
391+
"-//w3c//dtd html 3 1995-03-24//en",
392+
"-//w3c//dtd html 3.2 draft//en",
393+
"-//w3c//dtd html 3.2 final//en",
394+
"-//w3c//dtd html 3.2//en",
395+
"-//w3c//dtd html 3.2s draft//en",
396+
"-//w3c//dtd html 4.0 frameset//en",
397+
"-//w3c//dtd html 4.0 transitional//en",
398+
"-//w3c//dtd html experimental 19960712//en",
399+
"-//w3c//dtd html experimental 970421//en",
400+
"-//w3c//dtd w3 html//en",
401+
"-//w3o//dtd w3 html 3.0//en",
402+
"-//w3o//dtd w3 html 3.0//en//",
403+
"-//w3o//dtd w3 html strict 3.0//en//",
404+
"-//webtechs//dtd mozilla html 2.0//en",
405+
"-//webtechs//dtd mozilla html//en",
406+
"-/w3c/dtd html 4.0 transitional/en",
407+
"html")\
408+
or (publicId in\
409+
("-//w3c//dtd html 4.01 frameset//EN",
410+
"-//w3c//dtd html 4.01 transitional//EN") and systemId == None)\
411+
or (systemId != None and\
412+
systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
413+
#XXX quirks mode
414+
pass
415+
327416
self.parser.phase = self.parser.phases["rootElement"]
328417

329418
def processSpaceCharacters(self, data):
@@ -1731,7 +1820,7 @@ class AfterBodyPhase(Phase):
17311820
def __init__(self, parser, tree):
17321821
Phase.__init__(self, parser, tree)
17331822

1734-
# XXX We should prolly add a handler for "html" here as well...
1823+
# XXX We should prolly add a handler for here as well...
17351824
self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
17361825
self.endTagHandler.default = self.endTagOther
17371826

src/tokenizer.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -755,8 +755,10 @@ def beforeDoctypePublicIdentifierState(self):
755755
if data in spaceCharacters:
756756
pass
757757
elif data == "\"":
758+
self.currentToken["publicId"] = ""
758759
self.state = self.states["doctypePublicIdentifierDoubleQuoted"]
759760
elif data == "'":
761+
self.currentToken["publicId"] = ""
760762
self.state = self.states["doctypePublicIdentifierSingleQuoted"]
761763
elif data == ">":
762764
self.tokenQueue.append({"type": "ParseError", "data":
@@ -809,8 +811,10 @@ def afterDoctypePublicIdentifierState(self):
809811
if data in spaceCharacters:
810812
pass
811813
elif data == "\"":
814+
self.currentToken["systemId"] = ""
812815
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
813816
elif data == "'":
817+
self.currentToken["systemId"] = ""
814818
self.state = self.states["doctypeSystemIdentifierSinglequoted"]
815819
elif data == ">":
816820
self.tokenQueue.append(self.currentToken)
@@ -831,10 +835,15 @@ def beforeDoctypeSystemIdentifierState(self):
831835
if data in spaceCharacters:
832836
pass
833837
elif data == "\"":
838+
self.currentToken["systemId"] = ""
834839
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
835840
elif data == "'":
841+
self.currentToken["systemId"] = ""
836842
self.state = self.states["doctypeSystemIdentifierSinglequoted"]
837843
elif data == ">":
844+
self.tokenQueue.append({"type": "ParseError", "data":
845+
_("Unexpected character in DOCTYPE.")})
846+
self.currentToken["correct"] = False
838847
self.tokenQueue.append(self.currentToken)
839848
self.state = self.states["data"]
840849
elif data == EOF:

0 commit comments

Comments
 (0)