1818from constants import scopingElements , formattingElements , specialElements
1919from constants import headingElements , tableInsertModeElements
2020from constants import cdataElements , rcdataElements , voidElements
21+ from constants import tokenTypes , ReparseException
2122from constants import tokenTypes , namespaces
2223
23- def parse (doc , treebuilderName = "simpletree" , encoding = None ):
24- tb = treebuilders .getTreeBuilder (treebuilderName )
24+ def parse (doc , treebuilder = "simpletree" , encoding = None ):
25+ tb = treebuilders .getTreeBuilder (treebuilder )
2526 p = HTMLParser (tb )
2627 return p .parse (doc , encoding = encoding )
2728
@@ -79,19 +80,30 @@ def __init__(self, tree = simpletree.TreeBuilder,
7980
8081 def _parse (self , stream , innerHTML = False , container = "div" ,
8182 encoding = None , parseMeta = True , useChardet = True , ** kwargs ):
82-
83+
84+ self .innerHTMLMode = innerHTML
85+ self .container = container
86+ self .tokenizer = self .tokenizer_class (stream , encoding = encoding ,
87+ parseMeta = parseMeta ,
88+ useChardet = useChardet , ** kwargs )
89+ self .reset ()
90+
91+ while True :
92+ try :
93+ self .mainLoop ()
94+ break
95+ except ReparseException , e :
96+ self .reset ()
97+
98+ def reset (self ):
8399 self .tree .reset ()
84100 self .firstStartTag = False
85101 self .errors = []
86102 # "quirks" / "limited quirks" / "no quirks"
87103 self .compatMode = "no quirks"
88104
89- self .tokenizer = self .tokenizer_class (stream , encoding = encoding ,
90- parseMeta = parseMeta ,
91- useChardet = useChardet , ** kwargs )
92-
93- if innerHTML :
94- self .innerHTML = container .lower ()
105+ if self .innerHTMLMode :
106+ self .innerHTML = self .container .lower ()
95107
96108 if self .innerHTML in cdataElements :
97109 self .tokenizer .contentModelFlag = tokenizer .contentModelFlags ["RCDATA" ]
@@ -114,6 +126,19 @@ def _parse(self, stream, innerHTML=False, container="div",
114126 self .secondaryPhase = None
115127
116128 self .beforeRCDataPhase = None
129+
130+ def mainLoop (self ):
131+ (CharactersToken ,
132+ SpaceCharactersToken ,
133+ StartTagToken ,
134+ EndTagToken ,
135+ CommentToken ,
136+ DoctypeToken ) = (tokenTypes ["Characters" ],
137+ tokenTypes ["SpaceCharacters" ],
138+ tokenTypes ["StartTag" ],
139+ tokenTypes ["EndTag" ],
140+ tokenTypes ["Comment" ],
141+ tokenTypes ["Doctype" ])
117142
118143 CharactersToken = tokenTypes ["Characters" ]
119144 SpaceCharactersToken = tokenTypes ["SpaceCharacters" ]
@@ -124,6 +149,8 @@ def _parse(self, stream, innerHTML=False, container="div",
124149
125150
126151 for token in self .normalizedTokens ():
152+ #print self.phase.__class__.__name__
153+ #print token
127154 type = token ["type" ]
128155 if type == CharactersToken :
129156 self .phase .processCharacters (token )
@@ -378,18 +405,6 @@ def __init__(self, parser, tree):
378405
379406 def processEOF (self ):
380407 raise NotImplementedError
381- self .tree .generateImpliedEndTags ()
382- if len (self .tree .openElements ) > 2 :
383- self .parser .parseError ("expected-closing-tag-but-got-eof" )
384- elif len (self .tree .openElements ) == 2 and \
385- self .tree .openElements [1 ].name != "body" :
386- # This happens for framesets or something?
387- self .parser .parseError ("expected-closing-tag-but-got-eof" )
388- elif self .parser .innerHTML and len (self .tree .openElements ) > 1 :
389- # XXX This is not what the specification says. Not sure what to do
390- # here.
391- self .parser .parseError ("eof-in-innerhtml" )
392- # Betting ends.
393408
394409 def processComment (self , token ):
395410 # For most phases the following is correct. Where it's not it will be
@@ -702,10 +717,10 @@ def startTagMeta(self, token):
702717 attributes = token ["data" ]
703718 if self .parser .tokenizer .stream .charEncoding [1 ] == "tentative" :
704719 if "charset" in attributes :
705- codec = inputstream .codecName (attributes ["charset" ])
706- self .parser .tokenizer .stream .changeEncoding (codec )
720+ self .parser .tokenizer .stream .changeEncoding (attributes ["charset" ])
707721 elif "content" in attributes :
708- data = inputstream .EncodingBytes (attributes ["content" ])
722+ data = inputstream .EncodingBytes (
723+ attributes ["content" ].encode (self .parser .tokenizer .stream .charEncoding [0 ]))
709724 parser = inputstream .ContentAttrParser (data )
710725 codec = parser .parse ()
711726 self .parser .tokenizer .stream .changeEncoding (codec )
0 commit comments