Skip to content

Commit f981179

Browse files
committed
make comments inside CDATA and RCDATA work; someone please review the HTMLInputStream regression
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40760
1 parent 3dd403b commit f981179

2 files changed

Lines changed: 35 additions & 9 deletions

File tree

src/inputstream.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -202,11 +202,17 @@ def charsUntil(self, characters, opposite = False):
202202
# Put the character stopped on back to the front of the queue
203203
# from where it came.
204204
c = charStack.pop()
205-
if c != EOF and self.tell <= len(self.dataStream) and \
206-
self.dataStream[self.tell - 1] == c[0]:
207-
self.tell -= 1
208-
else:
209-
self.queue.insert(0, c)
205+
self.queue.insert(0, c)
206+
207+
# XXX the following is need for correct line number reporting apparently
208+
# but it causes to break other tests with the fixes in tokenizer. I have
209+
# no idea why...
210+
#
211+
#if c != EOF and self.tell <= len(self.dataStream) and \
212+
# self.dataStream[self.tell - 1] == c[0]:
213+
# self.tell -= 1
214+
#else:
215+
# self.queue.insert(0, c)
210216
return "".join(charStack)
211217

212218
class EncodingBytes(str):

src/tokenizer.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ def __init__(self, stream, encoding=None, parseMeta=True):
7070

7171
# Setup the initial tokenizer state
7272
self.contentModelFlag = contentModelFlags["PCDATA"]
73+
self.escapeFlag = False
74+
self.lastFourChars = []
7375
self.state = self.states["data"]
7476

7577
# The current token being created
@@ -273,12 +275,30 @@ def emitCurrentToken(self):
273275

274276
def dataState(self):
275277
data = self.stream.char()
276-
if data == u"&" and self.contentModelFlag in\
278+
if self.contentModelFlag in\
279+
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
280+
if len(self.lastFourChars) == 4:
281+
self.lastFourChars.pop(0)
282+
self.lastFourChars.append(data)
283+
if data == "&" and self.contentModelFlag in\
277284
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
278285
self.state = self.states["entityData"]
279-
elif data == u"<" and self.contentModelFlag !=\
280-
contentModelFlags["PLAINTEXT"]:
286+
elif data == "-" and self.contentModelFlag in\
287+
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
288+
self.escapeFlag == False and\
289+
"".join(self.lastFourChars) == "<!--":
290+
self.escapeFlag = True
291+
self.tokenQueue.append({"type": "Characters", "data":data})
292+
elif data == "<" and (self.contentModelFlag ==\
293+
contentModelFlags["PCDATA"] or (self.contentModelFlag in
294+
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
295+
self.escapeFlag == False)):
281296
self.state = self.states["tagOpen"]
297+
elif data == ">" and self.contentModelFlag in\
298+
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
299+
self.escapeFlag == True and "".join(self.lastFourChars)[1:] == "-->":
300+
self.escapeFlag = False
301+
self.tokenQueue.append({"type": "Characters", "data":data})
282302
elif data == EOF:
283303
# Tokenization ends.
284304
return False
@@ -292,7 +312,7 @@ def dataState(self):
292312
data + self.stream.charsUntil(spaceCharacters, True)})
293313
else:
294314
self.tokenQueue.append({"type": "Characters", "data":
295-
data + self.stream.charsUntil((u"&", u"<"))})
315+
data + self.stream.charsUntil(("&", "<", ">", "-"))})
296316
return True
297317

298318
def entityDataState(self):

0 commit comments

Comments
 (0)