Skip to content

Commit 1fe38fc

Browse files
author
lantis63
committed
Some clean up in inputstream and tokenizer and set tokenizer to use inputstream. Also moved all the parts concerning method calls to the parser to one area in the tokenizer to make it easier if we decide to change tokenizer to an iterable. Also the test_tokenizer was reporting 'expected' and 'received' swapped.
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40224
1 parent 261179b commit 1fe38fc

4 files changed

Lines changed: 252 additions & 192 deletions

File tree

constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
except:
66
pass
77

8+
EOF = None
9+
810
contentModelFlags = {
911
"PCDATA":0,
1012
"RCDATA":1,

inputstream.py

Lines changed: 63 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import codecs, StringIO
22

3+
from constants import EOF
4+
35
class HTMLInputStream(object):
46
""" Provides a unicode stream of characters to the HTMLTokenizer.
57
@@ -48,11 +50,8 @@ def __init__(self, source, encoding=None):
4850

4951
# Read bytes from stream decoding them into Unicode
5052
unicodeStream = self.rawStream.read().decode(self.charEncoding, 'replace')
51-
# Normalize new lines
52-
unicodeStream = unicodeStream.replace(u"\r\n", u"\n")
53-
unicodeStream = unicodeStream.replace(u"\r", u"\n")
54-
# Replace null bytes
55-
unicodeStream = unicodeStream.replace(u"\x00", u"\uFFFD")
53+
54+
unicodeStream = self.normalizeStream(unicodeStream)
5655

5756
# If encoding was determined from a BOM remove it from the stream
5857
if detectedEncoding:
@@ -113,6 +112,21 @@ def detectEncoding(self):
113112

114113
return encoding
115114

115+
def normalizeStream(self, stream):
116+
# Count U+FFFD replacement characters in case we need to switch encoding
117+
if self.charEncoding == "cp1252" and stream.count(u"\uFFFD"):
118+
self.incompatibleEncoding = True
119+
else:
120+
self.incompatibleEncoding = False
121+
122+
# Normalize new lines
123+
stream = stream.replace(u"\r\n", u"\n")
124+
stream = stream.replace(u"\r", u"\n")
125+
# Replace null bytes
126+
stream = stream.replace(u"\x00", u"\uFFFD")
127+
128+
return stream
129+
116130
def declareEncoding(self, encoding):
117131
"""Report the encoding declared by the meta element
118132
@@ -123,7 +137,22 @@ def declareEncoding(self, encoding):
123137
and non-US-ASCII characters have been seen, return True indicating
124138
parsing will have to begin again.
125139
"""
126-
pass
140+
# Only change encoding if we are using the default encoding
141+
if self.allowEncodingOverride:
142+
self.charEncoding = encoding
143+
# If there was incompatible characters found in the first encoding
144+
# we have to reencode the entire stream and start again
145+
if self.incompatibleEncoding:
146+
self.reset()
147+
self.dataStream = StringIO.StringIO(self.normalizeStream(
148+
self.dataStream.read(-1).decode(self.charEncoding, 'replace')
149+
))
150+
return True
151+
else:
152+
# Just decode the bytes from now on
153+
self.dataStream = codecs.EncodedFile(self.dataStream,
154+
self.charEncoding, 'replace')
155+
return False
127156

128157
def position(self):
129158
""" Returns (line, col) position in the stream
@@ -138,35 +167,50 @@ def position(self):
138167
col = tell-self.newLines[line-1]-1
139168
return (line, col)
140169

170+
def reset(self):
171+
""" Resets the position in the stream back to the start
172+
"""
173+
self.dataStream.seek(0)
174+
141175
def read(self, size=1):
142-
""" Read at most size characters from the stream
176+
""" Reads size characters from the stream or EOF if EOF is reached.
143177
"""
144-
char = self.dataStream.read(size)
145-
return char
178+
return self.dataStream.read(size) or EOF
146179

147180
def readMany(self, size):
148-
""" Reads multiple characters from the stream returning a list
181+
""" Returns a list of size characters from the stream
182+
and adds an EOF marker if the EOF is reached.
149183
"""
150-
charStack = []
151-
charStack.append(list(self.dataStream.read(size)))
152-
184+
charStack = list(self.dataStream.read(size)) or EOF
153185
if len(charStack) < size:
154-
charStack.append(None)
155-
186+
charStack.append(EOF)
156187
return charStack
157188

158189
def readUntil(self, charList):
159190
""" Returns a list of characters from the stream until a character
160191
in charList is found or EOF is reached
161192
"""
162-
charStack = [self.dataStream.read(1) or None]
163-
while charStack[-1] and charStack[-1] not in charList:
164-
charStack.append(self.dataStream.read(1) or None)
193+
charList = set(charList)
194+
charList.add(EOF)
195+
196+
charStack = [self.read(1)]
197+
while charStack[-1] not in charList:
198+
charStack.append(self.read(1))
199+
200+
return charStack
201+
202+
def readWhile(self, charList):
203+
""" Returns a list of characters from the stream until a character
204+
not in charList is found or EOF is reached
205+
"""
206+
charStack = [self.read(1)]
207+
while charStack[-1] in charList:
208+
charStack.append(self.read(1))
209+
165210
return charStack
166211

167212
if __name__ == "__main__":
168213
try:
169-
# Hard coded file name for now, this will need to be fixed later
170214
stream = HTMLInputStream("tests/utf-8-bom.html")
171215

172216
c = stream.read(1)

tests/test_tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ def runTokenizerTest(self, input, output):
8181
parser = TokenizerTestParser()
8282
tokens = parser.parse(StringIO.StringIO(input))
8383
tokens = concatenateCharacterTokens(tokens)
84-
errorMsg = "\n".join(["\n\nExpected:", str(tokens), "\nRecieved:",
85-
str(output)])
84+
errorMsg = "\n".join(["\n\nExpected:", str(output), "\nRecieved:",
85+
str(tokens)])
8686
self.assertTrue(tokensMatch(tokens, output), errorMsg)
8787

8888

0 commit comments

Comments
 (0)