Skip to content

Commit 3391e3e

Browse files
committed
Progress towards streaming
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40674
1 parent 8c9eb12 commit 3391e3e

File tree

2 files changed

+39
-37
lines changed

2 files changed

+39
-37
lines changed

src/inputstream.py

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
3131
3232
"""
3333
# List of where new lines occur
34-
self.newLines = []
34+
self.newLines = [0]
3535

36-
# Raw Stream
36+
# Raw Stream
3737
self.rawStream = self.openStream(source)
3838

3939
# Encoding Information
@@ -47,15 +47,18 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
4747
if encoding is None or not isValidEncoding(encoding):
4848
encoding = self.detectEncoding(parseMeta, chardet)
4949
self.charEncoding = encoding
50+
self.win1252 = False
5051

5152
# Read bytes from stream decoding them into Unicode
52-
uString = self.rawStream.read().decode(self.charEncoding, 'replace')
53-
54-
# Normalize new ipythonlines and null characters
55-
uString = re.sub('\r\n?', '\n', uString)
56-
uString = re.sub('\x00', u'\uFFFD', uString)
53+
uString = self.rawStream.read()
5754

5855
# Convert the unicode string into a list to be used as the data stream
56+
if self.charEncoding == 'windows-1252':
57+
self.win1252 = True
58+
else:
59+
self.win1252 = False
60+
uString = uString.decode(self.charEncoding, 'replace')
61+
5962
self.dataStream = uString
6063

6164
self.queue = []
@@ -148,20 +151,8 @@ def detectEncodingMeta(self):
148151
self.rawStream.seek(0)
149152
return parser.getEncoding()
150153

151-
def determineNewLines(self):
152-
# Looks through the stream to find where new lines occur so
153-
# the position method can tell where it is.
154-
self.newLines.append(0)
155-
for i in xrange(len(self.dataStream)):
156-
if self.dataStream[i] == u"\n":
157-
self.newLines.append(i)
158-
159154
def position(self):
160155
"""Returns (line, col) of the current position in the stream."""
161-
# Generate list of new lines first time around
162-
if not self.newLines:
163-
self.determineNewLines()
164-
165156
line = 0
166157
tell = self.tell
167158
for pos in self.newLines:
@@ -184,8 +175,22 @@ def char(self):
184175
return self.queue.pop(0)
185176
else:
186177
try:
178+
c = self.dataStream[self.tell]
187179
self.tell += 1
188-
return self.dataStream[self.tell - 1]
180+
if self.win1252 and c >= '\x80': c=c.decode('windows-1252')
181+
182+
# Normalize newlines and null characters
183+
if c == '\x00': c = u'\uFFFD'
184+
if c == '\r':
185+
if self.tell < len(self.dataStream) and \
186+
self.dataStream[self.tell] == '\n':
187+
self.tell += 1
188+
c = '\n'
189+
190+
# record where newlines occur so that the position method
191+
# can tell where it is
192+
if c == '\n': self.newLines.append(self.tell - 1)
193+
return c
189194
except:
190195
return EOF
191196

@@ -196,22 +201,17 @@ def charsUntil(self, characters, opposite = False):
196201
"""
197202
charStack = [self.char()]
198203

199-
# First from the queue
200-
while charStack[-1] and (charStack[-1] in characters) == opposite \
201-
and self.queue:
202-
charStack.append(self.queue.pop(0))
203-
204-
# Then the rest
205204
while charStack[-1] and (charStack[-1] in characters) == opposite:
206-
try:
207-
self.tell += 1
208-
charStack.append(self.dataStream[self.tell - 1])
209-
except:
210-
charStack.append(EOF)
205+
charStack.append(self.char())
211206

212207
# Put the character stopped on back to the front of the queue
213208
# from where it came.
214-
self.queue.insert(0, charStack.pop())
209+
c = charStack.pop()
210+
if c != EOF and self.tell <= len(self.dataStream) and \
211+
self.dataStream[self.tell - 1] == c[0]:
212+
self.tell -= 1
213+
else:
214+
self.queue.insert(0, c)
215215
return "".join(charStack)
216216

217217
class EncodingBytes(str):

tests/test_stream.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,11 @@ def test_char_utf8(self):
3131
self.assertEquals(stream.char(), u'\u2018')
3232

3333
def test_char_win1252(self):
34-
stream = HTMLInputStream(u'\u2018'.encode('windows-1252'))
34+
stream = HTMLInputStream(u"\xa9\xf1\u2019".encode('windows-1252'))
3535
self.assertEquals(stream.charEncoding, 'windows-1252')
36-
self.assertEquals(stream.char(), u'\u2018')
36+
self.assertEquals(stream.char(), u"\xa9")
37+
self.assertEquals(stream.char(), u"\xf1")
38+
self.assertEquals(stream.char(), u"\u2019")
3739

3840
def test_bom(self):
3941
stream = HTMLInputStream(codecs.BOM_UTF8 + "'")
@@ -50,11 +52,11 @@ def test_newlines(self):
5052
self.assertEquals(stream.tell, 0)
5153
self.assertEquals(stream.charsUntil('c'),u"a\nbb\n")
5254
self.assertEquals(stream.tell, 6)
53-
self.assertEquals(stream.position(), (3,1))
55+
self.assertEquals(stream.position(), (3,0))
5456
self.assertEquals(stream.charsUntil('x'),u"ccc\ndddd")
5557
self.assertEquals(stream.tell, 14)
56-
self.assertEquals(stream.position(), (4,5))
57-
self.assertEquals(stream.newLines, [0,1,4,8])
58+
self.assertEquals(stream.position(), (4,4))
59+
self.assertEquals(stream.newLines, [0,1,5,9])
5860

5961
def buildTestSuite():
6062
return unittest.defaultTestLoader.loadTestsFromName(__name__)

0 commit comments

Comments
 (0)