@@ -47,24 +47,13 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
4747 if encoding is None or not isValidEncoding (encoding ):
4848 encoding = self .detectEncoding (parseMeta , chardet )
4949 self .charEncoding = encoding
50- self .win1252 = False
5150
52- # Read bytes from stream decoding them into Unicode
53- uString = self .rawStream .read ()
54-
55- # Convert the unicode string into a list to be used as the data stream
56- if self .charEncoding == 'windows-1252' :
57- self .win1252 = True
58- else :
59- self .win1252 = False
60- uString = uString .decode (self .charEncoding , 'replace' )
61-
62- self .dataStream = uString
51+ self .dataStream = codecs .getreader (self .charEncoding )(self .rawStream , 'replace' )
6352
6453 self .queue = []
6554
66- # Reset position in the list to read from
67- self .tell = 0
55+ self . line = self . col = 0
56+ self .lineLengths = []
6857
6958 def openStream (self , source ):
7059 """Produces a file object from source.
@@ -76,8 +65,8 @@ def openStream(self, source):
7665 if hasattr (source , 'read' ):
7766 stream = source
7867 else :
79- # Otherwise treat source as a string and convert to a file object
80- if isinstance (source , unicode ):
68+ # Otherwise treat source as a string and convert to a file object
69+ if isinstance (source , unicode ):
8170 source = source .encode ('utf-8' )
8271 import cStringIO
8372 stream = cStringIO .StringIO (str (source ))
@@ -154,15 +143,15 @@ def detectEncodingMeta(self):
154143
155144 def position (self ):
156145 """Returns (line, col) of the current position in the stream."""
157- line = 0
158- tell = self .tell
159- for pos in self .newLines :
160- if pos < tell :
161- line += 1
146+ line , col = self .line , self .col
147+ for c in self .queue [::- 1 ]:
148+ if c == '\n ' :
149+ line -= 1
150+ assert col == 0
151+ col = self .lineLengths [line ]
162152 else :
163- break
164- col = tell - self .newLines [line - 1 ] - 1
165- return (line , col )
153+ col -= 1
154+ return (line + 1 , col )
166155
167156 def char (self ):
168157 """ Read one character from the stream or queue if available. Return
@@ -171,26 +160,28 @@ def char(self):
171160 if self .queue :
172161 return self .queue .pop (0 )
173162 else :
174- try :
175- c = self .dataStream [self .tell ]
176- self .tell += 1
177- if self .win1252 and c >= '\x80 ' : c = c .decode ('windows-1252' )
178-
179- # Normalize newlines and null characters
180- if c == '\x00 ' : c = u'\uFFFD '
181- if c == '\r ' :
182- if self .tell < len (self .dataStream ) and \
183- self .dataStream [self .tell ] == '\n ' :
184- self .tell += 1
185- c = '\n '
186-
187- # record where newlines occur so that the position method
188- # can tell where it is
189- if c == '\n ' : self .newLines .append (self .tell - 1 )
190- return unicode (c )
191- except :
163+ c = self .dataStream .read (1 , 1 )
164+ if not c :
165+ self .col += 1
192166 return EOF
193167
168+ # Normalize newlines and null characters
169+ if c == '\x00 ' : c = u'\uFFFD '
170+ if c == '\r ' :
171+ c = self .dataStream .read (1 , 1 )
172+ if c != '\n ' :
173+ self .queue .insert (0 , unicode (c ))
174+ c = '\n '
175+
176+ # update position in stream
177+ if c == '\n ' :
178+ self .lineLengths .append (self .col )
179+ self .line += 1
180+ self .col = 0
181+ else :
182+ self .col += 1
183+ return unicode (c )
184+
194185 def charsUntil (self , characters , opposite = False ):
195186 """ Returns a string of characters from the stream up to but not
196187 including any character in characters or EOF. characters can be
@@ -204,12 +195,19 @@ def charsUntil(self, characters, opposite = False):
204195 # Put the character stopped on back to the front of the queue
205196 # from where it came.
206197 c = charStack .pop ()
207- if c != EOF and self .tell > 0 and not self .queue and \
208- self .dataStream [self .tell - 1 ] == c [0 ]:
209- self .tell -= 1
210- else :
211- self .queue .insert (0 , c )
212- return "" .join (charStack )
198+ if c != EOF :
199+ self .queue .insert (0 , c )
200+
201+ # XXX the following is need for correct line number reporting apparently
202+ # but it causes to break other tests with the fixes in tokenizer. I have
203+ # no idea why...
204+ #
205+ #if c != EOF and self.tell <= len(self.dataStream) and \
206+ # self.dataStream[self.tell - 1] == c[0]:
207+ # self.tell -= 1
208+ #else:
209+ # self.queue.insert(0, c)
210+ return u"" .join (charStack )
213211
214212class EncodingBytes (str ):
215213 """String-like object with an assosiated position and various extra methods
0 commit comments