@@ -31,9 +31,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
3131
3232 """
3333 # List of where new lines occur
34- self .newLines = []
34+ self .newLines = [0 ]
3535
36- # Raw Stream
36+ # Raw Stream
3737 self .rawStream = self .openStream (source )
3838
3939 # Encoding Information
@@ -47,15 +47,18 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
4747 if encoding is None or not isValidEncoding (encoding ):
4848 encoding = self .detectEncoding (parseMeta , chardet )
4949 self .charEncoding = encoding
50+ self .win1252 = False
5051
5152 # Read bytes from stream decoding them into Unicode
52- uString = self .rawStream .read ().decode (self .charEncoding , 'replace' )
53-
54- # Normalize new ipythonlines and null characters
55- uString = re .sub ('\r \n ?' , '\n ' , uString )
56- uString = re .sub ('\x00 ' , u'\uFFFD ' , uString )
53+ uString = self .rawStream .read ()
5754
5855 # Convert the unicode string into a list to be used as the data stream
56+ if self .charEncoding == 'windows-1252' :
57+ self .win1252 = True
58+ else :
59+ self .win1252 = False
60+ uString = uString .decode (self .charEncoding , 'replace' )
61+
5962 self .dataStream = uString
6063
6164 self .queue = []
@@ -148,20 +151,8 @@ def detectEncodingMeta(self):
148151 self .rawStream .seek (0 )
149152 return parser .getEncoding ()
150153
151- def determineNewLines (self ):
152- # Looks through the stream to find where new lines occur so
153- # the position method can tell where it is.
154- self .newLines .append (0 )
155- for i in xrange (len (self .dataStream )):
156- if self .dataStream [i ] == u"\n " :
157- self .newLines .append (i )
158-
159154 def position (self ):
160155 """Returns (line, col) of the current position in the stream."""
161- # Generate list of new lines first time around
162- if not self .newLines :
163- self .determineNewLines ()
164-
165156 line = 0
166157 tell = self .tell
167158 for pos in self .newLines :
@@ -184,8 +175,22 @@ def char(self):
184175 return self .queue .pop (0 )
185176 else :
186177 try :
178+ c = self .dataStream [self .tell ]
187179 self .tell += 1
188- return self .dataStream [self .tell - 1 ]
180+ if self .win1252 and c >= '\x80 ' : c = c .decode ('windows-1252' )
181+
182+ # Normalize newlines and null characters
183+ if c == '\x00 ' : c = u'\uFFFD '
184+ if c == '\r ' :
185+ if self .tell < len (self .dataStream ) and \
186+ self .dataStream [self .tell ] == '\n ' :
187+ self .tell += 1
188+ c = '\n '
189+
190+ # record where newlines occur so that the position method
191+ # can tell where it is
192+ if c == '\n ' : self .newLines .append (self .tell - 1 )
193+ return c
189194 except :
190195 return EOF
191196
@@ -196,22 +201,17 @@ def charsUntil(self, characters, opposite = False):
196201 """
197202 charStack = [self .char ()]
198203
199- # First from the queue
200- while charStack [- 1 ] and (charStack [- 1 ] in characters ) == opposite \
201- and self .queue :
202- charStack .append (self .queue .pop (0 ))
203-
204- # Then the rest
205204 while charStack [- 1 ] and (charStack [- 1 ] in characters ) == opposite :
206- try :
207- self .tell += 1
208- charStack .append (self .dataStream [self .tell - 1 ])
209- except :
210- charStack .append (EOF )
205+ charStack .append (self .char ())
211206
212207 # Put the character stopped on back to the front of the queue
213208 # from where it came.
214- self .queue .insert (0 , charStack .pop ())
209+ c = charStack .pop ()
210+ if c != EOF and self .tell <= len (self .dataStream ) and \
211+ self .dataStream [self .tell - 1 ] == c [0 ]:
212+ self .tell -= 1
213+ else :
214+ self .queue .insert (0 , c )
215215 return "" .join (charStack )
216216
217217class EncodingBytes (str ):
0 commit comments