@@ -55,13 +55,17 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
5555 if self .charEncoding is None or not isValidEncoding (self .charEncoding ):
5656 self .charEncoding = self .detectEncoding (parseMeta , chardet )
5757
58- self .dataStream = codecs .getreader (self .charEncoding )(self .rawStream , 'replace' )
58+ self .dataStream = codecs .getreader (self .charEncoding )(self .rawStream ,
59+ 'replace' )
5960
6061 self .queue = []
6162 self .errors = []
6263
6364 self .line = self .col = 0
6465 self .lineLengths = []
66+
67+ #Flag to indicate we may have a CR LF broken across a data chunk
68+ self ._lastChunkEndsWithCR = False
6569
6670 def openStream (self , source ):
6771 """Produces a file object from source.
@@ -199,64 +203,47 @@ def detectEncodingMeta(self):
199203 def position (self ):
200204 """Returns (line, col) of the current position in the stream."""
201205 line , col = self .line , self .col
202- for c in self .queue [::- 1 ]:
203- if c == '\n ' :
204- line -= 1
205- assert col == 0
206- col = self .lineLengths [line ]
207- else :
208- col -= 1
209206 return (line + 1 , col )
210207
211208 def char (self ):
212209 """ Read one character from the stream or queue if available. Return
213210 EOF when EOF is reached.
214211 """
215- if self .queue :
216- char = self .queue .pop (0 )
217- if char == "\n " :
218- self .lineLengths .append (self .col )
219- self .line += 1
220- self .col = 0
221- return char
212+ if not self .queue :
213+ self .readChunk ()
214+ #If we still don't have a character we have reached EOF
215+ if not self .queue :
216+ return EOF
217+
218+ char = self .queue .pop (0 )
219+
220+ # update position in stream
221+ if char == '\n ' :
222+ self .lineLengths .append (self .col )
223+ self .line += 1
224+ self .col = 0
222225 else :
223- c = self .readChar ()
224- if c is EOF :
225- return c
226-
227- if c == '\r ' :
228- #XXX This isn't right in the case with multiple CR in a row
229- #also recursing here isn't ideal + not sure what happens to input position
230- c = self .readChar ()
231- if c is not EOF and c not in ('\n ' , '\r ' ):
232- self .queue .insert (0 , unicode (c ))
233- elif c == '\r ' :
234- self .queue .insert (0 , u'\n ' )
235- c = '\n '
236-
237- # update position in stream
238- if c == '\n ' :
239- self .lineLengths .append (self .col )
240- self .line += 1
241- self .col = 0
242- else :
243- self .col += 1
244- return unicode (c )
245-
246- def readChar (self ):
247- """Read the next character from the datastream and normalize for null
248- but not for CR"""
249- c = self .dataStream .read (1 , 1 )
250- if not c :
251226 self .col += 1
252- return EOF
253-
254- # Normalize newlines and null characters
255- if c == '\x00 ' :
227+ return char
228+
229+ def readChunk (self , chunkSize = 1024 ):
230+ data = self .dataStream .read (1024 )
231+ if not data :
232+ return
233+ #Replace null characters
234+ for i in xrange (data .count (u"\u0000 " )):
256235 self .errors .append (_ ('null character found in input stream, '
257- 'replaced with U+FFFD' ))
258- c = u'\uFFFD '
259- return c
236+ 'replaced with U+FFFD' ))
237+ data = data .replace (u"\u0000 " , u"\ufffd " )
238+ #Check for CR LF broken across chunks
239+ if (self ._lastChunkEndsWithCR and data [0 ] == "\n " ):
240+ data = data [1 :]
241+ self ._lastChunkEndsWithCR = data [- 1 ] == "\r "
242+ data = data .replace ("\r \n " , "\n " )
243+ data = data .replace ("\r " , "\n " )
244+
245+ data = unicode (data )
246+ self .queue .extend ([char for char in data ])
260247
261248 def charsUntil (self , characters , opposite = False ):
262249 """ Returns a string of characters from the stream up to but not
@@ -272,13 +259,20 @@ def charsUntil(self, characters, opposite = False):
272259 # from where it came.
273260 c = charStack .pop ()
274261 if c != EOF :
275- self .queue . insert ( 0 , c )
262+ self .unget ( c )
276263
277264 return u"" .join (charStack )
278265
279266 def unget (self , chars ):
280267 if chars :
281268 self .queue = list (chars ) + self .queue
269+ #Alter the current line, col position
270+ for c in chars [::- 1 ]:
271+ if c == '\n ' :
272+ self .line -= 1
273+ self .col = self .lineLengths [self .line ]
274+ else :
275+ self .col -= 1
282276
283277class EncodingBytes (str ):
284278 """String-like object with an assosiated position and various extra methods
0 commit comments