@@ -178,8 +178,8 @@ def reset(self):
178178 # number of columns in the last line of the previous chunk
179179 self .prevNumCols = 0
180180
181- #Flag to indicate we may have a CR LF broken across a data chunk
182- self ._lastChunkEndsWithCR = False
181+ #Deal with CR LF and surrogates split over chunk boundaries
182+ self ._bufferedCharacter = None
183183
184184 def openStream (self , source ):
185185 """Produces a file object from source.
@@ -344,23 +344,30 @@ def readChunk(self, chunkSize=None):
344344 self .chunkOffset = 0
345345
346346 data = self .dataStream .read (chunkSize )
347-
348- if not data :
347+
348+ #Deal with CR LF and surrogates broken across chunks
349+ if self ._bufferedCharacter :
350+ if data :
351+ data = data + self ._bufferedCharacter
352+ else :
353+ data = self ._bufferedCharacter
354+ self ._bufferedCharacter = None
355+ elif not data :
356+ # We have no more data, bye-bye stream
349357 return False
350358
359+ if len (data ) > 1 :
360+ lastv = ord (data [- 1 ])
361+ if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF :
362+ self ._bufferedCharacter = data [- 1 ]
363+ data = data [:- 1 ]
364+
351365 self .reportCharacterErrors (data )
352366
353367 # Replace invalid characters
354368 data = data .replace (u"\u0000 " , u"\ufffd " )
355369 data = self .replaceCharactersRegexp .sub (u"\ufffd " , data )
356-
357- #Check for CR LF broken across chunks
358- if (self ._lastChunkEndsWithCR and data [0 ] == u"\n " ):
359- data = data [1 :]
360- # Stop if the chunk is now empty
361- if not data :
362- return False
363- self ._lastChunkEndsWithCR = data [- 1 ] == u"\r "
370+
364371 data = data .replace (u"\r \n " , u"\n " )
365372 data = data .replace (u"\r " , u"\n " )
366373
@@ -400,8 +407,6 @@ def characterErrorsUCS2(self, data):
400407 else :
401408 skip = False
402409 self .errors .append ("invalid-codepoint" )
403- #This is still wrong if it is possible for a surrogate pair to break a
404- #chunk boundary
405410
406411 def charsUntil (self , characters , opposite = False ):
407412 """ Returns a string of characters from the stream up to but not
0 commit comments