@@ -134,8 +134,10 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
134134 #Craziness
135135 if len (u"\U0010FFFF " ) == 1 :
136136 self .reportCharacterErrors = self .characterErrorsUCS4
137+ self .replaceCharactersRegexp = re .compile (u"[\uD800 -\uDFFF ]" )
137138 else :
138139 self .reportCharacterErrors = self .characterErrorsUCS2
140+ self .replaceCharactersRegexp = re .compile (u"([\uD800 -\uDBFF ](?![\uDC00 -\uDFFF ])|(?<![\uD800 -\uDBFF ])[\uDC00 -\uDFFF ])" )
139141
140142 # List of where new lines occur
141143 self .newLines = [0 ]
@@ -159,6 +161,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
159161 if (self .charEncoding [0 ] is None ):
160162 self .charEncoding = self .detectEncoding (parseMeta , chardet )
161163
164+
162165 self .reset ()
163166
164167 def reset (self ):
@@ -175,8 +178,8 @@ def reset(self):
175178 # number of columns in the last line of the previous chunk
176179 self .prevNumCols = 0
177180
178- #Flag to indicate we may have a CR LF broken across a data chunk
179- self ._lastChunkEndsWithCR = False
181+ #Deal with CR LF and surrogates split over chunk boundaries
182+ self ._bufferedCharacter = None
180183
181184 def openStream (self , source ):
182185 """Produces a file object from source.
@@ -341,20 +344,27 @@ def readChunk(self, chunkSize=None):
341344 self .chunkOffset = 0
342345
343346 data = self .dataStream .read (chunkSize )
344-
345- if not data :
347+
348+ #Deal with CR LF and surrogates broken across chunks
349+ if self ._bufferedCharacter :
350+ data = self ._bufferedCharacter + data
351+ self ._bufferedCharacter = None
352+ elif not data :
353+ # We have no more data, bye-bye stream
346354 return False
347355
356+ if len (data ) > 1 :
357+ lastv = ord (data [- 1 ])
358+ if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF :
359+ self ._bufferedCharacter = data [- 1 ]
360+ data = data [:- 1 ]
361+
348362 self .reportCharacterErrors (data )
349-
363+
364+ # Replace invalid characters
350365 data = data .replace (u"\u0000 " , u"\ufffd " )
351- #Check for CR LF broken across chunks
352- if (self ._lastChunkEndsWithCR and data [0 ] == u"\n " ):
353- data = data [1 :]
354- # Stop if the chunk is now empty
355- if not data :
356- return False
357- self ._lastChunkEndsWithCR = data [- 1 ] == u"\r "
366+ data = self .replaceCharactersRegexp .sub (u"\ufffd " , data )
367+
358368 data = data .replace (u"\r \n " , u"\n " )
359369 data = data .replace (u"\r " , u"\n " )
360370
@@ -394,8 +404,6 @@ def characterErrorsUCS2(self, data):
394404 else :
395405 skip = False
396406 self .errors .append ("invalid-codepoint" )
397- #This is still wrong if it is possible for a surrogate pair to break a
398- #chunk boundary
399407
400408 def charsUntil (self , characters , opposite = False ):
401409 """ Returns a string of characters from the stream up to but not
0 commit comments