@@ -362,7 +362,7 @@ def readChunk(self, chunkSize=None):
362362 self .reportCharacterErrors (data )
363363
364364 # Replace invalid characters
365- data = data . replace ( u" \u0000 " , u" \ufffd " )
365+ # Note U+0000 is dealt with in the tokenizer
366366 data = self .replaceCharactersRegexp .sub (u"\ufffd " , data )
367367
368368 data = data .replace (u"\r \n " , u"\n " )
@@ -374,16 +374,12 @@ def readChunk(self, chunkSize=None):
374374 return True
375375
376376 def characterErrorsUCS4 (self , data ):
377- for i in xrange (data .count (u"\u0000 " )):
378- self .errors .append ("null-character" )
379377 for i in xrange (len (invalid_unicode_re .findall (data ))):
380378 self .errors .append ("invalid-codepoint" )
381379
382380 def characterErrorsUCS2 (self , data ):
383381 #Someone picked the wrong compile option
384382 #You lose
385- for i in xrange (data .count (u"\u0000 " )):
386- self .errors .append ("null-character" )
387383 skip = False
388384 import sys
389385 for match in invalid_unicode_re .finditer (data ):
@@ -452,24 +448,9 @@ def charsUntil(self, characters, opposite = False):
452448 r = u"" .join (rv )
453449 return r
454450
455- def charsUntilEOF (self ):
456- """ Returns a string of characters from the stream up to EOF."""
457-
458- rv = []
459-
460- while True :
461- rv .append (self .chunk [self .chunkOffset :])
462- if not self .readChunk ():
463- # Reached EOF
464- break
465-
466- r = u"" .join (rv )
467- return r
468-
469451 def unget (self , char ):
470452 # Only one character is allowed to be ungotten at once - it must
471453 # be consumed again before any further call to unget
472-
473454 if char is not None :
474455 if self .chunkOffset == 0 :
475456 # unget is called quite rarely, so it's a good idea to do
0 commit comments