11import codecs , StringIO
22
3+ from constants import EOF
4+
35class HTMLInputStream (object ):
46 """ Provides a unicode stream of characters to the HTMLTokenizer.
57
@@ -48,11 +50,8 @@ def __init__(self, source, encoding=None):
4850
4951 # Read bytes from stream decoding them into Unicode
5052 unicodeStream = self .rawStream .read ().decode (self .charEncoding , 'replace' )
51- # Normalize new lines
52- unicodeStream = unicodeStream .replace (u"\r \n " , u"\n " )
53- unicodeStream = unicodeStream .replace (u"\r " , u"\n " )
54- # Replace null bytes
55- unicodeStream = unicodeStream .replace (u"\x00 " , u"\uFFFD " )
53+
54+ unicodeStream = self .normalizeStream (unicodeStream )
5655
5756 # If encoding was determined from a BOM remove it from the stream
5857 if detectedEncoding :
@@ -113,6 +112,21 @@ def detectEncoding(self):
113112
114113 return encoding
115114
115+ def normalizeStream (self , stream ):
116+ # Count U+FFFD replacement characters in case we need to switch encoding
117+ if self .charEncoding == "cp1252" and stream .count (u"\uFFFD " ):
118+ self .incompatibleEncoding = True
119+ else :
120+ self .incompatibleEncoding = False
121+
122+ # Normalize new lines
123+ stream = stream .replace (u"\r \n " , u"\n " )
124+ stream = stream .replace (u"\r " , u"\n " )
125+ # Replace null bytes
126+ stream = stream .replace (u"\x00 " , u"\uFFFD " )
127+
128+ return stream
129+
116130 def declareEncoding (self , encoding ):
117131 """Report the encoding declared by the meta element
118132
@@ -123,7 +137,22 @@ def declareEncoding(self, encoding):
123137 and non-US-ASCII characters have been seen, return True indicating
124138 parsing will have to begin again.
125139 """
126- pass
140+ # Only change encoding if we are using the default encoding
141+ if self .allowEncodingOverride :
142+ self .charEncoding = encoding
143+ # If there was incompatible characters found in the first encoding
144+ # we have to reencode the entire stream and start again
145+ if self .incompatibleEncoding :
146+ self .reset ()
147+ self .dataStream = StringIO .StringIO (self .normalizeStream (
148+ self .dataStream .read (- 1 ).decode (self .charEncoding , 'replace' )
149+ ))
150+ return True
151+ else :
152+ # Just decode the bytes from now on
153+ self .dataStream = codecs .EncodedFile (self .dataStream ,
154+ self .charEncoding , 'replace' )
155+ return False
127156
128157 def position (self ):
129158 """ Returns (line, col) position in the stream
@@ -138,35 +167,50 @@ def position(self):
138167 col = tell - self .newLines [line - 1 ]- 1
139168 return (line , col )
140169
170+ def reset (self ):
171+ """ Resets the position in the stream back to the start
172+ """
173+ self .dataStream .seek (0 )
174+
141175 def read (self , size = 1 ):
142- """ Read at most size characters from the stream
176+ """ Reads size characters from the stream or EOF if EOF is reached.
143177 """
144- char = self .dataStream .read (size )
145- return char
178+ return self .dataStream .read (size ) or EOF
146179
147180 def readMany (self , size ):
148- """ Reads multiple characters from the stream returning a list
181+ """ Returns a list of size characters from the stream
182+ and adds an EOF marker if the EOF is reached.
149183 """
150- charStack = []
151- charStack .append (list (self .dataStream .read (size )))
152-
184+ charStack = list (self .dataStream .read (size )) or EOF
153185 if len (charStack ) < size :
154- charStack .append (None )
155-
186+ charStack .append (EOF )
156187 return charStack
157188
158189 def readUntil (self , charList ):
159190 """ Returns a list of characters from the stream until a character
160191 in charList is found or EOF is reached
161192 """
162- charStack = [self .dataStream .read (1 ) or None ]
163- while charStack [- 1 ] and charStack [- 1 ] not in charList :
164- charStack .append (self .dataStream .read (1 ) or None )
193+ charList = set (charList )
194+ charList .add (EOF )
195+
196+ charStack = [self .read (1 )]
197+ while charStack [- 1 ] not in charList :
198+ charStack .append (self .read (1 ))
199+
200+ return charStack
201+
202+ def readWhile (self , charList ):
203+ """ Returns a list of characters from the stream until a character
204+ not in charList is found or EOF is reached
205+ """
206+ charStack = [self .read (1 )]
207+ while charStack [- 1 ] in charList :
208+ charStack .append (self .read (1 ))
209+
165210 return charStack
166211
167212if __name__ == "__main__" :
168213 try :
169- # Hard coded file name for now, this will need to be fixed later
170214 stream = HTMLInputStream ("tests/utf-8-bom.html" )
171215
172216 c = stream .read (1 )
0 commit comments