|
1 | 1 | import codecs |
2 | 2 |
|
3 | | -from utils.utils import openStream |
4 | | - |
5 | 3 | class HTMLInputStream(object): |
6 | | - """For reading data from an input stream |
7 | | -
|
8 | | - This deals with character encoding issues automatically. |
9 | | -
|
10 | | - This keeps track of the current line and column number in the file |
11 | | - automatically, as you consume and unconsume characters. |
| 4 | + """ Provides a unicode stream of characters to the HTMLTokenizer. |
| 5 | + |
| 6 | + This class takes care of character encoding and removing or replacing |
| 7 | + incorrect byte-sequences and also provides column and line tracking. |
12 | 8 | """ |
13 | | - |
14 | | - def __init__(self, stream, encoding = None): |
| 9 | + |
| 10 | + def __init__(self, stream, encoding=None): |
15 | 11 | """ Initialise the HTMLInputReader. |
16 | | -
|
| 12 | + |
17 | 13 | The stream can either be a file-object, filename, url or string |
18 | | -
|
| 14 | + |
19 | 15 | The optional encoding parameter must be a string that indicates |
20 | 16 | the encoding. If specified, that encoding will be used, |
21 | 17 | regardless of any BOM or later declaration (such as in a meta |
22 | 18 | element) |
23 | 19 | """ |
24 | | - |
25 | | - self.line = 1 # Current line number |
26 | | - self.col = 0 # Current column number |
27 | | - self.lineBreaks = [0] |
28 | | - |
29 | | - # Keep a reference to the unencoded file object so that a new |
30 | | - # EncodedFile can be created later if the encoding is declared |
31 | | - # in a meta element |
32 | | - self.file = openStream(stream) |
33 | | - |
34 | | - skipBOM = False |
35 | | - self.charEncoding = self.detectBOM(self.file) |
36 | | - if self.charEncoding: |
37 | | - # The encoding is known from the BOM, don't allow later |
38 | | - # declarations from the meta element to override this. |
39 | | - skipBOM = True |
| 20 | + |
| 21 | + # Position Statistics |
| 22 | + self.line = 1 |
| 23 | + self.col = 0 |
| 24 | + |
| 25 | + # Encoding Information |
| 26 | + self.charEncoding = encoding |
| 27 | + |
| 28 | + # Original Stream |
| 29 | + self.stream = self.openStream(stream) |
| 30 | + |
| 31 | + # Try to detect the encoding of the stream by looking for a BOM |
| 32 | + encoding = self.detectEncoding() |
| 33 | + |
| 34 | + # Store whether we need to skip the BOM in future |
| 35 | + if encoding: |
| 36 | + self.skipBOM = True |
| 37 | + else: |
| 38 | + self.skipBOM = False |
| 39 | + |
| 40 | + # If an encoding was specified or detected from the BOM don't allow |
| 41 | + # the encoding to be changed futher into the stream |
| 42 | + if self.charEncoding or encoding: |
40 | 43 | self.allowEncodingOverride = False |
41 | 44 | else: |
42 | | - # Using the default encoding, don't allow later |
43 | | - # declarations from the meta element to override this. |
44 | 45 | self.allowEncodingOverride = True |
45 | | - self.charEncoding = "cp1252" # default to Windows-1252 |
46 | | - |
47 | | - self.encodedFile = codecs.EncodedFile(self.file, self.charEncoding) |
48 | | - if skipBOM: |
49 | | - self.encodedFile.read(1) |
50 | | - |
51 | | - def detectBOM(self, fp): |
52 | | - """ Attempts to detect the character encoding of the html file |
53 | | - given by a file object fp. fp must not be a codec wrapped file |
54 | | - object! |
55 | | -
|
56 | | - The return value can be: |
57 | | - - if detection of the BOM succeeds, the codec name of the |
58 | | - corresponding unicode charset is returned |
59 | | -
|
60 | | - - if BOM detection fails, None is returned. |
| 46 | + |
| 47 | + # If an encoding wasn't specified, use the encoding detected from the |
| 48 | + # BOM, if present, otherwise use the default encoding |
| 49 | + if not self.charEncoding: |
| 50 | + self.charEncoding = encoding or "cp1252" |
| 51 | + |
| 52 | + # Encoded file stream providing Unicode characters replacing characters |
| 53 | + # unable to be encoded with the Unicode replacement character |
| 54 | + self.encodedStream = codecs.EncodedFile(self.stream, self.charEncoding, |
| 55 | + errors='replace') |
| 56 | + |
| 57 | + self.seek(0) |
| 58 | + |
| 59 | + def openStream(self, stream): |
| 60 | + """ Opens stream first trying the native open function, if that |
| 61 | + fails try to open as a URL and finally treating stream as a string. |
| 62 | + |
| 63 | + Returns a file-like object. |
61 | 64 | """ |
62 | | - # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/363841 |
63 | | - |
64 | | - ### detection using BOM |
65 | | - |
66 | | - ## the BOMs we know, by their pattern |
67 | | - bomDict = { # bytepattern : name |
68 | | - (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be", |
69 | | - (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le", |
70 | | - (0xFE, 0xFF, None, None) : "utf_16_be", |
71 | | - (0xFF, 0xFE, None, None) : "utf_16_le", |
72 | | - (0xEF, 0xBB, 0xBF, None) : "utf_8", |
73 | | - } |
74 | | - |
75 | | - ## go to beginning of file and get the first 4 bytes |
76 | | - fp.seek(0) |
77 | | - (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4))) |
78 | | - |
79 | | - ## try bom detection using 4 bytes, 3 bytes, or 2 bytes |
80 | | - bomDetection = bomDict.get((byte1, byte2, byte3, byte4)) |
81 | | - if not bomDetection : |
82 | | - bomDetection = bomDict.get((byte1, byte2, byte3, None)) |
83 | | - if not bomDetection : |
84 | | - bomDetection = bomDict.get((byte1, byte2, None, None)) |
85 | | - |
86 | | - ## if BOM detected, we're done :-) |
87 | | - fp.seek(0) |
88 | | - if bomDetection : |
89 | | - return bomDetection |
90 | | - return None |
91 | | - |
92 | | - def consumeChar(self): |
93 | | - char = unicode(self.encodedFile.read(1), self.charEncoding) |
94 | | - if char == "\n": |
95 | | - # Move to next line and reset column count |
96 | | - self.line += 1 |
97 | | - self.col = 0 |
98 | | - self.lineBreaks.append(self.encodedFile.tell()) |
99 | | - else: |
100 | | - # Just increment the column counter |
101 | | - self.col += 1 |
102 | | - return char or None |
103 | | - |
104 | | - def unconsumeChar(self): |
105 | | - """Unconsume the previous character by seeking backwards thorough |
106 | | - the file. |
| 65 | + # Already a file-like object? |
| 66 | + if hasattr(stream, 'seek'): |
| 67 | + return stream |
| 68 | + |
| 69 | + # Try opening stream normally |
| 70 | + try: |
| 71 | + return open(stream) |
| 72 | + except: pass |
| 73 | + |
| 74 | + # Otherwise treat stream as a string and covert to a file-like object |
| 75 | + import StringIO as StringIO |
| 76 | + return StringIO.StringIO(str(stream)) |
| 77 | + |
| 78 | + def detectEncoding(self): |
| 79 | + """ Attempts to detect the character encoding of the stream. |
| 80 | + |
| 81 | + If an encoding can be determined from the BOM return the name of the |
| 82 | + encoding otherwise return None |
107 | 83 | """ |
108 | | - self.encodedFile.seek(-1, 1) |
109 | | - if self.encodedFile.tell()+1 == self.lineBreaks[-1]: |
110 | | - self.line -= 1 |
111 | | - self.lineBreaks.pop() |
112 | | - self.col = self.encodedFile.tell()-self.lineBreaks[-1] |
113 | | - else: |
114 | | - self.col -= 1 |
115 | | - |
| 84 | + |
| 85 | + bomDict = { |
| 86 | + codecs.BOM_UTF8: 'utf-8', |
| 87 | + codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be', |
| 88 | + codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be' |
| 89 | + } |
| 90 | + |
| 91 | + # Go to beginning of file and read in 4 bytes |
| 92 | + self.stream.seek(0) |
| 93 | + string = self.stream.read(4) |
| 94 | + |
| 95 | + # Try detecting the BOM using bytes from the string |
| 96 | + encoding = bomDict.get(string[:3]) # UTF-8 |
| 97 | + if not encoding: |
| 98 | + encoding = bomDict.get(string[:2]) # UTF-16 |
| 99 | + if not encoding: |
| 100 | + encoding = bomDict.get(string) # UTF-32 |
| 101 | + |
| 102 | + # Go back to the beginning of the file |
| 103 | + self.stream.seek(0) |
| 104 | + |
| 105 | + return encoding |
| 106 | + |
116 | 107 | def declareEncoding(self, encoding): |
117 | 108 | """Report the encoding declared by the meta element |
118 | 109 | |
119 | 110 | If the encoding is currently only guessed, then this |
120 | 111 | will read subsequent characters in that encoding. |
121 | | -
|
| 112 | + |
122 | 113 | If the encoding is not compatible with the guessed encoding |
123 | 114 | and non-US-ASCII characters have been seen, parsing will |
124 | 115 | have to begin again. |
125 | 116 | """ |
126 | 117 | pass |
| 118 | + |
| 119 | + def read(self, size=1, stopAt=None): |
| 120 | + """ Read at most size characters from the stream stopping when |
| 121 | + encountering a character in stopAt if supplied. |
| 122 | + |
| 123 | + stopAt can be any iterable object such as a string, list or tuple. |
| 124 | + |
| 125 | + Returns a string from the stream with null bytes and new lines |
| 126 | + normalized |
| 127 | + """ |
| 128 | + charStack = [] |
| 129 | + |
| 130 | + while (len(charStack) < size) or stopAt: |
| 131 | + charStack.append(self.encodedStream.read(1)) |
| 132 | + if charStack[-1] == u"\x00": |
| 133 | + charStack[-1] = u"\uFFFD" |
| 134 | + elif charStack[-1] == u"\r": |
| 135 | + if self.lookAhead(1) == u"\n": |
| 136 | + charStack.pop() |
| 137 | + else: |
| 138 | + charStack[-1] = u"\n" |
| 139 | + if stopAt and charStack and charStack[-1] in stopAt: |
| 140 | + break |
| 141 | + |
| 142 | + # Keep track of line and column count |
| 143 | + for c in charStack: |
| 144 | + if c == u"\n": |
| 145 | + self.line += 1 |
| 146 | + self.col = 0 |
| 147 | + else: |
| 148 | + self.col += 1 |
| 149 | + |
| 150 | + # Return normalized stream |
| 151 | + return "".join(charStack) |
| 152 | + |
| 153 | + def seek(self, offset, whence=0): |
| 154 | + """ Proxy method for seeking withing the input stream. |
| 155 | + """ |
| 156 | + |
| 157 | + # XXX TODO: Still need to find a way to track line and col after |
| 158 | + # seeking. Seeking back is easy but going forward will need reading |
| 159 | + # characters up to that point |
| 160 | + |
| 161 | + self.encodedStream.seek(offset, whence) |
| 162 | + # Skip over the BOM if needed |
| 163 | + if not self.tell() and self.skipBOM: |
| 164 | + self.encodedStream.read(1) |
| 165 | + |
| 166 | + def tell(self): |
| 167 | + """ Returns the streams current position |
| 168 | + """ |
| 169 | + return self.encodedStream.tell() |
| 170 | + |
| 171 | + def readUntil(self, charList): |
| 172 | + """ Returns a string of characters from the stream until a character |
| 173 | + in charList is found or EOF is reached |
| 174 | + """ |
| 175 | + return self.read(stopAt=charList) |
| 176 | + |
| 177 | + def lookAhead(self, amount): |
| 178 | + """ Returns the amount of characters specified without moving |
| 179 | + forward within the stream. |
| 180 | + """ |
| 181 | + string = self.read(amount) |
| 182 | + self.seek(-len(string), 1) |
| 183 | + return string |
127 | 184 |
|
128 | 185 | if __name__ == "__main__": |
129 | 186 | try: |
130 | 187 | # Hard coded file name for now, this will need to be fixed later |
131 | | - htmlFile = open("tests/utf-8-bom.html", "rU") |
132 | | - stream = HTMLInputStream(htmlFile) |
133 | | - |
134 | | - char = stream.consumeChar() |
| 188 | + stream = HTMLInputStream("tests/utf-8-bom.html") |
| 189 | + |
| 190 | + char = stream.read(1) |
135 | 191 | while char: |
136 | | - line = stream.line |
137 | | - col = stream.col |
138 | | - if char == "\n": |
139 | | - print "LF (%d, %d)" % (line, col) |
140 | | - else: |
141 | | - print "%s (%d, %d)" % (char, line, col) |
142 | | - char = stream.consumeChar() |
| 192 | + print char |
| 193 | + char = stream.read(1) |
143 | 194 | print "EOF" |
144 | | - htmlFile.close() |
145 | 195 | except IOError: |
146 | 196 | print "The file does not exist." |
0 commit comments