Skip to content

Commit b5abd6e

Browse files
committed
Initial proof of concept
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%4055
1 parent a305b49 commit b5abd6e

1 file changed

Lines changed: 118 additions & 0 deletions

File tree

inputstream.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
class HTMLInputStream(object):
2+
"""For reading data from an input stream
3+
4+
This deals with character encoding issues automatically.
5+
"""
6+
7+
def __init__(self, file):
8+
self.__file = file
9+
self.__line = 1 # Current line number
10+
self.__col = 0 # Current column number
11+
12+
self.__charEncoding = self.__detectBOM(file)
13+
14+
if self.__charEncoding:
15+
# The encoding is known from the BOM, don't allow later
16+
# declarations from the meta element to override this.
17+
self.__allowEncodingOverride = False
18+
else:
19+
self.__allowEncodingOverride = True
20+
self.__charEncoding = "cp1252" # default to Windows-1252
21+
22+
# Read the first line
23+
self.__srcLine = unicode(self.__file.readline(), self.__charEncoding)
24+
25+
# Strip the BOM, if present
26+
self.__srcLine = self.__srcLine.lstrip(u"\uFEFF")
27+
28+
# private function
29+
def __detectBOM(self, fp):
30+
""" Attempts to detect the character encoding of the html file
31+
given by a file object fp. fp must not be a codec wrapped file
32+
object!
33+
34+
The return value can be:
35+
- if detection of the BOM succeeds, the codec name of the
36+
corresponding unicode charset is returned
37+
38+
- if BOM detection fails, None is returned.
39+
"""
40+
# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/363841
41+
42+
### detection using BOM
43+
44+
## the BOMs we know, by their pattern
45+
bomDict = { # bytepattern : name
46+
(0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",
47+
(0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
48+
(0xFE, 0xFF, None, None) : "utf_16_be",
49+
(0xFF, 0xFE, None, None) : "utf_16_le",
50+
(0xEF, 0xBB, 0xBF, None) : "utf_8",
51+
}
52+
53+
## go to beginning of file and get the first 4 bytes
54+
oldFP = fp.tell()
55+
fp.seek(0)
56+
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
57+
58+
## try bom detection using 4 bytes, 3 bytes, or 2 bytes
59+
bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
60+
if not bomDetection :
61+
bomDetection = bomDict.get((byte1, byte2, byte3, None))
62+
if not bomDetection :
63+
bomDetection = bomDict.get((byte1, byte2, None, None))
64+
65+
## if BOM detected, we're done :-)
66+
fp.seek(0) # No BOM, return to the beginning of the file
67+
if bomDetection :
68+
return bomDetection
69+
return None
70+
71+
def consumeChar(self):
72+
char = self.__srcLine[self.__col]
73+
self.__col += 1
74+
return char
75+
76+
def unconsumeChar(self):
77+
self.__col -= 1
78+
79+
def getLine(self):
80+
return sef.__line
81+
82+
def getCol(self):
83+
return self.__col
84+
85+
self.__col -= 1
86+
87+
def declareEncoding(self, encoding):
88+
"""Report the encoding declared by the meta element
89+
90+
If the encoding is currently only guessed, then this
91+
will read subsequent characters in that encoding.
92+
93+
If the encoding is not compatible with the guessed encoding
94+
and non-US-ASCII characters have been seen, parsing will
95+
have to begin again.
96+
"""
97+
pass
98+
99+
if __name__ == "__main__":
100+
try:
101+
# Hard coded file name for now, this will need to be fixed later
102+
htmlFile = open("tests/utf-8-bom.html", "rU")
103+
stream = HTMLInputStream(htmlFile)
104+
105+
print stream.consumeChar()
106+
print stream.consumeChar()
107+
print stream.consumeChar()
108+
print stream.consumeChar()
109+
110+
print "unconsuming 2 characters and printing again"
111+
stream.unconsumeChar()
112+
stream.unconsumeChar()
113+
print stream.consumeChar()
114+
print stream.consumeChar()
115+
116+
htmlFile.close()
117+
except IOError:
118+
print "The file does not exist."

0 commit comments

Comments
 (0)