Skip to content

Commit 1f51326

Browse files
author
lantis63
committed
Changed the HTMLInputStream quite a bit and added a few more methods for it. Now with read(size), readUntil and lookAhead
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%4093
1 parent e8b6031 commit 1f51326

File tree

2 files changed

+162
-138
lines changed

2 files changed

+162
-138
lines changed

inputstream.py

Lines changed: 162 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,146 +1,196 @@
11
import codecs
22

3-
from utils.utils import openStream
4-
53
class HTMLInputStream(object):
6-
"""For reading data from an input stream
7-
8-
This deals with character encoding issues automatically.
9-
10-
This keeps track of the current line and column number in the file
11-
automatically, as you consume and unconsume characters.
4+
""" Provides a unicode stream of characters to the HTMLTokenizer.
5+
6+
This class takes care of character encoding and removing or replacing
7+
incorrect byte-sequences and also provides column and line tracking.
128
"""
13-
14-
def __init__(self, stream, encoding = None):
9+
10+
def __init__(self, stream, encoding=None):
1511
""" Initialise the HTMLInputReader.
16-
12+
1713
The stream can either be a file-object, filename, url or string
18-
14+
1915
The optional encoding parameter must be a string that indicates
2016
the encoding. If specified, that encoding will be used,
2117
regardless of any BOM or later declaration (such as in a meta
2218
element)
2319
"""
24-
25-
self.line = 1 # Current line number
26-
self.col = 0 # Current column number
27-
self.lineBreaks = [0]
28-
29-
# Keep a reference to the unencoded file object so that a new
30-
# EncodedFile can be created later if the encoding is declared
31-
# in a meta element
32-
self.file = openStream(stream)
33-
34-
skipBOM = False
35-
self.charEncoding = self.detectBOM(self.file)
36-
if self.charEncoding:
37-
# The encoding is known from the BOM, don't allow later
38-
# declarations from the meta element to override this.
39-
skipBOM = True
20+
21+
# Position Statistics
22+
self.line = 1
23+
self.col = 0
24+
25+
# Encoding Information
26+
self.charEncoding = encoding
27+
28+
# Original Stream
29+
self.stream = self.openStream(stream)
30+
31+
# Try to detect the encoding of the stream by looking for a BOM
32+
encoding = self.detectEncoding()
33+
34+
# Store whether we need to skip the BOM in future
35+
if encoding:
36+
self.skipBOM = True
37+
else:
38+
self.skipBOM = False
39+
40+
# If an encoding was specified or detected from the BOM don't allow
41+
# the encoding to be changed futher into the stream
42+
if self.charEncoding or encoding:
4043
self.allowEncodingOverride = False
4144
else:
42-
# Using the default encoding, don't allow later
43-
# declarations from the meta element to override this.
4445
self.allowEncodingOverride = True
45-
self.charEncoding = "cp1252" # default to Windows-1252
46-
47-
self.encodedFile = codecs.EncodedFile(self.file, self.charEncoding)
48-
if skipBOM:
49-
self.encodedFile.read(1)
50-
51-
def detectBOM(self, fp):
52-
""" Attempts to detect the character encoding of the html file
53-
given by a file object fp. fp must not be a codec wrapped file
54-
object!
55-
56-
The return value can be:
57-
- if detection of the BOM succeeds, the codec name of the
58-
corresponding unicode charset is returned
59-
60-
- if BOM detection fails, None is returned.
46+
47+
# If an encoding wasn't specified, use the encoding detected from the
48+
# BOM, if present, otherwise use the default encoding
49+
if not self.charEncoding:
50+
self.charEncoding = encoding or "cp1252"
51+
52+
# Encoded file stream providing Unicode characters replacing characters
53+
# unable to be encoded with the Unicode replacement character
54+
self.encodedStream = codecs.EncodedFile(self.stream, self.charEncoding,
55+
errors='replace')
56+
57+
self.seek(0)
58+
59+
def openStream(self, stream):
60+
""" Opens stream first trying the native open function, if that
61+
fails try to open as a URL and finally treating stream as a string.
62+
63+
Returns a file-like object.
6164
"""
62-
# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/363841
63-
64-
### detection using BOM
65-
66-
## the BOMs we know, by their pattern
67-
bomDict = { # bytepattern : name
68-
(0x00, 0x00, 0xFE, 0xFF) : "utf_32_be",
69-
(0xFF, 0xFE, 0x00, 0x00) : "utf_32_le",
70-
(0xFE, 0xFF, None, None) : "utf_16_be",
71-
(0xFF, 0xFE, None, None) : "utf_16_le",
72-
(0xEF, 0xBB, 0xBF, None) : "utf_8",
73-
}
74-
75-
## go to beginning of file and get the first 4 bytes
76-
fp.seek(0)
77-
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
78-
79-
## try bom detection using 4 bytes, 3 bytes, or 2 bytes
80-
bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
81-
if not bomDetection :
82-
bomDetection = bomDict.get((byte1, byte2, byte3, None))
83-
if not bomDetection :
84-
bomDetection = bomDict.get((byte1, byte2, None, None))
85-
86-
## if BOM detected, we're done :-)
87-
fp.seek(0)
88-
if bomDetection :
89-
return bomDetection
90-
return None
91-
92-
def consumeChar(self):
93-
char = unicode(self.encodedFile.read(1), self.charEncoding)
94-
if char == "\n":
95-
# Move to next line and reset column count
96-
self.line += 1
97-
self.col = 0
98-
self.lineBreaks.append(self.encodedFile.tell())
99-
else:
100-
# Just increment the column counter
101-
self.col += 1
102-
return char or None
103-
104-
def unconsumeChar(self):
105-
"""Unconsume the previous character by seeking backwards thorough
106-
the file.
65+
# Already a file-like object?
66+
if hasattr(stream, 'seek'):
67+
return stream
68+
69+
# Try opening stream normally
70+
try:
71+
return open(stream)
72+
except: pass
73+
74+
# Otherwise treat stream as a string and covert to a file-like object
75+
import StringIO as StringIO
76+
return StringIO.StringIO(str(stream))
77+
78+
def detectEncoding(self):
79+
""" Attempts to detect the character encoding of the stream.
80+
81+
If an encoding can be determined from the BOM return the name of the
82+
encoding otherwise return None
10783
"""
108-
self.encodedFile.seek(-1, 1)
109-
if self.encodedFile.tell()+1 == self.lineBreaks[-1]:
110-
self.line -= 1
111-
self.lineBreaks.pop()
112-
self.col = self.encodedFile.tell()-self.lineBreaks[-1]
113-
else:
114-
self.col -= 1
115-
84+
85+
bomDict = {
86+
codecs.BOM_UTF8: 'utf-8',
87+
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
88+
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
89+
}
90+
91+
# Go to beginning of file and read in 4 bytes
92+
self.stream.seek(0)
93+
string = self.stream.read(4)
94+
95+
# Try detecting the BOM using bytes from the string
96+
encoding = bomDict.get(string[:3]) # UTF-8
97+
if not encoding:
98+
encoding = bomDict.get(string[:2]) # UTF-16
99+
if not encoding:
100+
encoding = bomDict.get(string) # UTF-32
101+
102+
# Go back to the beginning of the file
103+
self.stream.seek(0)
104+
105+
return encoding
106+
116107
def declareEncoding(self, encoding):
117108
"""Report the encoding declared by the meta element
118109
119110
If the encoding is currently only guessed, then this
120111
will read subsequent characters in that encoding.
121-
112+
122113
If the encoding is not compatible with the guessed encoding
123114
and non-US-ASCII characters have been seen, parsing will
124115
have to begin again.
125116
"""
126117
pass
118+
119+
def read(self, size=1, stopAt=None):
120+
""" Read at most size characters from the stream stopping when
121+
encountering a character in stopAt if supplied.
122+
123+
stopAt can be any iterable object such as a string, list or tuple.
124+
125+
Returns a string from the stream with null bytes and new lines
126+
normalized
127+
"""
128+
charStack = []
129+
130+
while (len(charStack) < size) or stopAt:
131+
charStack.append(self.encodedStream.read(1))
132+
if charStack[-1] == u"\x00":
133+
charStack[-1] = u"\uFFFD"
134+
elif charStack[-1] == u"\r":
135+
if self.lookAhead(1) == u"\n":
136+
charStack.pop()
137+
else:
138+
charStack[-1] = u"\n"
139+
if stopAt and charStack and charStack[-1] in stopAt:
140+
break
141+
142+
# Keep track of line and column count
143+
for c in charStack:
144+
if c == u"\n":
145+
self.line += 1
146+
self.col = 0
147+
else:
148+
self.col += 1
149+
150+
# Return normalized stream
151+
return "".join(charStack)
152+
153+
def seek(self, offset, whence=0):
154+
""" Proxy method for seeking withing the input stream.
155+
"""
156+
157+
# XXX TODO: Still need to find a way to track line and col after
158+
# seeking. Seeking back is easy but going forward will need reading
159+
# characters up to that point
160+
161+
self.encodedStream.seek(offset, whence)
162+
# Skip over the BOM if needed
163+
if not self.tell() and self.skipBOM:
164+
self.encodedStream.read(1)
165+
166+
def tell(self):
167+
""" Returns the streams current position
168+
"""
169+
return self.encodedStream.tell()
170+
171+
def readUntil(self, charList):
172+
""" Returns a string of characters from the stream until a character
173+
in charList is found or EOF is reached
174+
"""
175+
return self.read(stopAt=charList)
176+
177+
def lookAhead(self, amount):
178+
""" Returns the amount of characters specified without moving
179+
forward within the stream.
180+
"""
181+
string = self.read(amount)
182+
self.seek(-len(string), 1)
183+
return string
127184

128185
if __name__ == "__main__":
129186
try:
130187
# Hard coded file name for now, this will need to be fixed later
131-
htmlFile = open("tests/utf-8-bom.html", "rU")
132-
stream = HTMLInputStream(htmlFile)
133-
134-
char = stream.consumeChar()
188+
stream = HTMLInputStream("tests/utf-8-bom.html")
189+
190+
char = stream.read(1)
135191
while char:
136-
line = stream.line
137-
col = stream.col
138-
if char == "\n":
139-
print "LF (%d, %d)" % (line, col)
140-
else:
141-
print "%s (%d, %d)" % (char, line, col)
142-
char = stream.consumeChar()
192+
print char
193+
char = stream.read(1)
143194
print "EOF"
144-
htmlFile.close()
145195
except IOError:
146196
print "The file does not exist."

utils/utils.py

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -33,29 +33,3 @@ def __getitem__(self, key):
3333
return self.defaultValue
3434
else:
3535
raise
36-
37-
def openStream(stream):
38-
""" Opens stream first trying the native open functino, if that
39-
fails try to open as a URL and finally treating stream as a string.
40-
41-
Returns a file-like object.
42-
"""
43-
# Already a file-like object?
44-
if hasattr(stream, 'tell'):
45-
return stream
46-
47-
# Try opening stream normally
48-
try:
49-
return open(stream)
50-
except: pass
51-
52-
# Try opening stream as a URL and storing the bytes returned so
53-
# they can be turned into a file-like object below
54-
try:
55-
import urllib
56-
stream = urllib.urlopen(stream).read(-1)
57-
except: pass
58-
59-
# Treat source as a string and make it into a file-like object
60-
import cStringIO as StringIO
61-
return StringIO.StringIO(str(stream))

0 commit comments

Comments
 (0)