Skip to content

Commit f8bfb92

Browse files
committed
Every place you can specify a stream, you should be able to specify an
external encoding. --HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40448
1 parent 432a2a8 commit f8bfb92

3 files changed

Lines changed: 11 additions & 6 deletions

File tree

src/html5parser.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,13 +70,18 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder):
7070
"trailingEnd": TrailingEndPhase(self, self.tree)
7171
}
7272

73-
def parse(self, stream, innerHTML=False):
73+
def parse(self, stream, encoding=None, innerHTML=False):
7474
"""Parse a HTML document into a well-formed tree
7575
7676
stream - a filelike object or string containing the HTML to be parsed
7777
7878
innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
7979
is not yet supported)
80+
81+
The optional encoding parameter must be a string that indicates
82+
the encoding. If specified, that encoding will be used,
83+
regardless of any BOM or later declaration (such as in a meta
84+
element)
8085
"""
8186

8287
self.tree.reset()
@@ -91,7 +96,7 @@ def parse(self, stream, innerHTML=False):
9196
# assertations
9297
self.innerHTML = innerHTML
9398

94-
self.tokenizer = tokenizer.HTMLTokenizer(stream)
99+
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding)
95100

96101
# XXX This is temporary for the moment so there isn't any other
97102
# changes needed for the parser to work with the iterable tokenizer

src/tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ class HTMLTokenizer(object):
3232

3333
# XXX need to fix documentation
3434

35-
def __init__(self, stream):
36-
self.stream = HTMLInputStream(stream)
35+
def __init__(self, stream, encoding=None):
36+
self.stream = HTMLInputStream(stream, encoding)
3737

3838
self.states = {
3939
"data":self.dataState,

tests/test_tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ def __init__(self, contentModelFlag, lastStartTag=None):
2020
self._contentModelFlag = constants.contentModelFlags[contentModelFlag]
2121
self._lastStartTag = lastStartTag
2222

23-
def parse(self, stream, innerHTML=False):
24-
tokenizer = self.tokenizer(stream)
23+
def parse(self, stream, encoding=None, innerHTML=False):
24+
tokenizer = self.tokenizer(stream, encoding)
2525
self.outputTokens = []
2626

2727
tokenizer.contentModelFlag = self._contentModelFlag

0 commit comments

Comments
 (0)