Skip to content

Commit 8a1f1cc

Browse files
committed
Move set of void elements to constants file
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%4082
1 parent 13d6dd6 commit 8a1f1cc

2 files changed

Lines changed: 7 additions & 329 deletions

File tree

constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232

3333
headingElements = frozenset(("h1", "h2", "h3", "h4", "h5", "h6"))
3434

35+
#What about event-source and command?
36+
voidElements = frozenset("base", "link", "meta", "hr", "br", "img", "embed",
37+
"param", "area", "col", "input")
38+
3539
entitiesWindows1252 = {
3640
128: 8364, # 0x80 0x20AC EURO SIGN
3741
129: 65533, # 0x81 UNDEFINED

tokenizer.py

Lines changed: 3 additions & 329 deletions
Original file line numberDiff line numberDiff line change
@@ -1,317 +1,7 @@
1-
21
import string
32

4-
contentModelFlags = {
5-
"PCDATA":0,
6-
"RCDATA":1,
7-
"CDATA":2,
8-
"PLAINTEXT":3
9-
}
10-
11-
spaceCharacters = (
12-
u"\t",
13-
u"\n",
14-
u"\u000B",
15-
u"\u000C",
16-
u" "
17-
)
18-
19-
entitiesWindows1252 = {
20-
128: 8364, # 0x80 0x20AC EURO SIGN
21-
129: 65533, # 0x81 UNDEFINED
22-
130: 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
23-
131: 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
24-
132: 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
25-
133: 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
26-
134: 8224, # 0x86 0x2020 DAGGER
27-
135: 8225, # 0x87 0x2021 DOUBLE DAGGER
28-
136: 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
29-
137: 8240, # 0x89 0x2030 PER MILLE SIGN
30-
138: 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
31-
139: 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
32-
140: 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
33-
141: 65533, # 0x8D UNDEFINED
34-
142: 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
35-
143: 65533, # 0x8F UNDEFINED
36-
144: 65533, # 0x90 UNDEFINED
37-
145: 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
38-
146: 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
39-
147: 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
40-
148: 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
41-
149: 8226, # 0x95 0x2022 BULLET
42-
150: 8211, # 0x96 0x2013 EN DASH
43-
151: 8212, # 0x97 0x2014 EM DASH
44-
152: 732, # 0x98 0x02DC SMALL TILDE
45-
153: 8482, # 0x99 0x2122 TRADE MARK SIGN
46-
154: 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
47-
155: 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
48-
156: 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
49-
157: 65533, # 0x9D UNDEFINED
50-
158: 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
51-
159: 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
52-
}
53-
54-
entities = {
55-
"AElig": u"\u00C6",
56-
"Aacute": u"\u00C1",
57-
"Acirc": u"\u00C2",
58-
"Agrave": u"\u00C0",
59-
"Alpha": u"\u0391",
60-
"Aring": u"\u00C5",
61-
"Atilde": u"\u00C3",
62-
"Auml": u"\u00C4",
63-
"Beta": u"\u0392",
64-
"Ccedil": u"\u00C7",
65-
"Chi": u"\u03A7",
66-
"Dagger": u"\u2021",
67-
"Delta": u"\u0394",
68-
"ETH": u"\u00D0",
69-
"Eacute": u"\u00C9",
70-
"Ecirc": u"\u00CA",
71-
"Egrave": u"\u00C8",
72-
"Epsilon": u"\u0395",
73-
"Eta": u"\u0397",
74-
"Euml": u"\u00CB",
75-
"Gamma": u"\u0393",
76-
"Iacute": u"\u00CD",
77-
"Icirc": u"\u00CE",
78-
"Igrave": u"\u00CC",
79-
"Iota": u"\u0399",
80-
"Iuml": u"\u00CF",
81-
"Kappa": u"\u039A",
82-
"Lambda": u"\u039B",
83-
"Mu": u"\u039C",
84-
"Ntilde": u"\u00D1",
85-
"Nu": u"\u039D",
86-
"OElig": u"\u0152",
87-
"Oacute": u"\u00D3",
88-
"Ocirc": u"\u00D4",
89-
"Ograve": u"\u00D2",
90-
"Omega": u"\u03A9",
91-
"Omicron": u"\u039F",
92-
"Oslash": u"\u00D8",
93-
"Otilde": u"\u00D5",
94-
"Ouml": u"\u00D6",
95-
"Phi": u"\u03A6",
96-
"Pi": u"\u03A0",
97-
"Prime": u"\u2033",
98-
"Psi": u"\u03A8",
99-
"Rho": u"\u03A1",
100-
"Scaron": u"\u0160",
101-
"Sigma": u"\u03A3",
102-
"THORN": u"\u00DE",
103-
"Tau": u"\u03A4",
104-
"Theta": u"\u0398",
105-
"Uacute": u"\u00DA",
106-
"Ucirc": u"\u00DB",
107-
"Ugrave": u"\u00D9",
108-
"Upsilon": u"\u03A5",
109-
"Uuml": u"\u00DC",
110-
"Xi": u"\u039E",
111-
"Yacute": u"\u00DD",
112-
"Yuml": u"\u0178",
113-
"Zeta": u"\u0396",
114-
"aacute": u"\u00E1",
115-
"acirc": u"\u00E2",
116-
"acute": u"\u00B4",
117-
"aelig": u"\u00E6",
118-
"agrave": u"\u00E0",
119-
"alefsym": u"\u2135",
120-
"alpha": u"\u03B1",
121-
"amp": u"\u0026",
122-
"AMP": u"\u0026",
123-
"and": u"\u2227",
124-
"ang": u"\u2220",
125-
"apos": u"\u0027",
126-
"aring": u"\u00E5",
127-
"asymp": u"\u2248",
128-
"atilde": u"\u00E3",
129-
"auml": u"\u00E4",
130-
"bdquo": u"\u201E",
131-
"beta": u"\u03B2",
132-
"brvbar": u"\u00A6",
133-
"bull": u"\u2022",
134-
"cap": u"\u2229",
135-
"ccedil": u"\u00E7",
136-
"cedil": u"\u00B8",
137-
"cent": u"\u00A2",
138-
"chi": u"\u03C7",
139-
"circ": u"\u02C6",
140-
"clubs": u"\u2663",
141-
"cong": u"\u2245",
142-
"copy": u"\u00A9",
143-
"COPY": u"\u00A9",
144-
"crarr": u"\u21B5",
145-
"cup": u"\u222A",
146-
"curren": u"\u00A4",
147-
"dArr": u"\u21D3",
148-
"dagger": u"\u2020",
149-
"darr": u"\u2193",
150-
"deg": u"\u00B0",
151-
"delta": u"\u03B4",
152-
"diams": u"\u2666",
153-
"divide": u"\u00F7",
154-
"eacute": u"\u00E9",
155-
"ecirc": u"\u00EA",
156-
"egrave": u"\u00E8",
157-
"empty": u"\u2205",
158-
"emsp": u"\u2003",
159-
"ensp": u"\u2002",
160-
"epsilon": u"\u03B5",
161-
"equiv": u"\u2261",
162-
"eta": u"\u03B7",
163-
"eth": u"\u00F0",
164-
"euml": u"\u00EB",
165-
"euro": u"\u20AC",
166-
"exist": u"\u2203",
167-
"fnof": u"\u0192",
168-
"forall": u"\u2200",
169-
"frac12": u"\u00BD",
170-
"frac14": u"\u00BC",
171-
"frac34": u"\u00BE",
172-
"frasl": u"\u2044",
173-
"gamma": u"\u03B3",
174-
"ge": u"\u2265",
175-
"gt": u"\u003E",
176-
"GT": u"\u003E",
177-
"hArr": u"\u21D4",
178-
"harr": u"\u2194",
179-
"hearts": u"\u2665",
180-
"hellip": u"\u2026",
181-
"iacute": u"\u00ED",
182-
"icirc": u"\u00EE",
183-
"iexcl": u"\u00A1",
184-
"igrave": u"\u00EC",
185-
"image": u"\u2111",
186-
"infin": u"\u221E",
187-
"int": u"\u222B",
188-
"iota": u"\u03B9",
189-
"iquest": u"\u00BF",
190-
"isin": u"\u2208",
191-
"iuml": u"\u00EF",
192-
"kappa": u"\u03BA",
193-
"lArr": u"\u21D0",
194-
"lambda": u"\u03BB",
195-
"lang": u"\u2329",
196-
"laquo": u"\u00AB",
197-
"larr": u"\u2190",
198-
"lceil": u"\u2308",
199-
"ldquo": u"\u201C",
200-
"le": u"\u2264",
201-
"lfloor": u"\u230A",
202-
"lowast": u"\u2217",
203-
"loz": u"\u25CA",
204-
"lrm": u"\u200E",
205-
"lsaquo": u"\u2039",
206-
"lsquo": u"\u2018",
207-
"lt": u"\u003C",
208-
"LT": u"\u003C",
209-
"macr": u"\u00AF",
210-
"mdash": u"\u2014",
211-
"micro": u"\u00B5",
212-
"middot": u"\u00B7",
213-
"minus": u"\u2212",
214-
"mu": u"\u03BC",
215-
"nabla": u"\u2207",
216-
"nbsp": u"\u00A0",
217-
"ndash": u"\u2013",
218-
"ne": u"\u2260",
219-
"ni": u"\u220B",
220-
"not": u"\u00AC",
221-
"notin": u"\u2209",
222-
"nsub": u"\u2284",
223-
"ntilde": u"\u00F1",
224-
"nu": u"\u03BD",
225-
"oacute": u"\u00F3",
226-
"ocirc": u"\u00F4",
227-
"oelig": u"\u0153",
228-
"ograve": u"\u00F2",
229-
"oline": u"\u203E",
230-
"omega": u"\u03C9",
231-
"omicron": u"\u03BF",
232-
"oplus": u"\u2295",
233-
"or": u"\u2228",
234-
"ordf": u"\u00AA",
235-
"ordm": u"\u00BA",
236-
"oslash": u"\u00F8",
237-
"otilde": u"\u00F5",
238-
"otimes": u"\u2297",
239-
"ouml": u"\u00F6",
240-
"para": u"\u00B6",
241-
"part": u"\u2202",
242-
"permil": u"\u2030",
243-
"perp": u"\u22A5",
244-
"phi": u"\u03C6",
245-
"pi": u"\u03C0",
246-
"piv": u"\u03D6",
247-
"plusmn": u"\u00B1",
248-
"pound": u"\u00A3",
249-
"prime": u"\u2032",
250-
"prod": u"\u220F",
251-
"prop": u"\u221D",
252-
"psi": u"\u03C8",
253-
"quot": u"\u0022",
254-
"QUOT": u"\u0022",
255-
"rArr": u"\u21D2",
256-
"radic": u"\u221A",
257-
"rang": u"\u232A",
258-
"raquo": u"\u00BB",
259-
"rarr": u"\u2192",
260-
"rceil": u"\u2309",
261-
"rdquo": u"\u201D",
262-
"real": u"\u211C",
263-
"reg": u"\u00AE",
264-
"REG": u"\u00AE",
265-
"rfloor": u"\u230B",
266-
"rho": u"\u03C1",
267-
"rlm": u"\u200F",
268-
"rsaquo": u"\u203A",
269-
"rsquo": u"\u2019",
270-
"sbquo": u"\u201A",
271-
"scaron": u"\u0161",
272-
"sdot": u"\u22C5",
273-
"sect": u"\u00A7",
274-
"shy": u"\u00AD",
275-
"sigma": u"\u03C3",
276-
"sigmaf": u"\u03C2",
277-
"sim": u"\u223C",
278-
"spades": u"\u2660",
279-
"sub": u"\u2282",
280-
"sube": u"\u2286",
281-
"sum": u"\u2211",
282-
"sup": u"\u2283",
283-
"sup1": u"\u00B9",
284-
"sup2": u"\u00B2",
285-
"sup3": u"\u00B3",
286-
"supe": u"\u2287",
287-
"szlig": u"\u00DF",
288-
"tau": u"\u03C4",
289-
"there4": u"\u2234",
290-
"theta": u"\u03B8",
291-
"thetasym": u"\u03D1",
292-
"thinsp": u"\u2009",
293-
"thorn": u"\u00FE",
294-
"tilde": u"\u02DC",
295-
"times": u"\u00D7",
296-
"trade": u"\u2122",
297-
"uArr": u"\u21D1",
298-
"uacute": u"\u00FA",
299-
"uarr": u"\u2191",
300-
"ucirc": u"\u00FB",
301-
"ugrave": u"\u00F9",
302-
"uml": u"\u00A8",
303-
"upsih": u"\u03D2",
304-
"upsilon": u"\u03C5",
305-
"uuml": u"\u00FC",
306-
"weierp": u"\u2118",
307-
"xi": u"\u03BE",
308-
"yacute": u"\u00FD",
309-
"yen": u"\u00A5",
310-
"yuml": u"\u00FF",
311-
"zeta": u"\u03B6",
312-
"zwj": u"\u200D",
313-
"zwnj": u"\u200C"
314-
}
3+
from constants import contentModelFlags, spaceCharacters
4+
from constants import entitiesWindows1252, entities, voidElements
3155

3166
# Data representing the end of the input stream
3177
EOF = None
@@ -412,22 +102,6 @@ def __init__(self, parser):
412102
"bogusDoctype":self.bogusDoctypeState
413103
}
414104

415-
self.voidElements = (
416-
# XXX This list doesn't include <event-source> and <command> yet.
417-
# AT Make this a "global" variable?
418-
"base",
419-
"link",
420-
"meta",
421-
"hr",
422-
"br",
423-
"img",
424-
"embed",
425-
"param",
426-
"area",
427-
"col",
428-
"input"
429-
)
430-
431105
# Setup the initial tokenizer state
432106
self.contentModelFlag = contentModelFlags['PCDATA']
433107
self.state = self.states['data']
@@ -478,7 +152,7 @@ def processSolidusInTag(self):
478152
# throwing an atheist parse error.
479153
data = self.consumeChar()
480154

481-
if self.currentToken.name in self.voidElements and data == u">":
155+
if self.currentToken.name in voidElements and data == u">":
482156
self.parser.atheistParseError()
483157
else:
484158
self.parser.parseError()

0 commit comments

Comments
 (0)