|
1 | | - |
2 | 1 | import string |
3 | 2 |
|
4 | | -contentModelFlags = { |
5 | | - "PCDATA":0, |
6 | | - "RCDATA":1, |
7 | | - "CDATA":2, |
8 | | - "PLAINTEXT":3 |
9 | | -} |
10 | | - |
11 | | -spaceCharacters = ( |
12 | | - u"\t", |
13 | | - u"\n", |
14 | | - u"\u000B", |
15 | | - u"\u000C", |
16 | | - u" " |
17 | | -) |
18 | | - |
19 | | -entitiesWindows1252 = { |
20 | | - 128: 8364, # 0x80 0x20AC EURO SIGN |
21 | | - 129: 65533, # 0x81 UNDEFINED |
22 | | - 130: 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK |
23 | | - 131: 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK |
24 | | - 132: 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK |
25 | | - 133: 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS |
26 | | - 134: 8224, # 0x86 0x2020 DAGGER |
27 | | - 135: 8225, # 0x87 0x2021 DOUBLE DAGGER |
28 | | - 136: 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT |
29 | | - 137: 8240, # 0x89 0x2030 PER MILLE SIGN |
30 | | - 138: 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON |
31 | | - 139: 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK |
32 | | - 140: 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE |
33 | | - 141: 65533, # 0x8D UNDEFINED |
34 | | - 142: 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON |
35 | | - 143: 65533, # 0x8F UNDEFINED |
36 | | - 144: 65533, # 0x90 UNDEFINED |
37 | | - 145: 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK |
38 | | - 146: 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK |
39 | | - 147: 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK |
40 | | - 148: 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK |
41 | | - 149: 8226, # 0x95 0x2022 BULLET |
42 | | - 150: 8211, # 0x96 0x2013 EN DASH |
43 | | - 151: 8212, # 0x97 0x2014 EM DASH |
44 | | - 152: 732, # 0x98 0x02DC SMALL TILDE |
45 | | - 153: 8482, # 0x99 0x2122 TRADE MARK SIGN |
46 | | - 154: 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON |
47 | | - 155: 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK |
48 | | - 156: 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE |
49 | | - 157: 65533, # 0x9D UNDEFINED |
50 | | - 158: 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON |
51 | | - 159: 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS |
52 | | -} |
53 | | - |
54 | | -entities = { |
55 | | - "AElig": u"\u00C6", |
56 | | - "Aacute": u"\u00C1", |
57 | | - "Acirc": u"\u00C2", |
58 | | - "Agrave": u"\u00C0", |
59 | | - "Alpha": u"\u0391", |
60 | | - "Aring": u"\u00C5", |
61 | | - "Atilde": u"\u00C3", |
62 | | - "Auml": u"\u00C4", |
63 | | - "Beta": u"\u0392", |
64 | | - "Ccedil": u"\u00C7", |
65 | | - "Chi": u"\u03A7", |
66 | | - "Dagger": u"\u2021", |
67 | | - "Delta": u"\u0394", |
68 | | - "ETH": u"\u00D0", |
69 | | - "Eacute": u"\u00C9", |
70 | | - "Ecirc": u"\u00CA", |
71 | | - "Egrave": u"\u00C8", |
72 | | - "Epsilon": u"\u0395", |
73 | | - "Eta": u"\u0397", |
74 | | - "Euml": u"\u00CB", |
75 | | - "Gamma": u"\u0393", |
76 | | - "Iacute": u"\u00CD", |
77 | | - "Icirc": u"\u00CE", |
78 | | - "Igrave": u"\u00CC", |
79 | | - "Iota": u"\u0399", |
80 | | - "Iuml": u"\u00CF", |
81 | | - "Kappa": u"\u039A", |
82 | | - "Lambda": u"\u039B", |
83 | | - "Mu": u"\u039C", |
84 | | - "Ntilde": u"\u00D1", |
85 | | - "Nu": u"\u039D", |
86 | | - "OElig": u"\u0152", |
87 | | - "Oacute": u"\u00D3", |
88 | | - "Ocirc": u"\u00D4", |
89 | | - "Ograve": u"\u00D2", |
90 | | - "Omega": u"\u03A9", |
91 | | - "Omicron": u"\u039F", |
92 | | - "Oslash": u"\u00D8", |
93 | | - "Otilde": u"\u00D5", |
94 | | - "Ouml": u"\u00D6", |
95 | | - "Phi": u"\u03A6", |
96 | | - "Pi": u"\u03A0", |
97 | | - "Prime": u"\u2033", |
98 | | - "Psi": u"\u03A8", |
99 | | - "Rho": u"\u03A1", |
100 | | - "Scaron": u"\u0160", |
101 | | - "Sigma": u"\u03A3", |
102 | | - "THORN": u"\u00DE", |
103 | | - "Tau": u"\u03A4", |
104 | | - "Theta": u"\u0398", |
105 | | - "Uacute": u"\u00DA", |
106 | | - "Ucirc": u"\u00DB", |
107 | | - "Ugrave": u"\u00D9", |
108 | | - "Upsilon": u"\u03A5", |
109 | | - "Uuml": u"\u00DC", |
110 | | - "Xi": u"\u039E", |
111 | | - "Yacute": u"\u00DD", |
112 | | - "Yuml": u"\u0178", |
113 | | - "Zeta": u"\u0396", |
114 | | - "aacute": u"\u00E1", |
115 | | - "acirc": u"\u00E2", |
116 | | - "acute": u"\u00B4", |
117 | | - "aelig": u"\u00E6", |
118 | | - "agrave": u"\u00E0", |
119 | | - "alefsym": u"\u2135", |
120 | | - "alpha": u"\u03B1", |
121 | | - "amp": u"\u0026", |
122 | | - "AMP": u"\u0026", |
123 | | - "and": u"\u2227", |
124 | | - "ang": u"\u2220", |
125 | | - "apos": u"\u0027", |
126 | | - "aring": u"\u00E5", |
127 | | - "asymp": u"\u2248", |
128 | | - "atilde": u"\u00E3", |
129 | | - "auml": u"\u00E4", |
130 | | - "bdquo": u"\u201E", |
131 | | - "beta": u"\u03B2", |
132 | | - "brvbar": u"\u00A6", |
133 | | - "bull": u"\u2022", |
134 | | - "cap": u"\u2229", |
135 | | - "ccedil": u"\u00E7", |
136 | | - "cedil": u"\u00B8", |
137 | | - "cent": u"\u00A2", |
138 | | - "chi": u"\u03C7", |
139 | | - "circ": u"\u02C6", |
140 | | - "clubs": u"\u2663", |
141 | | - "cong": u"\u2245", |
142 | | - "copy": u"\u00A9", |
143 | | - "COPY": u"\u00A9", |
144 | | - "crarr": u"\u21B5", |
145 | | - "cup": u"\u222A", |
146 | | - "curren": u"\u00A4", |
147 | | - "dArr": u"\u21D3", |
148 | | - "dagger": u"\u2020", |
149 | | - "darr": u"\u2193", |
150 | | - "deg": u"\u00B0", |
151 | | - "delta": u"\u03B4", |
152 | | - "diams": u"\u2666", |
153 | | - "divide": u"\u00F7", |
154 | | - "eacute": u"\u00E9", |
155 | | - "ecirc": u"\u00EA", |
156 | | - "egrave": u"\u00E8", |
157 | | - "empty": u"\u2205", |
158 | | - "emsp": u"\u2003", |
159 | | - "ensp": u"\u2002", |
160 | | - "epsilon": u"\u03B5", |
161 | | - "equiv": u"\u2261", |
162 | | - "eta": u"\u03B7", |
163 | | - "eth": u"\u00F0", |
164 | | - "euml": u"\u00EB", |
165 | | - "euro": u"\u20AC", |
166 | | - "exist": u"\u2203", |
167 | | - "fnof": u"\u0192", |
168 | | - "forall": u"\u2200", |
169 | | - "frac12": u"\u00BD", |
170 | | - "frac14": u"\u00BC", |
171 | | - "frac34": u"\u00BE", |
172 | | - "frasl": u"\u2044", |
173 | | - "gamma": u"\u03B3", |
174 | | - "ge": u"\u2265", |
175 | | - "gt": u"\u003E", |
176 | | - "GT": u"\u003E", |
177 | | - "hArr": u"\u21D4", |
178 | | - "harr": u"\u2194", |
179 | | - "hearts": u"\u2665", |
180 | | - "hellip": u"\u2026", |
181 | | - "iacute": u"\u00ED", |
182 | | - "icirc": u"\u00EE", |
183 | | - "iexcl": u"\u00A1", |
184 | | - "igrave": u"\u00EC", |
185 | | - "image": u"\u2111", |
186 | | - "infin": u"\u221E", |
187 | | - "int": u"\u222B", |
188 | | - "iota": u"\u03B9", |
189 | | - "iquest": u"\u00BF", |
190 | | - "isin": u"\u2208", |
191 | | - "iuml": u"\u00EF", |
192 | | - "kappa": u"\u03BA", |
193 | | - "lArr": u"\u21D0", |
194 | | - "lambda": u"\u03BB", |
195 | | - "lang": u"\u2329", |
196 | | - "laquo": u"\u00AB", |
197 | | - "larr": u"\u2190", |
198 | | - "lceil": u"\u2308", |
199 | | - "ldquo": u"\u201C", |
200 | | - "le": u"\u2264", |
201 | | - "lfloor": u"\u230A", |
202 | | - "lowast": u"\u2217", |
203 | | - "loz": u"\u25CA", |
204 | | - "lrm": u"\u200E", |
205 | | - "lsaquo": u"\u2039", |
206 | | - "lsquo": u"\u2018", |
207 | | - "lt": u"\u003C", |
208 | | - "LT": u"\u003C", |
209 | | - "macr": u"\u00AF", |
210 | | - "mdash": u"\u2014", |
211 | | - "micro": u"\u00B5", |
212 | | - "middot": u"\u00B7", |
213 | | - "minus": u"\u2212", |
214 | | - "mu": u"\u03BC", |
215 | | - "nabla": u"\u2207", |
216 | | - "nbsp": u"\u00A0", |
217 | | - "ndash": u"\u2013", |
218 | | - "ne": u"\u2260", |
219 | | - "ni": u"\u220B", |
220 | | - "not": u"\u00AC", |
221 | | - "notin": u"\u2209", |
222 | | - "nsub": u"\u2284", |
223 | | - "ntilde": u"\u00F1", |
224 | | - "nu": u"\u03BD", |
225 | | - "oacute": u"\u00F3", |
226 | | - "ocirc": u"\u00F4", |
227 | | - "oelig": u"\u0153", |
228 | | - "ograve": u"\u00F2", |
229 | | - "oline": u"\u203E", |
230 | | - "omega": u"\u03C9", |
231 | | - "omicron": u"\u03BF", |
232 | | - "oplus": u"\u2295", |
233 | | - "or": u"\u2228", |
234 | | - "ordf": u"\u00AA", |
235 | | - "ordm": u"\u00BA", |
236 | | - "oslash": u"\u00F8", |
237 | | - "otilde": u"\u00F5", |
238 | | - "otimes": u"\u2297", |
239 | | - "ouml": u"\u00F6", |
240 | | - "para": u"\u00B6", |
241 | | - "part": u"\u2202", |
242 | | - "permil": u"\u2030", |
243 | | - "perp": u"\u22A5", |
244 | | - "phi": u"\u03C6", |
245 | | - "pi": u"\u03C0", |
246 | | - "piv": u"\u03D6", |
247 | | - "plusmn": u"\u00B1", |
248 | | - "pound": u"\u00A3", |
249 | | - "prime": u"\u2032", |
250 | | - "prod": u"\u220F", |
251 | | - "prop": u"\u221D", |
252 | | - "psi": u"\u03C8", |
253 | | - "quot": u"\u0022", |
254 | | - "QUOT": u"\u0022", |
255 | | - "rArr": u"\u21D2", |
256 | | - "radic": u"\u221A", |
257 | | - "rang": u"\u232A", |
258 | | - "raquo": u"\u00BB", |
259 | | - "rarr": u"\u2192", |
260 | | - "rceil": u"\u2309", |
261 | | - "rdquo": u"\u201D", |
262 | | - "real": u"\u211C", |
263 | | - "reg": u"\u00AE", |
264 | | - "REG": u"\u00AE", |
265 | | - "rfloor": u"\u230B", |
266 | | - "rho": u"\u03C1", |
267 | | - "rlm": u"\u200F", |
268 | | - "rsaquo": u"\u203A", |
269 | | - "rsquo": u"\u2019", |
270 | | - "sbquo": u"\u201A", |
271 | | - "scaron": u"\u0161", |
272 | | - "sdot": u"\u22C5", |
273 | | - "sect": u"\u00A7", |
274 | | - "shy": u"\u00AD", |
275 | | - "sigma": u"\u03C3", |
276 | | - "sigmaf": u"\u03C2", |
277 | | - "sim": u"\u223C", |
278 | | - "spades": u"\u2660", |
279 | | - "sub": u"\u2282", |
280 | | - "sube": u"\u2286", |
281 | | - "sum": u"\u2211", |
282 | | - "sup": u"\u2283", |
283 | | - "sup1": u"\u00B9", |
284 | | - "sup2": u"\u00B2", |
285 | | - "sup3": u"\u00B3", |
286 | | - "supe": u"\u2287", |
287 | | - "szlig": u"\u00DF", |
288 | | - "tau": u"\u03C4", |
289 | | - "there4": u"\u2234", |
290 | | - "theta": u"\u03B8", |
291 | | - "thetasym": u"\u03D1", |
292 | | - "thinsp": u"\u2009", |
293 | | - "thorn": u"\u00FE", |
294 | | - "tilde": u"\u02DC", |
295 | | - "times": u"\u00D7", |
296 | | - "trade": u"\u2122", |
297 | | - "uArr": u"\u21D1", |
298 | | - "uacute": u"\u00FA", |
299 | | - "uarr": u"\u2191", |
300 | | - "ucirc": u"\u00FB", |
301 | | - "ugrave": u"\u00F9", |
302 | | - "uml": u"\u00A8", |
303 | | - "upsih": u"\u03D2", |
304 | | - "upsilon": u"\u03C5", |
305 | | - "uuml": u"\u00FC", |
306 | | - "weierp": u"\u2118", |
307 | | - "xi": u"\u03BE", |
308 | | - "yacute": u"\u00FD", |
309 | | - "yen": u"\u00A5", |
310 | | - "yuml": u"\u00FF", |
311 | | - "zeta": u"\u03B6", |
312 | | - "zwj": u"\u200D", |
313 | | - "zwnj": u"\u200C" |
314 | | -} |
| 3 | +from constants import contentModelFlags, spaceCharacters |
| 4 | +from constants import entitiesWindows1252, entities, voidElements |
315 | 5 |
|
316 | 6 | # Data representing the end of the input stream |
317 | 7 | EOF = None |
@@ -412,22 +102,6 @@ def __init__(self, parser): |
412 | 102 | "bogusDoctype":self.bogusDoctypeState |
413 | 103 | } |
414 | 104 |
|
415 | | - self.voidElements = ( |
416 | | - # XXX This list doesn't include <event-source> and <command> yet. |
417 | | - # AT Make this a "global" variable? |
418 | | - "base", |
419 | | - "link", |
420 | | - "meta", |
421 | | - "hr", |
422 | | - "br", |
423 | | - "img", |
424 | | - "embed", |
425 | | - "param", |
426 | | - "area", |
427 | | - "col", |
428 | | - "input" |
429 | | - ) |
430 | | - |
431 | 105 | # Setup the initial tokenizer state |
432 | 106 | self.contentModelFlag = contentModelFlags['PCDATA'] |
433 | 107 | self.state = self.states['data'] |
@@ -478,7 +152,7 @@ def processSolidusInTag(self): |
478 | 152 | # throwing an atheist parse error. |
479 | 153 | data = self.consumeChar() |
480 | 154 |
|
481 | | - if self.currentToken.name in self.voidElements and data == u">": |
| 155 | + if self.currentToken.name in voidElements and data == u">": |
482 | 156 | self.parser.atheistParseError() |
483 | 157 | else: |
484 | 158 | self.parser.parseError() |
|
0 commit comments