Skip to content

Commit b916ab5

Browse files
committed
make entity cleanup a bit faster (hopefully!)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%4090
1 parent dd7dfad commit b916ab5

2 files changed

Lines changed: 40 additions & 39 deletions

File tree

constants.py

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -133,40 +133,40 @@
133133
"input"
134134
))
135135

136-
entitiesWindows1252 = {
137-
128: 8364, # 0x80 0x20AC EURO SIGN
138-
129: 65533, # 0x81 UNDEFINED
139-
130: 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
140-
131: 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
141-
132: 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
142-
133: 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
143-
134: 8224, # 0x86 0x2020 DAGGER
144-
135: 8225, # 0x87 0x2021 DOUBLE DAGGER
145-
136: 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
146-
137: 8240, # 0x89 0x2030 PER MILLE SIGN
147-
138: 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
148-
139: 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
149-
140: 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
150-
141: 65533, # 0x8D UNDEFINED
151-
142: 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
152-
143: 65533, # 0x8F UNDEFINED
153-
144: 65533, # 0x90 UNDEFINED
154-
145: 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
155-
146: 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
156-
147: 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
157-
148: 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
158-
149: 8226, # 0x95 0x2022 BULLET
159-
150: 8211, # 0x96 0x2013 EN DASH
160-
151: 8212, # 0x97 0x2014 EM DASH
161-
152: 732, # 0x98 0x02DC SMALL TILDE
162-
153: 8482, # 0x99 0x2122 TRADE MARK SIGN
163-
154: 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
164-
155: 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
165-
156: 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
166-
157: 65533, # 0x9D UNDEFINED
167-
158: 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
168-
159: 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
169-
}
136+
entitiesWindows1252 = [
137+
8364, # 0x80 0x20AC EURO SIGN
138+
65533, # 0x81 UNDEFINED
139+
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
140+
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
141+
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
142+
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
143+
8224, # 0x86 0x2020 DAGGER
144+
8225, # 0x87 0x2021 DOUBLE DAGGER
145+
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
146+
8240, # 0x89 0x2030 PER MILLE SIGN
147+
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
148+
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
149+
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
150+
65533, # 0x8D UNDEFINED
151+
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
152+
65533, # 0x8F UNDEFINED
153+
65533, # 0x90 UNDEFINED
154+
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
155+
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
156+
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
157+
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
158+
8226, # 0x95 0x2022 BULLET
159+
8211, # 0x96 0x2013 EN DASH
160+
8212, # 0x97 0x2014 EM DASH
161+
732, # 0x98 0x02DC SMALL TILDE
162+
8482, # 0x99 0x2122 TRADE MARK SIGN
163+
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
164+
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
165+
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
166+
65533, # 0x9D UNDEFINED
167+
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
168+
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
169+
]
170170

171171
entities = {
172172
"AElig": u"\u00C6",

tokenizer.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -189,12 +189,13 @@ def consumeNumberEntity(self, isHex):
189189
# If the integer is between 127 and 160 (so 128 and bigger and 159 and
190190
# smaller) we need to do the "windows trick".
191191
if 127 < charAsInt < 160:
192-
charAsInt = entitiesWindows1252[charAsInt]
193-
192+
charAsInt = entitiesWindows1252[128 - charAsInt]
193+
194+
# 0 is not a good number.
195+
if charAsInt == 0:
196+
charAsInt = 65533
197+
194198
try:
195-
# XXX This is wrong. This doesn't take "windows-1252 entities" into
196-
# account.
197-
198199
# XXX We should have a separate function that does "int" to
199200
# "unicodestring" conversion since this doesn't always work
200201
# according to hsivonen. Also, unichr has a limitation of 65535

0 commit comments

Comments
 (0)