Skip to content

Commit 0ac30f8

Browse files
committed
Enhance the punycode decoder so that it can decode
unicode objects. Fix the idna codec and the tests.
1 parent 1f05a3b commit 0ac30f8

3 files changed

Lines changed: 136 additions & 130 deletions

File tree

Lib/encodings/idna.py

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
88

99
# IDNA section 5
10-
ace_prefix = "xn--"
10+
ace_prefix = b"xn--"
11+
sace_prefix = "xn--"
1112

1213
# This assumes query strings, so AllowUnassigned is true
1314
def nameprep(label):
@@ -87,7 +88,7 @@ def ToASCII(label):
8788
raise UnicodeError("label empty or too long")
8889

8990
# Step 5: Check ACE prefix
90-
if label.startswith(ace_prefix):
91+
if label.startswith(sace_prefix):
9192
raise UnicodeError("Label starts with ACE prefix")
9293

9394
# Step 6: Encode with PUNYCODE
@@ -134,7 +135,7 @@ def ToUnicode(label):
134135

135136
# Step 7: Compare the result of step 6 with the one of step 3
136137
# label2 will already be in lower case.
137-
if label.lower() != label2:
138+
if str(label, "ascii").lower() != str(label2, "ascii"):
138139
raise UnicodeError("IDNA does not round-trip", label, label2)
139140

140141
# Step 8: return the result of step 5
@@ -143,7 +144,7 @@ def ToUnicode(label):
143144
### Codec APIs
144145

145146
class Codec(codecs.Codec):
146-
def encode(self,input,errors='strict'):
147+
def encode(self, input, errors='strict'):
147148

148149
if errors != 'strict':
149150
# IDNA is quite clear that implementations must be strict
@@ -152,19 +153,21 @@ def encode(self,input,errors='strict'):
152153
if not input:
153154
return b"", 0
154155

155-
result = []
156+
result = b""
156157
labels = dots.split(input)
157-
if labels and len(labels[-1])==0:
158+
if labels and not labels[-1]:
158159
trailing_dot = b'.'
159160
del labels[-1]
160161
else:
161162
trailing_dot = b''
162163
for label in labels:
163-
result.append(ToASCII(label))
164-
# Join with U+002E
165-
return b".".join(result)+trailing_dot, len(input)
164+
if result:
165+
# Join with U+002E
166+
result.extend(b'.')
167+
result.extend(ToASCII(label))
168+
return result+trailing_dot, len(input)
166169

167-
def decode(self,input,errors='strict'):
170+
def decode(self, input, errors='strict'):
168171

169172
if errors != 'strict':
170173
raise UnicodeError("Unsupported error handling "+errors)
@@ -199,30 +202,31 @@ def _buffer_encode(self, input, errors, final):
199202
raise UnicodeError("unsupported error handling "+errors)
200203

201204
if not input:
202-
return ("", 0)
205+
return (b'', 0)
203206

204207
labels = dots.split(input)
205-
trailing_dot = ''
208+
trailing_dot = b''
206209
if labels:
207210
if not labels[-1]:
208-
trailing_dot = '.'
211+
trailing_dot = b'.'
209212
del labels[-1]
210213
elif not final:
211214
# Keep potentially unfinished label until the next call
212215
del labels[-1]
213216
if labels:
214-
trailing_dot = '.'
217+
trailing_dot = b'.'
215218

216-
result = []
219+
result = b""
217220
size = 0
218221
for label in labels:
219-
result.append(ToASCII(label))
220222
if size:
223+
# Join with U+002E
224+
result.extend(b'.')
221225
size += 1
226+
result.extend(ToASCII(label))
222227
size += len(label)
223228

224-
# Join with U+002E
225-
result = ".".join(result) + trailing_dot
229+
result += trailing_dot
226230
size += len(trailing_dot)
227231
return (result, size)
228232

@@ -239,8 +243,7 @@ def _buffer_decode(self, input, errors, final):
239243
labels = dots.split(input)
240244
else:
241245
# Must be ASCII string
242-
input = str(input)
243-
str(input, "ascii")
246+
input = str(input, "ascii")
244247
labels = input.split(".")
245248

246249
trailing_dot = ''

Lib/encodings/punycode.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,8 @@ def insertion_sort(base, extended, errors):
181181
return base
182182

183183
def punycode_decode(text, errors):
184+
if isinstance(text, str):
185+
text = text.encode("ascii")
184186
pos = text.rfind(b"-")
185187
if pos == -1:
186188
base = ""
@@ -194,11 +196,11 @@ def punycode_decode(text, errors):
194196

195197
class Codec(codecs.Codec):
196198

197-
def encode(self,input,errors='strict'):
199+
def encode(self, input, errors='strict'):
198200
res = punycode_encode(input)
199201
return res, len(input)
200202

201-
def decode(self,input,errors='strict'):
203+
def decode(self, input, errors='strict'):
202204
if errors not in ('strict', 'replace', 'ignore'):
203205
raise UnicodeError, "Unsupported error handling "+errors
204206
res = punycode_decode(input, errors)

0 commit comments

Comments
 (0)