Enhance the punycode decoder so that it can decode

doerwalter · doerwalter · commit 0ac30f82fe1b · 2007-05-11T10:32:57.000Z
unicode objects.

Fix the idna codec and the tests.
diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
@@ -7,7 +7,8 @@
 dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
 
 # IDNA section 5
-ace_prefix = "xn--"
+ace_prefix = b"xn--"
+sace_prefix = "xn--"
 
 # This assumes query strings, so AllowUnassigned is true
 def nameprep(label):
@@ -87,7 +88,7 @@ def ToASCII(label):
         raise UnicodeError("label empty or too long")
 
     # Step 5: Check ACE prefix
-    if label.startswith(ace_prefix):
+    if label.startswith(sace_prefix):
         raise UnicodeError("Label starts with ACE prefix")
 
     # Step 6: Encode with PUNYCODE
@@ -134,7 +135,7 @@ def ToUnicode(label):
 
     # Step 7: Compare the result of step 6 with the one of step 3
     # label2 will already be in lower case.
-    if label.lower() != label2:
+    if str(label, "ascii").lower() != str(label2, "ascii"):
         raise UnicodeError("IDNA does not round-trip", label, label2)
 
     # Step 8: return the result of step 5
@@ -143,7 +144,7 @@ def ToUnicode(label):
 ### Codec APIs
 
 class Codec(codecs.Codec):
-    def encode(self,input,errors='strict'):
+    def encode(self, input, errors='strict'):
 
         if errors != 'strict':
             # IDNA is quite clear that implementations must be strict
@@ -152,19 +153,21 @@ def encode(self,input,errors='strict'):
         if not input:
             return b"", 0
 
-        result = []
+        result = b""
         labels = dots.split(input)
-        if labels and len(labels[-1])==0:
+        if labels and not labels[-1]:
             trailing_dot = b'.'
             del labels[-1]
         else:
             trailing_dot = b''
         for label in labels:
-            result.append(ToASCII(label))
-        # Join with U+002E
-        return b".".join(result)+trailing_dot, len(input)
+            if result:
+                # Join with U+002E
+                result.extend(b'.')
+            result.extend(ToASCII(label))
+        return result+trailing_dot, len(input)
 
-    def decode(self,input,errors='strict'):
+    def decode(self, input, errors='strict'):
 
         if errors != 'strict':
             raise UnicodeError("Unsupported error handling "+errors)
@@ -199,30 +202,31 @@ def _buffer_encode(self, input, errors, final):
             raise UnicodeError("unsupported error handling "+errors)
 
         if not input:
-            return ("", 0)
+            return (b'', 0)
 
         labels = dots.split(input)
-        trailing_dot = ''
+        trailing_dot = b''
         if labels:
             if not labels[-1]:
-                trailing_dot = '.'
+                trailing_dot = b'.'
                 del labels[-1]
             elif not final:
                 # Keep potentially unfinished label until the next call
                 del labels[-1]
                 if labels:
-                    trailing_dot = '.'
+                    trailing_dot = b'.'
 
-        result = []
+        result = b""
         size = 0
         for label in labels:
-            result.append(ToASCII(label))
             if size:
+                # Join with U+002E
+                result.extend(b'.')
                 size += 1
+            result.extend(ToASCII(label))
             size += len(label)
 
-        # Join with U+002E
-        result = ".".join(result) + trailing_dot
+        result += trailing_dot
         size += len(trailing_dot)
         return (result, size)
 
@@ -239,8 +243,7 @@ def _buffer_decode(self, input, errors, final):
             labels = dots.split(input)
         else:
             # Must be ASCII string
-            input = str(input)
-            str(input, "ascii")
+            input = str(input, "ascii")
             labels = input.split(".")
 
         trailing_dot = ''
diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py
@@ -181,6 +181,8 @@ def insertion_sort(base, extended, errors):
     return base
 
 def punycode_decode(text, errors):
+    if isinstance(text, str):
+        text = text.encode("ascii")
     pos = text.rfind(b"-")
     if pos == -1:
         base = ""
@@ -194,11 +196,11 @@ def punycode_decode(text, errors):
 
 class Codec(codecs.Codec):
 
-    def encode(self,input,errors='strict'):
+    def encode(self, input, errors='strict'):
         res = punycode_encode(input)
         return res, len(input)
 
-    def decode(self,input,errors='strict'):
+    def decode(self, input, errors='strict'):
         if errors not in ('strict', 'replace', 'ignore'):
             raise UnicodeError, "Unsupported error handling "+errors
         res = punycode_decode(input, errors)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py