From 6fe0ab3784fa68686c1d7242fa25450da3d886e0 Mon Sep 17 00:00:00 2001 From: Vincent Gao Date: Tue, 16 Jun 2026 00:50:02 +0200 Subject: [PATCH] Fix Base45 dropping trailing bytes on non-ASCII input The Base45 encoder and decoder iterated with range(0, len(text), step) while indexing into t = b(text). codext converts a bytes input to str (UTF-8) before the codec runs, so for any non-ASCII content b(text) is longer than text and len(text) stops the loop early, silently dropping the trailing byte(s). For example encode(b'\xcf\xb1\x1b') returned 'OBQ' instead of 'OBQR0' and the value no longer round-tripped. Iterate over len(t) (the actual byte sequence) instead. Output now matches RFC 9285 and the reference base45 implementation, and encoding round-trips for arbitrary byte input. --- src/codext/base/base45.py | 7 +++++-- tests/test_base.py | 13 +++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/codext/base/base45.py b/src/codext/base/base45.py index 272c3e9..1590460 100644 --- a/src/codext/base/base45.py +++ b/src/codext/base/base45.py @@ -34,7 +34,10 @@ def base45_encode(mode): b45 = _get_charset(B45, mode) def encode(text, errors="strict"): t, s = b(text), "" - for i in range(0, len(text), 2): + # iterate over the byte sequence (t), not len(text): when the input + # holds non-ASCII characters, b(text) is longer than text and using + # len(text) silently drops the trailing bytes + for i in range(0, len(t), 2): n = 256 * __ord(t[i]) try: n += __ord(t[i+1]) @@ -54,7 +57,7 @@ def base45_decode(mode): def decode(text, errors="strict"): t, s = b(text), "" ehandler = handle_error("base45", errors, decode=True) - for i in range(0, len(text), 3): + for i in range(0, len(t), 3): try: n = b45[__chr(t[i])] except KeyError: diff --git a/tests/test_base.py b/tests/test_base.py index a37d1a6..193b173 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -211,6 +211,19 @@ def test_codec_base100(self): self.assertRaises(ValueError, codecs.decode, b(B100)[1:], "base100") self.assertIsNotNone(codecs.decode(b(B100) + b"\n", "base100", "ignore")) + def test_codec_base45(self): + # RFC 9285 test vectors + for s, b45 in [("AB", "BB8"), ("Hello!!", "%69 VD92EX0"), ("base-45", "UJCLQE7W581")]: + self.assertEqual(codecs.encode(s, "base45"), b45) + self.assertEqual(codecs.encode(b(s), "base45"), b(b45)) + self.assertEqual(codecs.decode(b45, "base45"), s) + self.assertEqual(codecs.decode(b(b45), "base45"), b(s)) + # a trailing non-ASCII byte must not be dropped (byte length, not str length, drives encoding) + self.assertEqual(codecs.encode(b"\xcf\xb1\x1b", "base45"), b"OBQR0") + self.assertEqual(codecs.decode(b"OBQR0", "base45"), b"\xcf\xb1\x1b") + for data in [b"\xff\xfe", b"hello", b"\x00", b"\x80\x81\x82\x83\x84"]: + self.assertEqual(codecs.decode(codecs.encode(data, "base45"), "base45"), data) + def test_codec_base_generic(self): for n in range(2, 255): bn = "base{}_generic".format(n)