Skip to content

Commit 3d9541f

Browse files
committed
Refactored base encodings
1 parent 9119479 commit 3d9541f

12 files changed

Lines changed: 744 additions & 133 deletions

File tree

codext/VERSION.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.0.3
1+
1.1.0

codext/_base.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# -*- coding: UTF-8 -*-
2+
"""Generic baseN functions.
3+
4+
"""
5+
from math import log
6+
from six import integer_types, string_types
7+
from string import printable
8+
from types import FunctionType
9+
10+
from .__common__ import *
11+
12+
13+
# generic base en/decoding functions
14+
class BaseError(ValueError):
15+
pass
16+
17+
18+
class BaseDecodeError(BaseError):
19+
pass
20+
21+
22+
class BaseEncodeError(BaseError):
23+
pass
24+
25+
26+
def _generate_charset(n):
27+
"""
28+
Generate a characters set.
29+
30+
:param n: size of charset
31+
"""
32+
if 1 < n <= 100:
33+
return printable[:n]
34+
elif 100 < n < 256:
35+
return "".join(chr(i) for i in range(n))
36+
raise ValueError("Bad size of character set")
37+
38+
39+
def _get_charset(charset, p=""):
40+
"""
41+
Charaters set selection function. It allows to define charsets in many
42+
different ways.
43+
44+
:param charset: charset object, can be a string (the charset itself), a
45+
function (that chooses the right charset depending on the
46+
input parameter) or a dictionary (either by exact key or by
47+
pattern matching)
48+
:param p: the parameter for choosing the charset
49+
"""
50+
# case 1: charset is a function, so return its result
51+
if isinstance(charset, FunctionType):
52+
return charset(p)
53+
# case 2: charset is a string, so return it
54+
elif isinstance(charset, string_types):
55+
return charset
56+
# case 3: charset is a dict with keys '' and 'inv', typically for a charset
57+
# using lowercase and uppercase characters that can be inverted
58+
elif isinstance(charset, dict) and list(charset.keys()) == ["", "inv"]:
59+
return charset["inv" if re.match(r"[-_]inv(erted)?$", p) else ""]
60+
# case 4: charset is a dict, but not with the specific keys '' and 'inv', so
61+
# consider it as pattern-charset pairs
62+
elif isinstance(charset, dict):
63+
# try to handle [p]arameter as a simple key
64+
try:
65+
return charset[p]
66+
except KeyError:
67+
pass
68+
# or handle [p]arameter as a pattern
69+
default, n = None, None
70+
for pattern, cset in charset.items():
71+
n = len(cset)
72+
if pattern == "":
73+
default = cset
74+
continue
75+
if re.match(pattern, p):
76+
return cset
77+
# special case: the given [p]arameter can be the charset itself if
78+
# it has the right length
79+
p = re.sub(r"^[-_]+", "", p)
80+
if len(p) == n:
81+
return p
82+
# or simply rely on key ''
83+
if default is not None:
84+
return default
85+
raise ValueError("Bad charset descriptor")
86+
87+
88+
def base_encode(input, charset, errors="strict", exc=BaseEncodeError):
89+
"""
90+
Base-10 to base-N encoding.
91+
92+
:param input: input (str or int) to be decoded
93+
:param charset: base-N characters set
94+
:param errors: errors handling marker
95+
:param exc: exception to be raised in case of error
96+
"""
97+
i = input if isinstance(input, integer_types) else s2i(input)
98+
n = len(charset)
99+
r = ""
100+
while i > 0:
101+
i, c = divmod(i, n)
102+
r = charset[c] + r
103+
return r
104+
105+
106+
def base_decode(input, charset, errors="strict", exc=BaseEncodeError):
107+
"""
108+
Base-N to base-10 decoding.
109+
110+
:param input: input to be decoded
111+
:param charset: base-N characters set
112+
:param errors: errors handling marker
113+
:param exc: exception to be raised in case of error
114+
"""
115+
i, n = 0, len(charset)
116+
for k, c in enumerate(input):
117+
try:
118+
i = i * n + charset.index(c)
119+
except ValueError:
120+
if errors == "strict":
121+
raise exc("'base' codec can't decode character '{}' in position"
122+
" {}".format(c, k))
123+
elif errors in ["ignore", "replace"]:
124+
continue
125+
else:
126+
raise ValueError("Unsupported error handling {}".format(errors))
127+
return base_encode(i, [chr(j) for j in range(256)], errors, exc)
128+
129+
130+
def base(charset, pattern=None, pow2=False,
131+
encode_template=base_encode, decode_template=base_decode):
132+
"""
133+
Base-N codec factory.
134+
135+
:param charset: charset selection function
136+
:param pattern: matching pattern for the codec name (first capturing group
137+
is used as the parameter for selecting the charset)
138+
:param pow2: whether the base codec's N is a power of 2
139+
"""
140+
is_n = isinstance(charset, int)
141+
n = len(_generate_charset(charset) if is_n else _get_charset(charset))
142+
nb = log(n, 2)
143+
if pow2 and nb != int(nb):
144+
raise BaseError("Bad charset ; {} is not a power of 2".format(n))
145+
146+
def encode(param=""):
147+
a = _generate_charset(n) if is_n else _get_charset(charset, param)
148+
def _encode(input, errors="strict"):
149+
return encode_template(input, a, errors), len(input)
150+
return _encode
151+
152+
def decode(param=""):
153+
a = _generate_charset(n) if is_n else _get_charset(charset, param)
154+
def _decode(input, errors="strict"):
155+
return decode_template(input, a, errors), len(input)
156+
return _decode
157+
158+
if pattern is None:
159+
pattern = "base{}".format(n)
160+
add("base{}".format(n), encode, decode, pattern)

codext/_base2n.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# -*- coding: UTF-8 -*-
2+
"""BaseN functions with N a power of 2.
3+
4+
"""
5+
from math import ceil, log
6+
7+
from .__common__ import *
8+
from ._base import base, _get_charset, BaseError
9+
10+
11+
# base en/decoding functions for N a power of 2
12+
class Base2NError(BaseError):
13+
pass
14+
15+
16+
class Base2NDecodeError(BaseError):
17+
pass
18+
19+
20+
class Base2NEncodeError(BaseError):
21+
pass
22+
23+
24+
def base2n(charset, pattern=None):
25+
"""
26+
Base-N codec factory for N a power of 2.
27+
28+
:param charset: charset selection function
29+
:param pattern: matching pattern for the codec name (first capturing group
30+
is used as the parameter for selecting the charset)
31+
"""
32+
base(charset, pattern, True, base2n_encode, base2n_decode)
33+
34+
35+
def base2n_encode(string, charset, errors="strict", exc=Base2NEncodeError):
36+
"""
37+
8-bits characters to base-N encoding for N a power of 2.
38+
39+
:param string: string to be decoded
40+
:param charset: base-N characters set
41+
:param errors: errors handling marker
42+
:param exc: exception to be raised in case of error
43+
"""
44+
bs, r, n = "", "", len(charset)
45+
# find the number of bits for the given character set and the quantum
46+
nb_out = int(log(n, 2))
47+
q = nb_out
48+
while q % 8 != 0:
49+
q += nb_out
50+
# iterate over the characters, gathering bits to be mapped to the charset
51+
for i, c in enumerate(string):
52+
c = c if isinstance(c, int) else ord(c)
53+
bs += "{:0>8}".format(bin(c)[2:])
54+
while len(bs) >= nb_out:
55+
r += charset[int(bs[:nb_out], 2)]
56+
bs = bs[nb_out:]
57+
if len(bs) > 0:
58+
for i in range(0, len(bs), nb_out):
59+
c = ("{:0<%d}" % nb_out).format(bs[i:i+nb_out])
60+
p = len(c) - len(bs[i:i+nb_out])
61+
r += charset[int(c, 2)]
62+
l = len(r) * nb_out
63+
while l % q != 0:
64+
l += nb_out
65+
return r + int(l / nb_out - len(r)) * "="
66+
67+
68+
def base2n_decode(string, charset, errors="strict", exc=Base2NDecodeError):
69+
"""
70+
Base-N to 8-bits characters decoding for N a power of 2.
71+
72+
:param string: string to be decoded
73+
:param charset: base-N characters set
74+
:param errors: errors handling marker
75+
:param exc: exception to be raised in case of error
76+
"""
77+
bs, r, n = "", "", len(charset)
78+
# find the number of bits for the given character set and the number of
79+
# padding characters
80+
nb_in = int(log(n, 2))
81+
n_pad = len(string) - len(string.rstrip("="))
82+
# iterate over the characters, mapping them to the character set and
83+
# converting the resulting bits to 8-bits characters
84+
for i, c in enumerate(string):
85+
if c == "=":
86+
bs += "0" * nb_in
87+
else:
88+
try:
89+
bs += ("{:0>%d}" % nb_in).format(bin(charset.index(c))[2:])
90+
except ValueError:
91+
if errors == "strict":
92+
raise exc("'base' codec can't decode character '{}' in "
93+
"position {}".format(c, i))
94+
elif errors == "replace":
95+
bs += "0" * nb_in
96+
elif errors == "ignore":
97+
continue
98+
else:
99+
raise ValueError("Unsupported error handling {}"
100+
.format(errors))
101+
if len(bs) > 8:
102+
r += chr(int(bs[:8], 2))
103+
bs = bs[8:]
104+
# if the number of bits is not multiple of 8 bits, it could mean a bad
105+
# padding
106+
if len(bs) != 8:
107+
if errors == "strict":
108+
raise Base2NDecodeError("Incorrect padding")
109+
elif errors in ["replace", "ignore"]:
110+
pass
111+
else:
112+
raise ValueError("Unsupported error handling {}".format(errors))
113+
r += chr(int(bs, 2))
114+
np = int(ceil(n_pad * nb_in / 8.0))
115+
return r[:-np] if np > 0 else r

0 commit comments

Comments
 (0)