|
| 1 | +# -*- coding: UTF-8 -*- |
| 2 | +"""Generic baseN functions. |
| 3 | +
|
| 4 | +""" |
| 5 | +from math import log |
| 6 | +from six import integer_types, string_types |
| 7 | +from string import printable |
| 8 | +from types import FunctionType |
| 9 | + |
| 10 | +from .__common__ import * |
| 11 | + |
| 12 | + |
| 13 | +# generic base en/decoding functions |
| 14 | +class BaseError(ValueError): |
| 15 | + pass |
| 16 | + |
| 17 | + |
| 18 | +class BaseDecodeError(BaseError): |
| 19 | + pass |
| 20 | + |
| 21 | + |
| 22 | +class BaseEncodeError(BaseError): |
| 23 | + pass |
| 24 | + |
| 25 | + |
| 26 | +def _generate_charset(n): |
| 27 | + """ |
| 28 | + Generate a characters set. |
| 29 | + |
| 30 | + :param n: size of charset |
| 31 | + """ |
| 32 | + if 1 < n <= 100: |
| 33 | + return printable[:n] |
| 34 | + elif 100 < n < 256: |
| 35 | + return "".join(chr(i) for i in range(n)) |
| 36 | + raise ValueError("Bad size of character set") |
| 37 | + |
| 38 | + |
| 39 | +def _get_charset(charset, p=""): |
| 40 | + """ |
| 41 | + Charaters set selection function. It allows to define charsets in many |
| 42 | + different ways. |
| 43 | + |
| 44 | + :param charset: charset object, can be a string (the charset itself), a |
| 45 | + function (that chooses the right charset depending on the |
| 46 | + input parameter) or a dictionary (either by exact key or by |
| 47 | + pattern matching) |
| 48 | + :param p: the parameter for choosing the charset |
| 49 | + """ |
| 50 | + # case 1: charset is a function, so return its result |
| 51 | + if isinstance(charset, FunctionType): |
| 52 | + return charset(p) |
| 53 | + # case 2: charset is a string, so return it |
| 54 | + elif isinstance(charset, string_types): |
| 55 | + return charset |
| 56 | + # case 3: charset is a dict with keys '' and 'inv', typically for a charset |
| 57 | + # using lowercase and uppercase characters that can be inverted |
| 58 | + elif isinstance(charset, dict) and list(charset.keys()) == ["", "inv"]: |
| 59 | + return charset["inv" if re.match(r"[-_]inv(erted)?$", p) else ""] |
| 60 | + # case 4: charset is a dict, but not with the specific keys '' and 'inv', so |
| 61 | + # consider it as pattern-charset pairs |
| 62 | + elif isinstance(charset, dict): |
| 63 | + # try to handle [p]arameter as a simple key |
| 64 | + try: |
| 65 | + return charset[p] |
| 66 | + except KeyError: |
| 67 | + pass |
| 68 | + # or handle [p]arameter as a pattern |
| 69 | + default, n = None, None |
| 70 | + for pattern, cset in charset.items(): |
| 71 | + n = len(cset) |
| 72 | + if pattern == "": |
| 73 | + default = cset |
| 74 | + continue |
| 75 | + if re.match(pattern, p): |
| 76 | + return cset |
| 77 | + # special case: the given [p]arameter can be the charset itself if |
| 78 | + # it has the right length |
| 79 | + p = re.sub(r"^[-_]+", "", p) |
| 80 | + if len(p) == n: |
| 81 | + return p |
| 82 | + # or simply rely on key '' |
| 83 | + if default is not None: |
| 84 | + return default |
| 85 | + raise ValueError("Bad charset descriptor") |
| 86 | + |
| 87 | + |
| 88 | +def base_encode(input, charset, errors="strict", exc=BaseEncodeError): |
| 89 | + """ |
| 90 | + Base-10 to base-N encoding. |
| 91 | + |
| 92 | + :param input: input (str or int) to be decoded |
| 93 | + :param charset: base-N characters set |
| 94 | + :param errors: errors handling marker |
| 95 | + :param exc: exception to be raised in case of error |
| 96 | + """ |
| 97 | + i = input if isinstance(input, integer_types) else s2i(input) |
| 98 | + n = len(charset) |
| 99 | + r = "" |
| 100 | + while i > 0: |
| 101 | + i, c = divmod(i, n) |
| 102 | + r = charset[c] + r |
| 103 | + return r |
| 104 | + |
| 105 | + |
| 106 | +def base_decode(input, charset, errors="strict", exc=BaseEncodeError): |
| 107 | + """ |
| 108 | + Base-N to base-10 decoding. |
| 109 | + |
| 110 | + :param input: input to be decoded |
| 111 | + :param charset: base-N characters set |
| 112 | + :param errors: errors handling marker |
| 113 | + :param exc: exception to be raised in case of error |
| 114 | + """ |
| 115 | + i, n = 0, len(charset) |
| 116 | + for k, c in enumerate(input): |
| 117 | + try: |
| 118 | + i = i * n + charset.index(c) |
| 119 | + except ValueError: |
| 120 | + if errors == "strict": |
| 121 | + raise exc("'base' codec can't decode character '{}' in position" |
| 122 | + " {}".format(c, k)) |
| 123 | + elif errors in ["ignore", "replace"]: |
| 124 | + continue |
| 125 | + else: |
| 126 | + raise ValueError("Unsupported error handling {}".format(errors)) |
| 127 | + return base_encode(i, [chr(j) for j in range(256)], errors, exc) |
| 128 | + |
| 129 | + |
| 130 | +def base(charset, pattern=None, pow2=False, |
| 131 | + encode_template=base_encode, decode_template=base_decode): |
| 132 | + """ |
| 133 | + Base-N codec factory. |
| 134 | + |
| 135 | + :param charset: charset selection function |
| 136 | + :param pattern: matching pattern for the codec name (first capturing group |
| 137 | + is used as the parameter for selecting the charset) |
| 138 | + :param pow2: whether the base codec's N is a power of 2 |
| 139 | + """ |
| 140 | + is_n = isinstance(charset, int) |
| 141 | + n = len(_generate_charset(charset) if is_n else _get_charset(charset)) |
| 142 | + nb = log(n, 2) |
| 143 | + if pow2 and nb != int(nb): |
| 144 | + raise BaseError("Bad charset ; {} is not a power of 2".format(n)) |
| 145 | + |
| 146 | + def encode(param=""): |
| 147 | + a = _generate_charset(n) if is_n else _get_charset(charset, param) |
| 148 | + def _encode(input, errors="strict"): |
| 149 | + return encode_template(input, a, errors), len(input) |
| 150 | + return _encode |
| 151 | + |
| 152 | + def decode(param=""): |
| 153 | + a = _generate_charset(n) if is_n else _get_charset(charset, param) |
| 154 | + def _decode(input, errors="strict"): |
| 155 | + return decode_template(input, a, errors), len(input) |
| 156 | + return _decode |
| 157 | + |
| 158 | + if pattern is None: |
| 159 | + pattern = "base{}".format(n) |
| 160 | + add("base{}".format(n), encode, decode, pattern) |
0 commit comments