Skip to content

Commit 4cdb566

Browse files
committed
Added console scripts for multiple base encodings
1 parent 3ff5147 commit 4cdb566

11 files changed

Lines changed: 232 additions & 56 deletions

File tree

codext/__common__.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
__all__ = ["add", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess",
3232
"generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "is_native", "list_categories",
3333
"list_encodings", "lookup", "maketrans", "rank", "re", "register", "remove", "reset", "s2i", "search",
34-
"stopfunc", "BytesIO", "MASKS", "PY3"]
34+
"stopfunc", "BytesIO", "MASKS", "PY3", "_input"]
3535
CODECS_REGISTRY = None
3636
CODECS_CATEGORIES = ["native", "custom"]
3737
MASKS = {
@@ -58,6 +58,29 @@
5858
s2i = lambda s: int(codecs.encode(s, "base16"), 16)
5959

6060

61+
def __stdin_pipe():
62+
""" Stdin pipe read function. """
63+
try:
64+
with open(0, 'rb') as f:
65+
for l in f:
66+
yield l
67+
except TypeError:
68+
for l in sys.stdin:
69+
yield l
70+
71+
72+
def _input(infile):
73+
# handle input file or stdin
74+
if infile:
75+
with open(infile, 'rb') as f:
76+
c = f.read()
77+
else:
78+
c = b("")
79+
for line in __stdin_pipe():
80+
c += line
81+
return c
82+
83+
6184
def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False, **kwargs):
6285
""" This adds a new codec to the codecs module setting its encode and/or decode functions, eventually dynamically
6386
naming the encoding with a pattern and with file handling.
@@ -612,15 +635,18 @@ def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=Fal
612635
glob = {'__name__': "__main__"}
613636
exec("class %s(ValueError): pass" % exc, glob)
614637

615-
def _handle_error(token, position):
638+
def _handle_error(token, position, output=""):
616639
""" This handles an encoding/decoding error according to the selected handling mode.
617640
618641
:param token: input token to be encoded/decoded
619642
:param position: token position index
643+
:param output: output, as decoded up to the position of the error
620644
"""
621645
if errors == "strict":
622646
msg = "'{}' codec can't {}code character '{}' in {} {}"
623-
raise glob[exc](msg.format(ename, ["en", "de"][decode], token, item, position))
647+
err = glob[exc](msg.format(ename, ["en", "de"][decode], token, item, position))
648+
err.output = output
649+
raise err
624650
elif errors == "leave":
625651
return token + sep
626652
elif errors == "replace":

codext/__init__.py

Lines changed: 18 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
lookup = codecs.lookup
1919
open = codecs.open
2020

21+
_lst = list
2122
list = list_encodings # not included in __all__ because of shadow name
2223

2324

@@ -32,18 +33,6 @@ def __literal_eval(o):
3233
return literal_eval("'" + str(o) + "'")
3334

3435

35-
def __stdin_pipe():
36-
""" Stdin pipe read function. """
37-
try:
38-
with open(0, 'rb') as f:
39-
for l in f:
40-
yield l
41-
except TypeError:
42-
import sys
43-
for l in sys.stdin:
44-
yield l
45-
46-
4736
def main():
4837
import argparse, os
4938
descr = "Codecs Extension (CodExt) {}\n\nAuthor : {} ({})\nCopyright: {}\nLicense : {}\nSource : {}\n" \
@@ -79,10 +68,10 @@ def main():
7968
help="error handling (default: strict)")
8069
guess = sparsers.add_parser("guess", help="try guessing the decoding codecs")
8170
guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)")
82-
guess.add_argument("-c", "--codec-categories", help="codec categories to be included in the search ; "
83-
"format: string|tuple|list(strings|tuples)")
84-
guess.add_argument("-e", "--exclude-codecs", help="codecs to be explicitely not used ; "
85-
"format: string|tuple|list(strings|tuples)")
71+
guess.add_argument("-c", "--codec-categories", nargs="*", help="codec categories to be included in the search ; "
72+
"format: string|tuple")
73+
guess.add_argument("-e", "--exclude-codecs", nargs="*", help="codecs to be explicitely not used ; "
74+
"format: string|tuple")
8675
guess.add_argument("-f", "--stop-function", default="text", help="result checking function (default: text) ; "
8776
"format: printables|text|flag|lang_[bigram]|[regex]")
8877
guess.add_argument("--max-depth", default=5, type=int, help="maximum codec search depth (default: 5)")
@@ -107,6 +96,15 @@ def main():
10796
search = sparsers.add_parser("search", help="search for codecs")
10897
search.add_argument("pattern", nargs="+", help="encoding pattern to search")
10998
args = parser.parse_args()
99+
try:
100+
args.codec_categories = _lst(map(__literal_eval, args.codec_categories))
101+
except (AttributeError, TypeError):
102+
pass
103+
try:
104+
args.exclude_codecs = _lst(map(__literal_eval, args.exclude_codecs))
105+
except (AttributeError, TypeError):
106+
pass
107+
#print(args.codec_categories, args.exclude_codecs)
110108
# if a search pattern is given, only handle it
111109
if args.command == "search":
112110
results = []
@@ -115,14 +113,7 @@ def main():
115113
print(", ".join(results) or "No encoding found")
116114
return
117115
# handle input file or stdin
118-
if args.infile:
119-
with open(args.infile, 'rb') as f:
120-
c = f.read()
121-
else:
122-
c = b("")
123-
for line in __stdin_pipe():
124-
c += line
125-
# strip only the very last (CR)LF
116+
c =_input(args.infile)
126117
c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n")
127118
# strip any other (CR)LF
128119
if args.strip:
@@ -142,8 +133,8 @@ def main():
142133
getattr(stopfunc, args.stop_function, args.stop_function),
143134
args.min_depth,
144135
args.max_depth,
145-
__literal_eval(args.codec_categories),
146-
__literal_eval(args.exclude_codecs),
136+
args.codec_categories,
137+
args.exclude_codecs,
147138
args.encoding,
148139
not args.do_not_stop,
149140
True, # show
@@ -162,8 +153,7 @@ def main():
162153
if len(r) == 0:
163154
print("Could not decode :-(")
164155
elif args.command == "rank":
165-
for i, e in codecs.rank(c, args.extended, args.limit,
166-
__literal_eval(args.codec_categories), __literal_eval(args.exclude_codecs)):
156+
for i, e in codecs.rank(c, args.extended, args.limit, args.codec_categories, args.exclude_codecs):
167157
s = "[+] %.5f: %s" % (i[0], e)
168158
print(s if len(s) <= 80 else s[:77] + "...")
169159

codext/base/__init__.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,58 @@
11
# -*- coding: UTF-8 -*-
2+
from argparse import ArgumentParser, RawTextHelpFormatter
3+
from types import MethodType
4+
25
from .ascii85 import *
36
from .base45 import *
47
from .base85 import *
58
from .base91 import *
69
from .base100 import *
710
from .base122 import *
811
from .baseN import *
12+
from ..__common__ import *
13+
from ..__info__ import __version__
14+
15+
16+
def main():
17+
descr = """Usage: debase [OPTION]... [FILE]
18+
Base decode multi-layer FILE, or standard input, to standard output.
19+
20+
With no FILE, or when FILE is -, read standard input.
21+
22+
Optional arguments:
23+
-f, --stop-function set the result chceking function (default: text)
24+
format: printables|text|flag|lang_[bigram]|[regex]
25+
-i, --ignore-generic ignore generic base codecs while guess-decoding
26+
-M, --max-depth maximum codec search depth (default: 5)
27+
-m, --min-depth minimum codec search depth (default: 0)
28+
-s, --do-not-stop do not stop if a valid output is found
29+
30+
--help display this help and exit
31+
--verbose show guessing information and steps
32+
--version output version information and exit
33+
34+
Report debase bugs to <https://github.com/dhondta/python-codext/issues/new>
35+
Full documentation at: <https://python-codext.readthedocs.io/en/latest/enc/base.html>
36+
"""
37+
parser = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False)
38+
parser.format_help = MethodType(lambda s: s.description, parser)
39+
parser.add_argument("file", nargs="?")
40+
parser.add_argument("-f", "--stop-function", default="text")
41+
parser.add_argument("-i", "--ignore-generic", action="store_true")
42+
parser.add_argument("-M", "--max-depth", default=5, type=int)
43+
parser.add_argument("-m", "--min-depth", default=0, type=int)
44+
parser.add_argument("-s", "--do-not-stop", action="store_true")
45+
parser.add_argument("--help", action="help")
46+
parser.add_argument("--version", action="version")
47+
parser.add_argument("--verbose", action="store_true")
48+
parser.version = "CodExt " + __version__
49+
args = parser.parse_args()
50+
excl = [[], ["base%d-generic" % i for i in range(2, 255)]][args.ignore_generic]
51+
sfunc = getattr(stopfunc, args.stop_function, args.stop_function)
52+
c = _input(args.file)
53+
c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n")
54+
r = codecs.guess(c, sfunc, args.min_depth, args.max_depth, exclude=excl, codec_categories="base",
55+
stop=not args.do_not_stop, show=True, scoring_heuristic=False)
56+
if not args.do_not_stop:
57+
print(ensure_str(list(r.items())[0][1]))
958

codext/base/_base.py

Lines changed: 60 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,15 @@
22
"""Generic baseN functions.
33
44
"""
5+
from argparse import ArgumentParser, RawTextHelpFormatter
56
from math import log
67
from six import integer_types, string_types
78
from string import ascii_lowercase as lower, ascii_uppercase as upper, digits, printable
8-
from types import FunctionType
9+
from textwrap import wrap
10+
from types import FunctionType, MethodType
911

1012
from ..__common__ import *
13+
from ..__info__ import __version__
1114

1215

1316
class BaseError(ValueError):
@@ -86,9 +89,7 @@ def base_encode(input, charset, errors="strict", exc=BaseEncodeError):
8689
:param errors: errors handling marker
8790
:param exc: exception to be raised in case of error
8891
"""
89-
i = input if isinstance(input, integer_types) else s2i(input)
90-
n = len(charset)
91-
r = ""
92+
i, n, r = input if isinstance(input, integer_types) else s2i(input), len(charset), ""
9293
while i > 0:
9394
i, c = divmod(i, n)
9495
r = charset[c] + r
@@ -103,13 +104,13 @@ def base_decode(input, charset, errors="strict", exc=BaseDecodeError):
103104
:param errors: errors handling marker
104105
:param exc: exception to be raised in case of error
105106
"""
106-
i, n = 0, len(charset)
107+
i, n, dec = 0, len(charset), lambda n: base_encode(n, [chr(x) for x in range(256)], errors, exc)
107108
for k, c in enumerate(input):
108109
try:
109110
i = i * n + charset.index(c)
110111
except ValueError:
111-
handle_error("base", errors, exc, decode=True)(c, k)
112-
return base_encode(i, [chr(j) for j in range(256)], errors, exc)
112+
handle_error("base", errors, exc, decode=True)(c, k, dec(i))
113+
return dec(i)
113114

114115

115116
# base codec factory functions
@@ -162,3 +163,55 @@ def _decode(input, errors="strict"):
162163
guess=["base%d-generic" % i for i in range(2, 255)], entropy=lambda e, n: log(int(n.split("-")[0][4:]), 2),
163164
len_charset=lambda n: int(n.split("-")[0][4:]), printables_rate=1., category="base-generic", penalty=.4)
164165

166+
167+
def main(n, ref=None, alt=None):
168+
base = str(n) + ("-" + alt.lstrip("-") if alt else "")
169+
src = "The data are encoded as described for the base%(base)s alphabet in %(reference)s.\n" % \
170+
{'base': base, 'reference': "\n" + ref if len(ref) > 10 else ref} if ref else ""
171+
descr = """Usage: base%(base)s [OPTION]... [FILE]
172+
Base%(base)s encode or decode FILE, or standard input, to standard output.
173+
174+
With no FILE, or when FILE is -, read standard input.
175+
176+
Mandatory arguments to long options are mandatory for short options too.
177+
-d, --decode decode data
178+
-i, --ignore-garbage when decoding, ignore non-alphabet characters
179+
-I, --invert invert charsets from the base alphabet (e.g. lower- and uppercase)
180+
-w, --wrap=COLS wrap encoded lines after COLS character (default 76).
181+
Use 0 to disable line wrapping
182+
183+
--help display this help and exit
184+
--version output version information and exit
185+
186+
%(source)sWhen decoding, the input may contain newlines in addition to the bytes of
187+
the formal base%(base)s alphabet. Use --ignore-garbage to attempt to recover
188+
from any other non-alphabet bytes in the encoded stream.
189+
190+
Report base%(base)s translation bugs to <https://github.com/dhondta/python-codext/issues/new>
191+
Full documentation at: <https://python-codext.readthedocs.io/en/latest/enc/base.html>
192+
""" % {'base': base, 'source': src}
193+
194+
def _main():
195+
parser = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False)
196+
parser.format_help = MethodType(lambda s: s.description, parser)
197+
parser.add_argument("file", nargs="?")
198+
parser.add_argument("-d", "--decode", action="store_true")
199+
parser.add_argument("-i", "--ignore-garbage", action="store_true")
200+
parser.add_argument("-I", "--invert", action="store_true")
201+
parser.add_argument("-w", "--wrap", type=int, default=76)
202+
parser.add_argument("--help", action="help")
203+
parser.add_argument("--version", action="version")
204+
parser.version = "CodExt " + __version__
205+
args = parser.parse_args()
206+
c, f = _input(args.file), [encode, decode][args.decode]
207+
c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n")
208+
try:
209+
c = f(c, "base" + base + ["", "-inv"][args.invert], ["strict", "ignore"][args.ignore_garbage])
210+
except Exception as err:
211+
print("%sbase%d: invalid input" % (err.output, n))
212+
return 1
213+
for l in wrap(ensure_str(c), args.wrap):
214+
print(l)
215+
return 0
216+
return _main
217+

codext/base/base100.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
- decodes file content to str (read)
1010
- encodes file content from str to bytes (write)
1111
"""
12+
from ._base import main
1213
from ..__common__ import *
1314

1415

@@ -19,18 +20,18 @@
1920
class Base100DecodeError(ValueError):
2021
pass
2122

22-
23-
def base100_encode(input, errors='strict'):
23+
def base100_encode(input, errors="strict"):
2424
input = b(input)
2525
r = [240, 159, 0, 0] * len(input)
2626
for i, c in enumerate(input):
2727
r[4*i+2] = (c + 55) // 64 + 143
2828
r[4*i+3] = (c + 55) % 64 + 128
2929
return bytes(r), len(input)
3030

31-
32-
def base100_decode(input, errors='strict'):
31+
def base100_decode(input, errors="strict"):
3332
input = b(input)
33+
if errors == "ignore":
34+
input = input.replace(b"\n", "")
3435
if len(input) % 4 != 0:
3536
raise Base100DecodeError("Bad input (length should be multiple of 4)")
3637
r = [None] * (len(input) // 4)
@@ -40,7 +41,14 @@ def base100_decode(input, errors='strict'):
4041
elif i % 4 == 3:
4142
r[i//4] = (c - 128 + tmp - 55) & 0xff
4243
return bytes(r), len(input)
44+
else:
45+
def base100_encode(input, errors='strict'):
46+
raise NotImplementedError
47+
48+
def base100_decode(input, errors='strict'):
49+
raise NotImplementedError
4350

4451

45-
add("base100", base100_encode, base100_decode, r"^(?:base[-_]?100|emoji)$")
52+
add("base100", base100_encode, base100_decode, r"^(?:base[-_]?100|emoji)$")
53+
main = main(100, "<https://github.com/AdamNiederer/base100>")
4654

codext/base/base122.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
- decodes file content to str (read)
88
- encodes file content from str to bytes (write)
99
"""
10+
from ._base import main
1011
from ..__common__ import *
1112

1213

@@ -67,7 +68,6 @@ def _get_7bits(idx, bit):
6768
r.extend([B1, B2])
6869
return "".join(map(chr, r)).encode("latin-1"), len(input)
6970

70-
7171
# inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js
7272
def base122_decode(input, errors="strict"):
7373
currB, bob, r, input = 0, 0, [], list(map(ord, input))
@@ -91,7 +91,14 @@ def _get_7bits(currB, bob, B, decoded):
9191
else:
9292
currB, bob = _get_7bits(currB, bob, input[i], r)
9393
return "".join(map(chr, r)), len(input)
94+
else:
95+
def base122_encode(input, errors='strict'):
96+
raise NotImplementedError
97+
98+
def base122_decode(input, errors='strict'):
99+
raise NotImplementedError
94100

95101

96-
add("base122", base122_encode, base122_decode, r"^base[-_]?122$")
102+
add("base122", base122_encode, base122_decode, r"^base[-_]?122$")
103+
main = main(122, "<http://blog.kevinalbs.com/base122>")
97104

0 commit comments

Comments
 (0)