From 281ca1bacbbc0891f7e2987bbbd161f507823bd3 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 7 Sep 2022 20:36:19 +0200 Subject: [PATCH 01/62] Added codec: tokenize --- codext/common/dummy.py | 12 +++++++++++- docs/manipulations.md | 10 +++++++++- tests/test_manual.py | 2 ++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/codext/common/dummy.py b/codext/common/dummy.py index 7f4be19..b45c023 100755 --- a/codext/common/dummy.py +++ b/codext/common/dummy.py @@ -22,7 +22,7 @@ def code(input, errors="strict"): # important note: ^ # using "{2}" here instead will break the codec # this is due to the fact the codext.__common__.generate_string_from_regex DOES NOT handle ASSERT_NOT (?!) and will -# faill to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo +# fail to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo def substitute(token, replacement): @@ -45,3 +45,13 @@ def code(input, errors="strict"): strip_spaces = lambda i, e="strict": (i.replace(" ", ""), len(i)) add("strip-spaces", strip_spaces, strip_spaces, guess=None) +def tokenize(n): + tlen = int(n[8:].lstrip("-_")) + def code(input, errors="strict"): + l = len(input) + if tlen > l: + raise LookupError("unknown encoding: %s" % n) + return " ".join(input[i:i+tlen] for i in range(0, l, tlen)), l + return code +add("tokenize", tokenize, tokenize, r"^(tokenize[-_]?[1-9][0-9]*)$", guess=None) + diff --git a/docs/manipulations.md b/docs/manipulations.md index 7962278..8857ca7 100644 --- a/docs/manipulations.md +++ b/docs/manipulations.md @@ -43,11 +43,12 @@ These transformation functions are simple string transformations. **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`replace` | text <-> text with single-char replaced | | +`replace` | text <-> text with multi-chars replaced | | parametrized with a _string_ and its _replacement_ `reverse` | text <-> reversed text | | `reverse-words` | text <-> reversed words | | same as `reverse` but not on the whole text, only on the words (text split by whitespace) `strip-spaces` | text <-> all whitespaces stripped | | `substitute` | text <-> text with token substituted | | +`tokenize` | text <-> text split in tokens of length N | | parametrized with _N_ As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)). @@ -58,6 +59,13 @@ $ echo -en "test string" | codext encode reverse-words | codext encode reverse r string_test ``` +Another example: + +```sh +$ echo -en "3132333435" | codext encode tokenize-2 +31 32 33 34 35 +``` + Or using encodings chaining: ```sh diff --git a/tests/test_manual.py b/tests/test_manual.py index 4211df7..64b1843 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -100,6 +100,8 @@ def test_codec_dummy_str_manips(self): self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR) self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that")) self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR) + self.assertEqual(codecs.encode(STR, "tokenize-2"), "th is i s a te st") + self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200") def test_codec_hash_functions(self): STR = b"This is a test string!" From 4792a99b3a3780765b80c68f0bbcb46da27a2f7b Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 11 Sep 2022 19:13:41 +0200 Subject: [PATCH 02/62] Fixed minor bugs --- codext/__common__.py | 26 +++++++++++++++----------- tests/test_generated.py | 12 +++++++++--- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index 41cb5b2..9d9400c 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -109,10 +109,11 @@ def __new__(cls, name): for action, examples in (self.codecs[0].parameters.get('examples', {}) or {'enc-dec(': ["T3st str!"]}).items(): if re.match(r"enc(-dec)?\(", action): for e in (examples.keys() if action.startswith("enc(") else examples or []): - rd = re.match(r"\@random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) if rd: - for n in (rd.group(1) or "512").split(","): - self.encode("".join(chr(randint(0, 255)) for i in range(int(n)))) + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + self.encode(s.lower() if rd.group(1) else s) continue self.encode(e) @@ -1276,10 +1277,9 @@ def __make_encodings_dict(include, exclude): def _develop(d, keep=True): d = d or {} for k, v in d.items(): - l, cc = [], [e for e in v if e in CODECS_CATEGORIES] + l, cc, sc = [], [e for e in v if e in CODECS_CATEGORIES], [e for e in v if e not in CODECS_CATEGORIES] # list from in-scope categories and then everything that is not a category - for enc in ((list_encodings(*cc) if len(cc) > 0 or keep else []) + \ - [e for e in v if e not in CODECS_CATEGORIES]): + for enc in ((list_encodings(*cc) if (len(cc) > 0 or keep) and len(sc) == 0 else []) + sc): g = [] for e in (search(enc, False) or [enc]): try: @@ -1293,8 +1293,8 @@ def _develop(d, keep=True): l.extend(g) d[k] = list(set(l)) return d - exclude = _develop(exclude, False) - return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()} + _excl, _incl = _develop(exclude, False), _develop(include) + return {k: [x for x in v if x not in _excl.get(k, [])] for k, v in _incl.items()} def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False): @@ -1304,7 +1304,10 @@ def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extende try: codec = CODECS_CACHE[e] except KeyError: - CODECS_CACHE[e] = codec = lookup(e, False) + try: + CODECS_CACHE[e] = codec = lookup(e, False) + except LookupError: + continue t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) if t: ranking[e] = t @@ -1321,7 +1324,7 @@ def __init__(self, text, pad_char=None): pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c) self.padding = pad_char is not None and last_char == pad_char if self.padding: - text = text.rstrip(pad_char) + text = text.rstrip(b(pad_char) if isinstance(text, bytes) else pad_char) self.len = len(self.text) self.lcharset = len(set(self.text)) self.printables = float(len([c for c in self.text if c in printable])) / self.len @@ -1501,7 +1504,8 @@ def rank(input, extended=False, limit=-1, include=None, exclude=None): :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) """ - encodings = __make_encodings_dict({-1: include or CODECS_CATEGORIES}, {-1: exclude or []}) + encodings = __make_encodings_dict(include if isinstance(include, dict) else {-1: include or CODECS_CATEGORIES}, + exclude if isinstance(exclude, dict) else {-1: exclude or []}) r = list(__rank(None, input, "", encodings[-1], True, extended, True)) return r[:limit] if len(r) > 1 else r codecs.rank = rank diff --git a/tests/test_generated.py b/tests/test_generated.py index 6b89129..614562f 100644 --- a/tests/test_generated.py +++ b/tests/test_generated.py @@ -36,6 +36,11 @@ def _template(self): for ename in m.groups(): if ename is None: continue + # buggy generated encoding names + try: + lookup(ename) + except LookupError: + continue # erroneous encoding name test if examples is None: self.assertRaises(LookupError, f1, "test", ename) @@ -72,11 +77,12 @@ def _template(self): # examples validation tests if k.startswith("enc-dec") and isinstance(examples, list): for e in examples[:]: - rd = re.match(r"\@random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) if rd: examples.remove(e) - for n in (rd.group(1) or "512").split(","): - examples.append("".join(chr(randint(0, 255)) for i in range(int(n)))) + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + examples.append(s.lower() if rd.group(1) else s) for s in [""] + examples: self.assertEqual(icdec(f2(icenc(f1(s, ename)), ename)), icdec(s)) self.assertEqual(icdec(f2(icenc(f1(b(s), ename)), ename)), b(icdec(s))) From b4e1eb66fb8764df992cc6434f0e69a6eedbd9b5 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 11 Sep 2022 19:13:57 +0200 Subject: [PATCH 03/62] Added codec: kbshift --- codext/others/__init__.py | 1 + codext/others/kbshift.py | 66 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100755 codext/others/kbshift.py diff --git a/codext/others/__init__.py b/codext/others/__init__.py index aa7ffa2..3bbf102 100755 --- a/codext/others/__init__.py +++ b/codext/others/__init__.py @@ -1,5 +1,6 @@ # -*- coding: UTF-8 -*- from .dna import * +from .kbshift import * from .letters import * from .markdown import * from .uuencode import * diff --git a/codext/others/kbshift.py b/codext/others/kbshift.py new file mode 100755 index 0000000..2bd0991 --- /dev/null +++ b/codext/others/kbshift.py @@ -0,0 +1,66 @@ +# -*- coding: UTF-8 -*- +"""Keyboard-Shift Codec - keyboard line shifting content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +LAYOUTS = { + 'ansi': "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;'\nzxcvbnm<>\n,./", + 'azerty': "azertyuiop\nqsdfghjklm\nwxcvbn", + 'azerty-be': "³1234567890°_\n²&é\"'(§è!çà)-\n|@#^{}\nazertyuiop$\n€[]\n¨*\nqsdfghjklm%£\nùµ\n´`\n>wxcvbn?./+\n<,;:=\n\\~", + 'azerty-fr': "1234567890°+\n²&é\"'(-è_çà)=\n~#{[|`\\^@]}\nazertyuiop¨£\nqsdfghjklm%µ\nù*\n>wxcvbn?./§\n<,;:!", + 'dvorak': "~!@#$%^&*(){}\n`1234567890[]\n\"<>pyfgcrl?+|\n',./=\\\naoeuidhtns_\n-\n:qjkxbmwvz\n;", + 'qwerty': "qwertyuiop\nasdfghjkl\nzxcvbnm", + 'qwerty-us': "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;,\nzxcvbnm<>?\n./", +} +__per_len = {} +for k, s in LAYOUTS.items(): + i = max(map(len, s.split("\n"))) + __per_len.setdefault(i, []) + __per_len[i].append(k) + + +__examples__ = {"enc-dec(kbshift_%s_%d)" % (kb, n): ["@irandom{256,512}"] for n in range(10) for kb in LAYOUTS.keys()} +__guess__ = [] +for mlen, kbs in __per_len.items(): + for k in kbs: + __guess__.extend(["kbshift-%s-%d" % (k, i+1) for i in range(mlen)]) + + +def _kbshift(text, keyboard="azerty", n=1, decode=False): + r = "" + for c in text: + nc = None + for l in LAYOUTS[keyboard].splitlines(): + if c.lower() in l: + nc = l[(l.index(c.lower()) + [-1, 1][decode] * n) % len(l)] + break + r += c if nc is None else nc + return r + + +def kbshift_encode(scheme): + kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups() + def encode(text, errors="strict"): + r = _kbshift(ensure_str(text), kb, int(shift)) + return r, len(r) + return encode + + +def kbshift_decode(scheme): + kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups() + def decode(text, errors="strict"): + r = _kbshift(ensure_str(text), kb, int(shift), True) + return r, len(r) + return decode + + +add("kbshift", kbshift_encode, kbshift_decode, entropy=lambda e: e,printables_rate=lambda pr: pr, transitive=True, + pattern=r"^kbshift(?:|[-_]((?:az|qw)erty[-_]?[1-9]|(?:ansi|azerty-(?:be|fr)|dvorak|qwerty-us)[-_]?(?:[1-9]|1[0-2])))$") + From cd234d5d97867f1470b45499694f3776aa74569b Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 12 Sep 2022 21:53:13 +0200 Subject: [PATCH 04/62] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 80138e7..850e742 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.13.4 +1.14.0 From 13960f1dbf2b322047bdc4285ba075bf023dd176 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 12 Feb 2023 13:52:24 +0100 Subject: [PATCH 05/62] Moved to pyproject.toml --- pyproject.toml | 87 +++++++++++++++++++ setup.cfg | 80 ----------------- setup.py | 4 - {codext => src/codext}/VERSION.txt | 0 {codext => src/codext}/__common__.py | 0 {codext => src/codext}/__info__.py | 0 {codext => src/codext}/__init__.py | 0 {codext => src/codext}/base/__init__.py | 0 {codext => src/codext}/base/_base.py | 0 {codext => src/codext}/base/_base2n.py | 0 {codext => src/codext}/base/base100.py | 0 {codext => src/codext}/base/base122.py | 0 {codext => src/codext}/base/base45.py | 0 {codext => src/codext}/base/base85.py | 0 {codext => src/codext}/base/base91.py | 0 {codext => src/codext}/base/baseN.py | 0 {codext => src/codext}/binary/__init__.py | 0 {codext => src/codext}/binary/baudot.py | 0 {codext => src/codext}/binary/bcd.py | 0 {codext => src/codext}/binary/excess3.py | 0 {codext => src/codext}/binary/gray.py | 0 {codext => src/codext}/binary/manchester.py | 0 {codext => src/codext}/binary/rotate.py | 0 {codext => src/codext}/common/__init__.py | 0 {codext => src/codext}/common/a1z26.py | 0 {codext => src/codext}/common/cases.py | 0 {codext => src/codext}/common/dummy.py | 0 {codext => src/codext}/common/octal.py | 0 {codext => src/codext}/common/ordinal.py | 0 .../codext}/compressions/__init__.py | 0 {codext => src/codext}/compressions/gzipp.py | 0 {codext => src/codext}/compressions/lz77.py | 0 {codext => src/codext}/compressions/lz78.py | 0 {codext => src/codext}/compressions/pkzip.py | 0 {codext => src/codext}/crypto/__init__.py | 0 {codext => src/codext}/crypto/affine.py | 0 {codext => src/codext}/crypto/atbash.py | 0 {codext => src/codext}/crypto/bacon.py | 0 {codext => src/codext}/crypto/barbie.py | 0 {codext => src/codext}/crypto/citrix.py | 0 {codext => src/codext}/crypto/railfence.py | 0 {codext => src/codext}/crypto/rot.py | 0 {codext => src/codext}/crypto/scytale.py | 0 {codext => src/codext}/crypto/shift.py | 0 {codext => src/codext}/crypto/xor.py | 0 {codext => src/codext}/hashing/__init__.py | 0 {codext => src/codext}/hashing/blake.py | 0 {codext => src/codext}/hashing/checksums.py | 0 {codext => src/codext}/hashing/crypt.py | 0 {codext => src/codext}/hashing/md.py | 0 {codext => src/codext}/hashing/sha.py | 0 {codext => src/codext}/hashing/shake.py | 0 {codext => src/codext}/languages/__init__.py | 0 {codext => src/codext}/languages/braille.py | 0 {codext => src/codext}/languages/galactic.py | 0 {codext => src/codext}/languages/ipsum.py | 0 {codext => src/codext}/languages/leetspeak.py | 0 {codext => src/codext}/languages/morse.py | 0 {codext => src/codext}/languages/navajo.py | 0 {codext => src/codext}/languages/radio.py | 0 {codext => src/codext}/languages/southpark.py | 0 {codext => src/codext}/languages/tap.py | 0 {codext => src/codext}/languages/tomtom.py | 0 {codext => src/codext}/macros.json | 0 {codext => src/codext}/others/__init__.py | 0 {codext => src/codext}/others/dna.py | 0 {codext => src/codext}/others/kbshift.py | 0 {codext => src/codext}/others/letters.py | 0 {codext => src/codext}/others/markdown.py | 0 {codext => src/codext}/others/uuencode.py | 0 {codext => src/codext}/stegano/__init__.py | 0 {codext => src/codext}/stegano/hexagram.py | 0 {codext => src/codext}/stegano/klopf.py | 0 {codext => src/codext}/stegano/resistor.py | 0 {codext => src/codext}/stegano/rick.py | 0 {codext => src/codext}/stegano/sms.py | 0 {codext => src/codext}/stegano/whitespace.py | 0 {codext => src/codext}/web/__init__.py | 0 {codext => src/codext}/web/html.py | 0 {codext => src/codext}/web/url.py | 0 80 files changed, 87 insertions(+), 84 deletions(-) create mode 100644 pyproject.toml delete mode 100644 setup.cfg delete mode 100644 setup.py rename {codext => src/codext}/VERSION.txt (100%) rename {codext => src/codext}/__common__.py (100%) rename {codext => src/codext}/__info__.py (100%) rename {codext => src/codext}/__init__.py (100%) rename {codext => src/codext}/base/__init__.py (100%) rename {codext => src/codext}/base/_base.py (100%) rename {codext => src/codext}/base/_base2n.py (100%) rename {codext => src/codext}/base/base100.py (100%) rename {codext => src/codext}/base/base122.py (100%) rename {codext => src/codext}/base/base45.py (100%) rename {codext => src/codext}/base/base85.py (100%) rename {codext => src/codext}/base/base91.py (100%) rename {codext => src/codext}/base/baseN.py (100%) rename {codext => src/codext}/binary/__init__.py (100%) rename {codext => src/codext}/binary/baudot.py (100%) rename {codext => src/codext}/binary/bcd.py (100%) rename {codext => src/codext}/binary/excess3.py (100%) rename {codext => src/codext}/binary/gray.py (100%) rename {codext => src/codext}/binary/manchester.py (100%) rename {codext => src/codext}/binary/rotate.py (100%) rename {codext => src/codext}/common/__init__.py (100%) rename {codext => src/codext}/common/a1z26.py (100%) rename {codext => src/codext}/common/cases.py (100%) rename {codext => src/codext}/common/dummy.py (100%) rename {codext => src/codext}/common/octal.py (100%) rename {codext => src/codext}/common/ordinal.py (100%) rename {codext => src/codext}/compressions/__init__.py (100%) rename {codext => src/codext}/compressions/gzipp.py (100%) rename {codext => src/codext}/compressions/lz77.py (100%) rename {codext => src/codext}/compressions/lz78.py (100%) rename {codext => src/codext}/compressions/pkzip.py (100%) rename {codext => src/codext}/crypto/__init__.py (100%) rename {codext => src/codext}/crypto/affine.py (100%) rename {codext => src/codext}/crypto/atbash.py (100%) rename {codext => src/codext}/crypto/bacon.py (100%) rename {codext => src/codext}/crypto/barbie.py (100%) rename {codext => src/codext}/crypto/citrix.py (100%) rename {codext => src/codext}/crypto/railfence.py (100%) rename {codext => src/codext}/crypto/rot.py (100%) rename {codext => src/codext}/crypto/scytale.py (100%) rename {codext => src/codext}/crypto/shift.py (100%) rename {codext => src/codext}/crypto/xor.py (100%) rename {codext => src/codext}/hashing/__init__.py (100%) rename {codext => src/codext}/hashing/blake.py (100%) rename {codext => src/codext}/hashing/checksums.py (100%) rename {codext => src/codext}/hashing/crypt.py (100%) rename {codext => src/codext}/hashing/md.py (100%) rename {codext => src/codext}/hashing/sha.py (100%) rename {codext => src/codext}/hashing/shake.py (100%) rename {codext => src/codext}/languages/__init__.py (100%) rename {codext => src/codext}/languages/braille.py (100%) rename {codext => src/codext}/languages/galactic.py (100%) rename {codext => src/codext}/languages/ipsum.py (100%) rename {codext => src/codext}/languages/leetspeak.py (100%) rename {codext => src/codext}/languages/morse.py (100%) rename {codext => src/codext}/languages/navajo.py (100%) rename {codext => src/codext}/languages/radio.py (100%) rename {codext => src/codext}/languages/southpark.py (100%) rename {codext => src/codext}/languages/tap.py (100%) rename {codext => src/codext}/languages/tomtom.py (100%) rename {codext => src/codext}/macros.json (100%) rename {codext => src/codext}/others/__init__.py (100%) rename {codext => src/codext}/others/dna.py (100%) rename {codext => src/codext}/others/kbshift.py (100%) rename {codext => src/codext}/others/letters.py (100%) rename {codext => src/codext}/others/markdown.py (100%) rename {codext => src/codext}/others/uuencode.py (100%) rename {codext => src/codext}/stegano/__init__.py (100%) rename {codext => src/codext}/stegano/hexagram.py (100%) rename {codext => src/codext}/stegano/klopf.py (100%) rename {codext => src/codext}/stegano/resistor.py (100%) rename {codext => src/codext}/stegano/rick.py (100%) rename {codext => src/codext}/stegano/sms.py (100%) rename {codext => src/codext}/stegano/whitespace.py (100%) rename {codext => src/codext}/web/__init__.py (100%) rename {codext => src/codext}/web/html.py (100%) rename {codext => src/codext}/web/url.py (100%) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ce377f3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,87 @@ +[build-system] +requires = ["setuptools>=61.0", "setuptools-scm"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +version = {attr = "codext.__info__.__version__"} + +[tool.setuptools.packages.find] +where = ["src"] + +[project] +name = "codext" +authors = [ + {name="Alexandre D'Hondt", email="alexandre.dhondt@gmail.com"}, +] +description = "Library for producing ASCII arts from a text or an image" +license = {file = "LICENSE"} +keywords = ["python", "development", "programming", "ascii-art", "banner-generator", "quote-generator", "cowsay"] +requires-python = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,<4" +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Developers", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Software Development :: Libraries :: Python Modules", +] +dependencies = [ + "markdown2==2.3.10; python_version=='2.7'", + "markdown2>=2.4.0; python_version>='3.6'", + "six", +] +dynamic = ["version"] + +[project.readme] +file = "README.md" +content-type = "text/markdown" + +[project.urls] +documentation = "https://python-codext.readthedocs.io/en/latest/?badge=latest" +homepage = "https://github.com/dhondta/python-codext" +issues = "https://github.com/dhondta/python-codext/issues" +repository = "https://github.com/dhondta/python-codext" + +[project.scripts] +base1 = "codext.base.baseN:main1" +base2 = "codext.base.baseN:main2" +base3 = "codext.base.baseN:main3" +base4 = "codext.base.baseN:main4" +base8 = "codext.base.baseN:main8" +base10 = "codext.base.baseN:main10" +base16 = "codext.base.baseN:main16" +base26 = "codext.base.baseN:main26" +base32 = "codext.base.baseN:main32" +base32-hex = "codext.base.baseN:main32hex" +base32-geohash = "codext.base.baseN:main32geo" +base32-crockford = "codext.base.baseN:main32crk" +base32-z = "codext.base.baseN:mainz32" +base36 = "codext.base.baseN:main36" +base45 = "codext.base.base45:main" +base58-bitcoin = "codext.base.baseN:main58bc" +base58-ripple = "codext.base.baseN:main58rp" +base58-flickr = "codext.base.baseN:main58fl" +base62 = "codext.base.baseN:main62" +base63 = "codext.base.baseN:main63" +base64 = "codext.base.baseN:main64" +base64-url = "codext.base.baseN:main64url" +base67 = "codext.base.baseN:main67" +base85 = "codext.base.base85:main85" +base85-adobe = "codext.base.base85:main85adobe" +base85-xbtoa = "codext.base.base85:main85xbtoa" +base85-ipv6 = "codext.base.base85:main85rfc1924" +base85-xml = "codext.base.base85:main85xml" +base85-zeromq = "codext.base.base85:main85zeromq" +base91 = "codext.base.base91:main91" +base100 = "codext.base.base100:main100" +base122 = "codext.base.base122:main122" +codext = "codext.__init__:main" +unbase = "codext.base.__init__:main" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 958a404..0000000 --- a/setup.cfg +++ /dev/null @@ -1,80 +0,0 @@ -[metadata] -name = codext -version = file: codext/VERSION.txt -author = Alexandre D'Hondt -author-email = alexandre.dhondt@gmail.com -home-page = https://github.com/dhondta/python-codext -description = Native codecs extension -long_description = file: README.md -long_description_content_type = text/markdown -keywords = - python - development - programming - codecs - encodings -license = GPLv3 -license-file = LICENSE -classifier = - Development Status :: 5 - Production/Stable - Environment :: Console - Intended Audience :: Developers - License :: OSI Approved :: GNU General Public License v3 (GPLv3) - Programming Language :: Python :: 2 - Programming Language :: Python :: 2.7 - Programming Language :: Python :: 3 - Programming Language :: Python :: 3.6 - Programming Language :: Python :: 3.7 - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Topic :: Software Development :: Libraries :: Python Modules - -[options] -packages = find: -include_package_data = False -install_requires = - markdown2==2.3.10; python_version=='2.7' # rq.filter: >=2.4.0 - markdown2>=2.4.0; python_version>='3.6' - six -setup-requires = setuptools -python-requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,<4 - -[options.package_data] -* = *.txt,*.json - -[options.entry_points] -console_scripts = - base1 = codext.base.baseN:main1 - base2 = codext.base.baseN:main2 - base3 = codext.base.baseN:main3 - base4 = codext.base.baseN:main4 - base8 = codext.base.baseN:main8 - base10 = codext.base.baseN:main10 - base16 = codext.base.baseN:main16 - base26 = codext.base.baseN:main26 - base32 = codext.base.baseN:main32 - base32-hex = codext.base.baseN:main32hex - base32-geohash = codext.base.baseN:main32geo - base32-crockford = codext.base.baseN:main32crk - base32-z = codext.base.baseN:mainz32 - base36 = codext.base.baseN:main36 - base45 = codext.base.base45:main - base58-bitcoin = codext.base.baseN:main58bc - base58-ripple = codext.base.baseN:main58rp - base58-flickr = codext.base.baseN:main58fl - base62 = codext.base.baseN:main62 - base63 = codext.base.baseN:main63 - base64 = codext.base.baseN:main64 - base64-url = codext.base.baseN:main64url - base67 = codext.base.baseN:main67 - base85 = codext.base.base85:main85 - base85-adobe = codext.base.base85:main85adobe - base85-xbtoa = codext.base.base85:main85xbtoa - base85-ipv6 = codext.base.base85:main85rfc1924 - base85-xml = codext.base.base85:main85xml - base85-zeromq = codext.base.base85:main85zeromq - base91 = codext.base.base91:main91 - base100 = codext.base.base100:main100 - base122 = codext.base.base122:main122 - codext = codext.__init__:main - unbase = codext.base.__init__:main diff --git a/setup.py b/setup.py deleted file mode 100644 index c823345..0000000 --- a/setup.py +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env python -from setuptools import setup - -setup() diff --git a/codext/VERSION.txt b/src/codext/VERSION.txt similarity index 100% rename from codext/VERSION.txt rename to src/codext/VERSION.txt diff --git a/codext/__common__.py b/src/codext/__common__.py similarity index 100% rename from codext/__common__.py rename to src/codext/__common__.py diff --git a/codext/__info__.py b/src/codext/__info__.py similarity index 100% rename from codext/__info__.py rename to src/codext/__info__.py diff --git a/codext/__init__.py b/src/codext/__init__.py similarity index 100% rename from codext/__init__.py rename to src/codext/__init__.py diff --git a/codext/base/__init__.py b/src/codext/base/__init__.py similarity index 100% rename from codext/base/__init__.py rename to src/codext/base/__init__.py diff --git a/codext/base/_base.py b/src/codext/base/_base.py similarity index 100% rename from codext/base/_base.py rename to src/codext/base/_base.py diff --git a/codext/base/_base2n.py b/src/codext/base/_base2n.py similarity index 100% rename from codext/base/_base2n.py rename to src/codext/base/_base2n.py diff --git a/codext/base/base100.py b/src/codext/base/base100.py similarity index 100% rename from codext/base/base100.py rename to src/codext/base/base100.py diff --git a/codext/base/base122.py b/src/codext/base/base122.py similarity index 100% rename from codext/base/base122.py rename to src/codext/base/base122.py diff --git a/codext/base/base45.py b/src/codext/base/base45.py similarity index 100% rename from codext/base/base45.py rename to src/codext/base/base45.py diff --git a/codext/base/base85.py b/src/codext/base/base85.py similarity index 100% rename from codext/base/base85.py rename to src/codext/base/base85.py diff --git a/codext/base/base91.py b/src/codext/base/base91.py similarity index 100% rename from codext/base/base91.py rename to src/codext/base/base91.py diff --git a/codext/base/baseN.py b/src/codext/base/baseN.py similarity index 100% rename from codext/base/baseN.py rename to src/codext/base/baseN.py diff --git a/codext/binary/__init__.py b/src/codext/binary/__init__.py similarity index 100% rename from codext/binary/__init__.py rename to src/codext/binary/__init__.py diff --git a/codext/binary/baudot.py b/src/codext/binary/baudot.py similarity index 100% rename from codext/binary/baudot.py rename to src/codext/binary/baudot.py diff --git a/codext/binary/bcd.py b/src/codext/binary/bcd.py similarity index 100% rename from codext/binary/bcd.py rename to src/codext/binary/bcd.py diff --git a/codext/binary/excess3.py b/src/codext/binary/excess3.py similarity index 100% rename from codext/binary/excess3.py rename to src/codext/binary/excess3.py diff --git a/codext/binary/gray.py b/src/codext/binary/gray.py similarity index 100% rename from codext/binary/gray.py rename to src/codext/binary/gray.py diff --git a/codext/binary/manchester.py b/src/codext/binary/manchester.py similarity index 100% rename from codext/binary/manchester.py rename to src/codext/binary/manchester.py diff --git a/codext/binary/rotate.py b/src/codext/binary/rotate.py similarity index 100% rename from codext/binary/rotate.py rename to src/codext/binary/rotate.py diff --git a/codext/common/__init__.py b/src/codext/common/__init__.py similarity index 100% rename from codext/common/__init__.py rename to src/codext/common/__init__.py diff --git a/codext/common/a1z26.py b/src/codext/common/a1z26.py similarity index 100% rename from codext/common/a1z26.py rename to src/codext/common/a1z26.py diff --git a/codext/common/cases.py b/src/codext/common/cases.py similarity index 100% rename from codext/common/cases.py rename to src/codext/common/cases.py diff --git a/codext/common/dummy.py b/src/codext/common/dummy.py similarity index 100% rename from codext/common/dummy.py rename to src/codext/common/dummy.py diff --git a/codext/common/octal.py b/src/codext/common/octal.py similarity index 100% rename from codext/common/octal.py rename to src/codext/common/octal.py diff --git a/codext/common/ordinal.py b/src/codext/common/ordinal.py similarity index 100% rename from codext/common/ordinal.py rename to src/codext/common/ordinal.py diff --git a/codext/compressions/__init__.py b/src/codext/compressions/__init__.py similarity index 100% rename from codext/compressions/__init__.py rename to src/codext/compressions/__init__.py diff --git a/codext/compressions/gzipp.py b/src/codext/compressions/gzipp.py similarity index 100% rename from codext/compressions/gzipp.py rename to src/codext/compressions/gzipp.py diff --git a/codext/compressions/lz77.py b/src/codext/compressions/lz77.py similarity index 100% rename from codext/compressions/lz77.py rename to src/codext/compressions/lz77.py diff --git a/codext/compressions/lz78.py b/src/codext/compressions/lz78.py similarity index 100% rename from codext/compressions/lz78.py rename to src/codext/compressions/lz78.py diff --git a/codext/compressions/pkzip.py b/src/codext/compressions/pkzip.py similarity index 100% rename from codext/compressions/pkzip.py rename to src/codext/compressions/pkzip.py diff --git a/codext/crypto/__init__.py b/src/codext/crypto/__init__.py similarity index 100% rename from codext/crypto/__init__.py rename to src/codext/crypto/__init__.py diff --git a/codext/crypto/affine.py b/src/codext/crypto/affine.py similarity index 100% rename from codext/crypto/affine.py rename to src/codext/crypto/affine.py diff --git a/codext/crypto/atbash.py b/src/codext/crypto/atbash.py similarity index 100% rename from codext/crypto/atbash.py rename to src/codext/crypto/atbash.py diff --git a/codext/crypto/bacon.py b/src/codext/crypto/bacon.py similarity index 100% rename from codext/crypto/bacon.py rename to src/codext/crypto/bacon.py diff --git a/codext/crypto/barbie.py b/src/codext/crypto/barbie.py similarity index 100% rename from codext/crypto/barbie.py rename to src/codext/crypto/barbie.py diff --git a/codext/crypto/citrix.py b/src/codext/crypto/citrix.py similarity index 100% rename from codext/crypto/citrix.py rename to src/codext/crypto/citrix.py diff --git a/codext/crypto/railfence.py b/src/codext/crypto/railfence.py similarity index 100% rename from codext/crypto/railfence.py rename to src/codext/crypto/railfence.py diff --git a/codext/crypto/rot.py b/src/codext/crypto/rot.py similarity index 100% rename from codext/crypto/rot.py rename to src/codext/crypto/rot.py diff --git a/codext/crypto/scytale.py b/src/codext/crypto/scytale.py similarity index 100% rename from codext/crypto/scytale.py rename to src/codext/crypto/scytale.py diff --git a/codext/crypto/shift.py b/src/codext/crypto/shift.py similarity index 100% rename from codext/crypto/shift.py rename to src/codext/crypto/shift.py diff --git a/codext/crypto/xor.py b/src/codext/crypto/xor.py similarity index 100% rename from codext/crypto/xor.py rename to src/codext/crypto/xor.py diff --git a/codext/hashing/__init__.py b/src/codext/hashing/__init__.py similarity index 100% rename from codext/hashing/__init__.py rename to src/codext/hashing/__init__.py diff --git a/codext/hashing/blake.py b/src/codext/hashing/blake.py similarity index 100% rename from codext/hashing/blake.py rename to src/codext/hashing/blake.py diff --git a/codext/hashing/checksums.py b/src/codext/hashing/checksums.py similarity index 100% rename from codext/hashing/checksums.py rename to src/codext/hashing/checksums.py diff --git a/codext/hashing/crypt.py b/src/codext/hashing/crypt.py similarity index 100% rename from codext/hashing/crypt.py rename to src/codext/hashing/crypt.py diff --git a/codext/hashing/md.py b/src/codext/hashing/md.py similarity index 100% rename from codext/hashing/md.py rename to src/codext/hashing/md.py diff --git a/codext/hashing/sha.py b/src/codext/hashing/sha.py similarity index 100% rename from codext/hashing/sha.py rename to src/codext/hashing/sha.py diff --git a/codext/hashing/shake.py b/src/codext/hashing/shake.py similarity index 100% rename from codext/hashing/shake.py rename to src/codext/hashing/shake.py diff --git a/codext/languages/__init__.py b/src/codext/languages/__init__.py similarity index 100% rename from codext/languages/__init__.py rename to src/codext/languages/__init__.py diff --git a/codext/languages/braille.py b/src/codext/languages/braille.py similarity index 100% rename from codext/languages/braille.py rename to src/codext/languages/braille.py diff --git a/codext/languages/galactic.py b/src/codext/languages/galactic.py similarity index 100% rename from codext/languages/galactic.py rename to src/codext/languages/galactic.py diff --git a/codext/languages/ipsum.py b/src/codext/languages/ipsum.py similarity index 100% rename from codext/languages/ipsum.py rename to src/codext/languages/ipsum.py diff --git a/codext/languages/leetspeak.py b/src/codext/languages/leetspeak.py similarity index 100% rename from codext/languages/leetspeak.py rename to src/codext/languages/leetspeak.py diff --git a/codext/languages/morse.py b/src/codext/languages/morse.py similarity index 100% rename from codext/languages/morse.py rename to src/codext/languages/morse.py diff --git a/codext/languages/navajo.py b/src/codext/languages/navajo.py similarity index 100% rename from codext/languages/navajo.py rename to src/codext/languages/navajo.py diff --git a/codext/languages/radio.py b/src/codext/languages/radio.py similarity index 100% rename from codext/languages/radio.py rename to src/codext/languages/radio.py diff --git a/codext/languages/southpark.py b/src/codext/languages/southpark.py similarity index 100% rename from codext/languages/southpark.py rename to src/codext/languages/southpark.py diff --git a/codext/languages/tap.py b/src/codext/languages/tap.py similarity index 100% rename from codext/languages/tap.py rename to src/codext/languages/tap.py diff --git a/codext/languages/tomtom.py b/src/codext/languages/tomtom.py similarity index 100% rename from codext/languages/tomtom.py rename to src/codext/languages/tomtom.py diff --git a/codext/macros.json b/src/codext/macros.json similarity index 100% rename from codext/macros.json rename to src/codext/macros.json diff --git a/codext/others/__init__.py b/src/codext/others/__init__.py similarity index 100% rename from codext/others/__init__.py rename to src/codext/others/__init__.py diff --git a/codext/others/dna.py b/src/codext/others/dna.py similarity index 100% rename from codext/others/dna.py rename to src/codext/others/dna.py diff --git a/codext/others/kbshift.py b/src/codext/others/kbshift.py similarity index 100% rename from codext/others/kbshift.py rename to src/codext/others/kbshift.py diff --git a/codext/others/letters.py b/src/codext/others/letters.py similarity index 100% rename from codext/others/letters.py rename to src/codext/others/letters.py diff --git a/codext/others/markdown.py b/src/codext/others/markdown.py similarity index 100% rename from codext/others/markdown.py rename to src/codext/others/markdown.py diff --git a/codext/others/uuencode.py b/src/codext/others/uuencode.py similarity index 100% rename from codext/others/uuencode.py rename to src/codext/others/uuencode.py diff --git a/codext/stegano/__init__.py b/src/codext/stegano/__init__.py similarity index 100% rename from codext/stegano/__init__.py rename to src/codext/stegano/__init__.py diff --git a/codext/stegano/hexagram.py b/src/codext/stegano/hexagram.py similarity index 100% rename from codext/stegano/hexagram.py rename to src/codext/stegano/hexagram.py diff --git a/codext/stegano/klopf.py b/src/codext/stegano/klopf.py similarity index 100% rename from codext/stegano/klopf.py rename to src/codext/stegano/klopf.py diff --git a/codext/stegano/resistor.py b/src/codext/stegano/resistor.py similarity index 100% rename from codext/stegano/resistor.py rename to src/codext/stegano/resistor.py diff --git a/codext/stegano/rick.py b/src/codext/stegano/rick.py similarity index 100% rename from codext/stegano/rick.py rename to src/codext/stegano/rick.py diff --git a/codext/stegano/sms.py b/src/codext/stegano/sms.py similarity index 100% rename from codext/stegano/sms.py rename to src/codext/stegano/sms.py diff --git a/codext/stegano/whitespace.py b/src/codext/stegano/whitespace.py similarity index 100% rename from codext/stegano/whitespace.py rename to src/codext/stegano/whitespace.py diff --git a/codext/web/__init__.py b/src/codext/web/__init__.py similarity index 100% rename from codext/web/__init__.py rename to src/codext/web/__init__.py diff --git a/codext/web/html.py b/src/codext/web/html.py similarity index 100% rename from codext/web/html.py rename to src/codext/web/html.py diff --git a/codext/web/url.py b/src/codext/web/url.py similarity index 100% rename from codext/web/url.py rename to src/codext/web/url.py From 7c79988bf8aa267bbcb177b78391d41ac5f7db3c Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 12 Feb 2023 13:53:02 +0100 Subject: [PATCH 06/62] Updated the documentation --- .readthedocs.yml | 8 ++-- README.md | 15 +++---- docs/coverage.svg | 1 + docs/js/collapsible-navbar.js | 54 ----------------------- docs/mkdocs.yml | 55 ++++++++++++++++++++++++ docs/{ => pages}/cli.md | 0 docs/{ => pages}/demos/using-bases.gif | Bin docs/{ => pages}/demos/using-codext.gif | Bin docs/{ => pages}/demos/using-debase.gif | Bin docs/{ => pages}/enc/base.md | 0 docs/{ => pages}/enc/binary.md | 0 docs/{ => pages}/enc/common.md | 0 docs/{ => pages}/enc/compressions.md | 0 docs/{ => pages}/enc/crypto.md | 0 docs/{ => pages}/enc/hashing.md | 0 docs/{ => pages}/enc/languages.md | 0 docs/{ => pages}/enc/others.md | 0 docs/{ => pages}/enc/stegano.md | 0 docs/{ => pages}/enc/web.md | 0 docs/{ => pages}/features.md | 0 docs/{ => pages}/guessing.md | 0 docs/{ => pages}/howto.md | 0 docs/{imgs => pages/img}/banner.png | Bin docs/pages/img/icon.png | Bin 0 -> 23561 bytes docs/{imgs => pages/img}/logo.png | Bin docs/{ => pages}/index.md | 0 docs/{ => pages}/manipulations.md | 0 docs/requirements.txt | 6 +++ mkdocs.yml | 31 ------------- 29 files changed, 74 insertions(+), 96 deletions(-) create mode 100644 docs/coverage.svg delete mode 100644 docs/js/collapsible-navbar.js create mode 100644 docs/mkdocs.yml rename docs/{ => pages}/cli.md (100%) rename docs/{ => pages}/demos/using-bases.gif (100%) rename docs/{ => pages}/demos/using-codext.gif (100%) rename docs/{ => pages}/demos/using-debase.gif (100%) rename docs/{ => pages}/enc/base.md (100%) rename docs/{ => pages}/enc/binary.md (100%) rename docs/{ => pages}/enc/common.md (100%) rename docs/{ => pages}/enc/compressions.md (100%) rename docs/{ => pages}/enc/crypto.md (100%) rename docs/{ => pages}/enc/hashing.md (100%) rename docs/{ => pages}/enc/languages.md (100%) rename docs/{ => pages}/enc/others.md (100%) rename docs/{ => pages}/enc/stegano.md (100%) rename docs/{ => pages}/enc/web.md (100%) rename docs/{ => pages}/features.md (100%) rename docs/{ => pages}/guessing.md (100%) rename docs/{ => pages}/howto.md (100%) rename docs/{imgs => pages/img}/banner.png (100%) create mode 100644 docs/pages/img/icon.png rename docs/{imgs => pages/img}/logo.png (100%) rename docs/{ => pages}/index.md (100%) rename docs/{ => pages}/manipulations.md (100%) create mode 100644 docs/requirements.txt delete mode 100644 mkdocs.yml diff --git a/.readthedocs.yml b/.readthedocs.yml index e8f4e71..0e991f8 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,6 +1,8 @@ version: 2 + mkdocs: - configuration: mkdocs.yml -formats: all + configuration: docs/mkdocs.yml + python: - version: 3.6 + install: + - requirements: docs/requirements.txt diff --git a/README.md b/README.md index 2ce70be..35aa6c2 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,12 @@ -

+

CodExt Tweet

Encode/decode anything.

[![PyPi](https://img.shields.io/pypi/v/codext.svg)](https://pypi.python.org/pypi/codext/) [![Read The Docs](https://readthedocs.org/projects/python-codext/badge/?version=latest)](https://python-codext.readthedocs.io/en/latest/?badge=latest) -[![Build Status](https://travis-ci.com/dhondta/python-codext.svg?branch=master)](https://travis-ci.com/dhondta/python-codext) -[![Coverage Status](https://coveralls.io/repos/github/dhondta/python-codext/badge.svg?branch=master)](https://coveralls.io/github/dhondta/python-codext?branch=master) +[![Build Status](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml/badge.svg)](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml) +[![Coverage Status](https://raw.githubusercontent.com/dhondta/python-codext/main/docs/coverage.svg)](#) [![Python Versions](https://img.shields.io/pypi/pyversions/codext.svg)](https://pypi.python.org/pypi/codext/) -[![Requirements Status](https://requires.io/github/dhondta/python-codext/requirements.svg?branch=master)](https://requires.io/github/dhondta/python-codext/requirements/?branch=master) [![Known Vulnerabilities](https://snyk.io/test/github/dhondta/python-codext/badge.svg?targetFile=requirements.txt)](https://snyk.io/test/github/dhondta/python-codext?targetFile=requirements.txt) [![DOI](https://zenodo.org/badge/236679865.svg)](https://zenodo.org/badge/latestdoi/236679865) [![License](https://img.shields.io/pypi/l/codext.svg)](https://pypi.python.org/pypi/codext/) @@ -20,13 +19,13 @@ $ pip install codext Want to contribute a new codec ? | Want to contribute a new macro ? :----------------------------------:|:------------------------------------: -Check the [documentation](https://python-codext.readthedocs.io/en/latest/howto.html) first
Then [PR](https://github.com/dhondta/python-codext/pulls) your new codec | [PR](https://github.com/dhondta/python-codext/pulls) your updated version of [`macros.json`](https://github.com/dhondta/python-codext/blob/master/codext/macros.json) +Check the [documentation](https://python-codext.readthedocs.io/en/latest/howto.html) first
Then [PR](https://github.com/dhondta/python-codext/pulls) your new codec | [PR](https://github.com/dhondta/python-codext/pulls) your updated version of [`macros.json`](https://github.com/dhondta/python-codext/blob/main/codext/macros.json) ## :mag: Demonstrations -

Using CodExt from the command line

-

Using base tools from the command line

-

Using the unbase command line tool

+

Using CodExt from the command line

+

Using base tools from the command line

+

Using the unbase command line tool

## :computer: Usage (main CLI tool) Tweet on codext diff --git a/docs/coverage.svg b/docs/coverage.svg new file mode 100644 index 0000000..bde433b --- /dev/null +++ b/docs/coverage.svg @@ -0,0 +1 @@ +coverage: 99.53%coverage99.53% \ No newline at end of file diff --git a/docs/js/collapsible-navbar.js b/docs/js/collapsible-navbar.js deleted file mode 100644 index b1e1593..0000000 --- a/docs/js/collapsible-navbar.js +++ /dev/null @@ -1,54 +0,0 @@ -String.prototype.format = function() { - a = this; - for (k in arguments) { - a = a.replace("{" + k + "}", arguments[k]) - } - return a -} - -$(document).ready(function () { - $('li.toctree-l1').each(function () { - var parent = $(this); - var span = parent.find('span:first'); - var sibling = null; - var remove = true; - $('li.toctree-l1').each(function() { - var a = $(this).find('a:first'); - if (a.text() != '' && a.text() == span.text()) { - parent.prepend(a); - span.remove(); - span = a; - if ($(this).hasClass('current')) parent.addClass('current'); - sibling = $(this); - return false - } - }); - if (sibling === null && parent.find('ul.subnav:not(li.toctree-l2)').children('li').length) { - sibling = parent; - remove = false; - } - if (sibling !== null) { - var ul = parent.find('ul.subnav:not(li.toctree-l2)'); - var new_a = ''; - if (!ul.children('li.current').length && !parent.hasClass('current')) { - ul.hide(); - $(new_a.format("left")).insertBefore(span); - } else { - $(new_a.format("down")).insertBefore(span); - } - if (remove) sibling.remove(); - } - }); - $('a.collapse-navbar').click(function () { - var parent = $(this).closest('li.toctree-l1'); - var subnav = parent.find('ul.subnav:not(li.toctree-l2)'); - if ($(this).hasClass('fa-caret-left')) { - subnav.show(); - $(this).removeClass('fa-caret-left'); - $(this).addClass('fa-caret-down'); - } else { - subnav.hide(); - $(this).addClass('fa-caret-left'); - $(this).removeClass('fa-caret-down'); - } -});}); diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml new file mode 100644 index 0000000..a39ccb0 --- /dev/null +++ b/docs/mkdocs.yml @@ -0,0 +1,55 @@ +site_author: dhondta +site_name: "Codext - Extension of native codecs for Python" +repo_url: https://github.com/dhondta/python-codext +copyright: Copyright © 2021-2023 Alexandre D'Hondt +docs_dir: pages +nav: + - Introduction: index.md + - Features: features.md + - 'Guess mode': guessing.md + - Encodings: + - Base: enc/base.md + - Binary: enc/binary.md + - Common: enc/common.md + - Compressions: enc/compressions.md + - Cryptography: enc/crypto.md + - Hashing: enc/hashing.md + - Languages: enc/languages.md + - Others: enc/others.md + - Steganography: enc/stegano.md + - 'String manipulations': manipulations.md + - 'CLI tool': cli.md + - 'Create your codec': howto.md +extra: + generator: false + social: + - icon: fontawesome/solid/paper-plane + link: mailto:alexandre.dhondt@gmail.com + name: Contact Alex + - icon: fontawesome/brands/github + link: https://github.com/dhondta + name: Alex on GitHub + - icon: fontawesome/brands/linkedin + link: https://www.linkedin.com/in/alexandre-d-2ab2aa14/ + name: Alex on LinkedIn + - icon: fontawesome/brands/twitter + link: https://twitter.com/alex_dhondt + name: Alex on Twitter +theme: + name: material + palette: + - scheme: default + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - scheme: slate + toggle: + icon: material/brightness-4 + name: Switch to light mode + logo: img/logo.png + favicon: img/icon.png +use_directory_urls: false +markdown_extensions: + - toc: + permalink: true + - admonition diff --git a/docs/cli.md b/docs/pages/cli.md similarity index 100% rename from docs/cli.md rename to docs/pages/cli.md diff --git a/docs/demos/using-bases.gif b/docs/pages/demos/using-bases.gif similarity index 100% rename from docs/demos/using-bases.gif rename to docs/pages/demos/using-bases.gif diff --git a/docs/demos/using-codext.gif b/docs/pages/demos/using-codext.gif similarity index 100% rename from docs/demos/using-codext.gif rename to docs/pages/demos/using-codext.gif diff --git a/docs/demos/using-debase.gif b/docs/pages/demos/using-debase.gif similarity index 100% rename from docs/demos/using-debase.gif rename to docs/pages/demos/using-debase.gif diff --git a/docs/enc/base.md b/docs/pages/enc/base.md similarity index 100% rename from docs/enc/base.md rename to docs/pages/enc/base.md diff --git a/docs/enc/binary.md b/docs/pages/enc/binary.md similarity index 100% rename from docs/enc/binary.md rename to docs/pages/enc/binary.md diff --git a/docs/enc/common.md b/docs/pages/enc/common.md similarity index 100% rename from docs/enc/common.md rename to docs/pages/enc/common.md diff --git a/docs/enc/compressions.md b/docs/pages/enc/compressions.md similarity index 100% rename from docs/enc/compressions.md rename to docs/pages/enc/compressions.md diff --git a/docs/enc/crypto.md b/docs/pages/enc/crypto.md similarity index 100% rename from docs/enc/crypto.md rename to docs/pages/enc/crypto.md diff --git a/docs/enc/hashing.md b/docs/pages/enc/hashing.md similarity index 100% rename from docs/enc/hashing.md rename to docs/pages/enc/hashing.md diff --git a/docs/enc/languages.md b/docs/pages/enc/languages.md similarity index 100% rename from docs/enc/languages.md rename to docs/pages/enc/languages.md diff --git a/docs/enc/others.md b/docs/pages/enc/others.md similarity index 100% rename from docs/enc/others.md rename to docs/pages/enc/others.md diff --git a/docs/enc/stegano.md b/docs/pages/enc/stegano.md similarity index 100% rename from docs/enc/stegano.md rename to docs/pages/enc/stegano.md diff --git a/docs/enc/web.md b/docs/pages/enc/web.md similarity index 100% rename from docs/enc/web.md rename to docs/pages/enc/web.md diff --git a/docs/features.md b/docs/pages/features.md similarity index 100% rename from docs/features.md rename to docs/pages/features.md diff --git a/docs/guessing.md b/docs/pages/guessing.md similarity index 100% rename from docs/guessing.md rename to docs/pages/guessing.md diff --git a/docs/howto.md b/docs/pages/howto.md similarity index 100% rename from docs/howto.md rename to docs/pages/howto.md diff --git a/docs/imgs/banner.png b/docs/pages/img/banner.png similarity index 100% rename from docs/imgs/banner.png rename to docs/pages/img/banner.png diff --git a/docs/pages/img/icon.png b/docs/pages/img/icon.png new file mode 100644 index 0000000000000000000000000000000000000000..da3cb311f8063334794b119e1e4e76e86f73407f GIT binary patch literal 23561 zcmYhh2|QG9_%@EBgchk#DWbGsHZ!9Ivu|dfF-t`=X2$F@W}it#l=h^Bw9rN>(V~S= zX^&`Ava}Z|^i7+R_xSzZ_w)bzjB}oIpXa%s{l2dIKF>x5jo>xLe~goplNX7IWdZlS z!~dh*fa~IAd6iC16H84vzDZ+~Db!M@5Gdw|cG8uGQAPLAd zI*CH2kV^jd9Rvh{hJheqUucF9s5TGEG4+e)o z(LgFrp*7opyA26Xgu~_nDU@EXmhz-xioz5E!AC+7;ZXQ+5ekt*jSGQbfV5hnk^(n^ zG)bi!E+R3AtlgoEM3FaE!t;K9&vc=&LC z*dm$8p!omQHY}8yM9Kf#LA`=$Ad`$ng*Xx~4-?`n7zNOY;XD?p!KeU6<$uQL|J6PG z!erM=|5qxJS`{H=4Uk3ge}_-XAxE*qa;=y{Wg_Ti5nmK#{$K4{ z5zvGGQOy8GFbw>^N)YjCp*RGBW>AtWFq4#n!l1FqgeXA5W2tcpiUP!9>Eo2TNI@LJ z0%*d+qj*{hNN0;;Fp^Yep(c)m!>f1@8xJos!_5e|S)k|3KzK+bh71*xsY!4Q$p}TG zQg}2d2PiIApg?E=!=e-e^M%GSP<)aU!G@6-AcY8{1?t74$#OD+OJzcVxxwo!$y}NY zufhst3Vx&{UW5aLG72G*A1`C@kvxUjB2O{#iAp5egpP`WD(F%1lw>d2BPJ{hQScnUemC>#X?GejD7e2!RV zl$d#X6c(5mGu|fFnn8AgiGsvJ;lKpgK}ZaTOe7E>be$$rqDewaVaY&6W&%|XXo6yB zQCuX;Y-fYOJWG z)FQ2bO9c#(l!QoeB$3QQu0tSY0S!e9@hJ!hmy48OC|F4fpKGKMBB=~|6pTYj29f0y zv=xJw+5}W2kXe97!H{Yi)hu;j!D>Drfy?b?HlK+yS=cn4L}|i-ttbiu$%#i;wOE5l zL=~xYT$D7*NR1*1;PG<1j_FX~QzGN}2&EXI74aMzBf^N2z(kmMoty)Kh?o>>GMdID z3t4PB2g-w4Z8$oW$Rq$$L&V!G8o=n{RYDETu8^Z7Oc;scpxccS3@8~2AsSV*WG;!O zfJm7#l#s*_86k4B5sgV9(=BwYQYe!|CD}+|nhh#JQ-OUFhquDHc%B4}vXFqzU=5L6 zI!VIUz_0?kjSI3H;zaQxom4;p>a<8$27*|LP2q#&GOP)THv%O9If0Ojb65ygiJigV z%C$1ETq!_{MGPH5s)&p-%eZiC3K@aWB&X1A1|by#vS~H8;gN?*Bup$($pn%~S}`<< z;6RB|Yz!+&Du7CqNk+a=p~cu7EHVTy784{k64C;~JJig0ff=Vu(NgsYV?2UMSLjqq zg;}mvDEKBV0%Xwe3=Aesj*Pd-#1JY=C6-ck1QH4h!7>pzf>t6o8j=Khp41@IBPddV zAzsW9s;Pn~7#HV&;7J;)G?GTK&?B*Qc9g`X!<0s?MhXjnEIQ!7=9 zG<+qKiUq~7p+vEkqR<+UmUxE&os8ngA(a@72Iv{ypi>ivJE)|w1Oy6Am?Wp+Ax1Ub zfsw#8DH?hth=&*QmE0(Zf`#J)TNKUU+F)@C42WV?66j2x@k#_vLeV0XXtYJ4!%7W+SOAqK ztJox-lx|Lz((Td|hzdJnp1B)V5;mB|p4kE(PL=-ljjnv}}@eCnG zh_<23Bt4gBAaLkF&tTSJm|rvKa_HB_)olVG7h-1qgzb z(FKZljRKF01WRQE3|%1N#qus0%;;6kT8N+4x)n*uq3Jwq*23gBn3eM z*T?H6nn*Ps%e08&EC5{4*bplg55*EpXee1^MJD4B9GOMK2apbJLxOdxWID%Alo*U$ zlU);!M>x=6Br*k>qOj?d60ie>Mga|s0>SKTHVF!*lF$SqTSNwHnRcU{gcIn|Tnd>c z)+8asXpWuAv^GqNNlS`LCP%;}1q%>m{u5nKibMKL4}yEp{|l7qnzCIb^6CruKm!3I230CW^<(ST`ukk(?;k&IM_CPl6g zXmMZ-7-WhY##WvJOhWJsdY;;BHM58cyNZD$5%i=KD_;_4Vv)3FseuBC!jZH%tq?G6 zQAuJR7N{Iefx)7578%c?q#7{EMy$Zjr11bAiYCZK8eY6ikxbwb`2c~!qlHXUio=Y8 zve*C};tP3f8Jg%28sexJjGb)2MLCQtg#)G)06VsyJk@iH;MPtoBG1LTOebsW_y`U>D)Uk;FK@ ziKUVnqbwkJGQp&PQB-`1Qiu~%a5g4CQqIv5M0^;`1lLH|ktCHJLUk}fTrnb2&67y= z4nBj9!t2=@u13feV+C5F%EUy;^f*2a#l$Pj#7H?^7$<-MBo@sAX#`}Q3M7o;u;SHb zsVI`pAth;si87f=#Zf>+t|A_*fRhDOwMK}*YV{Z)A&whwOk(S!kUX{k02{PQK1}IU za0G=T#WGBkNL7?bB1KT3Xf*;@iEulHK-7}08l?g&r$H=gv6==-(pdx=i9-Q3+8_!( zooQfNxlsZk!UESTR2qvz1>>kz<&j3VHYMI6QSsv0 zd^L&@9|y1>$S`|CBZy|Z0kAh{sGLp_D^qmDcq5XGRV(#+f`o5mOLdrdKAH|T!B|35 z9Nvxw$!Tx`mTEC!S#*U}PSs+Be7YS_z-L;`WU>yUV&PQU6b1mPL^u?w zwxFym(54()LOm(Z&j1+4lu_7w{hdO_Bf+M5y=8UEHEe}iX{*cU~mOM zc9am4K!A2oF)Y|Hb4^wv2oek5D#xdoq>&su&z=;CP2$Ru04B$G7$^vu%@hTVmuMs! zGv8w2fYDNr1j;~D2yuiIrUV&p2k|TfhD3*F7?{b)OhP17Wkc`+^mCj}~VaO`3gRaM@@n$GdNrXex++m|HQ^G_# zHj~LjS|xD=CM(&jHE{4eFm_Kg+Q6clFm?h_W7liw5T=$$Ls3L(IbI}EVEA!R zBQOU5U?k%8nDz$siFvPEE!q zE2vNe6h(${2uFw);P^&{M4}hWNTTFJwx$b5&0sy7-{!yx4l zV~CLw6FX9k5nzyd15Sa7jFf{(w0J_K#EPKXD0H113wU}dWT9CJXYjCM2n@`p090BS zhfyO`4zh(S1eiHGN*d3{BX~#(hX6%jK~w@yz#}HxpxPuWoM}%=f#_hEIKY!aN3j)3 zehLyK7OQm-z?fiFR*aF)2AYT>CXxAahdRX+8Arpj*f6V7CKHJD94N|0j6|V_5gdiZ z0}LLYLgeVkNdi6uM20)KCIK;pM>1f*N_~=%7a6CQDd;o=MPr8W4P3lh!;BPD1;SxJ z(w3CMAR$C*DM<~+66_cYLPG`iInk~~4%0=ENry6H=`^;?1m!}>#Q3Cm2;zTUuA@0SoE%Hnf=z(6z&j|kq$r{T%B64^b{tBeK?;Tm7Me*TTI~2FundTt$SiUs zQcDH6Gf;q%gpwoVj>sq&FcxSEhRKSuAtD`okQjw9iseWul!OAQfQ(>^nK4ZFOn{ds z19alCQXGW}3y< z23$b^Q#63P022>;e`dMGmTVq&{V`ZDA52HH^k|+K1Xltp1FbY+9L#vC87UFO%K&J` zV?Y?FOeIB099RxA1x}BYi;*c7rCo_5S)+u*;H|~U)DkLdc%-=?tIC3u;IzaP31FDe zDw$1dOX667xCx+2nnXuYWlSha3Zq#ifMD3wmKf<$9)kphbL`NlNUK=QATfdI!owX% zs$DKd>6kLB#b$`J+3{jr91M%3Ab|eiv05~TkHph-0D|Gs9GO)CtPp^u;W;|o@LG`} z06L5(f|SD#Sc?tM6b#QD4iPoXa8%Oas$q60kD;Q2hB*x&wAeKgfM25HnE)IO3+OT? z;7Q{xRNx}UD4_`JaKvZ$H@pe{U$p3dLPx;!|BcK+afq0aWll~rok&;|$CmKrz$hF0 z{lL|iJCBln9#o`o&v@K; zKPjd~YKm<=y0Tc25BnX{`)q&z{+&k13sYb1&y@$G$%b3jns4|2ta`akZ1B#52Q?=dq5QYko{hT_5B%kSeGYLV z-fS$N0Zo1ow~HA3;KG&M5_WNPNbdc&dAD!fx^-ggG$bhbXHR>TlT%syaeRwm536Lq z7UWxVGYO%2VG5o*w`+`(r>j$ZvHEiJ&^G%Dg464pwMU%Fp9OTP*Tx-U_*y~x zFjtbCY$TMl*}k2c%PzXHZvFZn@5m*q{`^>#d+^|c0WNJXG+qZ7oG$vq?<+Euc zHYh2f>cok8y8C;=UZ#ZHJ=Yk$sJ*>?$(@ITL?UrR_p`E>JKLPcf68L!ln-oSwKjjf z`e*+ltG4${((MhV!}}O9zqVDReAeU9>0isY#yIa%o^)DK5mw&f;#X6yJ8|N~4b!Mf zQc$#DC*gtlB}$ ztPP5IP|>ihvA({4&s~Xafro9{cx}w*sL8$ecYSIe5FKj#v1Js)CvsMB@W|UwpJWbP zvz}VFTGI7>;A{QQvy!_x;4^2={23Rpc=X)4bM z$mv64!MG8l#vYA%apa*~rceJhL+%xm9q*#y6N#h$-HB^BxEkbqYp+LoT`jwq=6)vg zr+-B7)6AKeufOLcHE;v`{C@x4IC0Luo3p0dLK=r+Roo_9$gw4bbA86_ANvthg1&n} zd~-~D%eiF1RPbA{zzO8}GjN&rx|vlsoIVFQz0!i7UVjoa(x0|HzIegsT^U6WW>d5F z6wx*vcC5?HEO#s4N6-Y#W zGvP;?tk*#}#OOmQdi_f;$DSjI-y-)-*KH35|Lgm8tL|WA{;b&65l*sw(^G5m^73BJ zn?HYwz7P#c2F1K3Lzb0O48rDgnP|bCZ(ae5e^qbUvSmipRLHV|!2L(Tfpup7Y^PUiL?0g2v@UnX=FYv#*tBj$?C-lf=j;U5R^FU!V+^U}xiKWM z?veR2G4$i1*Wr&3gj|IjdD1zvSH8!wQir{1vJ?bXh& z#)otF(v=|Ja8<<%(e#C;Q0eN9_-6T!<2yc&+)97w_3ZTW)+ptZ1^3#FF*Di zbPb4xnY}OM-yc+_=bt6f&0e>xmiMmI1(Rn?+K5dE+A+FsyB4xny?fujR?)yqWyQj? zMMbibYb#dxUBU%1f#wo&_zU8l(*m57Dav`Fp}%Nl&#Yiy&$h(Emec2=5F^68UhJC7 z?g5O(f)~GU&Dl!sl+(O`)2E)GDvHLgbvb@VJAKI>GIrNKJ~kbTPLG@C zwl^;4^_w?)-hKRXIOp!CQXOFTwYzukzTfXyB5<0eUU*<^zvErgvg*L{ZP$~&yf7c8 zGQ)ElD^O=&oa|WfMZT_CE)VQ{SsSxzC1ie(-v_U;jLVmib2efhZMm;FrvJqW z@9b92`eXSnBr~t(uxvCLt>MH=nooYP@ZvV@jts}2{OL-Ti>mCRZ4DjjE4_l8IT1$Dt1j|e^S=z7k}9#lMO@ZYR@>d2(`w5K@N z{4z~?@!Z1KmsfYtwDhs#gtN_ugBYLIjjUW-aM9%L3T%!S*CO$r6~U6zw4hLS_h>oK2YV;& z;zU>a`CPPl$F13O-xHfUzj`x$M!H^#m1i$qpPM_Ux(KslC8>mS`qfv->jMjaCqPz> zbKdx2^vO+oA$jipFOUVRD_0K=)s2g3!`*4(z8%qD{=C9t`b=tr=lKbV%l=)pr6xUx z9C-XF`}Nk8!NJ$MpRcfPm9@@9oYULuM`L$CI~zOIzxOBOBk`H_S3voB^h(>G^u=w# zJ7nEY?(UYish)KO*1dc-za+7-a3Ik+?a-x5zmC0lA$+pmxNo@@y{yT<QojAXsDPa0E(Bhzazftw4 zBB#jv^TXkfc;7$O^RqYJWF#aUJpNF={@K%~+ov9>2$k>gIkPOJnhCB+2G&YKr~i$$ zp39Y!#<==lcbz`A6m>yv8s+QL<*IOfs>cYtXyP%Bi^eQQgx@mWiJ9Q21Fiy6#n{wk z!TNr!->zq!D`GFr8EGfJ{;0ZH{(*m55jW>Ye^%|doQXubl$$18N{Icji}Y$R@!Ts@ zyz7i{S4a5;{3>{CFaIJ=@7kX^(|u%f=L=8yCHn?#uQTPt_w@Yy6WG%DMaxc>zPz${ zbm5@~n$OSA+-9@c?FelfG1#wk7N++WZjUAK@z2ReA_Izf=b9s*Pl~5*m_7c=n!;uIVO{Kx~S_>Cz|5q?egVI4Ef^w_)q^H z{KRK0Io33%e){RTCtoMbogeF-Fx8t}^pI!mNN;sjPLzaxO?YX~HAn4|&DiFib)aNj zc{I+Qe%tlx=k&n7u7W6ipR5j+HD~L#@2=Jo-_xebai^^wyVe8+a&z;Z&`HPTn2`A*{vxK`tdcx zxAkeG=UE+XuISG(baPnB@;!f9JmktU?~axNchaoZow$GXr59)Jn{`+J^TgAs^_IgI zL?NlWswTP)bp}Hp|F~F$Z7pBm{B-Iu!1;LQ8g!;FeOlndUso2`s@I04!=?9pJm1$v z{uL{Kd~Uh-z5Cgyw=SBFEA}h@gfRJ@nM)VKT?Q(Cg3@oKuA5)5CG!%)ns@F`c-tLS zUicdN{B^sxL{2!Ear^bxn1hX3fVruA5}gh@w+3_yx!)^GduIRiJHp6iys;NrK`&$;Gmk+;`4odMaj1_qKC)J8;*s3%AEA_ z^z3w5*|-CYMDK|cos-P&l5Zay5rytffpsyaeUru*DK#bevnBDuzkHYgP0)r`G{AH@16(p0Tdx-1hy*tb6nCZpCJUb%$*_rSRrY&w9Ppy5{NKyInK8cIwq7 zh1mNI4a>1iADZaYtpf|EJNs*DQJnxi`xbi0DZq3r%f0ewtefcZO zC(M2#eH~s-W{*0Mv20_Hhj~RuxHsR_7qGuRG#M42z$P4>eDJ4jO5)jn*O!O%gm=M9 zcuw{H<6VyP>ef{z=+X{Nuikk0dqcIKo=cj4Z~6%TDUd6lUeJeHIyY|)Ih)f-YhT=2 z(HrWnnIg?e`;pdsu_~rxhz59+i@#iL@@>j&+ceF~-^-6FYEnVQO0ESDwPn)VCLfZl>#K_kCa`Gf z(Dg;Xca%NWqEidOGocfmG!xDPj?NG4Id6(z_oq(-Pur`kqxhS~59!i9`x*aT)ZYz> z7u0!){%hO6Etm0cW1ByqrpByJh3D_SgvA!&|>C_!igy{OTot z+qXOp`PTB{#*>m~-O2i$U4?VpEl)flrWNPkje-2hJ)EIDAQC<9cr`+QVFKjyTrVL7 zg~w-kmxL;+!o)+xtBX+k7R{2CB$QcCmfjri60hxEME0QkeFL;>QpR{%4)*pOd+1mP>H{Wj&mag6qwV^HX#u^XFE@yIk z*`%5~uA6{aSbf_Id1(7n`IGE@2gY7*u?8X*9bq;xS-+dB7uCL4D?8HG);2F)k`^{5 z->YN}$YpFRXx{AEczbtq=?YVANA=2!0({JeoBx*1H13XFzCzYwjN9$v#Z8J$auM*e zJ#r%gvicYCzP}K9wQ8QNy%PYFH=S_xnEV-iF6UTm+n!zWJ+V9QzYi`Rldis+=Ir() zxheYJ9zz1)5`8ye&$Bku*H}8n?>-ZOBR1?hy3NI_EW^9!b64H5+{PI@ojaG8nkQsr zC9eEZ(KQRd<@v_j3&)>duoeIKesh1x1&0pMHS+AQU$!vWgSR7dg73XIc-F%rk6M0e z+8c@%{7v**ko9%RUs3ag)}rx|s*L-C|9;n?Ko?F%oEhI1QJP-cdnu_m+Wn2!&wC45)u3Po#^%1orPnI26XnygxtGFPVzDDj2TPgx2+&?`{e_`3a^=iPu zICuQ&)vGGN`&@qcAvyIm=WTPMpxI{FmgV)ODW6o}d3UGk#gx*0$$n-)EK7s+bc85uKaiJ z$n!OmCCBHm=k4Ctd2d$zsG|{*agw){zpqVf{~OyAlJWPR@cHU9X|}|^(!1BMNc+UK zGv5$C%$qc^_r=S{(2e0zcE8Jq^Em#cK3d`A^dfFWaafubAqn6kVejd&WmL-J=)hU zKcw0)b^m3abMN#!E@0e*anG|B+HwkgGUAUL3?5sO73uW|Xho}NHLz_6PD$lD| zof@2SAK#l+)4kaE-^~4s1_?)I1TI*hZml$T6Ca^>lbld2H@J=lIo=61LWkmk(X$ZxoCTYi3_h+YxcFYc%59iPv8eA%#K; zE96>}TxUz>to(1NzsFnmub>1x{!`dG*O=t9hp#1%8iKeF-F#%}cj`RqG6IXQbg@^^RYjYaNtlcftB<%iz}J@cA6Bm2kq;=u07 zZ`|@ay0$MpR{j@MCEK}q=V(jqlieeKiJy31*t0gylhNa-$UZ&>G`}%($F+jZH^)EU z&;GcFS`bSDpZvRvTJAcq^6I9G0eAIx|G-F`eMyE1&6i@lOTMpklsIEQc$dO^%c~3% zoT=WhfWlU{i}&t*k}fuBd9r^&ytZ933pBhJI`)g|MGrtkng#F%FXBxAFzH7&I#|bcBb^z z)-R+}=$MS^J?ZD?XJr*E+Uke-qS`+D#?^Nz)v*!=!XM_cbm*IQ_xh_+Am}z>@nytk z7a$(|`_;*^nuRZZ1=qzn&H^#q+`)nFG8yhdUSW$%%ZuV8&d&r#nwOVYL&dO##jas@ z6FR40C;x4J@wU4xXY|)SIg;bof2}0!{q*i;PVmy$)=TrVxB*4rLa*-9)xAv-iMN+d z_2~IYD%+OU**DN2--()fLVxm9vpeoYtEWHI>{D>-rFhfrcl+i=sAM6+9BPN@@vhg| z0eO4Jxz;`D@XFajL#3@A>KxLQ0{*uvh= z`;wA3dkJ#}Nu#E^G#r^je6ziB_ntlBkCYV$>{{hSxhy+Hle_1?>N&+FbOD7On~{0> z>bBx%A6<@2^_m%Wci_A+Yx`~7_1CS^zOOTW3T}%J@5!ApW_DFW;^|2{dxidxE6J3# z@w`iet{6>6uqp;V@MoLhdyUn**z}YgzJLp;-02mOxiI_EWFMbkckGF!u3Oz(rX;4O zWN+HG?ZM@hKeeHRw~g~37Oh${NI8@_zdys;m$Lr*&P8a;Ubhb+#XBz%)yExt_=VLA z>g@4-y$&Y^>el+RO^=0Bb_Nj`QDpFJbiOti$OK%Tp=1X?I>8*}T zDgRNk(Dn~PNT2QQ@V~JHaaj3w5Bb;t$?5zO!{xRGs2SDg$t2R(H&1^pf;J;+H?h%fq@afxo@UGgkU-e@v-so93L)-+AU~r0(YAme8DD>b~8(&G^ue zwG6ZG$S?1JAm(iP!MLJTdv3j)uy^v_5gEVdAKd7XHn!$L@TZlHyMwN7i8=n}l|z*I zXW`D}$z@HBtufa(Zv1i~VF3uY#6%l0w5#+-{>lzrx<=|w2CHKgS8)8}!Uo88{04|H_CZH(LA zk@G*;M%r5Wh$pXi8Ekv~c3sllXFi?W@O4u=3Z1=OytW(Xb?gm3xe#qyv!_V#_d$s1 z^E`yLF(!1=3;GP*D8P!1oxTwJUtv>v@b^_(Nzu(wV}eu%q$6|AyUB~U%}Db|-ah)r ze`gMz>8pS1HTT}7DiR?Le;L2_oX)yn@LcxcQ1%bhR1_3d#F>}ehn_t$KWy*h((2k# zuUE#gUzsjndvp&bGWxdGh-#C+mC-O@LnK(|5>MKg0HD^AyfPzrpK#(eBD(Nvr2cx<0Ta7 z4-om#3A62B571c`|K`$SHizkR;Mnu$igVY0UXTgT|9BH{v{%u{KRvrP1T`@qa>V7N zr50h`pPS$cekU;kA_EYJ%G8Z5a)`=?Wt>fNV&r~euvV(_u`tI)B zQRFgV*`3to2VUL$HFJZm*E#yM_wiFD8JnYamv^b73tQpmUjOX$esXd7;2GZ$X>%W* zNti8rW-(#boaHUNTi;w9<|W)p2LCN?@T#h}PMM89nixg0+AcY?j|-u?5l_eehLy#D8|3ajowH%M0Rn+IFH%l+)+ z5!6raXAea&`21goX~iqo|G|715jJn$r<48t{VlTWhds~G1LLkd_o(s!?K+4My5ArF zFR6V0(ecM0ej;u3uV39Ts^#?FQO49e$0}>?r~&f|N5I7$8W|JIl!xFQ9;K@UwpT<$pp}g2v_$|CZtNBB>6iHSO_=&l zIYxCPDdKnV-tnGa0Mgv>gqF=cmEN*QyKnk>`yVjh*Z7OR$zx_oD>#4Jh_vY~mt-M? zwDp5YA3zi59jY6Ze|klCe}Q$?xsofdXooNKPb>4j5QX~oc?jdSVM)IaZaRpTZnDst z%b~&f?vP5iC|wb*W^>DtEfrHBFn!q6siggr&Mcl;uP}-PzqJ18)XNAtB@D8S*4{qRfNPjZq^WdUC z=U$y%=lyW&4n10Sta+r@V|$P!-#E)=UEgnnkCF$KCCp219NW3$Yq)Xoy4WP2-V<5% z!XjJ|RyCHiUUmL52182cY^lD{e)xs2S?iwN6R7R$`W1R7>~Z!OLtFm)t?8TR-p#Bh zrxw0f(sr=96(_drf7IrfKEmk|dHJgAspu?rEa~##+!OPnm!Fz9yJq=n?dl!>dRCv} zoM`JdZ|iqEesUz{so5|$<|l0&Ddgae)J8!`{QlFALgQ8^rQRlEgEo9q+@6ZR&DAlq z2b`9#0YWm#>oYS`?_ViiE(@XTovdj344&T-bXPz9XUCB*&YPbfsqwr|Zgue;%rj-%n*l+E?zWliJH?tkG!Xs@|>`f>XrAGc9hfFc+&uXj9LCw;$!l%tUHkF@w$5_xg*jyAS>kH{ih4_1_r6k zyEZs3&q(NuJJCpOvt^`p9=|opFJWKRw0+{u4}-|ZcZJPfKH}OSw-DJ+%^2M zG1z+}$uMy%e#zpHx3O<7Putm0S#nXg<6E)U-#LHxUYOF8`{ZMrIrq*AtbRe}LP`T_ zGe39q>ZJ9LbGGypj+Ymt{&)A{==_wuPghjkXqO6wLKq#l33uqwq0L`22{mzEle$U< zh)d^FM?(^M?&{Ux$qaAN!7m%8hoqK5IGBFIW`V-F;h6KW&Pg*#t@_O|n z|GcWIz_jPW0`*|RyhG#*aXE=6PCov&$ZgYc&vA&4!Dsl`*jDrFKUS^JGRV?Pt&ZAe zsNNnfBo?ZRUcKHPzpJF|8dhSwNvWAqPc?_W!#;t-od#=ZR6H#Q^ipTQ;nN%2K;+c(1Bo@`dhGF`@Cjv{{E#rcne$gs=HqPyS5^h7L4 zJ*f_7Tou+u4M;CGf0rD7@o4#u?Rob)_#?jL*1wGCDEkJ-As!B1a6PN9^|f8O{q0rv zWa5e|yi|b9bXSX(99T8;$sF&$?HQ;e?7Eu!F;~6byIvD@`8=68`{zp6n#BXoKHR5s zp6rY6qc(LaS9G^TP7bV#V4q{Z%DrD?*f+i_z-d;-LWyIuv15-#xFvJO(NJ=7#^9ro z0ve4L0hp9xkC3^0?!T{!ez~*j+>rL6t@>pBg0Fbhwb#f0I7k)d7;J6dG^p`VNYewK zj=9zRqQ2K1Fa0G;TD>w;mL#MZ%09d2 z!R%d~H2Q)OX@fqv9b@75PTj~=Fy)_2DJfB@e-3J&-cDRTd-}tXmn|oayRqwj->F<@ zr_@ir>VM!+zc8Ueq?w)vD>{QZn{{PBZo$@yfI@g!qVdvUeB!-n6Xt-4RqK%BayQPM zu_iG&d*Q~bg_RyJDk~ptd6QFTc2sv&igytGyLuP3k%HkXyV~9^KgKE;ThnpH|95gw zkNc;smvlcF(d~ zvc*#^i7%XPa=*_TYb{x{c-r^=bv zs}wjj)aZ0NF=mDwve)xTr@fn| z1Oz93egDRLQ{jh29V{)kM)i#Z3%tYl@5LO}^cU>-_ zyM3T|_TF(tY?bjPk_$W)WPdFGKnm4Dv2U3$PsF}ZJu8S2MG{7gbOxHkAp-YrwfMr3+U zEt)#Ta9$PO6b~E-iO;isd-*Wh&+sAR#lGcDI||)VA3nbuql+H!Yr98TH~nKR%sgev z+ou)to~Nvy{`>QxI}<=>??bU$(>{&-ukH}G?|PaY+rT(hlGZr3niNf#`|C%N>h<=0 zi(Z@Xn4Y2Vz*Q^61+~{+%<7F*m?uR2cpup7L@qCQutqw&^UVt{*q^x05zC;E;*Y+y z9VI(Q{j1I$4>%$l1f-dd4+~0sim*n5;rcZ9kq~v5`-}e&Z$))HiY#ou-{b0%7T4*n z`Qs%>*P1`Sa`hn}eg3N3Dh}m~dNxcdM|@v;DSxHgQE$KB7f)QsiAy&R3}mi(&N#cZ z_q%u7+)HCxADoU{6`g$Jc6QMfBHY6?HDHl5&>l$1yz#eOkMMT74&AN#PVpdAbOd4! z@$S@}s$6Q*2oHk;lP)K=vaW7+qXv7gn!khi>d{v5aj5e1(lM~i&+Qiz6KcC=@IFQb zRTa*PU9>a-x?lHt``dLnp62-4sp+x1UCGDIYij7~tR>e2*3Nk=yZU-xRba>=R~h> zI~&(rMVNGZ;*#PWxHZ*|5m)cu9M|DnIN?%^Hy`_{y%#s&Kfb#8di`h+Ii#=AC5aL zfXsj0(@TD!*xBM%d_HAKBG8VB8^e|2UO^MCV@h*?{n^pXW4FI`MV||VAHAq}xV<2z zO)<^03+WaelQVMCC)U|M!hBTkSa-=l829OX!`H=AN)|1(b@buqyLUxjD)jSQ51urMJByyk0!7ez9`7&I{QYmlZei&dQ_7?HH3h4--1t!e z2floNGh7>wsvuqVoBrIZ=vFPQIT6Pt?+nJY$IBj4AFmwqpXV%(pPycke|>IR`p$TM z%-^urOD>gMAvT*#6NUY`+$q-{c*tE1>LbOJ58SAIIMkh{gP``%H?||p@v!QIm=R`$ zTZYKV;av7IRcgcL88@fxDX`AS6a6u*-`DI$y<^$92-_z542&Eg;m&4mYaJ`YO|1vRf|!q616GY2v=$ZXR2-=WTXy|(sw@7PJ2k`?=4^{-WD zXWcwda%A6j<5k|os+vuAcUF&##e5reVD9?eiXEOts^73l#6z4!f@7d>AdOSdH~nyfFw%4Aha`1#^bgE3=M>mI@0E=lzFD%iR$`rf@at&a-byYFTu zM7Tzb%_9UipZ1tKGpskKVOsz8Y0gI~7mxUO_FrbkvkNV$2WO4L-szb7^Sk&E`I^js zsAkol6O9Eiy^GiU`#%9y8LH-dm_L8M#Y~IbjygFxRbnjOj5p6@)YjEI7&aGZbw%5z z88rE3?lhwxj?q3p;Lbmpsl!`(jb=Y&K-Ow#AV(uMLU!WLub?c)`cmdz7&xW2^NEI` zF;4&*)^QiPH+2)4N?W7Rkm0@gKoc}886m^-Yz7SN&Ri)^`g&WsDbBAsK&w-J z`K3W$8^#?}%xS-T;TW?V^(_fnV<5x+p;n%D&%ren+9pv~R#j6DH99r8IJp8^2-^8X zk^kaO^GI4U*)X(blZu=_UikRuZ_|C?g2sf5_><9@78#J)1sd8h(Q{(;cehQoAssEz zwrtju7nG4qT<~AeA^nt*!>DYpXy} zc8|COUC%vd^w?-k!~l8(4b1Bq@Ng`1X~hJMV^%6NWm@DwW;1BC9TQDVckiiJ(V1%; z3EE$W_m*8vJ?nCrY!@at);4uViJeaTI7|DF z>Z#N}k5$l^u0MDJGeuHUK`Y41h8>$%TOCA@Wl&wy;CR;*iDqxPf8-!(b;`RE%^mJ=1 zX!aI3o;6C5dBts#4rNG7k|FbiclI9 z{TTF*cm&oQc^9;G4TTx-8g$SvsyGlRCS;-B5iotwTjW}3T0-3$nw2p7&NnAMGDRFY z5;j!EzT9{5J-(1hhD4(y&WP=jKo%7hwLU#P{qM}q*elTr3JQJ_i^Wf?HFa?PHY=*F zQJVo<>v7Y4mO0faf1G{%Z?@*BcNA!r^klUEKDz5*1Q5FGygF+0&vvghpB zv7_|Bfdgg%y4D6GBA_yP#pDYc@wnyy4bzRQ)#}Kanwq5sgTW2&b1wTZoXu{C9Afry zVAP|4K_1Hwz5%s54VgX@M=UpaV7N*>$Xe|88$00A{Sy`+k57`}S`F817)tlkg|CP^ zP;)mXWOy$=&{&YsnHHHEe*w^V(-v6Tby5^H4FgWl#>s3lnQW=MyZf*7XQid3NqW7W zrDww82)Vhrc8`~{RImKuZ^!;dpC|;HwBnDCq@}xA&Q`Xp(PaJVVN&ayBWnyE1hjLx zC*iDu9}rxBVHEeNjNjo@`XNZ@^f>ftKbTB!=sSiIr(rHy*V20E@vdX zWtIbl2^tL<%g2AtoH?2W3l^}l3)%^qP<2-!S8mqvXpWG{WC&W%pMU-t_V3@{=hHJTj(W6N~F9zWHxd*|x9@B`Q8tyjtq<#x`j8RUF9!#DSQBT$CY_!d>P+hAt+79vET%!b5XO4h)PS$Fu~vS@`5uG?I!^LOE;=EuSGy>j9F=X zub08kJrKS*zLey=!1zp@HZj>9PJb9xek<=1?7zC%^gLY121Jd55wR2D*K1ozz6)l< zSWa~5#@I}Z(R5_vX#CAI5_m6q8D<6<>dY*_!{^!|v>9-SL}P-Mnwkn9urkd~K9qRLO+{S7YM`^%Ahk!uNs#Bo5QwFScj zAv1y|804$bXofM}ZymTsV<~uoiAz#+UQV zlI@b)QOvxkH`KwFGYz%H@fge(=I!DShi?1?m}sR}`ysIY*kUNGY>2VslVr^6qZ{{Q z-Jd04bf~_-oM?2eGOGh-#{pJC;^>Vc`YB?YK^x}Z|WU;R-zsKsfyUY{D_Iwc{{bQk`rVNhU+-;VG*cxQ?1?jLkj7rC>K8|kI zLgLU!-oAZ%l}@J{L?9b5V1VJ&sZ+FBi8s??TLMigmG(Sz=n!n#!s5iHaYV>sA_E}M zt?ZXxzQ1nn;9F^m^=}GjwnAo8CoLSrg$(n^q9Ghp8zf))59>@!^b+jZ9zHtoio-+a zIJQgT*U!}kj2SbgDm69Loe#45^$SmdrnUl_mzUQ$gTWBJVa*~qdBHpxustu*T~?Rj z=~|oV;ar>V>#i+`?>{UdtbKTI=2{M~$EUDmO0?F34Et7}Gqyc!Jh_78nZ<^iidTny zNNQ1`;oQ-UtMO;fH1xeDPMioRydonb$C;)}u31R{G?VFv({Z~t0|Ha0&_;u`M$>Aw zqg5)^B5mIO{%bZJhfCM(w{O}r=$)ei$>9|hWjWO<74cu%F!LnI@N27C$)5Z19c#&- z4E@V68eON?kzFK)(P2J+PZuAvN+V6}2Yo)D#-9rs4cQMr{BYN_@aGAMGm~h1#4*+)C+Ho>oAqELQAWO6DLl?>C$Gi5_fuQ zP0&0&J?FW(xfLR0*q96Uy@#&B@5gR6yHxw}!^U~7Nf)+1c%(h)xGAp$e(4;GT?JwCGott48t`U`3D-v=_Gj8D#g*i3P(${+S=N=))c%_sT9+@tHd{F zBr!Aj@Tnlu(hJ~dN?PTWo4J}AE#>d1Uo%36jox+sw?T%vFyg~T1#`xe3mS~LL}rD4IWJ}>rKh@Q;cYy(WtOi^%D zj~+cXUAb~)8m1|ztgKwfyttj9r2+t7ksrHuoz_63A;Ypaw$?tvXW|}vE+ZiDmeM`Q z;QjD)%E`Uc{oGQhR%_wDqSWe|jtLoF`_G(Hr_Ep1B%|C`Ng2QRaP*9NrW4b>_j-hJ+VAK;jM`FP+bnkAChRWFIyS=@db}|)^Xc(9I z0nU)r{Q2_@STB0^?Adt>7cOL_1F|I1xG-3VYQoDn3^Y26kfj1%mLwV$31yZTrYF71_0lpG&5ABfw z7zXnG`_P*vzkWL3D^@YVNTiJ?mzy;Ccpf?91#R4V-z-Hr8y4P|0l6Es^O z!&ejJ849r>eMqqfEcJZkW-??I-(q=6V5u5RfQLr(xg(dGik#9v228(>ux>uHns{iS z?Gh{f(q(lp4LcUFU(RsJcJ!3V&9uk|T4G`%?AWmbFld~@X>$Gg^+BB4N^Kgnf`WoK z#A5Mk&{cuHv>_C&wLr5CG9Y!uF}euXN*(+(mX^M!*Xut7LmlYLP5}bfT2rT`0$$7)jiqQX zwfa{_7ZIN^jJf>kuf-;S(CcISKl}llxce*o$tXfko|6)TOyW!= zBC}zboy^Gv3ha}h@Bjb@|4BqaR4cK}Li04COp8p==#V({m_Zc=Rv{q;7Z5Upjcx+I z+m`&sSQ5})aO^$4xVU&80Mu72HR!=mc@+#AWs9%TEEVwLzGhs|<9ZAm`~SUQ&WL5i zt993fuT7_6Aqfn1Tm8qvHjytDJ7z7@Vopv@%8(&LV)0lyB+gW#1fBke3>nhkvn)^r ztSw12EUO|Gi#KB0D$|cclLu7AxeOQOa@J1L5xY*lR8)>o#?opFTx(%)3Ek+8z`iR# zSWRQlJaWtYs@zPA+!-${EbO_QoSYxS!ouL}*|S8FnIP-mzyAlx$;l`k%rY%<-L2ag zXe4Ti&iF_ViwF#dJ&M<^(%oMRU`)qU9sJg_b+Sk-*}`gk>p48YmR+YKAZsZ=9C05T zqLql-K3(1tA;TP)bV-c8nVFf>1(a+;xnnW1l+VYvMW;^g?w8i5iA18q-rnA;5VE?w zouJG77&Pvqhu6CFGAlaPSkU&?4IX~0uASNS9PvWi;x%lDRzkl!%vQcwp-_a=nNZM2 z2BjDN*T;gA&1#Zj&Y-j^XxJAtK{+`&?e+8X!-bBB?w_ivWOz+wej?Q5fi9%pZILkR(Mo8>NH%tT7{*`HkkLkR{0?`Nsg|_4SS5}Q@}S4T zR`+as7x(raZ^;xUm&>u(R{#J+;aI6us*xIvhAn{%I@~xM_NcqE3RHIhYVQMRiU8^= zpxk3pqF(GXoT4NE_$QJAC3aD{N-E#jc;ok}#j27KZIla#cFR(<(v&Gv?jAdKEZTTO z!WRu0bILP$W3&~}tay9uk;~<*3^TUcTyX%f)U{r(-+=2m{4=fR=xLVodie6GiA17| zqT1X~x1Ifbt)XE|9r|M1(#3?Lm9n$5VdTh>P*Fj4&lvDYvMth(G1p}noe@DlR4ajI z2971r?8!c4@3C^Zd=@51H2#tCQYw|}DT;cIlj13L`?b$jRBFne@+_YHNf9v&W>aO{|n(f&Vp z90G=*nVO{+Y)=Rs7aqDE507sWZ)#CTfyUQod!|_&7$MMTb0AR8~`1Z|OHSZ|8vEaPQdZ6eVJnU{=*uq~0#R$Z~TQI+pL+ zP9kUL&H=rrOGHwx!yyz_=j_QTzl-i#rWWQE9T(slW|m4F`!RXo-&ximA0MA1rKP2W znMso-!KzjDIXrNjnULX@lUa=fr%YC0V{?IKFF5udF9c6ZfM$b)=gqq-wBg~C@b)B+ zws-ks!7*aBTrQ_KO-um=XiDnZwJR@V_@U|Gz`($37cX9XfxC-*5I_R}<^Yg~ZD!Dz zyK|Z=d(=I=pxGl?TLU!Xj}(_#w8{d_k{8m_(j>;2={vbxKAoL0xL$!~@^Cf!n9Y^T z<*bZ_4+3cRNGW@cwR5f5)K z@Eq2H(;5%}U>1ufwCm(b65|U)={b$tKRYD}EzNDn2rQd%0Ez^*Ui#)T!llCqoZ3PW`Ds-%! zpxGlG>^aun-LFF>3G+=h7ic_^5`XkK>x47!@9OIMRdsdsKllWepd@1)j2c^%V6;~T z?l$ozNUfozmE5P-nIz*h?j3f2h$ zW(#C^3~mOv3q*tsKf`H=G`8JL1z4zW5-MxGF*TrHn%jYgNHTB~qOO_mD9^0{FNB2dG7Oi%6c1Am?sj1cK zyNeeuPP}vHPQ5Fyv8`LTZedVR5LQk$HAX*J$;a%O?u}C?zAILYq>w`@j`iKUcSo;W zxiWpU;IY$g^U%gST?$Z%ibR|;&DKeb8>LR2I(4XAE@vqn6$(Y_ z)~#F3)^vUP^qDD>$=WmTS65f3yLj>9X{}b(`IyYjSgQFK^qn%}nai$45m)1t%mVOixQolNqJn8J$kY zl6v&95{U$V!9D3;a8_2J9S{%@il3F1mgc9WrQxyMQ2aKFP+VMG z)wOHaE?XOqPMtcrhlGS^_U+rZDLp-%J08sG&m;|-^3eYW+pA(H(KMam00000NkvXX Hu0mjf-7_Wm literal 0 HcmV?d00001 diff --git a/docs/imgs/logo.png b/docs/pages/img/logo.png similarity index 100% rename from docs/imgs/logo.png rename to docs/pages/img/logo.png diff --git a/docs/index.md b/docs/pages/index.md similarity index 100% rename from docs/index.md rename to docs/pages/index.md diff --git a/docs/manipulations.md b/docs/pages/manipulations.md similarity index 100% rename from docs/manipulations.md rename to docs/pages/manipulations.md diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..b27f8dd --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,6 @@ +jinja2<3.1.0 +mkdocs==1.2.3 +mkdocs-bootswatch +mkdocs-material +mkdocs-rtd-dropdown +pymdown-extensions diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index e9fa675..0000000 --- a/mkdocs.yml +++ /dev/null @@ -1,31 +0,0 @@ -site_name: "Codext - Extension of native codecs for Python" -repo_url: https://github.com/dhondta/python-codext -site_author: dhondta -docs_dir: docs -nav: - - Introduction: index.md - - Features: features.md - - 'Guess mode': guessing.md - - Encodings: - - Base: enc/base.md - - Binary: enc/binary.md - - Common: enc/common.md - - Compressions: enc/compressions.md - - Cryptography: enc/crypto.md - - Hashing: enc/hashing.md - - Languages: enc/languages.md - - Others: enc/others.md - - Steganography: enc/stegano.md - - 'String manipulations': manipulations.md - - 'CLI tool': cli.md - - 'Create your codec': howto.md -extra: - mailto: alexandre.dhondt@gmail.com -theme: readthedocs -extra_javascript: - - js/collapsible-navbar.js -use_directory_urls: false -markdown_extensions: - - toc: - permalink: true - - admonition From f28816667d78e2dfaa76996730ffe6ff74843561 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 12 Feb 2023 13:53:28 +0100 Subject: [PATCH 07/62] Removed Travis CI config --- .travis.yml | 86 ----------------------------------------------------- 1 file changed, 86 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 39ff698..0000000 --- a/.travis.yml +++ /dev/null @@ -1,86 +0,0 @@ -language: python -jobs: - allow_failures: - - arch: arm64 - - os: osx - - python: nightly - fast_finish: true - include: - - python: 2.7 - - python: 3.6 - - python: 3.7 - - python: 3.8 - - python: 3.9 - - python: nightly - - os: windows - language: shell - before_install: choco install python2 --version 2.7.18 - env: PATH=/c/Python27:/c/Python27/Scripts:$PATH - - os: windows - language: shell - before_install: choco install python --version 3.6.8 - env: PATH=/c/Python36:/c/Python36/Scripts:$PATH - - os: windows - language: shell - before_install: choco install python --version 3.7.6 - env: PATH=/c/Python37:/c/Python37/Scripts:$PATH - - os: windows - language: shell - before_install: choco install python --version 3.8.1 - env: PATH=/c/Python38:/c/Python38/Scripts:$PATH - - os: windows - language: shell - before_install: choco install python --version 3.9.0 - env: PATH=/c/Python39:/c/Python39/Scripts:$PATH - - python: 2.7 - arch: arm64 - - python: 3.6 - arch: arm64 - - python: 3.7 - arch: arm64 - dist: focal - - python: 3.8 - arch: arm64 - - python: 3.9 - arch: arm64 - - python: nightly - arch: arm64 - - os: osx - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=2.7.18 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION - - os: osx - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=3.6.8 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION - - os: osx - osx_image: xcode11.3 - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=3.7.6 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION - - os: osx - osx_image: xcode11.3 - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=3.8.1 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION - - os: osx - osx_image: xcode11.3 - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=3.9.0 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION -cache: pip -install: - - python -m pip install --upgrade pip - - pip install pytest pytest-cov coveralls markdown2 six . -script: pytest --cov=codext --cov-report=term-missing tests -after_success: coveralls From 8508e2882ab6ce90d69ff4342511e5a7a32917df Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 12 Feb 2023 13:53:40 +0100 Subject: [PATCH 08/62] Added GitHub Actions --- .github/workflows/pypi-publish.yml | 37 +++++++++++++ .github/workflows/python-package.yml | 79 ++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 .github/workflows/pypi-publish.yml create mode 100644 .github/workflows/python-package.yml diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml new file mode 100644 index 0000000..392e026 --- /dev/null +++ b/.github/workflows/pypi-publish.yml @@ -0,0 +1,37 @@ +# This workflow will deploy the Python package to PyPi.org + +name: deploy + +env: + package: codext + +on: + push: + branches: + - main + paths: + - '**/VERSION.txt' + workflow_run: + workflows: ["build"] + types: [completed] + +jobs: + deploy: + runs-on: ubuntu-latest + if: ${{ github.event.workflow_run.conclusion == 'success' }} + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Cleanup README + run: | + sed -ri 's/^(##*)\s*:.*:\s*/\1 /g' README.md + awk '{if (match($0,"## Supporters")) exit; print}' README.md > README + mv -f README README.md + - run: python3 -m pip install --upgrade build && python3 -m build + - name: Upload ${{ env.package }} to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + verbose: true + verify_metadata: false diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..9010fab --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,79 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: build + +env: + package: codext + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install ${{ env.package }} + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest pytest-cov coverage + pip install -r requirements.txt + pip install . + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test ${{ env.package }} with pytest + run: | + pytest --cov=$package + coverage: + needs: build + runs-on: ubuntu-latest + env: + cov_badge_path: docs/coverage.svg + steps: + - uses: actions/checkout@v3 + - name: Install ${{ env.package }} + run: | + python -m pip install --upgrade pip + python -m pip install pytest pytest-cov + pip install -r requirements.txt + pip install . + - name: Make coverage badge for ${{ env.package }} + run: | + pip install genbadge[coverage] + pytest --cov=$package --cov-report=xml + genbadge coverage -i coverage.xml -o $cov_badge_path + - name: Verify Changed files + uses: tj-actions/verify-changed-files@v12 + id: changed_files + with: + files: ${{ env.cov_badge_path }} + - name: Commit files + if: steps.changed_files.outputs.files_changed == 'true' + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git add $cov_badge_path + git commit -m "Updated coverage.svg" + - name: Push changes + if: steps.changed_files.outputs.files_changed == 'true' + uses: ad-m/github-push-action@master + with: + github_token: ${{ secrets.github_token }} + branch: ${{ github.ref }} From fb303c91a17b91d13b0d6449026c7661ad593807 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 12 Feb 2023 14:15:23 +0100 Subject: [PATCH 09/62] Fixed issue with md4 --- src/codext/__common__.py | 5 +++++ src/codext/hashing/md.py | 3 ++- tests/test_manual.py | 3 ++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/codext/__common__.py b/src/codext/__common__.py index 9d9400c..d88dcbe 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -35,6 +35,11 @@ from importlib import reload except ImportError: pass +try: # from Python 3.11, it seems that 'sre_parse' is not bound to 're' anymore + re.sre_parse +except AttributeError: + import sre_parse as __sre_parse + re.sre_parse = __sre_parse __all__ = ["add", "add_macro", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess", diff --git a/src/codext/hashing/md.py b/src/codext/hashing/md.py index 181d85c..6463722 100644 --- a/src/codext/hashing/md.py +++ b/src/codext/hashing/md.py @@ -55,6 +55,7 @@ def md2(data): add("md2", lambda s, error="strict": (md2(s), len(s)), guess=None) -add("md4", lambda s, error="strict": (hashlib.new("md4", b(s)).hexdigest(), len(s)), guess=None) add("md5", lambda s, error="strict": (hashlib.new("md5", b(s)).hexdigest(), len(s)), guess=None) +if "md4" in hashlib.algorithms_available: + add("md4", lambda s, error="strict": (hashlib.new("md4", b(s)).hexdigest(), len(s)), guess=None) diff --git a/tests/test_manual.py b/tests/test_manual.py index 64b1843..6a1d09f 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -3,6 +3,7 @@ """Manual codec tests. """ +import hashlib import os import random from six import binary_type, string_types @@ -105,7 +106,7 @@ def test_codec_dummy_str_manips(self): def test_codec_hash_functions(self): STR = b"This is a test string!" - for h in ["adler32", "md2", "md4", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: + for h in ["adler32", "md2", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: self.assertIsNotNone(codecs.encode(STR, h)) self.assertRaises(NotImplementedError, codecs.decode, STR, h) if PY3: From c46912fcfd22ebcdde268dbdf7713052b9dec74b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 12 Feb 2023 13:23:12 +0000 Subject: [PATCH 10/62] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index bde433b..78f9f98 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.53%coverage99.53% \ No newline at end of file +coverage: 99.03%coverage99.03% \ No newline at end of file From 5438e5363500b5665d1ce646fc54e1879410300e Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 12 Feb 2023 14:29:05 +0100 Subject: [PATCH 11/62] New release --- pyproject.toml | 4 ++-- src/codext/VERSION.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ce377f3..099d04b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,9 +13,9 @@ name = "codext" authors = [ {name="Alexandre D'Hondt", email="alexandre.dhondt@gmail.com"}, ] -description = "Library for producing ASCII arts from a text or an image" +description = "Native codecs extension" license = {file = "LICENSE"} -keywords = ["python", "development", "programming", "ascii-art", "banner-generator", "quote-generator", "cowsay"] +keywords = ["python", "development", "programming", "codecs", "encodings"] requires-python = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,<4" classifiers = [ "Development Status :: 5 - Production/Stable", diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index 850e742..a4cc557 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.14.0 +1.14.2 From 8a902fa6aa1768e33c649d51633703087e486e39 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 15 Feb 2023 15:30:17 +0100 Subject: [PATCH 12/62] Updated docs requirements --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index b27f8dd..a4427bc 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,5 @@ jinja2<3.1.0 -mkdocs==1.2.3 +mkdocs>=1.3.0 mkdocs-bootswatch mkdocs-material mkdocs-rtd-dropdown From 3190f8ed91d16d8ba6582477cf8c2e8b930fe9ba Mon Sep 17 00:00:00 2001 From: dhondta Date: Fri, 28 Apr 2023 00:19:21 +0200 Subject: [PATCH 13/62] Dropped support for Python2 + Applied minor changes --- .coveragerc | 51 +- .github/workflows/python-package.yml | 158 +- docs/pages/cli.md | 366 ++-- docs/pages/enc/base.md | 346 ++- docs/pages/enc/binary.md | 334 ++- docs/pages/enc/common.md | 140 +- docs/pages/enc/compressions.md | 2 - docs/pages/enc/crypto.md | 410 ++-- docs/pages/enc/hashing.md | 2 - docs/pages/enc/languages.md | 396 ++-- docs/pages/enc/stegano.md | 244 +-- docs/pages/enc/web.md | 78 +- docs/pages/features.md | 674 +++--- docs/pages/guessing.md | 342 ++- docs/pages/howto.md | 482 ++-- docs/pages/index.md | 20 +- docs/pages/manipulations.md | 149 +- pyproject.toml | 10 +- pytest.ini | 2 + src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 3037 +++++++++++++------------- src/codext/__init__.py | 512 ++--- src/codext/base/_base.py | 581 +++-- src/codext/base/base100.py | 103 +- src/codext/base/base122.py | 204 +- src/codext/base/base85.py | 371 ++-- src/codext/binary/baudot.py | 576 +++-- src/codext/binary/rotate.py | 103 +- src/codext/common/cases.py | 5 +- src/codext/compressions/pkzip.py | 111 +- src/codext/crypto/railfence.py | 192 +- src/codext/hashing/blake.py | 22 +- src/codext/hashing/crypt.py | 4 +- src/codext/hashing/md.py | 4 +- src/codext/hashing/sha.py | 23 +- src/codext/hashing/shake.py | 22 +- src/codext/languages/braille.py | 67 +- src/codext/languages/galactic.py | 5 +- src/codext/languages/tap.py | 77 +- src/codext/others/uuencode.py | 2 +- src/codext/stegano/hexagram.py | 76 +- src/codext/web/html.py | 580 +++-- tests/test_base.py | 471 ++-- tests/test_common.py | 493 ++--- tests/test_generated.py | 297 +-- tests/test_manual.py | 340 ++- 46 files changed, 6200 insertions(+), 6286 deletions(-) create mode 100644 pytest.ini diff --git a/.coveragerc b/.coveragerc index 4ccc970..b677975 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,27 +1,24 @@ -[run] -source = codext -omit = - codext/__info__.py - codext/**/__init__.py - -[report] -exclude_lines = - pragma: no cover - if.*?__name__.*?==.*?.__main__.: - def main\(\)\: - def __stdin_pipe\(\)\: - for line in __stdin_pipe\(\)\: - def __format_list\(items, include\=True\)\: - def __print_tabular\(lst, space\=4\)\: - except ImportError: - except NameError: - raise NotImplementedError - if not PY3 - if PY3 - def encode\(self, input, final\=False\)\: - def decode\(self, input, final\=False\)\: - def _detect\(text\)\: - def _lang\(lang\)\: - if stopfunc\.LANG_BACKEND\: - def _validate\(stop_function, lang_backend\=\"none\"\)\: - except KeyboardInterrupt\: +[run] +source = codext +omit = + src/codext/__info__.py + src/codext/**/__init__.py + +[report] +exclude_lines = + pragma: no cover + if.*?__name__.*?==.*?.__main__.: + def main\(\)\: + def __stdin_pipe\(\)\: + for line in __stdin_pipe\(\)\: + def __format_list\(items, include\=True\)\: + def __print_tabular\(lst, space\=4\)\: + except ImportError: + except NameError: + raise NotImplementedError + def _detect\(text\)\: + def _lang\(lang\)\: + if stopfunc\.LANG_BACKEND\: + def _validate\(stop_function, lang_backend\=\"none\"\)\: + except KeyboardInterrupt\: + if alt and len\(t\) \% 2 \=\= 1\: diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 9010fab..62476a7 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,79 +1,79 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python - -name: build - -env: - package: codext - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - name: Install ${{ env.package }} - run: | - python -m pip install --upgrade pip - python -m pip install flake8 pytest pytest-cov coverage - pip install -r requirements.txt - pip install . - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test ${{ env.package }} with pytest - run: | - pytest --cov=$package - coverage: - needs: build - runs-on: ubuntu-latest - env: - cov_badge_path: docs/coverage.svg - steps: - - uses: actions/checkout@v3 - - name: Install ${{ env.package }} - run: | - python -m pip install --upgrade pip - python -m pip install pytest pytest-cov - pip install -r requirements.txt - pip install . - - name: Make coverage badge for ${{ env.package }} - run: | - pip install genbadge[coverage] - pytest --cov=$package --cov-report=xml - genbadge coverage -i coverage.xml -o $cov_badge_path - - name: Verify Changed files - uses: tj-actions/verify-changed-files@v12 - id: changed_files - with: - files: ${{ env.cov_badge_path }} - - name: Commit files - if: steps.changed_files.outputs.files_changed == 'true' - run: | - git config --local user.email "github-actions[bot]@users.noreply.github.com" - git config --local user.name "github-actions[bot]" - git add $cov_badge_path - git commit -m "Updated coverage.svg" - - name: Push changes - if: steps.changed_files.outputs.files_changed == 'true' - uses: ad-m/github-push-action@master - with: - github_token: ${{ secrets.github_token }} - branch: ${{ github.ref }} +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: build + +env: + package: codext + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8", "3.9", "3.10", "3.11"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install ${{ env.package }} + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest pytest-cov pytest-pythonpath coverage + pip install -r requirements.txt + pip install . + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test ${{ env.package }} with pytest + run: | + pytest --cov=$package + coverage: + needs: build + runs-on: ubuntu-latest + env: + cov_badge_path: docs/coverage.svg + steps: + - uses: actions/checkout@v3 + - name: Install ${{ env.package }} + run: | + python -m pip install --upgrade pip + python -m pip install pytest pytest-cov pytest-pythonpath + pip install -r requirements.txt + pip install . + - name: Make coverage badge for ${{ env.package }} + run: | + pip install genbadge[coverage] + pytest --cov=$package --cov-report=xml + genbadge coverage -i coverage.xml -o $cov_badge_path + - name: Verify Changed files + uses: tj-actions/verify-changed-files@v12 + id: changed_files + with: + files: ${{ env.cov_badge_path }} + - name: Commit files + if: steps.changed_files.outputs.files_changed == 'true' + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git add $cov_badge_path + git commit -m "Updated coverage.svg" + - name: Push changes + if: steps.changed_files.outputs.files_changed == 'true' + uses: ad-m/github-push-action@master + with: + github_token: ${{ secrets.github_token }} + branch: ${{ github.ref }} diff --git a/docs/pages/cli.md b/docs/pages/cli.md index 111913c..4b22cd4 100644 --- a/docs/pages/cli.md +++ b/docs/pages/cli.md @@ -1,184 +1,182 @@ -## CLI Tool - -`codext` has a Command-Line Interface tool. - ------ - -### Using Codext from the terminal - -The help message describes everything to know: - -```sh -usage: codext [-h] [-i INFILE] [-o OUTFILE] [-s] {encode,decode,guess,search} ... - -Codecs Extension (CodExt) 1.8.1 - -Author : Alexandre D'Hondt (alexandre.dhondt@gmail.com) -Copyright: © 2019-2021 A. D'Hondt -License : GPLv3 (https://www.gnu.org/licenses/gpl-3.0.fr.html) -Source : https://github.com/dhondta/python-codext - -This tool allows to encode/decode input strings/files with an extended set of codecs. - -positional arguments: - {encode,decode,guess,search} - command to be executed - encode encode input using the specified codecs - decode decode input using the specified codecs - guess try guessing the decoding codecs - search search for codecs - -optional arguments: - -h, --help show this help message and exit - -i INFILE, --input-file INFILE - input file (if none, take stdin as input) - -o OUTFILE, --output-file OUTFILE - output file (if none, display result to stdout) - -s, --strip-newlines strip newlines from input - -usage examples: -- codext search bitcoin -- codext decode base32 -i file.b32 -- codext encode morse < to_be_encoded.txt -- echo "test" | codext encode base100 -- echo -en "test" | codext encode braille -o test.braille -- codext encode base64 < to_be_encoded.txt > text.b64 -- echo -en "test" | codext encode base64 | codext encode base32 -- echo -en "mrdvm6teie6t2cq=" | codext encode upper | codext decode base32 | codext decode base64 -- echo -en "test" | codext encode upper reverse base32 | codext decode base32 reverse lower -- echo -en "test" | codext encode upper reverse base32 base64 morse -- echo -en "test" | codext encode base64 gzip | codext guess -- echo -en "test" | codext encode base64 gzip | codext guess gzip -c base -``` - -!!! note "Input/output" - - STDIN can be used as shown in an example from the help message, like when using the common Linux tool `base64`. - - Unless an output file is specified, the result is displayed in STDOUT. - -!!! note "Encodings chaining" - - Encodings can be chained as shown in the last examples of the help message. This can be practical for quickly manipulating data. - -### Execution examples - -**Scenario 1**: 2-stages encoded flag - -Creating the payload: - -```session -$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 -pwTDSWRUbXTuMQs5EDgKpjgW8MiJVw1 -``` - -From this point, the only thing we know is that we are searching for "*flag*" (with eventually other characters, i.e. leetspeak). - -```session -$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag -Codecs: base58, rotate-3 -A somewhat weird F1@9 ! -``` - -Executing the previous command will take a few tens of seconds. With few stages to be guessed, using the scoring heuristic can be far quicker to get to the right output. The following takes less than a second. - -```session -$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag --heuristic -Codecs: base58, rotate-3 -A somewhat weird F1@9 ! -``` - -**Scenario 2**: Multi-stage-encoded flag - -Creating the payload: - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse -.... -.-- --.- --. -- ....- - -.- -- . ..... -..- --. ..--- .-.. .. . .- ..... .-- -.-. ..... -.. --- -. --.- --.- . --. -- .-. --... ..-. ..- --.- -.-. -- -...- -...- -...- -``` - -When looking at the string, it is easy to figure out it is morse. The problem, at this point, is that this codec is case-insensitive and always returns lowercase characters, as shown hereafter. - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse -hyqgm4tkme5xg2liea5wc5donqqegmr7fuqcm=== -``` - -In order to get it guessed as Base32, it is necessary to put it back to uppercase (in other words, decode from lowercase). - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase -HYQGM4TKME5XG2LIEA5WC5DONQQEGMR7FUQCM=== -``` - -Now that we know we are searching for something with "*flag*" (with eventually other characters), we can use the predefined "`flag`" stop function. - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase | codext guess -f flag -Codecs: base32, barbie -A somewhat weird F1@9 ! -``` - -**Scenario 3**: Base-encoded rotated shifted secret (English) message - -Creating the payload: - -```session -$ echo "My super secret string" | codext encode shift-1 rotate-2 base58 base64 -NDNxaFdieXh0Z29XOVZpWWpjRGNpRWgyZE44Z2FNU0g= -``` - -First, we shall simplify as much as possible ; we can easily guess that Base64 was used as the first encoding scheme: - -```session -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext rank -[+] 1.00002: base62 -[+] 0.99401: base64 -[+] 0.70806: rotate-1 -[+] 0.70806: rotate-2 -[+] 0.70806: rotate-3 -[+] 0.70806: rotate-4 -[+] 0.70806: rotate-5 -[+] 0.70806: rotate-6 -[+] 0.70806: rotate-7 -[+] 0.70806: rotate-left-1 - -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base62 -%¤q ´!.[æ&[fÿhbð^ - -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 -h4nRqFifSnRjFfQxRHuVpxjxpP8cCR -``` - -Afterwards, we can still try to simplify ; - -```session -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext rank -[+] 1.00185: base58 -[+] 0.99091: base62 -[+] 0.67001: rotate-1 -[+] 0.67001: rotate-2 -[+] 0.67001: rotate-3 -[+] 0.67001: rotate-4 -[+] 0.67001: rotate-5 -[+] 0.67001: rotate-6 -[+] 0.67001: rotate-7 -[+] 0.67001: rotate-left-1 -``` - -From here, let us assume that `base58` is effectively the right second-stage encoding. Guessing the two remaining encodings with no more information will now take a few seconds. As multiple outputs can be recognized as normal text, we will use the "`-s`" option not to stop on the first output successfully decoded as text. Moreover, if we have the intuition that the output shall be English text, we can use a more refined stop function like "`lang_en`" with the "`-f`" option. - -```session -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext decode base58 | codext guess -s -f lang_en -[...] -[+] rotate-2, rot-1: My!super!secret!string -[+] rotate-2, rot-23: Qc!wytiv!wigvix!wxvmrk -[+] rotate-2, shift-1: My super secret string -[+] rotate-2, shift-20: :f\r`b]R_\r`RP_Ra\r`a_V[T -[...] -[+] rotate-left-6, shift-1: My super secret string -^C^C^C -``` - -We can then stop the research with Ctrl+C. The right output has been found ! - +`codext` has a Command-Line Interface tool. + +----- + +### Using Codext from the terminal + +The help message describes everything to know: + +```sh +usage: codext [-h] [-i INFILE] [-o OUTFILE] [-s] {encode,decode,guess,search} ... + +Codecs Extension (CodExt) 1.8.1 + +Author : Alexandre D'Hondt (alexandre.dhondt@gmail.com) +Copyright: © 2019-2021 A. D'Hondt +License : GPLv3 (https://www.gnu.org/licenses/gpl-3.0.fr.html) +Source : https://github.com/dhondta/python-codext + +This tool allows to encode/decode input strings/files with an extended set of codecs. + +positional arguments: + {encode,decode,guess,search} + command to be executed + encode encode input using the specified codecs + decode decode input using the specified codecs + guess try guessing the decoding codecs + search search for codecs + +optional arguments: + -h, --help show this help message and exit + -i INFILE, --input-file INFILE + input file (if none, take stdin as input) + -o OUTFILE, --output-file OUTFILE + output file (if none, display result to stdout) + -s, --strip-newlines strip newlines from input + +usage examples: +- codext search bitcoin +- codext decode base32 -i file.b32 +- codext encode morse < to_be_encoded.txt +- echo "test" | codext encode base100 +- echo -en "test" | codext encode braille -o test.braille +- codext encode base64 < to_be_encoded.txt > text.b64 +- echo -en "test" | codext encode base64 | codext encode base32 +- echo -en "mrdvm6teie6t2cq=" | codext encode upper | codext decode base32 | codext decode base64 +- echo -en "test" | codext encode upper reverse base32 | codext decode base32 reverse lower +- echo -en "test" | codext encode upper reverse base32 base64 morse +- echo -en "test" | codext encode base64 gzip | codext guess +- echo -en "test" | codext encode base64 gzip | codext guess gzip -c base +``` + +!!! note "Input/output" + + STDIN can be used as shown in an example from the help message, like when using the common Linux tool `base64`. + + Unless an output file is specified, the result is displayed in STDOUT. + +!!! note "Encodings chaining" + + Encodings can be chained as shown in the last examples of the help message. This can be practical for quickly manipulating data. + +### Execution examples + +**Scenario 1**: 2-stages encoded flag + +Creating the payload: + +```session +$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 +pwTDSWRUbXTuMQs5EDgKpjgW8MiJVw1 +``` + +From this point, the only thing we know is that we are searching for "*flag*" (with eventually other characters, i.e. leetspeak). + +```session +$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag +Codecs: base58, rotate-3 +A somewhat weird F1@9 ! +``` + +Executing the previous command will take a few tens of seconds. With few stages to be guessed, using the scoring heuristic can be far quicker to get to the right output. The following takes less than a second. + +```session +$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag --heuristic +Codecs: base58, rotate-3 +A somewhat weird F1@9 ! +``` + +**Scenario 2**: Multi-stage-encoded flag + +Creating the payload: + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse +.... -.-- --.- --. -- ....- - -.- -- . ..... -..- --. ..--- .-.. .. . .- ..... .-- -.-. ..... -.. --- -. --.- --.- . --. -- .-. --... ..-. ..- --.- -.-. -- -...- -...- -...- +``` + +When looking at the string, it is easy to figure out it is morse. The problem, at this point, is that this codec is case-insensitive and always returns lowercase characters, as shown hereafter. + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse +hyqgm4tkme5xg2liea5wc5donqqegmr7fuqcm=== +``` + +In order to get it guessed as Base32, it is necessary to put it back to uppercase (in other words, decode from lowercase). + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase +HYQGM4TKME5XG2LIEA5WC5DONQQEGMR7FUQCM=== +``` + +Now that we know we are searching for something with "*flag*" (with eventually other characters), we can use the predefined "`flag`" stop function. + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase | codext guess -f flag +Codecs: base32, barbie +A somewhat weird F1@9 ! +``` + +**Scenario 3**: Base-encoded rotated shifted secret (English) message + +Creating the payload: + +```session +$ echo "My super secret string" | codext encode shift-1 rotate-2 base58 base64 +NDNxaFdieXh0Z29XOVZpWWpjRGNpRWgyZE44Z2FNU0g= +``` + +First, we shall simplify as much as possible ; we can easily guess that Base64 was used as the first encoding scheme: + +```session +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext rank +[+] 1.00002: base62 +[+] 0.99401: base64 +[+] 0.70806: rotate-1 +[+] 0.70806: rotate-2 +[+] 0.70806: rotate-3 +[+] 0.70806: rotate-4 +[+] 0.70806: rotate-5 +[+] 0.70806: rotate-6 +[+] 0.70806: rotate-7 +[+] 0.70806: rotate-left-1 + +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base62 +%¤q ´!.[æ&[fÿhbð^ + +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 +h4nRqFifSnRjFfQxRHuVpxjxpP8cCR +``` + +Afterwards, we can still try to simplify ; + +```session +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext rank +[+] 1.00185: base58 +[+] 0.99091: base62 +[+] 0.67001: rotate-1 +[+] 0.67001: rotate-2 +[+] 0.67001: rotate-3 +[+] 0.67001: rotate-4 +[+] 0.67001: rotate-5 +[+] 0.67001: rotate-6 +[+] 0.67001: rotate-7 +[+] 0.67001: rotate-left-1 +``` + +From here, let us assume that `base58` is effectively the right second-stage encoding. Guessing the two remaining encodings with no more information will now take a few seconds. As multiple outputs can be recognized as normal text, we will use the "`-s`" option not to stop on the first output successfully decoded as text. Moreover, if we have the intuition that the output shall be English text, we can use a more refined stop function like "`lang_en`" with the "`-f`" option. + +```session +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext decode base58 | codext guess -s -f lang_en +[...] +[+] rotate-2, rot-1: My!super!secret!string +[+] rotate-2, rot-23: Qc!wytiv!wigvix!wxvmrk +[+] rotate-2, shift-1: My super secret string +[+] rotate-2, shift-20: :f\r`b]R_\r`RP_Ra\r`a_V[T +[...] +[+] rotate-left-6, shift-1: My super secret string +^C^C^C +``` + +We can then stop the research with Ctrl+C. The right output has been found ! + diff --git a/docs/pages/enc/base.md b/docs/pages/enc/base.md index 757965e..dc7b26c 100644 --- a/docs/pages/enc/base.md +++ b/docs/pages/enc/base.md @@ -1,174 +1,172 @@ -## Base - -`codext` defines a far broader set of Base-encodings than in the original library. - ------ - -### Classical base 2^N encodings - -This namely adds the classical BaseXX encodings like 16 (hexadecimal) and 32 (RFC 3548), which are not available in the native codecs. - -Common base encodings with N a power of 2: - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_AB`) -`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_6VC9`) -`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | Charset: `abcdefgh` ; Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_A5c96T7x`) -`base16` | text <-> Base16 encoded text | `base[-_]?16-inv(erted)` | -`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)`, `base32-crockford`, `base32_geohash`, ... | Also supports Base32 Crockford, Geohash and Hex -`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | Human-oriented Base32 -`base64` | text <-> Base64 encoded text | `base[-_]?64-inv(erted)` | - -!!! note "Aliases" - - All the aliases are case insensitive for base encodings. - -```python ->>> codext.encode("test", "base2") -'01110100011001010111001101110100' ->>> codext.encode("test", "base2-inv") -'10001011100110101000110010001011' -``` - -```python ->>> codecs.encode("this is a test", "base16") -'7468697320697320612074657374' ->>> codecs.decode("7468697320697320612074657374", "base16") -'this is a test' ->>> codecs.encode("this is a test", "base16-inv") -'1E02031DCA031DCA0BCA1E0F1D1E' -``` - -```python ->>> codext.encode("this is a test", "base32") -'ORUGS4ZANFZSAYJAORSXG5A=' ->>> codext.decode("ORUGS4ZANFZSAYJAORSXG5A=", "base32") -'this is a test' -``` - -Note that for `base64`, it overwrites the native `base64_codec` to also support en/decoding from str. - -```python ->>> codecs.encode("this is a test", "base64") -'dGhpcyBpcyBhIHRlc3Q=' ->>> codecs.decode("dGhpcyBpcyBhIHRlc3Q=", "base64") -'this is a test' -``` - ------ - -### Generic base encodings - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base3` | text <-> Base3 encoded text | `base[-_]?3(|[-_]inv(erted)?)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_C2Z`) -`base10` | text <-> Base10 encoded text | `base[-_]?10|int(?:eger)?|dec(?:imal)?` | -`base11` | text <-> Base11 encoded text | `base[-_]?11(|[-_]inv(erted)?)` | -`base36` | text <-> Base36 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | -`base45` | text <-> Base45 encoded text | `base[-_]?45(|[-_]inv(erted)?)` | -`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | Supports Bitcoin, Ripple and short URL -`base62` | text <-> Base62 encoded text | `base[-_]?62(|[-_]inv(erted)?)` | -`base63` | text <-> Base63 encoded text | `base[-_]?63(|[-_]inv(erted)?)` | -`base91` | text <-> Base91 encoded text | `base[-_]?91(|[-_]inv(erted)?)` | -`base91-alt` | text <-> Alternate Base91 encoded text | `base[-_]?91[-_]alt(?:ernate)?(|[-_]inv(erted)?)` | Another version of Base91 - -```python ->>> codext.encode("test", "base3") -'23112113223321323322' -``` - -```python ->>> codecs.encode("test", "base36") -'WANEK4' ->>> codecs.decode("4WMHTK6UZL044O91NKCEB8", "base36") -'this is a test' -``` - -```python ->>> codext.encode("this is a test!", "base45") -'AWE+EDH44.OEOCC7WE QEX0' ->>> codext.decode('AWE+EDH44.OEOCC7WE QEX0', "base45") -'this is a test!' -``` - -```python ->>> codext.encode("this is a test", "base58") -'jo91waLQA1NNeBmZKUF' ->>> codext.encode("this is a test", "base58-ripple") -'jo9rA2LQwr44eBmZK7E' ->>> codext.encode("this is a test", "base58-url") -'JN91Wzkpa1nnDbLyjtf' -``` - -```python ->>> codecs.encode("test", "base62") -'289lyu' ->>> codecs.encode("this is a test", "base62") -'CsoB4HQ5gmgMyCenF7E' -``` - -```python ->>> codecs.encode("This is a test !", "base91") -'nX,<:WRT%yxth90oZB^C' ->>> codext.encode("This is a test !", "base91-alt") -'?a&[jv4S3Wg>,71@Jo#K' -``` - -!!! note "Generic encodings" - - Base encodings are available for any N other than the ones explicitely specified using the "`-generic`" suffix. Their charsets consist of printable characters from the `string` module for N up to 100 and for characters composed from the 256 possible ordinals for a greater N. - - :::python - >>> codext.encode("test", "base3-generic") - '12001002112210212211' - >>> codext.encode("test", "base17-generic") - '4cf60456' - ------ - -### Base85 - -This encoding implements various different versions of Base85. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base85` | text <-> ascii85 | `(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)` | - -```python ->>> codext.encode("this is a test", "ascii85") -"FD,B0+DGm>@3BZ'F*%" ->>> codext.decode("FD,B0+DGm>@3BZ'F*%", "ascii85") -'this is a test' ->>> with open("ascii85.txt", 'w', encoding="ascii85") as f: - f.write("this is a test") -14 ->>> with open("ascii85.txt", encoding="ascii85") as f: - f.read() -'this is a test' -``` - ------ - -### Other base encodings - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base100` | text <-> Base100 encoded text | `base[-_]?100|emoji` | Python 3 only -`base122` | text <-> Base122 encoded text | `base[-_]?122` | Python 3 only -`base128` | text <-> Base128 encoded text | `base[-_]?128` | Relies on the ASCII charset - -```python ->>> codecs.encode("this is a test", "base100") -'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' ->>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") -'this is a test' -``` - -```python ->>> codecs.encode("this is a test", "base122") -':\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft' ->>> codecs.decode(":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", "base122") -'this is a test' -``` - +`codext` defines a far broader set of Base-encodings than in the original library. + +----- + +### Classical base 2^N encodings + +This namely adds the classical BaseXX encodings like 16 (hexadecimal) and 32 (RFC 3548), which are not available in the native codecs. + +Common base encodings with N a power of 2: + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_AB`) +`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_6VC9`) +`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | Charset: `abcdefgh` ; Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_A5c96T7x`) +`base16` | text <-> Base16 encoded text | `base[-_]?16-inv(erted)` | +`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)`, `base32-crockford`, `base32_geohash`, ... | Also supports Base32 Crockford, Geohash and Hex +`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | Human-oriented Base32 +`base64` | text <-> Base64 encoded text | `base[-_]?64-inv(erted)` | + +!!! note "Aliases" + + All the aliases are case insensitive for base encodings. + +```python +>>> codext.encode("test", "base2") +'01110100011001010111001101110100' +>>> codext.encode("test", "base2-inv") +'10001011100110101000110010001011' +``` + +```python +>>> codecs.encode("this is a test", "base16") +'7468697320697320612074657374' +>>> codecs.decode("7468697320697320612074657374", "base16") +'this is a test' +>>> codecs.encode("this is a test", "base16-inv") +'1E02031DCA031DCA0BCA1E0F1D1E' +``` + +```python +>>> codext.encode("this is a test", "base32") +'ORUGS4ZANFZSAYJAORSXG5A=' +>>> codext.decode("ORUGS4ZANFZSAYJAORSXG5A=", "base32") +'this is a test' +``` + +Note that for `base64`, it overwrites the native `base64_codec` to also support en/decoding from str. + +```python +>>> codecs.encode("this is a test", "base64") +'dGhpcyBpcyBhIHRlc3Q=' +>>> codecs.decode("dGhpcyBpcyBhIHRlc3Q=", "base64") +'this is a test' +``` + +----- + +### Generic base encodings + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base3` | text <-> Base3 encoded text | `base[-_]?3(|[-_]inv(erted)?)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_C2Z`) +`base10` | text <-> Base10 encoded text | `base[-_]?10|int(?:eger)?|dec(?:imal)?` | +`base11` | text <-> Base11 encoded text | `base[-_]?11(|[-_]inv(erted)?)` | +`base36` | text <-> Base36 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | +`base45` | text <-> Base45 encoded text | `base[-_]?45(|[-_]inv(erted)?)` | +`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | Supports Bitcoin, Ripple and short URL +`base62` | text <-> Base62 encoded text | `base[-_]?62(|[-_]inv(erted)?)` | +`base63` | text <-> Base63 encoded text | `base[-_]?63(|[-_]inv(erted)?)` | +`base91` | text <-> Base91 encoded text | `base[-_]?91(|[-_]inv(erted)?)` | +`base91-alt` | text <-> Alternate Base91 encoded text | `base[-_]?91[-_]alt(?:ernate)?(|[-_]inv(erted)?)` | Another version of Base91 + +```python +>>> codext.encode("test", "base3") +'23112113223321323322' +``` + +```python +>>> codecs.encode("test", "base36") +'WANEK4' +>>> codecs.decode("4WMHTK6UZL044O91NKCEB8", "base36") +'this is a test' +``` + +```python +>>> codext.encode("this is a test!", "base45") +'AWE+EDH44.OEOCC7WE QEX0' +>>> codext.decode('AWE+EDH44.OEOCC7WE QEX0', "base45") +'this is a test!' +``` + +```python +>>> codext.encode("this is a test", "base58") +'jo91waLQA1NNeBmZKUF' +>>> codext.encode("this is a test", "base58-ripple") +'jo9rA2LQwr44eBmZK7E' +>>> codext.encode("this is a test", "base58-url") +'JN91Wzkpa1nnDbLyjtf' +``` + +```python +>>> codecs.encode("test", "base62") +'289lyu' +>>> codecs.encode("this is a test", "base62") +'CsoB4HQ5gmgMyCenF7E' +``` + +```python +>>> codecs.encode("This is a test !", "base91") +'nX,<:WRT%yxth90oZB^C' +>>> codext.encode("This is a test !", "base91-alt") +'?a&[jv4S3Wg>,71@Jo#K' +``` + +!!! note "Generic encodings" + + Base encodings are available for any N other than the ones explicitely specified using the "`-generic`" suffix. Their charsets consist of printable characters from the `string` module for N up to 100 and for characters composed from the 256 possible ordinals for a greater N. + + :::python + >>> codext.encode("test", "base3-generic") + '12001002112210212211' + >>> codext.encode("test", "base17-generic") + '4cf60456' + +----- + +### Base85 + +This encoding implements various different versions of Base85. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base85` | text <-> ascii85 | `(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)` | + +```python +>>> codext.encode("this is a test", "ascii85") +"FD,B0+DGm>@3BZ'F*%" +>>> codext.decode("FD,B0+DGm>@3BZ'F*%", "ascii85") +'this is a test' +>>> with open("ascii85.txt", 'w', encoding="ascii85") as f: + f.write("this is a test") +14 +>>> with open("ascii85.txt", encoding="ascii85") as f: + f.read() +'this is a test' +``` + +----- + +### Other base encodings + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base100` | text <-> Base100 encoded text | `base[-_]?100|emoji` | Python 3 only +`base122` | text <-> Base122 encoded text | `base[-_]?122` | Python 3 only +`base128` | text <-> Base128 encoded text | `base[-_]?128` | Relies on the ASCII charset + +```python +>>> codecs.encode("this is a test", "base100") +'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' +>>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") +'this is a test' +``` + +```python +>>> codecs.encode("this is a test", "base122") +':\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft' +>>> codecs.decode(":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", "base122") +'this is a test' +``` + diff --git a/docs/pages/enc/binary.md b/docs/pages/enc/binary.md index 745ef82..0ed7fb0 100644 --- a/docs/pages/enc/binary.md +++ b/docs/pages/enc/binary.md @@ -1,168 +1,166 @@ -## Binary - -`codext` also adds common binary encodings. For instance, the Manchester code, that encodes digits, is applied to the ordinals of the input text and the resulting binary stream is converted back to characters. - ------ - -### Baudot - -It supports various formats such as CCITT-1 and CCITT-2, ITA1 and ITA2, and some others. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`baudot` | text <-> text | Baudot code bits | `baudot-ccitt1`, `baudot_ccitt2_lsb`, ... | supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... -`baudot-spaced` | text <-> Baudot code groups of bits | `baudot-spaced-ita1_lsb`, `baudot_spaced_ita2_msb`, ... | groups of 5 bits are whitespace-separated -`baudot-tape` | text <-> Baudot code tape | `baudot-tape-mtk2`, `baudot_tape_murray`, ... | outputs a string that looks like a perforated tape - -!!! note "LSB / MSB" - - "`_lsb`" or "`_msb`" can be specified in the codec name to set the bits order. If not specified, it defaults to MSB. - - -```python ->>> codext.encode("12345", "baudot-fr") -'010000000100010001000010100111' ->>> codext.decode("010000000100010001000010100111", "baudot-fr") -'12345' -``` - -```python ->>> codext.encode("TEST", "baudot-spaced_uk") -'10101 00010 10100 10101' ->>> codext.decode("10101 00010 10100 10101", "baudot-spaced_uk") -'TEST' -``` - -```python ->>> s = codext.encode("HELLO WORLD!", "baudot-tape_ita2") ->>> print(s) -***.** -* *. - . * -* .* -* .* -** . - *. -* .** -** . - * .* -* .* - * . * -** .** - **. * ->>> codext.decode(s, "baudot-tape_ita2") -'HELLO WORLD!' -``` - ------ - -### Binary Coded Decimal (BCD) - -It converts characters to their odrinals, left-pads with zeros, converts digits to 4-bits groups and then make characters with the assembled groups. It can also use a 4-bits prefix for making new characters. It then allows to define extended versions of BCD. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`bcd` | text <-> BCD encoded text | `binary_coded_decimals` | -`bcd-extended0` | text <-> BCD encoded text using prefix `0000` | `bcd_ext0`, `bcd-extended-zeros`, `binary_coded_decimals_extended_0` | -`bcd-extended1` | text <-> BCD encoded text using prefix `1111` | `bcd_ext1`, `bcd-extended-ones`, `binary_coded_decimals_extended_1` | - -```python ->>> codext.encode("Test", "bcd") -'\x08A\x01\x11Q\x16' ->>> codext.decode("\x08A\x01\x11Q\x16", "binary_coded_decimal") -'Test' ->>> codext.encode("Test", "bcd_ext_zero") -'\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00' ->>> codext.decode("\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00", "bcd-ext0") -'Test' ->>> codext.encode("Test", "bcd_extended_ones") -'\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0' ->>> codext.decode("\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0", "bcd_ext1") -'Test' -``` - ------ - -### Excess-3 - -Also called *Stibitz code*, it converts characters to ordinals, left-pads with zeros and then applies Excess-3 (Stibitz) code to get groups of 4 bits that are finally reassembled into bytes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`excess3` | text <-> XS3 encoded text | `excess-3`, `xs3`, `stibitz` | - -```python ->>> codext.encode("This is a test!", "excess-3") -';t7C\x84H6T8D\x83e<£eD\x944D\x84I6`' ->>> codext.decode(";t7C\x84H6T8D\x83e<£eD\x944D\x84I6`", "stibitz") -'This is a test!' -``` - ------ - -### Gray - -Also called *reflected binary code*, it implements the Gray code applied to characters while converted to bytes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`gray` | text <-> gray encoded text | `reflected-bin`, `reflected_binary` | - -```python ->>> codext.encode("this is a test", "gray") -'N\\]J0]J0Q0NWJN' ->>> codext.decode("N\\]J0]J0Q0NWJN", "gray") -'this is a test' ->>> codext.encode("THIS IS A TEST", "gray") -'~lmz0mz0a0~gz~' ->>> codext.decode("~lmz0mz0a0~gz~", "gray") -'THIS IS A TEST' -``` - ------ - -### Manchester - -This codec XORes each group of 4 bits of the input text with a 1-byte clock signal, e.g. `0x55` giving in binary `01010101`. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`manchester` | text <-> manchester encoded text | | clock signal is `0x55` (`01010101`) -`manchester-inverted` | text <-> manchester encoded text | `ethernet`, `ieee802.4` | clock signal is `0xaa` (`10101010`) - -```python ->>> codext.encode("This is a test!", "manchester") -'fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV' ->>> codext.decode("fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV", "manchester") -'This is a test!' ->>> codext.encode("This is a test!", "manchester-inverted") -'\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©' ->>> codext.decode("\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©", "ethernet") -'This is a test!' -``` - ------ - -### Rotate N bits - -This codec rotates of N bits each byte of an input string. - -!!! note "Lossless" - - This codec does not use the "`<<`" and "`>>`" operators as it is lossy in some cases. Instead, it rotates per group of 8 bits. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rotate` | text <-> N-bits-rotated text | `rotate-N`, `rotate_bits-N`, `rotate-right-N`, `rotate_left_N` | N belongs to [1,7] ; when nothing specified, it rotates to the right - -```python ->>> codext.encode("test", "rotate-1") -':29:' ->>> codext.encode("test", "rotatebits-1") -':29:' ->>> codext.encode("test", "rotate_right-1") -':29:' ->>> codext.encode("test", "rotate_left_1") -'èÊæè' -``` - +`codext` also adds common binary encodings. For instance, the Manchester code, that encodes digits, is applied to the ordinals of the input text and the resulting binary stream is converted back to characters. + +----- + +### Baudot + +It supports various formats such as CCITT-1 and CCITT-2, ITA1 and ITA2, and some others. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`baudot` | text <-> text | Baudot code bits | `baudot-ccitt1`, `baudot_ccitt2_lsb`, ... | supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... +`baudot-spaced` | text <-> Baudot code groups of bits | `baudot-spaced-ita1_lsb`, `baudot_spaced_ita2_msb`, ... | groups of 5 bits are whitespace-separated +`baudot-tape` | text <-> Baudot code tape | `baudot-tape-mtk2`, `baudot_tape_murray`, ... | outputs a string that looks like a perforated tape + +!!! note "LSB / MSB" + + "`_lsb`" or "`_msb`" can be specified in the codec name to set the bits order. If not specified, it defaults to MSB. + + +```python +>>> codext.encode("12345", "baudot-fr") +'010000000100010001000010100111' +>>> codext.decode("010000000100010001000010100111", "baudot-fr") +'12345' +``` + +```python +>>> codext.encode("TEST", "baudot-spaced_uk") +'10101 00010 10100 10101' +>>> codext.decode("10101 00010 10100 10101", "baudot-spaced_uk") +'TEST' +``` + +```python +>>> s = codext.encode("HELLO WORLD!", "baudot-tape_ita2") +>>> print(s) +***.** +* *. + . * +* .* +* .* +** . + *. +* .** +** . + * .* +* .* + * . * +** .** + **. * +>>> codext.decode(s, "baudot-tape_ita2") +'HELLO WORLD!' +``` + +----- + +### Binary Coded Decimal (BCD) + +It converts characters to their odrinals, left-pads with zeros, converts digits to 4-bits groups and then make characters with the assembled groups. It can also use a 4-bits prefix for making new characters. It then allows to define extended versions of BCD. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`bcd` | text <-> BCD encoded text | `binary_coded_decimals` | +`bcd-extended0` | text <-> BCD encoded text using prefix `0000` | `bcd_ext0`, `bcd-extended-zeros`, `binary_coded_decimals_extended_0` | +`bcd-extended1` | text <-> BCD encoded text using prefix `1111` | `bcd_ext1`, `bcd-extended-ones`, `binary_coded_decimals_extended_1` | + +```python +>>> codext.encode("Test", "bcd") +'\x08A\x01\x11Q\x16' +>>> codext.decode("\x08A\x01\x11Q\x16", "binary_coded_decimal") +'Test' +>>> codext.encode("Test", "bcd_ext_zero") +'\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00' +>>> codext.decode("\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00", "bcd-ext0") +'Test' +>>> codext.encode("Test", "bcd_extended_ones") +'\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0' +>>> codext.decode("\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0", "bcd_ext1") +'Test' +``` + +----- + +### Excess-3 + +Also called *Stibitz code*, it converts characters to ordinals, left-pads with zeros and then applies Excess-3 (Stibitz) code to get groups of 4 bits that are finally reassembled into bytes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`excess3` | text <-> XS3 encoded text | `excess-3`, `xs3`, `stibitz` | + +```python +>>> codext.encode("This is a test!", "excess-3") +';t7C\x84H6T8D\x83e<£eD\x944D\x84I6`' +>>> codext.decode(";t7C\x84H6T8D\x83e<£eD\x944D\x84I6`", "stibitz") +'This is a test!' +``` + +----- + +### Gray + +Also called *reflected binary code*, it implements the Gray code applied to characters while converted to bytes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`gray` | text <-> gray encoded text | `reflected-bin`, `reflected_binary` | + +```python +>>> codext.encode("this is a test", "gray") +'N\\]J0]J0Q0NWJN' +>>> codext.decode("N\\]J0]J0Q0NWJN", "gray") +'this is a test' +>>> codext.encode("THIS IS A TEST", "gray") +'~lmz0mz0a0~gz~' +>>> codext.decode("~lmz0mz0a0~gz~", "gray") +'THIS IS A TEST' +``` + +----- + +### Manchester + +This codec XORes each group of 4 bits of the input text with a 1-byte clock signal, e.g. `0x55` giving in binary `01010101`. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`manchester` | text <-> manchester encoded text | | clock signal is `0x55` (`01010101`) +`manchester-inverted` | text <-> manchester encoded text | `ethernet`, `ieee802.4` | clock signal is `0xaa` (`10101010`) + +```python +>>> codext.encode("This is a test!", "manchester") +'fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV' +>>> codext.decode("fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV", "manchester") +'This is a test!' +>>> codext.encode("This is a test!", "manchester-inverted") +'\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©' +>>> codext.decode("\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©", "ethernet") +'This is a test!' +``` + +----- + +### Rotate N bits + +This codec rotates of N bits each byte of an input string. + +!!! note "Lossless" + + This codec does not use the "`<<`" and "`>>`" operators as it is lossy in some cases. Instead, it rotates per group of 8 bits. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rotate` | text <-> N-bits-rotated text | `rotate-N`, `rotate_bits-N`, `rotate-right-N`, `rotate_left_N` | N belongs to [1,7] ; when nothing specified, it rotates to the right + +```python +>>> codext.encode("test", "rotate-1") +':29:' +>>> codext.encode("test", "rotatebits-1") +':29:' +>>> codext.encode("test", "rotate_right-1") +':29:' +>>> codext.encode("test", "rotate_left_1") +'èÊæè' +``` + diff --git a/docs/pages/enc/common.md b/docs/pages/enc/common.md index 34a566c..1739ca8 100644 --- a/docs/pages/enc/common.md +++ b/docs/pages/enc/common.md @@ -1,71 +1,69 @@ -## Common - -`codext` also provides some very common encodings, for the sake of simplicity (e.g. while chaining codecs with [the CLI tool](../cli.html)). - ------ - -### A1Z26 - -This simple codec converts letters to their order number in the alphabet using a separator between characters and keeping words separated by a whitespace. It is similar to the [`consonant-vowel-indices`](others.html#letter-indices) encoding. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`a1z26` | text <-> alphabet order numbers | `a1z26`, `a1z26-/`, `a1z26-,`, ... | this codec does not preserve the case and is dynamic (separator of characters in each word can be customized among these: "`-_/|,;:*`") - -```python ->>> codext.encode("This is a test", "a1z26") -'20-8-9-19 9-19 1 20-5-19-20' ->>> codext.decode("20-8-9-19 9-19 1 20-5-19-20", "a1z26") -'this is a test' -``` - ------ - -### Octal - -This simple codec converts characters into their octal values. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`octal` | text <-> octal digits | `octals` | groups of 3-chars octal values when encoded -`octal-spaced` | text <-> spaced octal digits | `octals-spaced` | whitespace-separated suite of variable-length groups of octal digits when encoded - -```python ->>> codext.encode("this is a test", "octal") -'164150151163040151163040141040164145163164' ->>> codext.decode("164150151163040151163040141040164145163164", "octals") -'this is a test' -``` - -```python ->>> codext.encode("this is a test", "octal-spaced") -'164 150 151 163 40 151 163 40 141 40 164 145 163 164' ->>> codext.decode("164 150 151 163 40 151 163 40 141 40 164 145 163 164", "octals-spaced") -'this is a test' -``` - ------ - -### Ordinal - -This simple codec converts characters into their ordinals. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`ordinal` | text <-> ordinal digits | `ordinals` | groups of 3-chars ordinal values when encoded -`ordinal-spaced` | text <-> spaced ordinal digits | `ordinals-spaced` | whitespace-separated suite of variable-length groups of ordinal digits when encoded - -```python ->>> codext.encode("this is a test", "ordinal") -'116104105115032105115032097032116101115116' ->>> codext.decode("116104105115032105115032097032116101115116", "ordinals") -'this is a test' -``` - -```python ->>> codext.encode("this is a test", "ordinal-spaced") -'116 104 105 115 32 105 115 32 97 32 116 101 115 116' ->>> codext.decode("116 104 105 115 32 105 115 32 97 32 116 101 115 116", "ordinals-spaced") -'this is a test' -``` - +`codext` also provides some very common encodings, for the sake of simplicity (e.g. while chaining codecs with [the CLI tool](../cli.html)). + +----- + +### A1Z26 + +This simple codec converts letters to their order number in the alphabet using a separator between characters and keeping words separated by a whitespace. It is similar to the [`consonant-vowel-indices`](others.html#letter-indices) encoding. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`a1z26` | text <-> alphabet order numbers | `a1z26`, `a1z26-/`, `a1z26-,`, ... | this codec does not preserve the case and is dynamic (separator of characters in each word can be customized among these: "`-_/|,;:*`") + +```python +>>> codext.encode("This is a test", "a1z26") +'20-8-9-19 9-19 1 20-5-19-20' +>>> codext.decode("20-8-9-19 9-19 1 20-5-19-20", "a1z26") +'this is a test' +``` + +----- + +### Octal + +This simple codec converts characters into their octal values. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`octal` | text <-> octal digits | `octals` | groups of 3-chars octal values when encoded +`octal-spaced` | text <-> spaced octal digits | `octals-spaced` | whitespace-separated suite of variable-length groups of octal digits when encoded + +```python +>>> codext.encode("this is a test", "octal") +'164150151163040151163040141040164145163164' +>>> codext.decode("164150151163040151163040141040164145163164", "octals") +'this is a test' +``` + +```python +>>> codext.encode("this is a test", "octal-spaced") +'164 150 151 163 40 151 163 40 141 40 164 145 163 164' +>>> codext.decode("164 150 151 163 40 151 163 40 141 40 164 145 163 164", "octals-spaced") +'this is a test' +``` + +----- + +### Ordinal + +This simple codec converts characters into their ordinals. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`ordinal` | text <-> ordinal digits | `ordinals` | groups of 3-chars ordinal values when encoded +`ordinal-spaced` | text <-> spaced ordinal digits | `ordinals-spaced` | whitespace-separated suite of variable-length groups of ordinal digits when encoded + +```python +>>> codext.encode("this is a test", "ordinal") +'116104105115032105115032097032116101115116' +>>> codext.decode("116104105115032105115032097032116101115116", "ordinals") +'this is a test' +``` + +```python +>>> codext.encode("this is a test", "ordinal-spaced") +'116 104 105 115 32 105 115 32 97 32 116 101 115 116' +>>> codext.decode("116 104 105 115 32 105 115 32 97 32 116 101 115 116", "ordinals-spaced") +'this is a test' +``` + diff --git a/docs/pages/enc/compressions.md b/docs/pages/enc/compressions.md index a5437cf..5c4fd2e 100644 --- a/docs/pages/enc/compressions.md +++ b/docs/pages/enc/compressions.md @@ -1,5 +1,3 @@ -## Compressions - `codext` provides a few common compression codecs. ----- diff --git a/docs/pages/enc/crypto.md b/docs/pages/enc/crypto.md index e59ab0f..b189c0e 100644 --- a/docs/pages/enc/crypto.md +++ b/docs/pages/enc/crypto.md @@ -1,206 +1,204 @@ -## Cryptography - -`codext` also implements several simple cryptographic ciphers. But how does it relate to encoding while a key is required ? `codext` focuses on ciphers that have a weak key. With dynamically named encodings, it is then possible to define a bunch of encodings, one for each value of the key. For instance, Barbie Typewriter has a key with only 4 possible values. The `barbie` codec can then be `barbie-1`, ..., `barbie-4`. - -!!! note "Available masks" - - Some cipher codecs use character masks to generate their alphabets. Groups of characters are indicated using a headin "`?`". - - `a`: printable characters - `b`: all 8-bits chars - `d`: digits - `h`: lowercase hexadecimal - `H`: uppercase hexadecimal - `l`: lowercase letters - `p`: punctuation characters - `s`: whitespace - `u`: uppercase letters - - When combining masks, only one occurrence of each character is taken in the final alphabet. - - So, for instance, the following masks yield the following alphabets: - - - `?l?u?d?s`: "`abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 `" - - `?s.,?!?u?d`: "` .,?!ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789`" - ------ - -### Affine Cipher - -This codec implements the Affine monoalphabetic substitution cipher. It is parametrizable with a mask for generating the alphabet and the parameters `a` and `b`. By default, it uses mask "`lus`" and parameters `a=1` and `b=2` but it can be set as in the examples hereafter. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`affine` | text <-> affine ciphertext | `affine`, `affine_cipher-?l?u?d?s-5,8`, `affine-?s.,?!?u?d-23,6`, ... | Mask-generated alphabet ; uses default mask "`?l?u?s`" with `a=1` and `b=2` - -```python ->>> codext.encode("this is a test", "affine") -'vjkubkubcbvguv' ->>> codext.decode("vjkubkubcbvguv", "affine") -'this is a test' ->>> codext.encode("this is a test", "affine-?l?u?d?s-5,8") -'ORWJdWJdidOCJO' ->>> codext.decode("ORWJdWJdidOCJO", "affine-?l?u?d?s-5,8") -'this is a test' ->>> codext.encode("THIS IS A TEST", "affine-?s.,?!?u?d-5,8") -'AW1 D1 D2DAH A' ->>> codext.decode("AW1 D1 D2DAH A", "affine-?s.,?!?u?d-5,8") -'THIS IS A TEST' -``` - -!!! warning "Parameters `a` and `b`" - - Not all values are suitable for `a` and `b`. If a generated encoding map has mapping collisions, an exception is raised telling that `a` and `b` are bad. - ------ - -### Atbash Cipher - -It implements the monoalphabetic substitution cipher used for the Hebrew alphabet. By default, it considers the lowercase and uppercase letters, inverted per group, as the alphabet. It can also use a mask to extend it. Note that it does not generate any error for characters that are not part of the alphabet. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`atbash` | text <-> Atbash ciphertext | `atbash`, `atbash_cipher-?l?d?s`, ... | Mask-generated alphabet ; uses default mask "`?u?l`" - -```python ->>> codext.encode("this is a test", "atbash") -'gsrh rh z gvhg' ->>> codext.encode("this is a test", "atbash-[?l?u?p?s]") -'.^]/a]/a a.{/.' ->>> codext.decode(".^]/a]/a a.{/.", "atbash_cipher_[?l?u?p?s]") -'this is a test' -``` - ------ - -### Baconian Cipher - -It support only letters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`bacon` | text <-> Bacon ciphertext | `bacon-cipher`, `baconian_cipher`, `bacon-01`, `bacon-10` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `ab`) - -```python ->>> codext.encode("this is a test", "bacon") -'baaba aabbb abaaa baaab abaaa baaab aaaaa baaba aabaa baaab baaba' ->>> codext.encode("this is a test", "bacon_01") -'10010 00111 01000 10001 01000 10001 00000 10010 00100 10001 10010' ->>> codext.decode("-..-. ..--- .-... -...- .-... -...- ..... -..-. ..-.. -...- -..-.", "bacon_.-") -'THIS IS A TEST' -``` - ------ - -### Barbie Typewriter - -It implements the cipher for its 4 different keys. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`barbie` | text <-> Barbie ciphertext | `barbie-1`, `barbie-2`, `barbie-3`, `barbie-4` - -```python ->>> codext.encode("this is a test", "barbie-1") -'hstf tf i hafh' ->>> codext.encode("this is a test", "barbie_3") -'fpsu su h ftuf' ->>> codext.decode("fpsu su h ftuf", "barbie-3") -'this is a test' -``` - ------ - -### Citrix CTX1 - -This implements the Citrix CTX1 password encoding algorithm. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`citrix` | text <-> Citrix CTX1 ciphertext | `citrix`, `citrix-1`, `citrix_ctx1` | - -```python ->>> codext.encode("this is a test", "citrix-ctx1") -'NBBMNAAGIDEPJJBMNIFNIMEMJKEL' ->>> codext.decode("NBBMNAAGIDEPJJBMNIFNIMEMJKEL", "citrix-ctx1") -'this is a test' -``` - ------ - -### Rail Fence Cipher - -This implements the Rail Fence encoding algorithm, using 3 rails and offset 0 as the default parameters. The encoding fence is built from the top ; the `up` flag can be used to build the fence from the bottom. Note that trying parameters that do not fit the input length will trigger a `ValueError` mentioning the bad value. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rail` | text <-> rail fence ciphertext, X rails and Y offset | `rail-X-Y`, `rail_X_Y`, `rail-X-Y-up`, `zigzag`, ... | - -```python ->>> codext.encode("this is a test", "zigzag") -'t ashsi etist' ->>> codext.encode("this is a test", "rail-5-3") -'it sss etiath ' ->>> codext.decode("it sss etiath ", "zigzag_5-3") -'this is a test' -``` - ------ -### ROT N - -This is a dynamic encoding, that is, it can be called with an integer to define the ROT offset. Encoding will apply a positive offset, decoding will apply a negative one. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_25`, `caesar13` | Dynamic ROT parameter ; belongs to [1, 26[ -`rot47` | text <-> rot47 ciphertext | | - -```python ->>> codext.encode("this is a test", "rot-15") -'iwxh xh p ithi' ->>> codext.encode("iwxh xh p ithi", "rot20") -'cqrb rb j cnbc' ->>> codext.decode("cqrb rb j cnbc", "rot_9") -'this is a test' -``` - ------ - -### Shift - -This is a dynamic encoding, that is, it can be called with an integer to define the shift offset. Encoding will apply a positive offset, decoding will apply a negative one. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-158`, `shift_255` | Dynamic shift parameter ; belongs to [1, 256[ - -```python ->>> codext.encode("this is a test", "shift-3") -'wklv#lv#d#whvw' ->>> codext.decode("wklv#lv#d#whvw", "shift10") -'mabl\x19bl\x19Z\x19m^lm' ->>> codext.encode("mabl\x19bl\x19Z\x19m^lm", "ordshift_7") -'this is a test' -``` - ------ - -### XOR with 1 byte - -This is a dynamic encoding, that is, it can be called with an integer to define the ordinal of the byte to XOR with the input text. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor22`, `xor-158`, `xor_255` | Dynamic XOR parameter ; belongs to [1, 256[ - -```python ->>> codext.encode("this is a test", "xor-10") -'~bcy*cy*k*~oy~' ->>> codext.encode("this is a test", "xor-30") -'jvwm>wm>\x7f>j{mj' ->>> codext.decode("this is a test", "xor-30") -'jvwm>wm>\x7f>j{mj' ->>> codext.encode("~bcy*cy*k*~oy~", "xor-10") -'this is a test' -``` - +`codext` also implements several simple cryptographic ciphers. But how does it relate to encoding while a key is required ? `codext` focuses on ciphers that have a weak key. With dynamically named encodings, it is then possible to define a bunch of encodings, one for each value of the key. For instance, Barbie Typewriter has a key with only 4 possible values. The `barbie` codec can then be `barbie-1`, ..., `barbie-4`. + +!!! note "Available masks" + + Some cipher codecs use character masks to generate their alphabets. Groups of characters are indicated using a headin "`?`". + + `a`: printable characters + `b`: all 8-bits chars + `d`: digits + `h`: lowercase hexadecimal + `H`: uppercase hexadecimal + `l`: lowercase letters + `p`: punctuation characters + `s`: whitespace + `u`: uppercase letters + + When combining masks, only one occurrence of each character is taken in the final alphabet. + + So, for instance, the following masks yield the following alphabets: + + - `?l?u?d?s`: "`abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 `" + - `?s.,?!?u?d`: "` .,?!ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789`" + +----- + +### Affine Cipher + +This codec implements the Affine monoalphabetic substitution cipher. It is parametrizable with a mask for generating the alphabet and the parameters `a` and `b`. By default, it uses mask "`lus`" and parameters `a=1` and `b=2` but it can be set as in the examples hereafter. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`affine` | text <-> affine ciphertext | `affine`, `affine_cipher-?l?u?d?s-5,8`, `affine-?s.,?!?u?d-23,6`, ... | Mask-generated alphabet ; uses default mask "`?l?u?s`" with `a=1` and `b=2` + +```python +>>> codext.encode("this is a test", "affine") +'vjkubkubcbvguv' +>>> codext.decode("vjkubkubcbvguv", "affine") +'this is a test' +>>> codext.encode("this is a test", "affine-?l?u?d?s-5,8") +'ORWJdWJdidOCJO' +>>> codext.decode("ORWJdWJdidOCJO", "affine-?l?u?d?s-5,8") +'this is a test' +>>> codext.encode("THIS IS A TEST", "affine-?s.,?!?u?d-5,8") +'AW1 D1 D2DAH A' +>>> codext.decode("AW1 D1 D2DAH A", "affine-?s.,?!?u?d-5,8") +'THIS IS A TEST' +``` + +!!! warning "Parameters `a` and `b`" + + Not all values are suitable for `a` and `b`. If a generated encoding map has mapping collisions, an exception is raised telling that `a` and `b` are bad. + +----- + +### Atbash Cipher + +It implements the monoalphabetic substitution cipher used for the Hebrew alphabet. By default, it considers the lowercase and uppercase letters, inverted per group, as the alphabet. It can also use a mask to extend it. Note that it does not generate any error for characters that are not part of the alphabet. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`atbash` | text <-> Atbash ciphertext | `atbash`, `atbash_cipher-?l?d?s`, ... | Mask-generated alphabet ; uses default mask "`?u?l`" + +```python +>>> codext.encode("this is a test", "atbash") +'gsrh rh z gvhg' +>>> codext.encode("this is a test", "atbash-[?l?u?p?s]") +'.^]/a]/a a.{/.' +>>> codext.decode(".^]/a]/a a.{/.", "atbash_cipher_[?l?u?p?s]") +'this is a test' +``` + +----- + +### Baconian Cipher + +It support only letters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`bacon` | text <-> Bacon ciphertext | `bacon-cipher`, `baconian_cipher`, `bacon-01`, `bacon-10` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `ab`) + +```python +>>> codext.encode("this is a test", "bacon") +'baaba aabbb abaaa baaab abaaa baaab aaaaa baaba aabaa baaab baaba' +>>> codext.encode("this is a test", "bacon_01") +'10010 00111 01000 10001 01000 10001 00000 10010 00100 10001 10010' +>>> codext.decode("-..-. ..--- .-... -...- .-... -...- ..... -..-. ..-.. -...- -..-.", "bacon_.-") +'THIS IS A TEST' +``` + +----- + +### Barbie Typewriter + +It implements the cipher for its 4 different keys. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`barbie` | text <-> Barbie ciphertext | `barbie-1`, `barbie-2`, `barbie-3`, `barbie-4` + +```python +>>> codext.encode("this is a test", "barbie-1") +'hstf tf i hafh' +>>> codext.encode("this is a test", "barbie_3") +'fpsu su h ftuf' +>>> codext.decode("fpsu su h ftuf", "barbie-3") +'this is a test' +``` + +----- + +### Citrix CTX1 + +This implements the Citrix CTX1 password encoding algorithm. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`citrix` | text <-> Citrix CTX1 ciphertext | `citrix`, `citrix-1`, `citrix_ctx1` | + +```python +>>> codext.encode("this is a test", "citrix-ctx1") +'NBBMNAAGIDEPJJBMNIFNIMEMJKEL' +>>> codext.decode("NBBMNAAGIDEPJJBMNIFNIMEMJKEL", "citrix-ctx1") +'this is a test' +``` + +----- + +### Rail Fence Cipher + +This implements the Rail Fence encoding algorithm, using 3 rails and offset 0 as the default parameters. The encoding fence is built from the top ; the `up` flag can be used to build the fence from the bottom. Note that trying parameters that do not fit the input length will trigger a `ValueError` mentioning the bad value. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rail` | text <-> rail fence ciphertext, X rails and Y offset | `rail-X-Y`, `rail_X_Y`, `rail-X-Y-up`, `zigzag`, ... | + +```python +>>> codext.encode("this is a test", "zigzag") +'t ashsi etist' +>>> codext.encode("this is a test", "rail-5-3") +'it sss etiath ' +>>> codext.decode("it sss etiath ", "zigzag_5-3") +'this is a test' +``` + +----- +### ROT N + +This is a dynamic encoding, that is, it can be called with an integer to define the ROT offset. Encoding will apply a positive offset, decoding will apply a negative one. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_25`, `caesar13` | Dynamic ROT parameter ; belongs to [1, 26[ +`rot47` | text <-> rot47 ciphertext | | + +```python +>>> codext.encode("this is a test", "rot-15") +'iwxh xh p ithi' +>>> codext.encode("iwxh xh p ithi", "rot20") +'cqrb rb j cnbc' +>>> codext.decode("cqrb rb j cnbc", "rot_9") +'this is a test' +``` + +----- + +### Shift + +This is a dynamic encoding, that is, it can be called with an integer to define the shift offset. Encoding will apply a positive offset, decoding will apply a negative one. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-158`, `shift_255` | Dynamic shift parameter ; belongs to [1, 256[ + +```python +>>> codext.encode("this is a test", "shift-3") +'wklv#lv#d#whvw' +>>> codext.decode("wklv#lv#d#whvw", "shift10") +'mabl\x19bl\x19Z\x19m^lm' +>>> codext.encode("mabl\x19bl\x19Z\x19m^lm", "ordshift_7") +'this is a test' +``` + +----- + +### XOR with 1 byte + +This is a dynamic encoding, that is, it can be called with an integer to define the ordinal of the byte to XOR with the input text. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor22`, `xor-158`, `xor_255` | Dynamic XOR parameter ; belongs to [1, 256[ + +```python +>>> codext.encode("this is a test", "xor-10") +'~bcy*cy*k*~oy~' +>>> codext.encode("this is a test", "xor-30") +'jvwm>wm>\x7f>j{mj' +>>> codext.decode("this is a test", "xor-30") +'jvwm>wm>\x7f>j{mj' +>>> codext.encode("~bcy*cy*k*~oy~", "xor-10") +'this is a test' +``` + diff --git a/docs/pages/enc/hashing.md b/docs/pages/enc/hashing.md index d1b0298..0f6f151 100644 --- a/docs/pages/enc/hashing.md +++ b/docs/pages/enc/hashing.md @@ -1,5 +1,3 @@ -## Hashing - `codext` provides hash functions through the `.encode(...)` API for convenience (e.g. while chaining codecs with [the CLI tool](../cli.html)). ----- diff --git a/docs/pages/enc/languages.md b/docs/pages/enc/languages.md index 3735d15..9aa805c 100644 --- a/docs/pages/enc/languages.md +++ b/docs/pages/enc/languages.md @@ -1,199 +1,197 @@ -## Languages - -`codext` also adds some common languages for encoding. - ------ - -### Braille - -It supports letters, digits and some special characters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`braille` | text <-> braille symbols | | Python 3 only - -```python ->>> codext.encode("this is a test", "braille") -'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' ->>> codext.encode("THIS IS A TEST", "braille") -'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' ->>> codext.decode("⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞", "braille") -'this is a test' -``` - ------ - -### Galactic - -This implements the [Minecraft's enchanting table](https://www.thegamer.com/minecraft-enchantment-table-language-guide/) using resembling Unicode characters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`galactic` | text <-> Minecraft enchantment symbols | `galactic-alphabet`, `minecraft_enchantment`, `minecraft-enchanting-language` | Python 3 only - -```python ->>> codext.encode("this is a test", "galactic") -'ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ' ->>> codext.decode("ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ", "galactic") -'this is a test' -``` - ------ - -### Ipsum - -This implements a codec that uses lorem ipsum words. It selects random words per letter and keeps the following punctuations: "`.,:;+=-*/\\`". - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`ipsum` | text <-> latin words | `loremipsum`, `lorem-ipsum` | words from the classical lorem ipsum - -```python ->>> codext.encode("This is a test.", "ipsum") -'Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.' ->>> codext.decode("Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.", "lorem-ipsum") -'This is a test.' -``` - ------ - -### Leetspeak - -This implements a very basic ruleset of elite speaking. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`leetspeak` | text <-> leetspeak encoded text | `leet`, `1337`, `leetspeak` | based on minimalistic elite speaking rules - -```python ->>> codext.encode("this is a test", "leetspeak") -'7h15 15 4 7357' ->>> codext.decode("7h15 15 4 7357", "leetspeak") -'ThIS IS A TEST' -``` - ------ - -### Morse - -It supports of course letters and digits, but also a few special characters: `.,;:?!/\\@&=-_'" $()`. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`morse` | text <-> morse encoded text | none | uses whitespace as a separator, dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `/-.`) - -```python ->>> codext.encode("this is a test", "morse") -'- .... .. ... / .. ... / .- / - . ... -' ->>> codext.encode("this is a test", "morse/-.") -'- .... .. ... / .. ... / .- / - . ... -' ->>> codext.encode("this is a test", "morse_ABC") -'B CCCC CC CCC A CC CCC A CB A B C CCC B' ->>> codext.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") -'this is a test' ->>> with codext.open("morse.txt", 'w', encoding="morse") as f: - f.write("this is a test") -14 ->>> with codext.open("morse.txt", encoding="morse") as f: - f.read() -'this is a test' -``` - ------ - -### Navajo - -It implements the letters from the [Navajo Code Talkers' Dictionary](https://www.history.navy.mil/research/library/online-reading-room/title-list-alphabetically/n/navajo-code-talker-dictionary.html). It conserves digits and newlines. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`navajo` | text <-> Navajo | | - -```python ->>> import codext ->>> codext.encode("this is a test 123", "navajo") -'a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3' ->>> codext.decode("a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3", "navajo") -'this is a test 123' -``` - ------ - -### Radio Alphabet - -This is also known as the [NATO phonetic alphabet](https://en.wikipedia.org/wiki/NATO_phonetic_alphabet). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`radio` | text <-> radio alphabet words | `military_alphabet`, `nato-phonetic-alphabet`, `radio-alphabet` | - -```python ->>> codext.encode("foobar", "nato_phonetic_alphabet") -'Foxtrot Oscar Oscar Bravo Alpha Romeo' ->>> codext.decode("Foxtrot Oscar Oscar Bravo Alpha Romeo", "radio-alphabet") -'FOOBAR' -``` - ------ - -### Southpark - -This encodes text according to Kenny's language in Southpark. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`southpark` | text <-> Kenny's language | `kenny` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `fFMmpP`) -`southpark-icase` | text <-> Kenny's language | `kenny_icase` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `FMP`) - -```python ->>> codext.encode("This is a Test", "southpark") -'FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp' ->>> codext.decode('FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp', "kenny") -'This is a Test' ->>> codext.encode("This is a test", "kenny_123456") -'245415411144111411144211444111145455144145' ->>> codext.decode("245415411144111411144211444111145455144145", "kenny-123456") -'This is a test' ->>> codext.encode("this is a test", "kenny_icase") -'FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP' ->>> codext.decode("FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP", "southpark-icase") -'this is a test' ->>> codext.encode("this is a test", "southpark-icase_123") -'123213211122111211122111222111123233122123' ->>> codext.decode('123213211122111211122111222111123233122123', "kenny_icase-123") -'this is a test' -``` - ------ - -### Tap - -This codec implements the [tap/knock code](https://en.wikipedia.org/wiki/Tap_code) commonly used by prisoners. It uses 25 letters, "*k*" is encoded to the same token than "*c*". - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`tap` | text <-> tap/knock encoded text | `knock`, `tap-code` | uses a large Unicode whitespace as a token separator ; Python 3 only - -```python ->>> codext.encode("this is a test", "tap") -'.... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....' ->>> codext.decode(".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....", "knock") -'this is a test' -``` - ------ - -### Tom-Tom - -This codec is similar to morse. It converts text into slashes and backslashes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`tomtom` | text <-> tom-tom encoded text | `tom-tom` | uses "`|`" as a separator - -```python ->>> codext.encode("this is a test", "tom-tom") -'\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\' ->>> codext.decode("\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\", "tomtom") -'THIS IS A TEST' -``` +`codext` also adds some common languages for encoding. + +----- + +### Braille + +It supports letters, digits and some special characters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`braille` | text <-> braille symbols | | Python 3 only + +```python +>>> codext.encode("this is a test", "braille") +'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' +>>> codext.encode("THIS IS A TEST", "braille") +'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' +>>> codext.decode("⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞", "braille") +'this is a test' +``` + +----- + +### Galactic + +This implements the [Minecraft's enchanting table](https://www.thegamer.com/minecraft-enchantment-table-language-guide/) using resembling Unicode characters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`galactic` | text <-> Minecraft enchantment symbols | `galactic-alphabet`, `minecraft_enchantment`, `minecraft-enchanting-language` | Python 3 only + +```python +>>> codext.encode("this is a test", "galactic") +'ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ' +>>> codext.decode("ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ", "galactic") +'this is a test' +``` + +----- + +### Ipsum + +This implements a codec that uses lorem ipsum words. It selects random words per letter and keeps the following punctuations: "`.,:;+=-*/\\`". + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`ipsum` | text <-> latin words | `loremipsum`, `lorem-ipsum` | words from the classical lorem ipsum + +```python +>>> codext.encode("This is a test.", "ipsum") +'Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.' +>>> codext.decode("Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.", "lorem-ipsum") +'This is a test.' +``` + +----- + +### Leetspeak + +This implements a very basic ruleset of elite speaking. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`leetspeak` | text <-> leetspeak encoded text | `leet`, `1337`, `leetspeak` | based on minimalistic elite speaking rules + +```python +>>> codext.encode("this is a test", "leetspeak") +'7h15 15 4 7357' +>>> codext.decode("7h15 15 4 7357", "leetspeak") +'ThIS IS A TEST' +``` + +----- + +### Morse + +It supports of course letters and digits, but also a few special characters: `.,;:?!/\\@&=-_'" $()`. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`morse` | text <-> morse encoded text | none | uses whitespace as a separator, dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `/-.`) + +```python +>>> codext.encode("this is a test", "morse") +'- .... .. ... / .. ... / .- / - . ... -' +>>> codext.encode("this is a test", "morse/-.") +'- .... .. ... / .. ... / .- / - . ... -' +>>> codext.encode("this is a test", "morse_ABC") +'B CCCC CC CCC A CC CCC A CB A B C CCC B' +>>> codext.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") +'this is a test' +>>> with codext.open("morse.txt", 'w', encoding="morse") as f: + f.write("this is a test") +14 +>>> with codext.open("morse.txt", encoding="morse") as f: + f.read() +'this is a test' +``` + +----- + +### Navajo + +It implements the letters from the [Navajo Code Talkers' Dictionary](https://www.history.navy.mil/research/library/online-reading-room/title-list-alphabetically/n/navajo-code-talker-dictionary.html). It conserves digits and newlines. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`navajo` | text <-> Navajo | | + +```python +>>> import codext +>>> codext.encode("this is a test 123", "navajo") +'a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3' +>>> codext.decode("a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3", "navajo") +'this is a test 123' +``` + +----- + +### Radio Alphabet + +This is also known as the [NATO phonetic alphabet](https://en.wikipedia.org/wiki/NATO_phonetic_alphabet). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`radio` | text <-> radio alphabet words | `military_alphabet`, `nato-phonetic-alphabet`, `radio-alphabet` | + +```python +>>> codext.encode("foobar", "nato_phonetic_alphabet") +'Foxtrot Oscar Oscar Bravo Alpha Romeo' +>>> codext.decode("Foxtrot Oscar Oscar Bravo Alpha Romeo", "radio-alphabet") +'FOOBAR' +``` + +----- + +### Southpark + +This encodes text according to Kenny's language in Southpark. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`southpark` | text <-> Kenny's language | `kenny` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `fFMmpP`) +`southpark-icase` | text <-> Kenny's language | `kenny_icase` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `FMP`) + +```python +>>> codext.encode("This is a Test", "southpark") +'FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp' +>>> codext.decode('FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp', "kenny") +'This is a Test' +>>> codext.encode("This is a test", "kenny_123456") +'245415411144111411144211444111145455144145' +>>> codext.decode("245415411144111411144211444111145455144145", "kenny-123456") +'This is a test' +>>> codext.encode("this is a test", "kenny_icase") +'FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP' +>>> codext.decode("FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP", "southpark-icase") +'this is a test' +>>> codext.encode("this is a test", "southpark-icase_123") +'123213211122111211122111222111123233122123' +>>> codext.decode('123213211122111211122111222111123233122123', "kenny_icase-123") +'this is a test' +``` + +----- + +### Tap + +This codec implements the [tap/knock code](https://en.wikipedia.org/wiki/Tap_code) commonly used by prisoners. It uses 25 letters, "*k*" is encoded to the same token than "*c*". + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`tap` | text <-> tap/knock encoded text | `knock`, `tap-code` | uses a large Unicode whitespace as a token separator ; Python 3 only + +```python +>>> codext.encode("this is a test", "tap") +'.... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....' +>>> codext.decode(".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....", "knock") +'this is a test' +``` + +----- + +### Tom-Tom + +This codec is similar to morse. It converts text into slashes and backslashes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`tomtom` | text <-> tom-tom encoded text | `tom-tom` | uses "`|`" as a separator + +```python +>>> codext.encode("this is a test", "tom-tom") +'\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\' +>>> codext.decode("\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\", "tomtom") +'THIS IS A TEST' +``` diff --git a/docs/pages/enc/stegano.md b/docs/pages/enc/stegano.md index 57dfb18..1a3a5fa 100644 --- a/docs/pages/enc/stegano.md +++ b/docs/pages/enc/stegano.md @@ -1,123 +1,121 @@ -## Steganography - -`codext` defines a few steganography-related encodings. While encoding is not really steganography (that is, concealing data within data), the following codecs are worth creating this category as they relate to converting data into something that could mislead the unaware reader. - ------ - -### Hexagrams (I Ching) - -This uses Base64 and then encodes output characters to [I Ching Hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) such that implemented [here](https://github.com/qntm/hexagram-encode). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`hexagram` | text <-> hexagrams-encoded Base64 | `hexagrams`, `i-ching-hexagrams`, `iching` | Python3 only - -```python ->>> codext.encode("this is a test", "hexagram") -'䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯' ->>> codext.decode("䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯", "iching") -'this is a test' -``` - ------ - -### Klopf Code - -This is a Polybius code with the trivial alphabetical distribution ("A" -> (1,1), "B" -> (2,1), ...). This can be tested [here](https://gc.de/gc/klopfcode/). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`klopf` | text <-> klopf encoded text | `klopfcode` | - -```python ->>> codext.encode("this is a test", "klopf") -'44324234 4234 11 44513444' ->>> codext.decode("44324234 4234 11 44513444", "klopf") -'THIS IS A TEST' -``` - ------ - -### Resistor Color Codes - -This uses the [electronic color code](https://en.wikipedia.org/wiki/Electronic_color_code#Resistor_color-coding) to encode digits, displaying colors in the terminal with ANSI color codes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`resistor` | text <-> resistor colors | `condensator`, `resistors-color`, `resistor_color_code` | visually, it only works in a terminal supporting ANSI color codes - -```python ->>> codext.encode("1234", "resistor") -'\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m' ->>> codext.decode("\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m", "condensators_color") -'1234' -``` - ------ - -### Rick Cipher - -This converts letters to words from Rick Astley's famous song "*Never gonna give you up*". - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rick` | text <-> words from Risk's song | `rick-astley`, `rick_cipher`, `rick-astley-cipher` | case-insensitive while encoding - -```python ->>> codext.encode("Test String", "rick") -'TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna' ->>> codext.decode("TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna", "rick") -'TEST STRING' -``` - ------ - -### SMS (T9) - -This codec implements the SMS encoding, also caled T9, that is the conversion from characters to their corresponding phone keystrokes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`sms` | text <-> phone keystrokes | `nokia`, `nokia_3310`, `t9` | uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding - -```python ->>> codext.encode("this is a test", "sms") -'8-44-444-7777-0-444-7777-0-2-0-8-33-7777-8' ->>> codext.decode("8_44_444_7777_0_444_7777_0_2_0_8_33_7777_8", "nokia") -'this is a test' ->>> codext.decode("8_44_444_7777_0-444-7777_0-2_0_8_33-7777-8", "t9") -'this is a test' -``` - ------ - -### Whitespaces - -This simple encoding replaces zeros and ones of the binary version of the input text with spaces and tabs. It is supported either with its original mapping or with the inverted mapping. - -!!! warning "Encoding, not programming !" - - This should not be confused with the [whitespace esoteric language](https://en.wikipedia.org/wiki/Whitespace_(programming_language)). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`whitespace` | text <-> whitespaces and tabs | `whitespaces?-inv(erted)?` | The default encoding uses tabs for zeros and spaces for ones -`whitespace_after_before` | text <-> whitespaces[letter]whitespaces | | This codec encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") - -```python ->>> codext.encode("test", "whitespace") -'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' ->>> codext.encode("test", "whitespaces") -'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' ->>> codext.encode("test", "whitespaces-inv") -' \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ' ->>> codext.decode(" \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ", "whitespaces_inverted") -'test' -``` - -```python ->>> codext.encode("test", "whitespace+after-before") -' m \n l \n u \n m ' ->>> codext.decode(" m \n l \n u \n m ", "whitespace+after-before") -'test' -``` +`codext` defines a few steganography-related encodings. While encoding is not really steganography (that is, concealing data within data), the following codecs are worth creating this category as they relate to converting data into something that could mislead the unaware reader. + +----- + +### Hexagrams (I Ching) + +This uses Base64 and then encodes output characters to [I Ching Hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) such that implemented [here](https://github.com/qntm/hexagram-encode). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`hexagram` | text <-> hexagrams-encoded Base64 | `hexagrams`, `i-ching-hexagrams`, `iching` | Python3 only + +```python +>>> codext.encode("this is a test", "hexagram") +'䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯' +>>> codext.decode("䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯", "iching") +'this is a test' +``` + +----- + +### Klopf Code + +This is a Polybius code with the trivial alphabetical distribution ("A" -> (1,1), "B" -> (2,1), ...). This can be tested [here](https://gc.de/gc/klopfcode/). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`klopf` | text <-> klopf encoded text | `klopfcode` | + +```python +>>> codext.encode("this is a test", "klopf") +'44324234 4234 11 44513444' +>>> codext.decode("44324234 4234 11 44513444", "klopf") +'THIS IS A TEST' +``` + +----- + +### Resistor Color Codes + +This uses the [electronic color code](https://en.wikipedia.org/wiki/Electronic_color_code#Resistor_color-coding) to encode digits, displaying colors in the terminal with ANSI color codes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`resistor` | text <-> resistor colors | `condensator`, `resistors-color`, `resistor_color_code` | visually, it only works in a terminal supporting ANSI color codes + +```python +>>> codext.encode("1234", "resistor") +'\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m' +>>> codext.decode("\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m", "condensators_color") +'1234' +``` + +----- + +### Rick Cipher + +This converts letters to words from Rick Astley's famous song "*Never gonna give you up*". + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rick` | text <-> words from Risk's song | `rick-astley`, `rick_cipher`, `rick-astley-cipher` | case-insensitive while encoding + +```python +>>> codext.encode("Test String", "rick") +'TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna' +>>> codext.decode("TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna", "rick") +'TEST STRING' +``` + +----- + +### SMS (T9) + +This codec implements the SMS encoding, also caled T9, that is the conversion from characters to their corresponding phone keystrokes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`sms` | text <-> phone keystrokes | `nokia`, `nokia_3310`, `t9` | uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding + +```python +>>> codext.encode("this is a test", "sms") +'8-44-444-7777-0-444-7777-0-2-0-8-33-7777-8' +>>> codext.decode("8_44_444_7777_0_444_7777_0_2_0_8_33_7777_8", "nokia") +'this is a test' +>>> codext.decode("8_44_444_7777_0-444-7777_0-2_0_8_33-7777-8", "t9") +'this is a test' +``` + +----- + +### Whitespaces + +This simple encoding replaces zeros and ones of the binary version of the input text with spaces and tabs. It is supported either with its original mapping or with the inverted mapping. + +!!! warning "Encoding, not programming !" + + This should not be confused with the [whitespace esoteric language](https://en.wikipedia.org/wiki/Whitespace_(programming_language)). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`whitespace` | text <-> whitespaces and tabs | `whitespaces?-inv(erted)?` | The default encoding uses tabs for zeros and spaces for ones +`whitespace_after_before` | text <-> whitespaces[letter]whitespaces | | This codec encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") + +```python +>>> codext.encode("test", "whitespace") +'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' +>>> codext.encode("test", "whitespaces") +'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' +>>> codext.encode("test", "whitespaces-inv") +' \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ' +>>> codext.decode(" \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ", "whitespaces_inverted") +'test' +``` + +```python +>>> codext.encode("test", "whitespace+after-before") +' m \n l \n u \n m ' +>>> codext.decode(" m \n l \n u \n m ", "whitespace+after-before") +'test' +``` diff --git a/docs/pages/enc/web.md b/docs/pages/enc/web.md index 80c6a20..4477a1f 100644 --- a/docs/pages/enc/web.md +++ b/docs/pages/enc/web.md @@ -1,40 +1,38 @@ -## Web - -`codext` implements some common Web-related encodings. - ------ - -### HTML Entities - -This implements the full list of characters available at [this reference](https://dev.w3.org/html5/html-author/charref). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`html` | text <-> HTML entities | `html-entity`, `html_entities` | implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) - -```python ->>> codext.encode("Тħĩş Їś ą Ţêšŧ", "html") -'Тħĩş Їś ą Ţêšŧ' ->>> codext.decode("Тħĩş Їś ą Ţêšŧ", "html-entities") -'Тħĩş Їś ą Ţêšŧ' -``` - ------ - -### URL - -This handles URL encoding, regardless of the case when decoding and with no error. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`url` | text <-> URL encoded text | `url`, `urlencode` | - -```python ->>> codecs.encode("?=this/is-a_test/../", "url") -'%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F' ->>> codext.decode("%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F", "urlencode") -'?=this/is-a_test/../' ->>> codext.decode("%3f%3dthis%2fis-a_test%2f%2e%2e%2f", "urlencode") -'?=this/is-a_test/../' -``` - +`codext` implements some common Web-related encodings. + +----- + +### HTML Entities + +This implements the full list of characters available at [this reference](https://dev.w3.org/html5/html-author/charref). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`html` | text <-> HTML entities | `html-entity`, `html_entities` | implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) + +```python +>>> codext.encode("Тħĩş Їś ą Ţêšŧ", "html") +'Тħĩş Їś ą Ţêšŧ' +>>> codext.decode("Тħĩş Їś ą Ţêšŧ", "html-entities") +'Тħĩş Їś ą Ţêšŧ' +``` + +----- + +### URL + +This handles URL encoding, regardless of the case when decoding and with no error. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`url` | text <-> URL encoded text | `url`, `urlencode` | + +```python +>>> codecs.encode("?=this/is-a_test/../", "url") +'%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F' +>>> codext.decode("%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F", "urlencode") +'?=this/is-a_test/../' +>>> codext.decode("%3f%3dthis%2fis-a_test%2f%2e%2e%2f", "urlencode") +'?=this/is-a_test/../' +``` + diff --git a/docs/pages/features.md b/docs/pages/features.md index 11316f0..02b375b 100644 --- a/docs/pages/features.md +++ b/docs/pages/features.md @@ -1,338 +1,336 @@ -## Features - -Basically, the `codecs` library provides a series of functions from the built-in `_codecs` library which maintains a registry of search functions (a simple list) that maps ancodings to the right de/encode functions by returning a `CodecInfo` object once first matched. - -`codext` hooks `codecs`'s functions to insert its own proxy registry between the function calls and the native registry so that new encodings can be added or replace existing ones while using `code[cs|xt].open`. Indeed, as the proxy registry is called first, the first possible match occurs in a custom codec, while if not existing, the native registry is used. - -!!! note "The `open` built-in function" - - Two behaviors are to be considered when using `codext`: - - 1. Encodings added from `codext` are only added to the proxy codecs registry of `codext` and are NOT available using `open(...)` (but well using `code[cs|xt].open(...)`. - 2. Encodings added from `codecs` are added to the proxy registry AND ALSO to the native registry and are therefore available using `open(...)`. - - This difference allows to keep encodings added from `codext` removable while these added from `codecs` are not. This is the consequence from the fact that there is no unregister function in the native `_codecs` library. - -!!! warning "Lossy conversion" - - Some encodings are lossy, meaning that it is not always possible to decode back to the exact start string. This should be considered especially when chaining codecs. - ------ - -### Add a custom encoding - -New codecs can be added easily using the new function `add`. - -```python ->>> import codext ->>> help(codext.add) -Help on function add in module codext.__common__: - -add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False) - This adds a new codec to the codecs module setting its encode and/or decode - functions, eventually dynamically naming the encoding with a pattern and - with file handling (if text is True). - - :param ename: encoding name - :param encode: encoding function or None - :param decode: decoding function or None - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the - built-in open(...) but will make it impossible - to remove the codec later - -``` - -Here is a simple example of how to add a basic codec: - -```python -import codext - -def mycodec_encode(text, errors="strict"): - # do some encoding stuff - return encoded, len(text) - -def mycodec_decode(text, errors="strict"): - # do some decoding stuff - return decoded, len(text) - -codext.add("mycodec", mycodec_encode, mycodec_decode) -``` - -In this first example, we can see that: - -- The `decode`/`encode` functions have a signature holding a keyword-argument "`errors`" for error handling. This comes from the syntax for making a codec for the `codecs` native library. This argument can have multiple values, namely "`strict`" for raising an exception when an de/encoding error occurs, while "`replace`" allows to replace the character at the position of the error with a generic character and also "`ignore`" that simply ignores the error and continues without adding anything to the resulting string. -- These functions always return a pair with the resulting string and the length of consumed input text. - -Another example for a more complex and dynamic codec: - -```python -import codext - -def mydyncodec_encode(i): - def encode(text, error="strict"): - # do somthing depending on i - return result, len(text) - return encode - -codext.add("mydyncodec", mydyncodec_encode, pattern=r"mydyn-(\d+)$") -``` - -In this second example, we can see that: - -- Only the encoding function is defined. -- A pattern is defined to match the prefix "`mydyn-`" and then an integer which is captured and used with `mydyncodec_encode(i)`. - -!!! warning "Pattern capture group" - - A capture group means that the parameter will be used with a dynamic (decorated) encoding function. In order to avoid this, i.e. for matching multiple names leading to the same encoding while calling a static encoding function, we can simply define a non-capturing group, e.g. "`(?:my|special_)codec`". - ------ - -### Add a custom map encoding - -New codecs using encoding maps can be added easily using the new function `add_map`. - -```python ->>> import codext ->>> help(codext.add) -Help on function add_map in module codext.__common__: - -add_map(ename, encmap, repl_char='?', sep='', ignore_case=None, no_error=False, intype=None, outype=None, **kwargs) - This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs module - dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with a pattern - and with file handling (if text is True). - - :param ename: encoding name - :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture - group of the regex pattern) or a function building the encoding map - :param repl_char: replacement char (used when errors handling is set to "replace") - :param sep: string of possible character separators (hence, only single-char separators are considered) ; - - while encoding, the first separator is used - - while decoding, separators can be mixed in the input text - :param ignore_case: ignore text case while encoding and/or decoding - :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) - :param intype: specify the input type for pre-transforming the input text - :param outype: specify the output type for post-transforming the output text - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - -``` - -This relies on the [`add`](#add-a-custom-encoding) function and simplifies creating new encodings when they can be described as a mapping dictionary. - -Here is a simple example of how to add a map codec: - -```python -import codext - -ENCMAP = {'a': "A", 'b': "B", 'c': "C"} - -codext.add_map("mycodec", ENCMAP) -``` - -In this first example, we can see that: - -- The `decode`/`encode` functions do not have to be declared anymore. -- `ENCMAP` is the mapping between characters, it is also used to compute the decoding function. - -Another example for a more complex and dynamic codec: - -```python -import codext - -ENCMAP = [ - {'00': "A", '01': "B", '10': "C", '11': "D"}, - {'00': "D", '01': "C", '10': "B", '11': "A"}, -] - -codext.add("mydyncodec", ENCMAP, "#", ignore_case=True, intype="bin", pattern=r"mydyn-(\d+)$") -``` - -In this second example, we can see that: - -- `ENCMAP` is now a list of mappings. The capture group in the pattern is used to select the right encoding map. Consequently, using encoding "`mydyn-8`" will fail with a `LookupError` as the only possibility are "`mydyn-1`" and "`mydyn-2`". Note that the index begins at 1 in the encoding name. -- Instead of using the default character "`?`" for replacements, we use "`#`". -- The case is ignored ; decoding either "`abcd`" or "`ABCD`" will succeed. -- The binary mode is enabled, meaning that the input text is converted to a binary string for encoding, while it is converted from binary to text when decoding. - -!!! warning "Input/Output types" - - By default, when `intype` is defined, `outype` takes the same value. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). - ------ - -### Add a macro - -**Macros** are chains of encodings. It is possible to define own macros with this feature. It works by giving the precedence to user's macros saved in `~/.codext-macros.json` then using embedded macros from the `codext` package. - -Here is an example of adding a macro (and verifying it was indeed added): - -```python ->>> codext.list_macros() -['example-macro'] ->>> codext.add_macro("test-macro", "gzip", "base64") ->>> codext.list_macros() -['example-macro', 'test-macro'] -``` - -!!! note "Removing a macro" - - As macros are resolved like codecs (with the precedence for codecs), they can be removed the same way as a codec. - - :::python - >>> codext.remove("test-macro") - - If this is a built-in macro, it will removed from the runtime list within the `codext` package. Next time this will be loaded, it will reset the builtin list of macros. Otherwise, if this is a custom macro, it will removed from the list of custom macros AND removed from `~/.codext-macros.json`. - ------ - -### List codecs - -Codecs can be listed with the `list` function, either the whole codecs or only some categories. - -```python ->>> codext.list() -['affine', 'ascii', 'ascii85', 'atbash', 'bacon', ..., 'base36', 'base58', 'base62', 'base64', 'base64_codec', ..., 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'big5', 'big5hkscs', 'braille', 'bz2_codec', 'capitalize', 'cp037', ...] -``` - -!!! note "Codecs categories" - - - `native`: the built-in codecs from the original `codecs` package - - `non-native`: this special category regroups all the categories mentioned hereafter - - `base`: baseX codecs (e.g. `base`, `base100`) - - `binary`: codecs working on strings but applying their algorithms on their binary forms (e.g. `baudot`, `manchester`) - - `common`: common codecs not included in the native ones or simly added for the purpose of standardization (e.g. `octal`, `ordinal`) - - `crypto`: codecs related to cryptography algorithms (e.g. `barbie`, `rot`, `xor`) - - `language`: language-related codecs (e.g. `morse`, `navajo`) - - `other`: uncategorized codecs (e.g. `letters`, `url`) - - `stegano`: steganography-related codecs (e.g. `sms`, `resistor`) - - Except the `native` and `non-native` categories, the other ones are simply the name of the subdirectories (with "`s`" right-stripped) of the `codext` package. - -```python ->>> codext.list("binary") -['baudot', 'baudot-spaced', 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'excess3', 'gray', 'manchester', 'manchester-inverted'] ->>> codext.list("language") -['braille', 'leet', 'morse', 'navajo', 'radio', 'southpark', 'southpark-icase', 'tom-tom'] ->>> codext.list("native") -['ascii', 'base64_codec', 'big5', 'big5hkscs', 'bz2_codec', 'cp037', 'cp273', 'cp424', 'cp437', 'cp500', 'cp775', 'cp850', 'cp852', 'cp855', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', ...] -``` - -!!! warning "Codecs listed, not encodings" - - Beware that this function only lists the codecs, not the encodings. This means that, for instance, it only lists `base` (codecs' name) instead of `base17`, `base61`, `base97`, ... (the valid encoding names related to the `base` codec). - ------ - -### Search for encodings - -Natively, `codecs` provides a `lookup` function that allows to get the `CodecInfo` object for the desired encoding. This performs a lookup in the registry based on an exact match. Sometimes, it can be useful to search for available encodings based on a regular expression. Therefore, a `search` function is added by `codext` to allow to get a list of encoding names matching the input regex. - -```python ->>> codext.search("baudot") -['baudot', 'baudot_spaced', 'baudot_tape'] ->>> codext.search("al") -['capitalize', 'octal', 'octal_spaced', 'ordinal', 'ordinal_spaced', 'radio'] ->>> codext.search("white") -['whitespace', 'whitespace_after_before'] -``` - -Also, `codext` provides an `examples` function to get some examples of valid encoding names. This is especially useful when it concerns dynamicly named encodings (e.g. `rot`, `shift` or `dna`). - -```python ->>> codext.examples("rot") -['rot-14', 'rot-24', 'rot-7', 'rot18', 'rot3', 'rot4', 'rot6', 'rot_1', 'rot_12', 'rot_2'] ->>> codext.examples("dna") -['dna-1', 'dna-2', 'dna-5', 'dna1', 'dna4', 'dna5', 'dna6', 'dna8', 'dna_3', 'dna_5'] ->>> codext.examples("barbie", 5) -['barbie-1', 'barbie1', 'barbie4', 'barbie_2', 'barbie_4'] -``` - ------ - -### Remove a custom encoding or macro - -New codecs can be removed easily using the new function `remove`, which will only remove every codec matching the given encoding name in the proxy codecs registry and NOT in the native one. - -```python ->>> codext.encode("test", "bin") -'01110100011001010111001101110100' ->>> codext.remove("bin") ->>> codext.encode("test", "bin") - -Traceback (most recent call last): - [...] -LookupError: unknown encoding: bin -``` - -Trying to remove a codec that is in the native registry won't raise a `LookupError`. - -```python ->>> codext.remove("utf-8") ->>> codext.encode("test", "utf-8") -b'test' -``` - -Removing a macro works exactly the same way as for a codec. - -```python ->>> codext.remove("test-macro") -``` - ------ - -### Remove or restore `codext` encodings and macros - -It can be useful while playing with encodings and/or macros e.g. from Idle to be able to remove or restore `codext`'s encodings and macros. This can be achieved using respectively the new `clear` and `reset` functions. - -```python ->>> codext.clear() ->>> codext.encode("test", "bin") - -Traceback (most recent call last): - [...] -LookupError: unknown encoding: bin -``` - -```python ->>> codext.reset() ->>> codext.encode("test", "bin") -'01110100011001010111001101110100' -``` - ------ - -### Multi-rounds encoding - -It is possible to use multiple times the same encoding through the following convention: `encoding[X]` - -A simple example for a 1-round and a 2-rounds morse-encoded string: - -```python ->>> codext.encode("This is a test", "morse") -'- .... .. ... / .. ... / .- / - . ... -' ->>> codext.encode("This is a test", "morse[2]") -'-....- / .-.-.- .-.-.- .-.-.- .-.-.- / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- -....- / -..-. / -....- / .-.-.- / .-.-.- .-.-.- .-.-.- / -....-' -``` - -Another example using 5-rounds base58: - -```python ->>> codext.encode("Sup3rS3cr3t", "base58[5]") -'3YrjaeeJE1qfUVkpUbMymEMLJenvRrtcZ4vaDQ3httdiqWV8wGYFpqw' -``` - ------ - -### Hooked `codecs` functions - -In order to select the right de/encoding function and avoid any conflict, the native `codecs` library registers search functions (using the `register(search_function)` function), called in order of registration while searching for a codec. - -While being imported, `codext` hooks the following base functions of `codecs` dealing with the codecs registry: `encode`, `decode`, `lookup` and `register`. This way, `codext` holds a private registry that is called before reaching out to the native one, causing the codecs defined in `codext` to override native codecs with a matching registry search function. - +Basically, the `codecs` library provides a series of functions from the built-in `_codecs` library which maintains a registry of search functions (a simple list) that maps ancodings to the right de/encode functions by returning a `CodecInfo` object once first matched. + +`codext` hooks `codecs`'s functions to insert its own proxy registry between the function calls and the native registry so that new encodings can be added or replace existing ones while using `code[cs|xt].open`. Indeed, as the proxy registry is called first, the first possible match occurs in a custom codec, while if not existing, the native registry is used. + +!!! note "The `open` built-in function" + + Two behaviors are to be considered when using `codext`: + + 1. Encodings added from `codext` are only added to the proxy codecs registry of `codext` and are NOT available using `open(...)` (but well using `code[cs|xt].open(...)`. + 2. Encodings added from `codecs` are added to the proxy registry AND ALSO to the native registry and are therefore available using `open(...)`. + + This difference allows to keep encodings added from `codext` removable while these added from `codecs` are not. This is the consequence from the fact that there is no unregister function in the native `_codecs` library. + +!!! warning "Lossy conversion" + + Some encodings are lossy, meaning that it is not always possible to decode back to the exact start string. This should be considered especially when chaining codecs. + +----- + +### Add a custom encoding + +New codecs can be added easily using the new function `add`. + +```python +>>> import codext +>>> help(codext.add) +Help on function add in module codext.__common__: + +add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False) + This adds a new codec to the codecs module setting its encode and/or decode + functions, eventually dynamically naming the encoding with a pattern and + with file handling (if text is True). + + :param ename: encoding name + :param encode: encoding function or None + :param decode: decoding function or None + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the + built-in open(...) but will make it impossible + to remove the codec later + +``` + +Here is a simple example of how to add a basic codec: + +```python +import codext + +def mycodec_encode(text, errors="strict"): + # do some encoding stuff + return encoded, len(text) + +def mycodec_decode(text, errors="strict"): + # do some decoding stuff + return decoded, len(text) + +codext.add("mycodec", mycodec_encode, mycodec_decode) +``` + +In this first example, we can see that: + +- The `decode`/`encode` functions have a signature holding a keyword-argument "`errors`" for error handling. This comes from the syntax for making a codec for the `codecs` native library. This argument can have multiple values, namely "`strict`" for raising an exception when an de/encoding error occurs, while "`replace`" allows to replace the character at the position of the error with a generic character and also "`ignore`" that simply ignores the error and continues without adding anything to the resulting string. +- These functions always return a pair with the resulting string and the length of consumed input text. + +Another example for a more complex and dynamic codec: + +```python +import codext + +def mydyncodec_encode(i): + def encode(text, error="strict"): + # do somthing depending on i + return result, len(text) + return encode + +codext.add("mydyncodec", mydyncodec_encode, pattern=r"mydyn-(\d+)$") +``` + +In this second example, we can see that: + +- Only the encoding function is defined. +- A pattern is defined to match the prefix "`mydyn-`" and then an integer which is captured and used with `mydyncodec_encode(i)`. + +!!! warning "Pattern capture group" + + A capture group means that the parameter will be used with a dynamic (decorated) encoding function. In order to avoid this, i.e. for matching multiple names leading to the same encoding while calling a static encoding function, we can simply define a non-capturing group, e.g. "`(?:my|special_)codec`". + +----- + +### Add a custom map encoding + +New codecs using encoding maps can be added easily using the new function `add_map`. + +```python +>>> import codext +>>> help(codext.add) +Help on function add_map in module codext.__common__: + +add_map(ename, encmap, repl_char='?', sep='', ignore_case=None, no_error=False, intype=None, outype=None, **kwargs) + This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs module + dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with a pattern + and with file handling (if text is True). + + :param ename: encoding name + :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture + group of the regex pattern) or a function building the encoding map + :param repl_char: replacement char (used when errors handling is set to "replace") + :param sep: string of possible character separators (hence, only single-char separators are considered) ; + - while encoding, the first separator is used + - while decoding, separators can be mixed in the input text + :param ignore_case: ignore text case while encoding and/or decoding + :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) + :param intype: specify the input type for pre-transforming the input text + :param outype: specify the output type for post-transforming the output text + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + +``` + +This relies on the [`add`](#add-a-custom-encoding) function and simplifies creating new encodings when they can be described as a mapping dictionary. + +Here is a simple example of how to add a map codec: + +```python +import codext + +ENCMAP = {'a': "A", 'b': "B", 'c': "C"} + +codext.add_map("mycodec", ENCMAP) +``` + +In this first example, we can see that: + +- The `decode`/`encode` functions do not have to be declared anymore. +- `ENCMAP` is the mapping between characters, it is also used to compute the decoding function. + +Another example for a more complex and dynamic codec: + +```python +import codext + +ENCMAP = [ + {'00': "A", '01': "B", '10': "C", '11': "D"}, + {'00': "D", '01': "C", '10': "B", '11': "A"}, +] + +codext.add("mydyncodec", ENCMAP, "#", ignore_case=True, intype="bin", pattern=r"mydyn-(\d+)$") +``` + +In this second example, we can see that: + +- `ENCMAP` is now a list of mappings. The capture group in the pattern is used to select the right encoding map. Consequently, using encoding "`mydyn-8`" will fail with a `LookupError` as the only possibility are "`mydyn-1`" and "`mydyn-2`". Note that the index begins at 1 in the encoding name. +- Instead of using the default character "`?`" for replacements, we use "`#`". +- The case is ignored ; decoding either "`abcd`" or "`ABCD`" will succeed. +- The binary mode is enabled, meaning that the input text is converted to a binary string for encoding, while it is converted from binary to text when decoding. + +!!! warning "Input/Output types" + + By default, when `intype` is defined, `outype` takes the same value. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). + +----- + +### Add a macro + +**Macros** are chains of encodings. It is possible to define own macros with this feature. It works by giving the precedence to user's macros saved in `~/.codext-macros.json` then using embedded macros from the `codext` package. + +Here is an example of adding a macro (and verifying it was indeed added): + +```python +>>> codext.list_macros() +['example-macro'] +>>> codext.add_macro("test-macro", "gzip", "base64") +>>> codext.list_macros() +['example-macro', 'test-macro'] +``` + +!!! note "Removing a macro" + + As macros are resolved like codecs (with the precedence for codecs), they can be removed the same way as a codec. + + :::python + >>> codext.remove("test-macro") + + If this is a built-in macro, it will removed from the runtime list within the `codext` package. Next time this will be loaded, it will reset the builtin list of macros. Otherwise, if this is a custom macro, it will removed from the list of custom macros AND removed from `~/.codext-macros.json`. + +----- + +### List codecs + +Codecs can be listed with the `list` function, either the whole codecs or only some categories. + +```python +>>> codext.list() +['affine', 'ascii', 'ascii85', 'atbash', 'bacon', ..., 'base36', 'base58', 'base62', 'base64', 'base64_codec', ..., 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'big5', 'big5hkscs', 'braille', 'bz2_codec', 'capitalize', 'cp037', ...] +``` + +!!! note "Codecs categories" + + - `native`: the built-in codecs from the original `codecs` package + - `non-native`: this special category regroups all the categories mentioned hereafter + - `base`: baseX codecs (e.g. `base`, `base100`) + - `binary`: codecs working on strings but applying their algorithms on their binary forms (e.g. `baudot`, `manchester`) + - `common`: common codecs not included in the native ones or simly added for the purpose of standardization (e.g. `octal`, `ordinal`) + - `crypto`: codecs related to cryptography algorithms (e.g. `barbie`, `rot`, `xor`) + - `language`: language-related codecs (e.g. `morse`, `navajo`) + - `other`: uncategorized codecs (e.g. `letters`, `url`) + - `stegano`: steganography-related codecs (e.g. `sms`, `resistor`) + + Except the `native` and `non-native` categories, the other ones are simply the name of the subdirectories (with "`s`" right-stripped) of the `codext` package. + +```python +>>> codext.list("binary") +['baudot', 'baudot-spaced', 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'excess3', 'gray', 'manchester', 'manchester-inverted'] +>>> codext.list("language") +['braille', 'leet', 'morse', 'navajo', 'radio', 'southpark', 'southpark-icase', 'tom-tom'] +>>> codext.list("native") +['ascii', 'base64_codec', 'big5', 'big5hkscs', 'bz2_codec', 'cp037', 'cp273', 'cp424', 'cp437', 'cp500', 'cp775', 'cp850', 'cp852', 'cp855', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', ...] +``` + +!!! warning "Codecs listed, not encodings" + + Beware that this function only lists the codecs, not the encodings. This means that, for instance, it only lists `base` (codecs' name) instead of `base17`, `base61`, `base97`, ... (the valid encoding names related to the `base` codec). + +----- + +### Search for encodings + +Natively, `codecs` provides a `lookup` function that allows to get the `CodecInfo` object for the desired encoding. This performs a lookup in the registry based on an exact match. Sometimes, it can be useful to search for available encodings based on a regular expression. Therefore, a `search` function is added by `codext` to allow to get a list of encoding names matching the input regex. + +```python +>>> codext.search("baudot") +['baudot', 'baudot_spaced', 'baudot_tape'] +>>> codext.search("al") +['capitalize', 'octal', 'octal_spaced', 'ordinal', 'ordinal_spaced', 'radio'] +>>> codext.search("white") +['whitespace', 'whitespace_after_before'] +``` + +Also, `codext` provides an `examples` function to get some examples of valid encoding names. This is especially useful when it concerns dynamicly named encodings (e.g. `rot`, `shift` or `dna`). + +```python +>>> codext.examples("rot") +['rot-14', 'rot-24', 'rot-7', 'rot18', 'rot3', 'rot4', 'rot6', 'rot_1', 'rot_12', 'rot_2'] +>>> codext.examples("dna") +['dna-1', 'dna-2', 'dna-5', 'dna1', 'dna4', 'dna5', 'dna6', 'dna8', 'dna_3', 'dna_5'] +>>> codext.examples("barbie", 5) +['barbie-1', 'barbie1', 'barbie4', 'barbie_2', 'barbie_4'] +``` + +----- + +### Remove a custom encoding or macro + +New codecs can be removed easily using the new function `remove`, which will only remove every codec matching the given encoding name in the proxy codecs registry and NOT in the native one. + +```python +>>> codext.encode("test", "bin") +'01110100011001010111001101110100' +>>> codext.remove("bin") +>>> codext.encode("test", "bin") + +Traceback (most recent call last): + [...] +LookupError: unknown encoding: bin +``` + +Trying to remove a codec that is in the native registry won't raise a `LookupError`. + +```python +>>> codext.remove("utf-8") +>>> codext.encode("test", "utf-8") +b'test' +``` + +Removing a macro works exactly the same way as for a codec. + +```python +>>> codext.remove("test-macro") +``` + +----- + +### Remove or restore `codext` encodings and macros + +It can be useful while playing with encodings and/or macros e.g. from Idle to be able to remove or restore `codext`'s encodings and macros. This can be achieved using respectively the new `clear` and `reset` functions. + +```python +>>> codext.clear() +>>> codext.encode("test", "bin") + +Traceback (most recent call last): + [...] +LookupError: unknown encoding: bin +``` + +```python +>>> codext.reset() +>>> codext.encode("test", "bin") +'01110100011001010111001101110100' +``` + +----- + +### Multi-rounds encoding + +It is possible to use multiple times the same encoding through the following convention: `encoding[X]` + +A simple example for a 1-round and a 2-rounds morse-encoded string: + +```python +>>> codext.encode("This is a test", "morse") +'- .... .. ... / .. ... / .- / - . ... -' +>>> codext.encode("This is a test", "morse[2]") +'-....- / .-.-.- .-.-.- .-.-.- .-.-.- / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- -....- / -..-. / -....- / .-.-.- / .-.-.- .-.-.- .-.-.- / -....-' +``` + +Another example using 5-rounds base58: + +```python +>>> codext.encode("Sup3rS3cr3t", "base58[5]") +'3YrjaeeJE1qfUVkpUbMymEMLJenvRrtcZ4vaDQ3httdiqWV8wGYFpqw' +``` + +----- + +### Hooked `codecs` functions + +In order to select the right de/encoding function and avoid any conflict, the native `codecs` library registers search functions (using the `register(search_function)` function), called in order of registration while searching for a codec. + +While being imported, `codext` hooks the following base functions of `codecs` dealing with the codecs registry: `encode`, `decode`, `lookup` and `register`. This way, `codext` holds a private registry that is called before reaching out to the native one, causing the codecs defined in `codext` to override native codecs with a matching registry search function. + diff --git a/docs/pages/guessing.md b/docs/pages/guessing.md index 9bac11c..5745918 100644 --- a/docs/pages/guessing.md +++ b/docs/pages/guessing.md @@ -1,172 +1,170 @@ -## Guess Mode - -For decoding multiple layers of codecs, `codext` features a guess mode relying on an Artificial Intelligence algorithm, the Breadth-First tree Search (BFS). For many cases, the default parameters are sufficient for guess-decoding things. But it may require parameters tuning. - ------ - -### Parameters - -BFS stops when a given condition, in the form of a function applied to the decoded string at the current depth, is met. It returns two results: the decoded string and a tuple with the related encoding names in order of application. - -The following parameters are tunable: - -- `stop_func`: can be a function or a regular expression to be matched (automatically converted to a function that uses the `re` module) ; by default, checks if all input characters are printable. -- `min_depth`: the minimum depth for the tree search (allows to avoid a bit of overhead while checking the current decoded output at a depth with the stop function when we are sure it should not be the right result) ; by default 0. -- `max_depth`: the maximum depth for the tree search ; by default 5. -- `codec_categories`: a string indicating a codec [category](#list-codecs) or a list of [category](#list-codecs) strings ; by default, `None`, meaning the whole [categories](#list-codecs) (very slow). -- `found`: a list or tuple of currently found encodings that can be used to save time if the first decoding steps are known ; by default, an empty tuple. - -A simple example for a 1-stage base64-encoded string: - -```python ->>> codext.guess("VGhpcyBpcyBhIHRlc3Q=") -{('base64',): 'This is a test'} -``` - -An example of a 2-stages base64- then base62-encoded string: - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7") -{('base62',): 'VGhpcyBpcyBhIHRlc3Q='} -``` - -In the second example, we can see that the given encoded string is not decoded as expected. This is the case because the (default) stop condition is too broad and stops if all the characters of the output are printable. If we have a prior knowledge on what we should expect, we can input a simple string or a regex: - -!!! note "Default stop function" - - :::python - >>> codext.stopfunc.default.__name__ - '...' - - The output depends on whether you have a language detection backend library installed ; see section [*Natural Language Detection*](#natural-language-detection). If no such library is installed, the default function is "`text`". - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test") -{('base62', 'base64'): 'This is a test'} -``` - -In this example, the string "*test*" is converted to a function that uses this string as regular expression. Instead of a string, we can also pass a function. For this purpose, standard [stop functions](#available-stop-functions) are predefined. So, we can for instance use `stopfunc.lang_en` to stop when we find something that is English. Note that working this way gives lots of false positives if the text is very short like in the example case. That's why the `codec_categories` argument is used to only consider baseX codecs. This is also demonstrated in the next examples. - -```python ->>> codext.stopfunc._reload_lang("langdetect") ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", codext.stopfunc.lang_en, codec_categories="base") -('This is a test', ('base62', 'base64')) -``` - -If we know the first encoding, we can set this in the `found` parameter to save time: - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", found=["base62"]) -('This is a test', ('base62', 'base64')) -``` - -If we are sure that only `base` (which is a valid [category](#list-codecs)) encodings are used, we can restrict the tree search using the `codec_categories` parameter to save time: - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", codec_categories="base") -('This is a test', ('base62', 'base64')) -``` - -Another example of 2-stages encoded string: - -```python ->>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test") -('this is a test', ('base64', 'morse')) ->>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test", codec_categories=["base", "language"]) -('this is a test', ('base64', 'morse')) -``` - -When multiple results are expected, `stop` and `show` arguments can be used respectively to avoid stopping while finding a result and to display the intermediate result. - -!!! warning "Computation time" - - Note that, in the very last examples, the first call takes much longer than the second one but requires no knowledge about the possible [categories](#list-codecs) of encodings. - ------ - -### Available Stop Functions - -A few stop functions are predefined in the `stopfunc` submodule. - -```python ->>> import codext ->>> dir(codext.stopfunc) -['LANG_BACKEND', 'LANG_BACKENDS', ..., '_reload_lang', 'default', 'flag', ..., 'printables', 'regex', 'text'] -``` - -Currently, the following stop functions are provided: - -- `flag`: searches for the pattern "`[Ff][Ll1][Aa4@][Gg9]`" (either UTF-8 or UTF-16) -- `lang_**`: checks if the given lang is detected (note that it first checks if all characters are text ; see `text` hereafter) -- `printables`: checks that every output character is in the set of printables -- `regex(pattern)`: takes one argument, the regular expression, for checking a string against the given pattern -- `text`: checks for printables and an entropy less than 4.6 (empirically determined) - -A stop function can be used as the second argument of the `guess` function or as a keyword-argument, as shown in the following examples: - -```python ->>> codext.guess("...", codext.stopfunc.text) -[...] ->>> codext.guess("...", [...], stop_func=codext.stopfunc.text) -[...] -``` - -When a string is given, it is automatically converted to a `regex` stop function. - -```python ->>> s = codext.encode("pattern testing", "leetspeak") ->>> s -'p4773rn 73571n9' ->>> stop_func = codext.stopfunc.regex("p[a4@][t7]{2}[e3]rn") ->>> stop_func(s) -True ->>> codext.guess(s, stop_func) -[...] -``` - -Additionally, a simple stop function is predefined for CTF players, matching various declinations of the word *flag*. Alternatively, a pattern can always be used when flags have a particular format. - -```python ->>> codext.stopfunc.flag("test string") -False ->>> codext.stopfunc.flag("test f1@9") -True ->>> codext.stopfunc.regex(r"^CTF\{.*?\}$")("CTF{098f6bcd4621d373cade4e832627b4f6}") -True -``` - -The particular type of stop function `lang_**` is explained in the [next section](#natural-language-detection). - ------ - -### Natural Language Detection - -As in many cases, we are trying to decode inputs to readable text, it is necessary to narrow the scope while searching for valid decoded outputs. As matching printables and even text (as defined here before as printables with an entropy of less than 4.6) is too broad for many cases, it may be very useful to apply natural language detection. In `codext`, this is done by relying on Natural Language Processing (NLP) backend libraries, loaded only if they were separately installed. - -Currently, the following backends are supported, in order of precedence (this order was empirically determined by testing): - -- [`langid`](https://github.com/saffsd/langid.py): *Standalone Language Identification (LangID) tool.* -- [`langdetect`](https://github.com/Mimino666/langdetect): *Port of Nakatani Shuyo's language-detection library (version from 03/03/2014) to Python.* -- [`pycld2`](https://github.com/aboSamoor/pycld2): *Python bindings for the Compact Langauge Detect 2 (CLD2).* -- [`cld3`](https://github.com/bsolomon1124/pycld3): *Python bindings to the Compact Language Detector v3 (CLD3).* -- [`textblob`](https://github.com/sloria/TextBlob): *Python (2 and 3) library for processing textual data.* - -The way NLP is used is to check that these libraries exist and to take the first one by default. This sets up the `stopfunc.default` for the guess mode. This behavior aims to keep language detection as optional and to avoid multiple specific requirements having the same purpose. - -While loaded, the default backend can be switched to another one by using the `_reload_lang` function: - -```python ->>> codext.stopfunc._reload_lang("pycld2") # this loads pycld2 and attaches lang_** functions to the stopfunc submodule ->>> codext.stopfunc._reload_lang() # this unloads any loaded backend -``` - -Each time a backend is loaded, it gets `lang_**` stop functions attached to the `stopfunc` submodule for each supported language. - ------ - -### Ranking Heuristic - -!!! warning "Work in progress" - - This part is still in progress and shall be improved with better features and/or using machine learning. - +For decoding multiple layers of codecs, `codext` features a guess mode relying on an Artificial Intelligence algorithm, the Breadth-First tree Search (BFS). For many cases, the default parameters are sufficient for guess-decoding things. But it may require parameters tuning. + +----- + +### Parameters + +BFS stops when a given condition, in the form of a function applied to the decoded string at the current depth, is met. It returns two results: the decoded string and a tuple with the related encoding names in order of application. + +The following parameters are tunable: + +- `stop_func`: can be a function or a regular expression to be matched (automatically converted to a function that uses the `re` module) ; by default, checks if all input characters are printable. +- `min_depth`: the minimum depth for the tree search (allows to avoid a bit of overhead while checking the current decoded output at a depth with the stop function when we are sure it should not be the right result) ; by default 0. +- `max_depth`: the maximum depth for the tree search ; by default 5. +- `codec_categories`: a string indicating a codec [category](#list-codecs) or a list of [category](#list-codecs) strings ; by default, `None`, meaning the whole [categories](#list-codecs) (very slow). +- `found`: a list or tuple of currently found encodings that can be used to save time if the first decoding steps are known ; by default, an empty tuple. + +A simple example for a 1-stage base64-encoded string: + +```python +>>> codext.guess("VGhpcyBpcyBhIHRlc3Q=") +{('base64',): 'This is a test'} +``` + +An example of a 2-stages base64- then base62-encoded string: + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7") +{('base62',): 'VGhpcyBpcyBhIHRlc3Q='} +``` + +In the second example, we can see that the given encoded string is not decoded as expected. This is the case because the (default) stop condition is too broad and stops if all the characters of the output are printable. If we have a prior knowledge on what we should expect, we can input a simple string or a regex: + +!!! note "Default stop function" + + :::python + >>> codext.stopfunc.default.__name__ + '...' + + The output depends on whether you have a language detection backend library installed ; see section [*Natural Language Detection*](#natural-language-detection). If no such library is installed, the default function is "`text`". + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test") +{('base62', 'base64'): 'This is a test'} +``` + +In this example, the string "*test*" is converted to a function that uses this string as regular expression. Instead of a string, we can also pass a function. For this purpose, standard [stop functions](#available-stop-functions) are predefined. So, we can for instance use `stopfunc.lang_en` to stop when we find something that is English. Note that working this way gives lots of false positives if the text is very short like in the example case. That's why the `codec_categories` argument is used to only consider baseX codecs. This is also demonstrated in the next examples. + +```python +>>> codext.stopfunc._reload_lang("langdetect") +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", codext.stopfunc.lang_en, codec_categories="base") +('This is a test', ('base62', 'base64')) +``` + +If we know the first encoding, we can set this in the `found` parameter to save time: + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", found=["base62"]) +('This is a test', ('base62', 'base64')) +``` + +If we are sure that only `base` (which is a valid [category](#list-codecs)) encodings are used, we can restrict the tree search using the `codec_categories` parameter to save time: + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", codec_categories="base") +('This is a test', ('base62', 'base64')) +``` + +Another example of 2-stages encoded string: + +```python +>>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test") +('this is a test', ('base64', 'morse')) +>>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test", codec_categories=["base", "language"]) +('this is a test', ('base64', 'morse')) +``` + +When multiple results are expected, `stop` and `show` arguments can be used respectively to avoid stopping while finding a result and to display the intermediate result. + +!!! warning "Computation time" + + Note that, in the very last examples, the first call takes much longer than the second one but requires no knowledge about the possible [categories](#list-codecs) of encodings. + +----- + +### Available Stop Functions + +A few stop functions are predefined in the `stopfunc` submodule. + +```python +>>> import codext +>>> dir(codext.stopfunc) +['LANG_BACKEND', 'LANG_BACKENDS', ..., '_reload_lang', 'default', 'flag', ..., 'printables', 'regex', 'text'] +``` + +Currently, the following stop functions are provided: + +- `flag`: searches for the pattern "`[Ff][Ll1][Aa4@][Gg9]`" (either UTF-8 or UTF-16) +- `lang_**`: checks if the given lang is detected (note that it first checks if all characters are text ; see `text` hereafter) +- `printables`: checks that every output character is in the set of printables +- `regex(pattern)`: takes one argument, the regular expression, for checking a string against the given pattern +- `text`: checks for printables and an entropy less than 4.6 (empirically determined) + +A stop function can be used as the second argument of the `guess` function or as a keyword-argument, as shown in the following examples: + +```python +>>> codext.guess("...", codext.stopfunc.text) +[...] +>>> codext.guess("...", [...], stop_func=codext.stopfunc.text) +[...] +``` + +When a string is given, it is automatically converted to a `regex` stop function. + +```python +>>> s = codext.encode("pattern testing", "leetspeak") +>>> s +'p4773rn 73571n9' +>>> stop_func = codext.stopfunc.regex("p[a4@][t7]{2}[e3]rn") +>>> stop_func(s) +True +>>> codext.guess(s, stop_func) +[...] +``` + +Additionally, a simple stop function is predefined for CTF players, matching various declinations of the word *flag*. Alternatively, a pattern can always be used when flags have a particular format. + +```python +>>> codext.stopfunc.flag("test string") +False +>>> codext.stopfunc.flag("test f1@9") +True +>>> codext.stopfunc.regex(r"^CTF\{.*?\}$")("CTF{098f6bcd4621d373cade4e832627b4f6}") +True +``` + +The particular type of stop function `lang_**` is explained in the [next section](#natural-language-detection). + +----- + +### Natural Language Detection + +As in many cases, we are trying to decode inputs to readable text, it is necessary to narrow the scope while searching for valid decoded outputs. As matching printables and even text (as defined here before as printables with an entropy of less than 4.6) is too broad for many cases, it may be very useful to apply natural language detection. In `codext`, this is done by relying on Natural Language Processing (NLP) backend libraries, loaded only if they were separately installed. + +Currently, the following backends are supported, in order of precedence (this order was empirically determined by testing): + +- [`langid`](https://github.com/saffsd/langid.py): *Standalone Language Identification (LangID) tool.* +- [`langdetect`](https://github.com/Mimino666/langdetect): *Port of Nakatani Shuyo's language-detection library (version from 03/03/2014) to Python.* +- [`pycld2`](https://github.com/aboSamoor/pycld2): *Python bindings for the Compact Langauge Detect 2 (CLD2).* +- [`cld3`](https://github.com/bsolomon1124/pycld3): *Python bindings to the Compact Language Detector v3 (CLD3).* +- [`textblob`](https://github.com/sloria/TextBlob): *Python (2 and 3) library for processing textual data.* + +The way NLP is used is to check that these libraries exist and to take the first one by default. This sets up the `stopfunc.default` for the guess mode. This behavior aims to keep language detection as optional and to avoid multiple specific requirements having the same purpose. + +While loaded, the default backend can be switched to another one by using the `_reload_lang` function: + +```python +>>> codext.stopfunc._reload_lang("pycld2") # this loads pycld2 and attaches lang_** functions to the stopfunc submodule +>>> codext.stopfunc._reload_lang() # this unloads any loaded backend +``` + +Each time a backend is loaded, it gets `lang_**` stop functions attached to the `stopfunc` submodule for each supported language. + +----- + +### Ranking Heuristic + +!!! warning "Work in progress" + + This part is still in progress and shall be improved with better features and/or using machine learning. + diff --git a/docs/pages/howto.md b/docs/pages/howto.md index 6163ef6..9e59805 100644 --- a/docs/pages/howto.md +++ b/docs/pages/howto.md @@ -1,242 +1,240 @@ -## How To Create Your Codec - -The purpose of this section is to provide a tutorial for creating new codecs accordingly. - -As explained in [this section](./features.html), `codext` provides the possibility to add new codecs in two ways: - -1. [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56): using this function, the *encode* and *decode* functions must be given as arguments. -2. [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160): using this function, an *encoding map* must be given but can be formatted in different ways to handle various use cases. - -In both cases, a *pattern* is given in argument and aims to define the set of all strings that aim to select this codec. - -!!! important "Codec precedence" - - `codext` uses a local registry that is queried first before attempting native `codecs` lookups. This means that a native codec can be overridden with a *pattern* that matches the same strings. - -The remainder of this section explains how to successfully create a new codec and/or how to make so that it can be added to the library. - -!!! reminder "Contributions welcome !" - - Remember that you can always [submit a request for a new codec](https://github.com/dhondta/python-codext/issues/new) or submit your own with a PR for improving `codext` ! - ------ - -### Generic arguments - -Whatever solution is chosen, the following arguments shall be considered: - -- `ename` (first positional argument): Choose the shortest possible encoding name. If it clashes with another codec, always remember that `codext` resolves codecs in order of registry, that is from the first added. Also, it resolves codecs based on the given pattern. So, a codec with a clashing name could still be selected if the pattern does not match for the codec with the precedence but matches for this codec. -- `pattern` (keyword-argument): If not defined, it defaults to the encoding name. It can be a regular expression ; in this case, it should not be too broad. A codec decode or encode function can be parametrized through the pattern using the **first capture group**. It is important to note that the first capture group is used and not any other. This means that any other group definition shall use the do-not-capture specifier, that is "`(?:...)`". - -!!! danger "Too broad pattern" - - Let us consider the following ; we add a codec that handles every character in any number of occurrence. It will then capture anything in the given encoding name and will then always resolve to this codec, preventing any other codec added afterwards to resolve. - - >>> import codext - >>> identity = lambda text, errors="strict": (text, len(text)) - >>> codext.add("everything", identity, identity, pattern=r".*") - >>> codext.encode("test string", "test-encoding-name") # r".*" matches anything, thus including "test-encoding-name" - 'test string' - >>> codext.decode("test string", "test-encoding-name") - 'test string' - >>> codext.encode("test string", "morse") # "morse" has the precedence on codec "everything" we just added - '- . ... - / ... - .-. .. -. --.' - >>> test = lambda text, errors="strict": ("TEST", len(t)) - >>> codext.add("test", test) # no pattern given ; should then be matched by encoding name "test" - >>> codext.encode("test string", "test") # should give "TEST" if codec "test" was selected - 'test string' # gives the output of codec "test-encoding-name", - # which has precedence on "test" and a too broad pattern - ------ - -### Which `add` function ? - -At this point, it is necessary to determine what kind of codec you want. If it is a simple map of characters, you should definitely use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160). If it is more complex and cannot be handled using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160)'s options, then you should use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) and define the encode/decode functions yourself. - -A few examples: - -- `morse` is a simple map that does not handle case ; it then uses [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `ignore_case` set to "`encode`" (not "`both`" for encoding and decoding as it does not matter anyway for decoding) -- `whitespace` has 2 codecs defined ; the simple one is a simple bit encoding map, therefore using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype` set to "`bin`" (for pre-converting characters to bits before applying the encoding map), and the complex one uses [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) with its specific endocde/decode functions -- `atbash` defines a dynamic map with a "factory" function, that creates the encoding map according to the parameters supplied in the codec name - -So, before going further, determine the following: - -- What does the new codec map from and to ? E.g. if binary input and ordinal output, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype="bin"` and `outype="ord"`. -- Is this codec ignoring case ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and specify which operation(s) should ignore case, e.g. `ignore_case="both"` or `ignore_case="decode"`. -- Should this codec handle no error ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) do not forget to specify `no_error=True`. -- Does the codec yields variable-length encoded tokens ? If so, you can still use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) but you should define `sep` (separator) as `codext` will not be able to handle ambiguities. - -If you find aspects that are not covered in these questions, you shall use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56), then refering to [Case 1](#case-1-generic-encoding-definition). Otherwise, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and refer -to [Case 2](#case-2-encoding-map). - ------ - -### Case 1: Generic encoding definition - -This uses: [`codext.add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) - -The following shall be considered: - -- `encode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot encode. -- `decode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot decode. - -Both functions must take 2 arguments and return 2 values (in order to stick to `codec`'s encode/decode function format): - -- Inputs: `text`, `errors="strict"` ; respectively the text to encode/decode and the error handling mode. -- Outputs: encoded text and length of consumed input text. - -!!! note "Error handling mode" - - - `strict`: this is the default ; it means that any error shall raise an exception. - - `ignore`: any error is ignored, adding nothing to the output. - - `replace`: any error yields the given replacement character(s). - - `leave`: any error yields the erroneous input token in the output. - - This last mode is an addition to the native ones. It can be useful for some encodings that must cause no error while encoding and can therefore have their original characters in the output. - -Also, while defining the `encode` and/or `decode` functions, `codext.handle_error` can be used as a shortcut to handle the different modes. It returns a wrapped function that takes `token` and `position` as arguments (see [`excess3`](https://github.com/dhondta/python-codext/blob/master/codext/binary/excess3.py) for an example). - -```python ->>> help(codext.handle_error) -Help on function handle_error in module codext.__common__: - -handle_error(ename, errors, sep='', repl_char='?', repl_minlen=1, decode=False, item='position') - This shortcut function allows to handle error modes given some tuning parameters. - - :param ename: encoding name - :param errors: error handling mode - :param sep: token separator - :param repl_char: replacement character (for use when errors="replace") - :param repl_minlen: repeat number for the replacement character - :param decode: whether we are encoding or decoding - :param item: position item description (for describing the error ; e.g. "group" or "token") - ->>> err = codext.handle_error("test", "strict") ->>> help(err) -Help on function _handle_error in module codext.__common__: - -_handle_error(token, position) - This handles an encoding/decoding error according to the selected handling mode. - - :param token: input token to be encoded/decoded - :param position: token position index - -``` - ------ - -### Case 2: Encoding map - -This uses: [`codext.add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) - -The following options shall be considered: - -- `encmap` (second positional argument): This defines the encoding map and is the core of the codec ; 4 subcases are handled and explained hereafter. -- `repl_char` (keyword-argument ; default: "`?`"): The replacement character can be tuned, especially if the default one clashes with a character from the encoding. -- `sep` (keyword-argument ; default: ""): The separator between encoded tokens can be useful to tune, especially when the encoded tokens have a variable length. -- `ignore_case` (keyword-argument ; default: `None`): This defines where the case shall be ignored ; it can be one of the followings: "`encode`", "`decode`" or "`both`". -- `no_error` (keyword-argument ; default: `False`): This sets if errors should be handled as normal or if no error should be considered, simply leaving the input token as is in the output. -- `intype` (keyword-argument ; default: `None`): This specifies the type the input text should be converted to before applying the encoding map (pre-conversion before really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. -- `outype` (keyword-argument ; default: `None`): This specifies the type the output text of the encoding map should be converted from (post-conversion after really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. - -!!! warning "Input/Output types" - - By default, when `intype` is defined, `outype` takes the same value if left `None`. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be explicitely set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). - -`encmap` can be defined as follows: - -1. **Simple map**: In this case, the encoding map is a dictionary mapping each input character to an output one (see [`radio`](https://github.com/dhondta/python-codext/blob/master/codext/languages/radio.py) for an example). -2. **List of maps**: In this case, encoding maps are put in a list and referenced by their order number starting from 1, meaning that the `pattern` shall define a capture group with values from 1 to the length of this list (see [`dna`](https://github.com/dhondta/python-codext/blob/master/codext/others/dna.py) for an example). -3. **Parametrized map**: This variant defines a dictionary of regex-selected encoding maps, that is, a dictionary of dictionaries with keys matching the captured groups from codec's pattern. -4. **Map factory function**: This one is implemented by a function that returns the composed encoding map. This function takes a single argument according to the capture group from the `pattern` (see [`affine`](https://github.com/dhondta/python-codext/blob/master/codext/crypto/affine.py) for an example). - -!!! note "Mapping one input character to multiple output characters" - - In some particular cases (e.g. the `navajo` codec), a single input character can be mapped to multiple output ones. It is possible to define them in a map by simply putting them into a list (e.g. a map with `{'A': ["B", "C", "D"]}`). In this case, while encoding, the output character is randomly chosen (e.g. "`A`" will map to "`D`", another time to "`B`", ...). - ------ - -### Self-generated tests - -In order to facilitate testing, a test suite can be automatically generated from a set of *examples*. This is defined in the `__examples__` dunder inside codec's source file (see [`sms`](https://github.com/dhondta/python-codext/blob/master/codext/stegano/sms.py) for an example). By default, the `add`/`add_map` function will get `__examples__` from the global scope but this behavior can be overridden by specifying the keyword-argument `examples` (e.g. `add(..., examples=__examples1__)` ; see [`ordinal`](https://github.com/dhondta/python-codext/blob/master/codext/common/ordinal.py) for an example). - -A set of examples is a dictionary specifying the test cases to be considered. The keys are the descriptions of the test cases and the values can be either dictionaries of input texts and their output encoded texts or lists of input texts. Each key has the format "`operation(encodings)`". Operations can be: - -- `enc`: This is for testing the encoding of the nested values (that is, a dictionary of input/outputs). -- `dec`: This is for testing the decoding of the nested values (that is, a dictionary of input/outputs). If this is not specified, the test suite automatically tries to decode from what is defined in `enc`. -- `enc-dec`: This is for testing the encoding AND decoding of the nested values (that is, a list of inputs) ; this one does not enforce what should be the output of the encoding but checks that encoding AND decoding leads to the same input text. This is particularly useful when encoding can yield randomly chosen tokens in the encoded output. - -The `encodings` are a `|`-separated list of encoding names, compliant or not with tested codec's pattern. Faulty names can also be tested as of the examples hereafter. - -Examples of `__examples__` test suites: - -```python -__my_examples__ = { - 'enc(BAD)': None -} -``` - -!!! note "Observations" - - - `__my__examples__` is not the standard dunder, therefore requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. - - `BAD` is assumed to be a bad encoding name, therefore having a dictionary value of `None`, meaning that the test should raise a `LookupError`. - -```python -__examples__ = { - 'enc(codec)': {'string': None} -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, therefore NOT requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. - - `codec` is assumed to be a valid encoding name, therefore having a dictionary as its value, but in this special case "`string`" is assumed not to be encoded, its corresponding value is then `None`, meaning that the test should raise a `ValueError`. - -```python -__examples__ = { - 'enc-dec(codec)': ["test string", "TEST STRING", "@random", "@random{1024}"] -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. - - `enc-dec` is used, meaning that a list of inputs is defined. - - So, whatever its encoded output, the input string shall give the same while applying encoding then decoding. - - The special values `@random` and `@random{1024}`, meaning that test strings are generated from any possible byte-character with a specified length (512 when not specified, otherwise specified with `{...}`). - -```python -__examples__ = { - 'enc(codec)': {"test string": "..."} -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. - - `enc` only is used, meaning that a dictionary of inputs/outputs is given and `dec` is automatically handled while requiring the exact encoded text but recovering the exact same input while decoding. - -```python -__examples__ = { - 'enc(codec)': {"Test String": "..."}, - 'dec(codec)': {"...": "test string"}, -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. - - `enc` and `dec` are used, meaning that dictionaries of inputs/outputs are given and the input texts are not necessarily the same (i.e. if text case is not handled by the codec). - ------ - -### Adding a new codec to `codext` - -As a checklist when making a codec for addition in `codext`, please follow these steps: - -1. Create your codec file (i.e. starting with a copy of an existing similar one) -2. Place it into the right category folder -3. Add it to the list in [`README.md`](https://github.com/dhondta/python-codext/blob/master/README.md#list-of-codecs) -4. Add its documentation in the [right Markdown file](https://github.com/dhondta/python-codext/tree/master/docs/enc) -5. If self-generated tests are not enough, add manual tests in [the related file](https://github.com/dhondta/python-codext/blob/master/tests/test_manual.py) - +The purpose of this section is to provide a tutorial for creating new codecs accordingly. + +As explained in [this section](./features.html), `codext` provides the possibility to add new codecs in two ways: + +1. [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56): using this function, the *encode* and *decode* functions must be given as arguments. +2. [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160): using this function, an *encoding map* must be given but can be formatted in different ways to handle various use cases. + +In both cases, a *pattern* is given in argument and aims to define the set of all strings that aim to select this codec. + +!!! important "Codec precedence" + + `codext` uses a local registry that is queried first before attempting native `codecs` lookups. This means that a native codec can be overridden with a *pattern* that matches the same strings. + +The remainder of this section explains how to successfully create a new codec and/or how to make so that it can be added to the library. + +!!! reminder "Contributions welcome !" + + Remember that you can always [submit a request for a new codec](https://github.com/dhondta/python-codext/issues/new) or submit your own with a PR for improving `codext` ! + +----- + +### Generic arguments + +Whatever solution is chosen, the following arguments shall be considered: + +- `ename` (first positional argument): Choose the shortest possible encoding name. If it clashes with another codec, always remember that `codext` resolves codecs in order of registry, that is from the first added. Also, it resolves codecs based on the given pattern. So, a codec with a clashing name could still be selected if the pattern does not match for the codec with the precedence but matches for this codec. +- `pattern` (keyword-argument): If not defined, it defaults to the encoding name. It can be a regular expression ; in this case, it should not be too broad. A codec decode or encode function can be parametrized through the pattern using the **first capture group**. It is important to note that the first capture group is used and not any other. This means that any other group definition shall use the do-not-capture specifier, that is "`(?:...)`". + +!!! danger "Too broad pattern" + + Let us consider the following ; we add a codec that handles every character in any number of occurrence. It will then capture anything in the given encoding name and will then always resolve to this codec, preventing any other codec added afterwards to resolve. + + >>> import codext + >>> identity = lambda text, errors="strict": (text, len(text)) + >>> codext.add("everything", identity, identity, pattern=r".*") + >>> codext.encode("test string", "test-encoding-name") # r".*" matches anything, thus including "test-encoding-name" + 'test string' + >>> codext.decode("test string", "test-encoding-name") + 'test string' + >>> codext.encode("test string", "morse") # "morse" has the precedence on codec "everything" we just added + '- . ... - / ... - .-. .. -. --.' + >>> test = lambda text, errors="strict": ("TEST", len(t)) + >>> codext.add("test", test) # no pattern given ; should then be matched by encoding name "test" + >>> codext.encode("test string", "test") # should give "TEST" if codec "test" was selected + 'test string' # gives the output of codec "test-encoding-name", + # which has precedence on "test" and a too broad pattern + +----- + +### Which `add` function ? + +At this point, it is necessary to determine what kind of codec you want. If it is a simple map of characters, you should definitely use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160). If it is more complex and cannot be handled using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160)'s options, then you should use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) and define the encode/decode functions yourself. + +A few examples: + +- `morse` is a simple map that does not handle case ; it then uses [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `ignore_case` set to "`encode`" (not "`both`" for encoding and decoding as it does not matter anyway for decoding) +- `whitespace` has 2 codecs defined ; the simple one is a simple bit encoding map, therefore using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype` set to "`bin`" (for pre-converting characters to bits before applying the encoding map), and the complex one uses [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) with its specific endocde/decode functions +- `atbash` defines a dynamic map with a "factory" function, that creates the encoding map according to the parameters supplied in the codec name + +So, before going further, determine the following: + +- What does the new codec map from and to ? E.g. if binary input and ordinal output, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype="bin"` and `outype="ord"`. +- Is this codec ignoring case ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and specify which operation(s) should ignore case, e.g. `ignore_case="both"` or `ignore_case="decode"`. +- Should this codec handle no error ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) do not forget to specify `no_error=True`. +- Does the codec yields variable-length encoded tokens ? If so, you can still use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) but you should define `sep` (separator) as `codext` will not be able to handle ambiguities. + +If you find aspects that are not covered in these questions, you shall use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56), then refering to [Case 1](#case-1-generic-encoding-definition). Otherwise, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and refer +to [Case 2](#case-2-encoding-map). + +----- + +### Case 1: Generic encoding definition + +This uses: [`codext.add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) + +The following shall be considered: + +- `encode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot encode. +- `decode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot decode. + +Both functions must take 2 arguments and return 2 values (in order to stick to `codec`'s encode/decode function format): + +- Inputs: `text`, `errors="strict"` ; respectively the text to encode/decode and the error handling mode. +- Outputs: encoded text and length of consumed input text. + +!!! note "Error handling mode" + + - `strict`: this is the default ; it means that any error shall raise an exception. + - `ignore`: any error is ignored, adding nothing to the output. + - `replace`: any error yields the given replacement character(s). + - `leave`: any error yields the erroneous input token in the output. + + This last mode is an addition to the native ones. It can be useful for some encodings that must cause no error while encoding and can therefore have their original characters in the output. + +Also, while defining the `encode` and/or `decode` functions, `codext.handle_error` can be used as a shortcut to handle the different modes. It returns a wrapped function that takes `token` and `position` as arguments (see [`excess3`](https://github.com/dhondta/python-codext/blob/master/codext/binary/excess3.py) for an example). + +```python +>>> help(codext.handle_error) +Help on function handle_error in module codext.__common__: + +handle_error(ename, errors, sep='', repl_char='?', repl_minlen=1, decode=False, item='position') + This shortcut function allows to handle error modes given some tuning parameters. + + :param ename: encoding name + :param errors: error handling mode + :param sep: token separator + :param repl_char: replacement character (for use when errors="replace") + :param repl_minlen: repeat number for the replacement character + :param decode: whether we are encoding or decoding + :param item: position item description (for describing the error ; e.g. "group" or "token") + +>>> err = codext.handle_error("test", "strict") +>>> help(err) +Help on function _handle_error in module codext.__common__: + +_handle_error(token, position) + This handles an encoding/decoding error according to the selected handling mode. + + :param token: input token to be encoded/decoded + :param position: token position index + +``` + +----- + +### Case 2: Encoding map + +This uses: [`codext.add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) + +The following options shall be considered: + +- `encmap` (second positional argument): This defines the encoding map and is the core of the codec ; 4 subcases are handled and explained hereafter. +- `repl_char` (keyword-argument ; default: "`?`"): The replacement character can be tuned, especially if the default one clashes with a character from the encoding. +- `sep` (keyword-argument ; default: ""): The separator between encoded tokens can be useful to tune, especially when the encoded tokens have a variable length. +- `ignore_case` (keyword-argument ; default: `None`): This defines where the case shall be ignored ; it can be one of the followings: "`encode`", "`decode`" or "`both`". +- `no_error` (keyword-argument ; default: `False`): This sets if errors should be handled as normal or if no error should be considered, simply leaving the input token as is in the output. +- `intype` (keyword-argument ; default: `None`): This specifies the type the input text should be converted to before applying the encoding map (pre-conversion before really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. +- `outype` (keyword-argument ; default: `None`): This specifies the type the output text of the encoding map should be converted from (post-conversion after really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. + +!!! warning "Input/Output types" + + By default, when `intype` is defined, `outype` takes the same value if left `None`. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be explicitely set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). + +`encmap` can be defined as follows: + +1. **Simple map**: In this case, the encoding map is a dictionary mapping each input character to an output one (see [`radio`](https://github.com/dhondta/python-codext/blob/master/codext/languages/radio.py) for an example). +2. **List of maps**: In this case, encoding maps are put in a list and referenced by their order number starting from 1, meaning that the `pattern` shall define a capture group with values from 1 to the length of this list (see [`dna`](https://github.com/dhondta/python-codext/blob/master/codext/others/dna.py) for an example). +3. **Parametrized map**: This variant defines a dictionary of regex-selected encoding maps, that is, a dictionary of dictionaries with keys matching the captured groups from codec's pattern. +4. **Map factory function**: This one is implemented by a function that returns the composed encoding map. This function takes a single argument according to the capture group from the `pattern` (see [`affine`](https://github.com/dhondta/python-codext/blob/master/codext/crypto/affine.py) for an example). + +!!! note "Mapping one input character to multiple output characters" + + In some particular cases (e.g. the `navajo` codec), a single input character can be mapped to multiple output ones. It is possible to define them in a map by simply putting them into a list (e.g. a map with `{'A': ["B", "C", "D"]}`). In this case, while encoding, the output character is randomly chosen (e.g. "`A`" will map to "`D`", another time to "`B`", ...). + +----- + +### Self-generated tests + +In order to facilitate testing, a test suite can be automatically generated from a set of *examples*. This is defined in the `__examples__` dunder inside codec's source file (see [`sms`](https://github.com/dhondta/python-codext/blob/master/codext/stegano/sms.py) for an example). By default, the `add`/`add_map` function will get `__examples__` from the global scope but this behavior can be overridden by specifying the keyword-argument `examples` (e.g. `add(..., examples=__examples1__)` ; see [`ordinal`](https://github.com/dhondta/python-codext/blob/master/codext/common/ordinal.py) for an example). + +A set of examples is a dictionary specifying the test cases to be considered. The keys are the descriptions of the test cases and the values can be either dictionaries of input texts and their output encoded texts or lists of input texts. Each key has the format "`operation(encodings)`". Operations can be: + +- `enc`: This is for testing the encoding of the nested values (that is, a dictionary of input/outputs). +- `dec`: This is for testing the decoding of the nested values (that is, a dictionary of input/outputs). If this is not specified, the test suite automatically tries to decode from what is defined in `enc`. +- `enc-dec`: This is for testing the encoding AND decoding of the nested values (that is, a list of inputs) ; this one does not enforce what should be the output of the encoding but checks that encoding AND decoding leads to the same input text. This is particularly useful when encoding can yield randomly chosen tokens in the encoded output. + +The `encodings` are a `|`-separated list of encoding names, compliant or not with tested codec's pattern. Faulty names can also be tested as of the examples hereafter. + +Examples of `__examples__` test suites: + +```python +__my_examples__ = { + 'enc(BAD)': None +} +``` + +!!! note "Observations" + + - `__my__examples__` is not the standard dunder, therefore requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. + - `BAD` is assumed to be a bad encoding name, therefore having a dictionary value of `None`, meaning that the test should raise a `LookupError`. + +```python +__examples__ = { + 'enc(codec)': {'string': None} +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, therefore NOT requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. + - `codec` is assumed to be a valid encoding name, therefore having a dictionary as its value, but in this special case "`string`" is assumed not to be encoded, its corresponding value is then `None`, meaning that the test should raise a `ValueError`. + +```python +__examples__ = { + 'enc-dec(codec)': ["test string", "TEST STRING", "@random", "@random{1024}"] +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. + - `enc-dec` is used, meaning that a list of inputs is defined. + - So, whatever its encoded output, the input string shall give the same while applying encoding then decoding. + - The special values `@random` and `@random{1024}`, meaning that test strings are generated from any possible byte-character with a specified length (512 when not specified, otherwise specified with `{...}`). + +```python +__examples__ = { + 'enc(codec)': {"test string": "..."} +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. + - `enc` only is used, meaning that a dictionary of inputs/outputs is given and `dec` is automatically handled while requiring the exact encoded text but recovering the exact same input while decoding. + +```python +__examples__ = { + 'enc(codec)': {"Test String": "..."}, + 'dec(codec)': {"...": "test string"}, +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. + - `enc` and `dec` are used, meaning that dictionaries of inputs/outputs are given and the input texts are not necessarily the same (i.e. if text case is not handled by the codec). + +----- + +### Adding a new codec to `codext` + +As a checklist when making a codec for addition in `codext`, please follow these steps: + +1. Create your codec file (i.e. starting with a copy of an existing similar one) +2. Place it into the right category folder +3. Add it to the list in [`README.md`](https://github.com/dhondta/python-codext/blob/master/README.md#list-of-codecs) +4. Add its documentation in the [right Markdown file](https://github.com/dhondta/python-codext/tree/master/docs/enc) +5. If self-generated tests are not enough, add manual tests in [the related file](https://github.com/dhondta/python-codext/blob/master/tests/test_manual.py) + diff --git a/docs/pages/index.md b/docs/pages/index.md index 185dd25..2579b17 100644 --- a/docs/pages/index.md +++ b/docs/pages/index.md @@ -1,11 +1,9 @@ -## Introduction - -Codext, contraction of "*codecs*" and "*extension*", is a library that gathers many additional encodings for use with [`codecs`](https://docs.python.org/3/library/codecs.html). When imported, it registers new encodings to an extended codecs registry for making the encodings available from the `codecs.(decode|encode|open)` API. It also features [CLI tools](./cli.html) and a [guess mode](./features.html#guess-decode-an-arbitrary-input) for decoding multiple layers of codecs. - -### Setup - -This library is available on [PyPi](https://pypi.python.org/pypi/codext/) and can be simply installed using Pip: - -```sh -pip install codext -``` +Codext, contraction of "*codecs*" and "*extension*", is a library that gathers many additional encodings for use with [`codecs`](https://docs.python.org/3/library/codecs.html). When imported, it registers new encodings to an extended codecs registry for making the encodings available from the `codecs.(decode|encode|open)` API. It also features [CLI tools](./cli.html) and a [guess mode](./features.html#guess-decode-an-arbitrary-input) for decoding multiple layers of codecs. + +### Setup + +This library is available on [PyPi](https://pypi.python.org/pypi/codext/) and can be simply installed using Pip: + +```sh +pip install codext +``` diff --git a/docs/pages/manipulations.md b/docs/pages/manipulations.md index 8857ca7..340f89c 100644 --- a/docs/pages/manipulations.md +++ b/docs/pages/manipulations.md @@ -1,75 +1,74 @@ -## String tranformations - -`codext` also defines multiple dummy string manipulation/transformation codecs, essentially for use with the CLI tool and for the sake of simplicity. - ------ - -### Case-related operations - -These transformation functions are simple string transformations, including `str`'s methods. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`camelcase` | text --> camel-case text | `camel` | no decoding -`capitalize` | text <-> capitalized text | | decoding "uncapitalizes" the text -`lowercase` | text <-> lowercase text | `lower` | decoding is `uppercase` -`pascalcase` | text --> pascal-case text | `pascal` | no decoding -`slugify` | text --> slug | `slug`, `kebab`, `kebabcase` | no decoding -`snakecase` | text --> snake-case text | `snake` | no decoding -`swapcase` | text <-> case-swapped text | `swap`, `invert`, `invertcase` | -`title` | text <-> titled text | | decoding "untitles" the text -`uppercase` | text <-> uppercase text | `upper` | decoding is `lowercase` - -Of course, these transformations have no interest while using them in Python as the `str` methods can be called. It can be useful while using `codext` from the terminal (see [*CLI tool*](cli.html)). - -Some simple examples: - -```sh -$ echo -en "test string" | codext encode swap-case -TEST STRING - -$ echo -en "test string" | codext encode camel_case -testString - -$ echo -en "test string" | codext encode kebab_case -test-string -``` - ------ - -### Dummy string operations - -These transformation functions are simple string transformations. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`replace` | text <-> text with multi-chars replaced | | parametrized with a _string_ and its _replacement_ -`reverse` | text <-> reversed text | | -`reverse-words` | text <-> reversed words | | same as `reverse` but not on the whole text, only on the words (text split by whitespace) -`strip-spaces` | text <-> all whitespaces stripped | | -`substitute` | text <-> text with token substituted | | -`tokenize` | text <-> text split in tokens of length N | | parametrized with _N_ - -As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)). - -A simple example: - -```sh -$ echo -en "test string" | codext encode reverse-words | codext encode reverse replace-\ _ -string_test -``` - -Another example: - -```sh -$ echo -en "3132333435" | codext encode tokenize-2 -31 32 33 34 35 -``` - -Or using encodings chaining: - -```sh -$ echo -en "test string" | codext encode reverse-words reverse substitute-string/phrase -phrase test -``` - +`codext` also defines multiple dummy string manipulation/transformation codecs, essentially for use with the CLI tool and for the sake of simplicity. + +----- + +### Case-related operations + +These transformation functions are simple string transformations, including `str`'s methods. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`camelcase` | text --> camel-case text | `camel` | no decoding +`capitalize` | text <-> capitalized text | | decoding "uncapitalizes" the text +`lowercase` | text <-> lowercase text | `lower` | decoding is `uppercase` +`pascalcase` | text --> pascal-case text | `pascal` | no decoding +`screamingsnakecase` | text --> screaming-snake-case text | `screaming-snake`, `screaming_snake_case` | no decoding +`slugify` | text --> slug | `slug`, `kebab`, `kebabcase` | no decoding +`snakecase` | text --> snake-case text | `snake` | no decoding +`swapcase` | text <-> case-swapped text | `swap`, `invert`, `invertcase` | +`title` | text <-> titled text | | decoding "untitles" the text +`uppercase` | text <-> uppercase text | `upper` | decoding is `lowercase` + +Of course, these transformations have no interest while using them in Python as the `str` methods can be called. It can be useful while using `codext` from the terminal (see [*CLI tool*](cli.html)). + +Some simple examples: + +```sh +$ echo -en "test string" | codext encode swap-case +TEST STRING + +$ echo -en "test string" | codext encode camel_case +testString + +$ echo -en "test string" | codext encode kebab_case +test-string +``` + +----- + +### Dummy string operations + +These transformation functions are simple string transformations. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`replace` | text <-> text with multi-chars replaced | | parametrized with a _string_ and its _replacement_ +`reverse` | text <-> reversed text | | +`reverse-words` | text <-> reversed words | | same as `reverse` but not on the whole text, only on the words (text split by whitespace) +`strip-spaces` | text <-> all whitespaces stripped | | +`substitute` | text <-> text with token substituted | | +`tokenize` | text <-> text split in tokens of length N | | parametrized with _N_ + +As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)). + +A simple example: + +```sh +$ echo -en "test string" | codext encode reverse-words | codext encode reverse replace-\ _ +string_test +``` + +Another example: + +```sh +$ echo -en "3132333435" | codext encode tokenize-2 +31 32 33 34 35 +``` + +Or using encodings chaining: + +```sh +$ echo -en "test string" | codext encode reverse-words reverse substitute-string/phrase +phrase test +``` + diff --git a/pyproject.toml b/pyproject.toml index 099d04b..b204596 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,17 +16,13 @@ authors = [ description = "Native codecs extension" license = {file = "LICENSE"} keywords = ["python", "development", "programming", "codecs", "encodings"] -requires-python = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,<4" +requires-python = ">=3.8,<4" classifiers = [ "Development Status :: 5 - Production/Stable", "Environment :: Console", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", @@ -34,9 +30,7 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", ] dependencies = [ - "markdown2==2.3.10; python_version=='2.7'", - "markdown2>=2.4.0; python_version>='3.6'", - "six", + "markdown2>=2.4.0", ] dynamic = ["version"] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..fcccae1 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = src diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index a4cc557..d3fbbb2 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.14.2 +1.15.0 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index d88dcbe..a2ff0ef 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -1,1517 +1,1520 @@ -# -*- coding: UTF-8 -*- -import _codecs -import codecs -import json -import os -import random -import re -import sys -from encodings.aliases import aliases as ALIASES -from functools import reduce, update_wrapper, wraps -from importlib import import_module -from inspect import currentframe -from itertools import chain, product -from locale import getlocale -from math import log -from pkgutil import iter_modules -from platform import system -from random import randint -from six import binary_type, string_types, text_type, BytesIO -from string import * -from types import FunctionType, ModuleType -try: # Python2 - import __builtin__ as builtins -except ImportError: - import builtins -try: # Python2 - from inspect import getfullargspec -except ImportError: - from inspect import getargspec as getfullargspec -try: # Python2 - from string import maketrans -except ImportError: - maketrans = str.maketrans -try: # Python3 - from importlib import reload -except ImportError: - pass -try: # from Python 3.11, it seems that 'sre_parse' is not bound to 're' anymore - re.sre_parse -except AttributeError: - import sre_parse as __sre_parse - re.sre_parse = __sre_parse - - -__all__ = ["add", "add_macro", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess", - "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "i2s", "is_native", - "list_categories", "list_encodings", "list_macros", "lookup", "maketrans", "os", "rank", "re", "register", - "remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro", - "DARWIN", "LANG", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"] -CODECS_REGISTRY = None -CODECS_OVERWRITTEN = [] -CODECS_CATEGORIES = ["native", "custom"] -CODECS_CACHE = {} -LANG = getlocale() -if LANG: - LANG = (LANG[0] or "")[:2].lower() -MASKS = { - 'a': printable, - 'b': "".join(chr(i) for i in range(256)), - 'd': digits, - 'h': digits + "abcdef", - 'H': digits + "ABCDEF", - 'l': ascii_lowercase, - 'p': punctuation, - 's': " ", - 'u': ascii_uppercase, -} - -__codecs_registry = [] - -MACROS = {} -PERS_MACROS = {} -PERS_MACROS_FILE = os.path.expanduser("~/.codext-macros.json") - -DARWIN = system() == "Darwin" -LINUX = system() == "Linux" -PY3 = sys.version[0] == "3" -UNIX = DARWIN or LINUX -WINDOWS = system() == "Windows" - -entropy = lambda s: -sum([p * log(p, 2) for p in [float(s.count(c)) / len(s) for c in set(s)]]) - -isb = lambda s: isinstance(s, binary_type) -iss = lambda s: isinstance(s, string_types) -fix = lambda x, ref: b(x) if isb(ref) else ensure_str(x) if iss(ref) else x - -s2i = lambda s: int(codecs.encode(s, "base16"), 16) -exc_name = lambda e: "".join(t.capitalize() for t in re.split(r"[-_+]", e)) - - -def i2s(input): - h = hex(input)[2:].rstrip("eL") - return codecs.decode(h.zfill(len(h) + len(h) % 2), "hex") - - -class CodecMacro(tuple): - """Macro details when looking up the codec registry. """ - def __new__(cls, name): - self = tuple.__new__(cls) - self.name = name - # get from personal macros first - try: - self.codecs = PERS_MACROS[name] - except KeyError: - try: - self.codecs = MACROS[name] - except KeyError: - raise LookupError("unknown macro: %s" % name) - if not isinstance(self.codecs, (tuple, list)): - raise ValueError("bad macro list: %s" % str(self.codecs)) - self.codecs = [lookup(e, False) for e in self.codecs] # lookup(e, False) - self.parameters = {'name': name, 'category': "macro"} # ^ means that macros won't be nestable - # test examples to check that the chain of encodings works - for action, examples in (self.codecs[0].parameters.get('examples', {}) or {'enc-dec(': ["T3st str!"]}).items(): - if re.match(r"enc(-dec)?\(", action): - for e in (examples.keys() if action.startswith("enc(") else examples or []): - rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) - if rd: - for n in (rd.group(2) or "512").split(","): - s = "".join(chr(randint(0, 255)) for i in range(int(n))) - self.encode(s.lower() if rd.group(1) else s) - continue - self.encode(e) - - class Codec: - decode = self.decode - encode = self.encode - - class IncrementalEncoder(codecs.IncrementalEncoder): - def encode(self, input, final=False): - return b(self.encode(input, self.errors)[0]) - self.incrementalencoder = IncrementalEncoder - - class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - return ensure_str(self.decode(input, self.errors)[0]) - self.incrementaldecoder = IncrementalDecoder - - class StreamWriter(Codec, codecs.StreamWriter): - charbuffertype = bytes - self.streamwriter = StreamWriter - - class StreamReader(Codec, codecs.StreamReader): - charbuffertype = bytes - self.streamreader = StreamReader - - return self - - def decode(self, input, error="strict"): - """ Decode with each codec in reverse order. """ - for ci in self.codecs[::-1]: - input, l = ci.decode(input, error) - return input, l - - def encode(self, input, error="strict"): - """ Encode with each codec. """ - for ci in self.codecs: - input, l = ci.encode(input, error) - return input, l - - def __repr__(self): - return "" % (self.name, id(self)) - - -# inspired from: https://stackoverflow.com/questions/10875442/possible-to-change-a-functions-repr-in-python -class Repr(object): - def __init__(self, name, func): - self.__name = name - self.__func = func - update_wrapper(self, func) - - def __call__(self, *args, **kwargs): - return self.__func(*args, **kwargs) - - def __repr__(self): - return "" % (self.__name, id(self)) - - -def __stdin_pipe(): - """ Stdin pipe read function. """ - try: - with open(0, 'rb') as f: - for l in f: - yield l - except TypeError: - for l in sys.stdin: - yield l - - -def _input(infile): - # handle input file or stdin - c = b("") - if infile: - with open(infile, 'rb') as f: - c = f.read() - else: - for line in __stdin_pipe(): - c += line - return c - - -def _set_exc(name, etype="ValueError"): - if not hasattr(builtins, name): - exec("class %s(%s): __module__ = 'builtins'" % (name, etype)) - setattr(builtins, name, locals()[name]) -_set_exc("InputSizeLimitError") -_set_exc("ParameterError") - - -def _stripl(s, st_lines, st_crlf): - if st_crlf: - s = s.replace(b"\r\n", b"") if isb(s) else s.replace("\r\n", "") - if st_lines: - s = s.replace(b"\n", b"") if isb(s) else s.replace("\n", "") - return s - - -def _with_repr(name): - def _wrapper(f): - return Repr(name, f) - return _wrapper - - -def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False, **kwargs): - """ This adds a new codec to the codecs module setting its encode and/or decode functions, eventually dynamically - naming the encoding with a pattern and with file handling. - - :param ename: encoding name - :param encode: encoding function or None - :param decode: decoding function or None - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - """ - remove(ename) - if encode: - if not isinstance(encode, FunctionType): - raise ValueError("Bad 'encode' function") - _set_exc("%sEncodeError" % exc_name(ename)) # create the custom encode exception as a builtin - if decode: - if not isinstance(decode, FunctionType): - raise ValueError("Bad 'decode' function") - _set_exc("%sDecodeError" % exc_name(ename)) # create the custom decode exception as a builtin - if not encode and not decode: - raise ValueError("At least one en/decoding function must be defined") - for exc in kwargs.get('extra_exceptions', []): - _set_exc(exc) # create additional custom exceptions as builtins - glob = currentframe().f_back.f_globals - # search function for the new encoding - @_with_repr(ename) - def getregentry(encoding): - if encoding != ename and not (pattern and re.match(pattern, encoding)): - return - fenc, fdec, name = encode, decode, encoding - # prepare CodecInfo input arguments - if pattern: - m, args, i = re.match(pattern, encoding), [], 1 - try: - while True: - try: - g = m.group(i) or "" - if g.isdigit() and not g.startswith("0") and "".join(set(g)) != "01": - g = int(g) - args += [g] - i += 1 - except AttributeError: - # this occurs when m is None or there is an error in fenc(g) or fdec(g), meaning no match - if m is not None: - raise - return - except IndexError: - # this occurs while m is not None, but possibly no capture group that gives at least 1 group index ; - # in this case, if fenc/fdec is a decorated function, execute it with no arg - if len(args) == 0: - if fenc and len(getfullargspec(fenc).args) == 1: - fenc = fenc() - if fdec and len(getfullargspec(fdec).args) == 1: - fdec = fdec() - else: - fenc = fenc(*args) if fenc else fenc - fdec = fdec(*args) if fdec else fdec - if fenc: - fenc = fix_inout_formats(fenc) - if fdec: - fdec = fix_inout_formats(fdec) - sl, sc = kwargs.pop('strip_lines', False), kwargs.pop('strip_crlf', False) - if sl or sc: - def _striplines(f): - def __wrapper(input, *a, **kw): - return f(_stripl(input, sc, sl), *a, **kw) - return __wrapper - # this fixes issues with wrapped encoded inputs - fdec = _striplines(fdec) - - class Codec(codecs.Codec): - def encode(self, input, errors="strict"): - if fenc is None: - raise NotImplementedError - return fenc(input, errors) - - def decode(self, input, errors="strict"): - if fdec is None: - raise NotImplementedError - return fdec(input, errors) - - class IncrementalEncoder(codecs.IncrementalEncoder): - def encode(self, input, final=False): - if fenc is None: - raise NotImplementedError - return b(fenc(input, self.errors)[0]) - - class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - if fdec is None: - raise NotImplementedError - return ensure_str(fdec(input, self.errors)[0]) - - class StreamWriter(Codec, codecs.StreamWriter): - charbuffertype = bytes - - class StreamReader(Codec, codecs.StreamReader): - charbuffertype = bytes - - ci = codecs.CodecInfo( - name=name, - encode=Codec().encode, - decode=Codec().decode, - incrementalencoder=IncrementalEncoder, - incrementaldecoder=IncrementalDecoder, - streamwriter=StreamWriter, - streamreader=StreamReader, - _is_text_encoding=text, - ) - ci.parameters = kwargs - ci.parameters['name'] = ename - ci.parameters['add_to_codecs'] = add_to_codecs - ci.parameters['pattern'] = pattern - ci.parameters['text'] = text - f = glob.get('__file__', os.path.join("custom", "_")) - cat = f.split(os.path.sep)[-2].rstrip("s") - if cat not in CODECS_CATEGORIES: - CODECS_CATEGORIES.append(cat) - ci.parameters['category'] = kwargs.get('category', cat) - ci.parameters['examples'] = kwargs.get('examples', glob.get('__examples__')) - ci.parameters['guess'] = kwargs.get('guess', glob.get('__guess__', [ename])) or [] - ci.parameters['module'] = kwargs.get('module', glob.get('__name__')) - ci.parameters.setdefault("scoring", {}) - for attr in ["bonus_func", "entropy", "expansion_factor", "len_charset", "penalty", "printables_rate", - "padding_char", "transitive"]: - a = kwargs.pop(attr, None) - if a is not None: - ci.parameters['scoring'][attr] = a - return ci - - getregentry.__name__ = re.sub(r"[\s\-]", "_", ename) - if kwargs.get('aliases'): - getregentry.__aliases__ = list(map(lambda n: re.sub(r"[\s\-]", "_", n), kwargs['aliases'])) - getregentry.__pattern__ = pattern - register(getregentry, add_to_codecs) - return getregentry - - -def add_macro(mname, *encodings): - """ This allows to define a macro, chaining multiple codecs one after the other. This relies on a default set of - macros from a YAML file embedded in the package and a local YAML file from the home folder that takes - precedence for defining personal macros. - - :param mname: macro name - :param encodings: encoding names of the encodings to be chained with the macro - """ - global PERS_MACROS - # check for name clash with alreday existing macros and codecs - if mname in MACROS or mname in PERS_MACROS: - raise ValueError("Macro name already exists") - try: - ci = lookup(mname, False) - raise ValueError("Macro name clashes with codec '%s'" % ci.name) - except LookupError: - pass - try: - PERS_MACROS[mname] = encodings - CodecMacro(mname) - with open(PERS_MACROS_FILE, 'w') as f: - json.dump(PERS_MACROS, f, indent=2) - except ValueError: - del PERS_MACROS[mname] - raise -codecs.add_macro = add_macro - - -def add_map(ename, encmap, repl_char="?", sep="", ignore_case=None, no_error=False, intype=None, outype=None, **kwargs): - """ This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs - module dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with - a pattern and with file handling (if text is True). - - :param ename: encoding name - :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture - group of the regex pattern) or a function building the encoding map - :param repl_char: replacement char (used when errors handling is set to "replace") - :param sep: string of possible character separators (hence, only single-char separators are considered) ; - - while encoding, the first separator is used - - while decoding, separators can be mixed in the input text - :param ignore_case: ignore text case while encoding and/or decoding - :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) - :param intype: specify the input type for pre-transforming the input text - :param outype: specify the output type for post-transforming the output text - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - """ - outype = outype or intype - if ignore_case not in [None, "encode", "decode", "both"]: - raise ValueError("Bad ignore_case parameter while creating encoding map") - if intype not in [None, "str", "bin", "ord"]: - raise ValueError("Bad input type parameter while creating encoding map") - if outype not in [None, "str", "bin", "ord"]: - raise ValueError("Bad output type parameter while creating encoding map") - - def __generic_code(decode=False): - def _wrapper(param): - """ The parameter for wrapping comes from the encoding regex pattern ; e.g. - [no pattern] => param will be None everytime - r"barbie[-_]?([1-4])$" => param could be int 1, 2, 3 or 4 - r"^morse(|[-_]?.{3})$" => param could be None, "-ABC" (for mapping to ".-/") - - In order of precedence: - 1. when param is a key in mapdict or mapdict is a list of encoding maps (hence in the case of "barbie...", - param MUST be an int, otherwise for the first case it could clash with a character of the encoding map) - 2. otherwise handle it as a new encoding character map "ABC" translates to ".-/" for morse - """ - p = param - if isinstance(encmap, FunctionType): - mapdict = encmap(p) - p = None - else: - mapdict = encmap - if isinstance(mapdict, dict): - smapdict = {k: v for k, v in mapdict.items()} - elif isinstance(mapdict, list) and isinstance(mapdict[0], dict): - smapdict = {k: v for k, v in mapdict[0].items()} - else: - raise ValueError("Bad mapping dictionary or list of mapping dictionaries") - if p is not None: - # case 1: param is empty string - if p == "": - if isinstance(mapdict, list): - smapdict = {k: v for k, v in mapdict[0].items()} - elif isinstance(mapdict, dict): - if '' in mapdict.keys() and isinstance(mapdict[''], dict): - smapdict = {k: v for k, v in mapdict[''].items()} - else: - smapdict = {k: v for k, v in mapdict.items()} - # no 'else' handling a LookupError here ; this case is covered by the first if/elif/else block - # case 2: list or dictionary or dictionary of numbered encodings - elif isinstance(p, int): - # if mapdict is a list, we shall align the parameter (starting from 1) as an index (starting from 0) - if isinstance(mapdict, list): - p -= 1 - if isinstance(mapdict, list) and 0 <= p < len(mapdict) or \ - isinstance(mapdict, dict) and p in mapdict.keys(): - smapdict = {k: v for k, v in mapdict[p].items()} - else: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) - # case 3: dictionary of regex-selected encoding mappings - elif isinstance(mapdict, dict) and isinstance(list(mapdict.values())[0], dict): - tmp = None - for r, d in mapdict.items(): - if r == '': # this is already handled in case 1 ; anyway, an empty regex always matches, hence - continue # it must be excluded - if re.match(r, p): - tmp = d - break - if tmp is None: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) - smapdict = tmp - # case 4: encoding characters translation - else: - # collect base tokens in order of appearance in the mapping dictionary - base_tokens = "" - for _, c in sorted(mapdict.items()): - for t in c: - for st in t: - if st not in base_tokens: - base_tokens += st - if " " not in sep: - base_tokens = base_tokens.replace(" ", "") - if len(p) > 0 and p[0] in "-_" and len(p[1:]) == len(set(p[1:])) == len(base_tokens): - p = p[1:] - if len(p) == len(set(p)) == len(base_tokens): - t = maketrans(base_tokens, p) - for k, v in smapdict.items(): - smapdict[k] = [x.translate(t) for x in v] if isinstance(v, list) else v.translate(t) - else: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) - if ignore_case is not None: - cases = ["upper", "lower"] - case_d = cases[any(c in str(list(smapdict.values())) for c in "abcdefghijklmnopqrstuvwxyz")] - case_e = cases[any(c in str(list(smapdict.keys())) for c in "abcdefghijklmnopqrstuvwxyz")] - i = ignore_case - smapdict = {getattr(k, case_e)() if i in ["both", "encode"] else k: \ - ([getattr(x, case_d)() for x in v] if isinstance(v, list) else getattr(v, case_d)()) \ - if i in ["both", "decode"] else v for k, v in smapdict.items()} - if decode: - tmp = {} - # this has a meaning for encoding maps that could have clashes in encoded chars (e.g. Bacon's cipher ; - # I => abaaa but also J => abaaa, with the following, we keep I instead of letting J overwrite it) - for k, v in sorted(smapdict.items()): - if not isinstance(v, list): - v = [v] - for x in v: - if x not in tmp.keys(): - tmp[x] = k - smapdict, cs = tmp, reduce(lambda acc, x: acc + x, tmp.keys()) - kwargs['strip_lines'], kwargs['strip_crlf'] = "\n" not in set(cs), "\r\n" not in cs - # this allows to avoid an error with Python2 in the "for i, c in enumerate(parts)" loop - if '' not in smapdict.keys(): - smapdict[''] = "" - # determine token and result lengths - tmaxlen = max(map(len, smapdict.keys())) - tminlen = max(1, min(map(len, set(smapdict.keys()) - {''}))) - l = [] - for x in smapdict.values(): - getattr(l, ["append", "extend"][isinstance(x, list)])(x) - rminlen = max(1, min(map(len, set(l) - {''}))) - - # generic encoding/decoding function for map encodings - def code(text, errors="strict"): - icase = ignore_case == "both" or \ - decode and ignore_case == "decode" or \ - not decode and ignore_case == "encode" - if icase: - case = case_d if decode else case_e - if no_error: - errors = "leave" - text = ensure_str(text) - if not decode: - if intype == "bin": - text = "".join("{:0>8}".format(bin(ord(c))[2:]) for c in text) - elif intype == "ord": - text = "".join(str(ord(c)).zfill(3) for c in text) - r = "" - lsep = "" if decode else sep if len(sep) <= 1 else sep[0] - kind = ["character", "token"][tmaxlen > 1] - error_func = handle_error(ename, errors, lsep, repl_char, rminlen, decode, kind) - - # get the value from the mapping dictionary, trying the token with its inverted case if relevant - def __get_value(token, position, case_changed=False): - try: - result = smapdict[token] - except KeyError: - if icase and not case_changed: - token_inv_case = getattr(token, case)() - return __get_value(token_inv_case, position, True) - return error_func(token, position) - if isinstance(result, list): - result = result[0] - return result + lsep - - # if a separator is defined, rely on it by splitting the input text - if decode and len(sep) > 0: - for i, c in enumerate(re.split("[" + sep + "]", text)): - r += __get_value(c, i) - # otherwise, move through the text using a cursor for tokenizing it ; this allows defining more complex - # encodings with variable token lengths - else: - cursor, bad = 0, "" - while cursor < len(text): - token = text[cursor:cursor+1] - for l in range(tminlen, tmaxlen + 1): - token = text[cursor:cursor+l] - if token in smapdict.keys() or icase and getattr(token, case)() in smapdict.keys(): - r += __get_value(token, cursor) - cursor += l - break - else: - # collect bad chars and only move the cursor one char to the right - bad += text[cursor] - cursor += 1 - # if the number of bad chars is the minimum token length, consume it and start a new buffer - if len(bad) == tminlen or errors == "leave": - posn = cursor - len(bad) - r += error_func(bad, posn) - bad = "" - if decode: - if outype in ["bin", "ord"]: - tmp, r = "", r.replace(lsep, "") - step = [3, 8][outype == "bin"] - for i in range(0, len(r), step): - s = r[i:i+step] - try: - tmp += chr(int(s, 2) if outype == "bin" else int(s)) - except ValueError: - if len(s) > 0: - tmp += "[" + s + "]" - r = tmp + lsep - return r[:len(r)-len(lsep)], len(b(text)) - return code - if re.search(r"\([^(?:)]", kwargs.get('pattern', "")) is None: - # in this case, there is no capturing group for parametrization - return _wrapper(None) - return _wrapper - - glob = currentframe().f_back.f_globals - kwargs['category'] = glob['__file__'].split(os.path.sep)[-2].rstrip("s") - kwargs['examples'] = kwargs.get('examples', glob.get('__examples__')) - kwargs['encmap'] = encmap - kwargs['repl_char'] = repl_char - kwargs['sep'] = sep - kwargs['ignore_case'] = ignore_case - kwargs['no_error'] = no_error - kwargs['intype'] = intype - kwargs['outype'] = outype - kwargs['module'] = glob.get('__name__') - try: - if isinstance(encmap, dict): - smapdict = {k: v for k, v in encmap.items()} - elif isinstance(encmap, list) and isinstance(encmap[0], dict): - smapdict = {k: v for k, v in encmap[0].items()} - kwargs['repl_minlen'] = i = max(1, min(map(len, set(smapdict.values()) - {''}))) - kwargs['repl_minlen_b'] = max(1, min(map(len, map(b, set(smapdict.values()) - {''})))) - except: - pass - return add(ename, __generic_code(), __generic_code(True), **kwargs) -codecs.add_map = add_map - - -def clear(): - """ Clear codext's local registry of search functions. """ - global __codecs_registry, MACROS, PERS_MACROS - __codecs_registry, MACROS, PERS_MACROS = [], {}, {} -codecs.clear = clear - - -def examples(encoding, number=10): - """ Use the search function to get the matching encodings and provide examples of valid encoding names. """ - e = [] - for name in search(encoding): - for search_function in __codecs_registry: - n = search_function.__name__ - if name in [n, n.replace("_", "-")]: - temp = [] - for s in generate_strings_from_regex(search_function.__pattern__, yield_max=16*number): - temp.append(s) - random.shuffle(temp) - i = 0 - while i < min(number, len(temp)): - if not temp[i].isdigit(): - try: - lookup(temp[i], False) - e.append(temp[i]) - except LookupError: - pass - i += 1 - for alias, codec in ALIASES.items(): - if name == codec: - if codec not in e: - e.append(codec) - if not alias.isdigit(): - e.append(alias) - random.shuffle(e) - return sorted([e[i] for i in range(min(number, len(e)))], key=_human_keys) -codecs.examples = examples - - -def is_native(encoding): - """ Determine if a given encoding is native or not. """ - return lookup(encoding, False).parameters['category'] == "native" - - -def list_categories(): - """ Get a list of all codec categories. """ - c = CODECS_CATEGORIES - root = os.path.dirname(__file__) - for d in os.listdir(root): - if os.path.isdir(os.path.join(root, d)) and not d.startswith("__"): - c.append(d.rstrip("s")) - # particular category, hardcoded from base/_base.py - c += ["base-generic"] - return c -list_categories() - - -def list_encodings(*categories): - """ Get a list of all codecs. """ - # if "non-native" is in the input list, extend the list with the whole categories but "native" - categories, exclude = list(categories), [] - for c in categories[:]: - if c == "non-native": - for c in CODECS_CATEGORIES: - if c == "native" or c in categories: - continue - categories.append(c) - categories.remove("non-native") - if c.startswith("~"): - exclude.append(c[1:]) - categories.remove(c) - try: - categories.remove(c[1:]) - except ValueError: - pass - # now, filter codecs according to the input list of categories - enc = [] - if (len(categories) == 0 or "native" in categories) and "native" not in exclude: - for a in set(ALIASES.values()): - try: - ci = __orig_lookup(a) - except LookupError: - continue - if lookup(a) is ci: - enc.append(ci.name) - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - name = search_function.__name__.replace("_", "-") - p = search_function.__pattern__ - ci = search_function(name) if p is None else search_function(generate_string_from_regex(p)) - c = "other" if ci is None else ci.parameters['category'] - if (len(categories) == 0 or c in categories) and c not in exclude: - enc.append(name) - for category in categories: - if category not in CODECS_CATEGORIES: - raise ValueError("Category '%s' does not exist" % category) - return sorted(list(set(enc)), key=_human_keys) - - -def list_macros(): - """ Get a list of all macros, with the precedence on personal ones. """ - return sorted(list(set(list(MACROS.keys()) + list(PERS_MACROS.keys())))) - - -def remove(name): - """ Remove all search functions matching the input encoding name from codext's local registry or any macro with the - given name. """ - global __codecs_registry, MACROS, PERS_MACROS - tbr = [] - for search_function in __codecs_registry: - if search_function(name) is not None: - tbr.append(search_function) - for search_function in tbr: - __codecs_registry.remove(search_function) - try: - del MACROS[name] - except KeyError: - pass - try: - del PERS_MACROS[name] - with open(PERS_MACROS_FILE, 'w') as f: - json.dump(PERS_MACROS, f, indent=2) - except KeyError: - pass - try: - del CODECS_CACHE[name] - except KeyError: - pass - for s in ["En", "De"]: - try: - delattr(builtins, "%s%scodeError" % (name.capitalize(), s)) - except AttributeError: - pass -codecs.remove = remove - - -def reset(): - """ Reset codext's local registry of search functions and macros. """ - global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS - clear() - d = os.path.dirname(__file__) - for pkg in sorted(os.listdir(d)): - if pkg.startswith("_") or not os.path.isdir(os.path.join(d, pkg)): - continue - reload(import_module("codext." + pkg)) - # backup codext's registry - if CODECS_REGISTRY is None: - CODECS_REGISTRY = __codecs_registry[:] - # restore codext's registry - else: - __codecs_registry = CODECS_REGISTRY[:] - # restore codext's embedded set of macros - with open(os.path.join(os.path.dirname(__file__), "macros.json")) as f: - MACROS = json.load(f) - # reload personal set of macros - PERS_MACROS = {} - if os.path.exists(PERS_MACROS_FILE): - with open(PERS_MACROS_FILE) as f: - PERS_MACROS = json.load(f) -codecs.reset = reset - - -# conversion functions -def b(s): - """ Non-crashing bytes conversion function. """ - if PY3: - try: - return s.encode("latin-1") - except: - pass - try: - return s.encode("utf-8") - except: - pass - return s - - -def ensure_str(s, encoding='utf-8', errors='strict'): - """ Similar to six.ensure_str. Adapted here to avoid messing up with six version errors. """ - if not PY3 and isinstance(s, text_type): - return s.encode(encoding, errors) - elif PY3 and isinstance(s, binary_type): - try: - return s.decode(encoding, errors) - except: - return s.decode("latin-1") - return s - - -# make conversion functions compatible with input/output strings/bytes -def fix_inout_formats(f): - """ This decorator ensures that the first output of f will have the same text format as the first input (str or - bytes). """ - @wraps(f) - def _wrapper(*args, **kwargs): - a0 = args[0] - a0_isb = isb(a0) - a0 = ensure_str(a0) if iss(a0) or a0_isb else a0 - r = f(a0, *args[1:], **kwargs) - # special case: input is in bytes ; ensure that the returned length is this of the bytes, not this processed by - # the decode/encode function - if isinstance(r, (tuple, list)) and isinstance(r[1], int) and a0_isb: - r = tuple([list(r)[0]] + [len(args[0])] + list(r)[2:]) - return (fix(r[0], args[0]), ) + r[1:] if isinstance(r, (tuple, list)) else fix(r, args[0]) - return _wrapper - - -# alphabet generation function from a given mask -def get_alphabet_from_mask(mask): - """ This function generates an alphabet from the given mask. The style used is similar to Hashcat ; group keys are - marked with a heading "?". """ - i, alphabet = 0, "" - while i < len(mask): - c = mask[i] - if c == "?" and i < len(mask) - 1 and mask[i+1] in MASKS.keys(): - for c in MASKS[mask[i+1]]: - if c not in alphabet: - alphabet += c - i += 1 - elif c not in alphabet: - alphabet += c - i += 1 - return alphabet - - -# generic error handling function -def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=False, kind="character", item="position"): - """ This shortcut function allows to handle error modes given some tuning parameters. - - :param ename: encoding name - :param errors: error handling mode - :param sep: token separator - :param repl_char: replacement character (for use when errors="replace") - :param repl_minlen: repeat number for the replacement character - :param decode: whether we are encoding or decoding - :param item: position item description (for describing the error ; e.g. "group" or "token") - """ - exc = "%s%scodeError" % (exc_name(ename), ["En", "De"][decode]) - - def _handle_error(token, position, output="", eename=None): - """ This handles an encoding/decoding error according to the selected handling mode. - - :param token: input token to be encoded/decoded - :param position: token position index - :param output: output, as decoded up to the position of the error - """ - if errors == "strict": - msg = "'%s' codec can't %scode %s '%s' in %s %d" - token = ensure_str(token) - token = token[:7] + "..." if len(token) > 10 else token - err = getattr(builtins, exc)(msg % (eename or ename, ["en", "de"][decode], kind, token, item, position)) - err.output = output - err.__cause__ = err - raise err - elif errors == "leave": - return token + sep - elif errors == "replace": - return repl_char * repl_minlen + sep - elif errors == "ignore": - return "" - else: - raise ValueError("Unsupported error handling '{}'".format(errors)) - return _handle_error - - -# codecs module hooks -__orig_lookup = _codecs.lookup -__orig_register = _codecs.register - - -def __add(ename, encode=None, decode=None, pattern=None, text=True, **kwargs): - kwargs.pop('add_to_codecs', None) - return add(ename, encode, decode, pattern, text, True, **kwargs) -__add.__doc__ = add.__doc__ -codecs.add = __add - - -def decode(obj, encoding='utf-8', errors='strict'): - """ Custom decode function relying on the hooked lookup function. """ - return lookup(encoding).decode(obj, errors)[0] -codecs.decode = decode - - -def encode(obj, encoding='utf-8', errors='strict'): - """ Custom encode function relying on the hooked lookup function. """ - n, m = 1, re.search(r"\[(\d+)\]$", encoding) - if m: - n = int(m.group(1)) - encoding = re.sub(r"\[(\d+)\]$", "", encoding) - ci = lookup(encoding) - for i in range(n): - obj = ci.encode(obj, errors)[0] - return obj -codecs.encode = encode - - -def lookup(encoding, macro=True): - """ Hooked lookup function for searching first for codecs in the local registry of this module. """ - # first, try to match the given encoding with codecs' search functions - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - codecinfo = search_function(encoding) - if codecinfo is not None: - return codecinfo - # then, if a codec name was given, generate an encoding name from its pattern and get the CodecInfo - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - if search_function.__name__.replace("_", "-") == encoding or \ - encoding in getattr(search_function, "__aliases__", []): - codecinfo = search_function(generate_string_from_regex(search_function.__pattern__)) - if codecinfo is not None: - return codecinfo - # finally, get a CodecInfo with the original lookup function and refine it with a dictionary of parameters - try: - ci = __orig_lookup(encoding) - ci.parameters = {'category': "native", 'module': "codecs", 'name': ALIASES.get(ci.name, ci.name)} - return ci - except LookupError: - if not macro: - raise - try: - return CodecMacro(encoding) - except LookupError: - e = LookupError("unknown encoding: %s" % encoding) - e.__cause__ = e # stop exception chaining - raise e -codecs.lookup = lookup - - -def register(search_function, add_to_codecs=False): - """ Register function for registering new codecs in the local registry of this module and, if required, in the - native codecs registry (for use with the built-in 'open' function). - - :param search_function: search function for the codecs registry - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - """ - if search_function not in __codecs_registry: - try: - __orig_lookup(search_function.__name__) - l = CODECS_OVERWRITTEN - except LookupError: - l = __codecs_registry - l.append(search_function) - if add_to_codecs: - __orig_register(search_function) - - -def __register(search_function): - """ Same as register(...), but with add_to_codecs set by default to True. """ - register(search_function, True) -codecs.register = __register - - -def search(encoding_regex, extended=True): - """ Function similar to lookup but allows to search for an encoding based on a regex instead. It searches this way - into the local registry but also tries a simple lookup with the original lookup function. """ - matches = [] - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - n = search_function.__name__ - for name in [n, n.replace("_", "-")]: - if re.search(encoding_regex, name): - matches.append(n.replace("_", "-")) - continue - if extended: - # in some cases, encoding_regex can match a generated string that uses a particular portion of its - # generating pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also - # find "morse" or "atbash" very rarely because of their dynamic patterns and the limited number of randomly - # generated strings - # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of - # matches ; executing 5 times the string generation for a given codec but adding the codec to the list of - # matches only if we get at least 3 matches ensures that we consider up to 2 failures that could be - # stochastic, therefore drastically decreasing the probability to get a "junk" encoding in the matches list - c = 0 - for i in range(5): - for s in generate_strings_from_regex(search_function.__pattern__): - if re.search(encoding_regex, s): - c += 1 - break - if c >= 3: - matches.append(n) - break - for s, n in ALIASES.items(): - if re.search(encoding_regex, s) or re.search(encoding_regex, n): - matches.append(n) - return sorted(list(set(matches)), key=_human_keys) -codecs.search = search - - -# utility function for the search feature -CATEGORIES = { - 'digit': digits, - 'not_digit': reduce(lambda x, c: x.replace(c, ""), digits, printable), - 'space': whitespace, - 'not_space': reduce(lambda x, c: x.replace(c, ""), whitespace, printable), - 'word': ascii_letters + digits + '_', - 'not_word': reduce(lambda x, c: x.replace(c, ""), ascii_letters + digits + '_', printable), -} -REPEAT_MAX = 10 -STAR_PLUS_MAX = 10 -YIELD_MAX = 100 - - -def __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max, parsed=False): - """ Recursive function to generate strings from a regex pattern. """ - if regex is None: - return - __groups = {} - tokens = [] - negate, last_rand = False, None - for state in (regex if parsed else re.sre_parse.parse(b(getattr(regex, "pattern", regex)))): - code = getattr(state[0], "name", state[0]).lower() - value = getattr(state[1], "name", state[1]) - value = value.lower() if isinstance(value, str) else value - if code in ["assert_not", "at"]: - continue - elif code == "any": - charset = list(printable.replace("\n", "")) - while charset[0] == last_rand and len(charset) > 1: - random.shuffle(charset) - last_rand = charset[0] - tokens.append(charset) # should be ord(x) with x belongs to [0, 256[ - elif code == "assert": - tokens.append(list(__gen_str_from_re(value[1], star_plus_max, repeat_max, yield_max, True))) - elif code == "branch": - result = [] - for r in value[1]: - result += list(__gen_str_from_re(r, star_plus_max, repeat_max, yield_max, True)) or [""] - tokens.append(result) - elif code == "category": - charset = list(CATEGORIES[value[9:]]) - if negate: - negate = False - charset = list(set(printable).difference(charset)) - while charset[0] == last_rand and len(charset) > 1: - random.shuffle(charset) - last_rand = charset[0] - tokens.append(charset) - elif code == "groupref": - tokens.extend(__groups[value]) - elif code == "in": - subtokens = list(__gen_str_from_re(value, star_plus_max, repeat_max, yield_max, True)) - subtokens = [x for l in subtokens for x in l] - tokens.append(subtokens) - elif code == "literal": - tokens.append(chr(value)) - elif code in ["max_repeat", "min_repeat"]: - start, end = value[:2] - end = min(end, star_plus_max) - start = min(start, end) - charset = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) - subtokens = [] - if start == 0 and end == 1: - subtokens.append("") - subtokens.extend(charset) - elif len(charset) ** end > repeat_max: - for i in range(min(repeat_max, 10 * len(charset))): - n = random.randint(start, end + 1) - token = "" if n == 0 else "".join(random.choice(charset) for i in range(n)) - if token not in subtokens: - subtokens.append(token) - else: - i -= 1 - else: - for n in range(start, end + 1): - for c in product(charset, repeat=n): - subtokens.append("".join(c)) - tokens.append(subtokens) - elif code == "negate": - negate = True - elif code == "not_literal": - charset = list(printable.replace(chr(value), "")) - while charset[0] == last_rand and len(charset) > 1: - random.shuffle(charset) - last_rand = charset[0] - tokens.append(charset) - elif code == "range": - tokens.append("".join(chr(i) for i in range(value[0], value[1] + 1))) - elif code == "subpattern": - result = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) - if value[0]: - __groups[value[0]] = result - tokens.append(result) - else: - raise NotImplementedError("Unhandled code '{}'".format(code)) - if len(tokens) == 0: - tokens = [""] - i = 0 - for result in product(*tokens): - yield "".join(result) - i += 1 - if i >= yield_max: - break - - -def _human_keys(text): - """ Sorting function for considering strings with numbers (e.g. base2, base10, base100) """ - tokens = [] - for s in re.split(r"(\d+|\D+)", text): - tokens.append(int(s) if s.isdigit() else s) - return tokens - - -def generate_string_from_regex(regex): - """ Utility function to generate a single string from a regex pattern. """ - if regex: - return list(generate_strings_from_regex(regex, yield_max=1))[0] - - -def generate_strings_from_regex(regex, star_plus_max=STAR_PLUS_MAX, repeat_max=REPEAT_MAX, yield_max=YIELD_MAX): - """ Utility function to generate strings from a regex pattern. """ - i = 0 - for result in __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max): - yield result - - -# guess feature objects -__module_exists = lambda n: n in [x[1] for x in iter_modules()] -stopfunc = ModuleType("stopfunc", """ - Predefined stop functions - ~~~~~~~~~~~~~~~~~~~~~~~~~ - - This submodule contains stop functions for the guess feature of codext. - - - `flag`: searches for the pattern "[Ff][Ll1][Aa4@][Gg9]" (either UTF-8 or UTF-16) - - `lang_**`: checks if the given lang (any from the PROFILES_DIRECTORY of the langdetect module) is detected - - `printables`: checks that every output character is in the set of printables - - `regex`: takes one argument, the regular expression, for checking a string against the given pattern - - `text`: checks for printables and an entropy less than 4.6 (empirically determined) -""") -stopfunc.printables = lambda s: all(c in printable for c in ensure_str(s)) -stopfunc.printables.__name__ = stopfunc.printables.__qualname__ = "printables" -stopfunc.regex = lambda p: lambda s: re.search(p, ensure_str(s)) is not None -stopfunc.regex.__name__ = stopfunc.regex.__qualname__ = "regex" -stopfunc.text = lambda s: stopfunc.printables(s) and entropy(s) < 4.6 -stopfunc.text.__name__ = stopfunc.text.__qualname__ = "text" -stopfunc.flag = lambda x: re.search(r"[Ff][Ll1][Aa4@][Gg96]", ensure_str(x)) is not None -stopfunc.flag.__name__ = stopfunc.flag.__qualname__ = "flag" -stopfunc.default = stopfunc.text - -stopfunc.LANG_BACKEND = None -stopfunc.LANG_BACKENDS = [n for n in ["pycld2", "langdetect", "langid", "cld3", "textblob"] if __module_exists(n)] -if len(stopfunc.LANG_BACKENDS) > 0: - stopfunc.LANG_BACKEND = stopfunc.LANG_BACKENDS[0] -if "cld3" in stopfunc.LANG_BACKENDS: - stopfunc.CLD3_LANGUAGES = "af|am|ar|bg|bn|bs|ca|ce|co|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|fy|ga|gd|gl|gu|ha|" \ - "hi|hm|hr|ht|hu|hy|id|ig|is|it|iw|ja|jv|ka|kk|km|kn|ko|ku|ky|la|lb|lo|lt|lv|mg|mi|mk|" \ - "ml|mn|mr|ms|mt|my|ne|nl|no|ny|pa|pl|ps|pt|ro|ru|sd|si|sk|sl|sm|sn|so|sq|sr|st|su|sv|" \ - "sw|ta|te|tg|th|tr|uk|ur|uz|vi|xh|yi|yo|zh|zu".split("|") -if "textblob" in stopfunc.LANG_BACKENDS: - stopfunc.TEXTBLOB_LANGUAGES = "af|ar|az|be|bg|bn|ca|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|ga|gl|gu|hi|hr|ht|hu|" \ - "id|is|it|iw|ja|ka|kn|ko|la|lt|lv|mk|ms|mt|nl|no|pl|pt|ro|ru|sk|sl|sq|sr|sv|sw|ta|" \ - "te|th|tl|tr|uk|ur|vi|yi|zh".split("|") - - -def _detect(text): - _lb, t = stopfunc.LANG_BACKEND, ensure_str(text) - if _lb is None: - raise ValueError("No language backend %s" % ["selected", "installed"][len(stopfunc.LANG_BACKENDS) == 0]) - return langid.classify(t)[0] if _lb == "langid" else \ - langdetect.detect(t) if _lb == "langdetect" else \ - pycld2.detect(t)[2][0][1] if _lb == "pycld2" else \ - cld3.get_language(t).language[:2] if _lb == "cld3" else \ - textblob.TextBlob(t).detect_language()[:2] - - -def _lang(lang): - def _test(s): - if not stopfunc.text(s): - return False - try: - return _detect(ensure_str(s))[:2] == lang - except: - return False - return _test - - -def _load_lang_backend(backend=None): - # import the requested backend library if not imported yet - if backend is None or backend in stopfunc.LANG_BACKENDS: - stopfunc.LANG_BACKEND = backend - if backend: - globals()[backend] = __import__(backend) - else: - raise ValueError("Unsupported language detection backend") - # remove language-related stop functions - for attr in dir(stopfunc): - if attr.startswith("_") or not isinstance(getattr(stopfunc, attr), FunctionType): - continue - if re.match(r"lang_[a-z]{2}$", attr): - delattr(stopfunc, attr) - # rebind applicable language-related stop functions - if stopfunc.LANG_BACKEND: - _lb = stopfunc.LANG_BACKEND - if _lb == "langid": - langid.langid.load_model() - for lang in ( - langid.langid.identifier.nb_classes if _lb == "langid" else \ - list(set(p[:2] for p in os.listdir(langdetect.PROFILES_DIRECTORY))) if _lb == "langdetect" else \ - list(set(x[1][:2] for x in pycld2.LANGUAGES if x[0] in pycld2.DETECTED_LANGUAGES)) if _lb == "pycld2" else \ - stopfunc.CLD3_LANGUAGES if _lb == "cld3" else \ - stopfunc.TEXTBLOB_LANGUAGES if _lb == "textblob" else \ - []): - n = "lang_%s" % lang - setattr(stopfunc, n, _lang(lang)) - getattr(stopfunc, n).__name__ = getattr(stopfunc, n).__qualname__ = n - if LANG: - flng = "lang_%s" % LANG - if getattr(stopfunc, flng, None): - stopfunc.default = getattr(stopfunc, flng) -stopfunc._reload_lang = _load_lang_backend - - -def _validate(stop_function, lang_backend="none"): - s, lb = stop_function, lang_backend - if isinstance(s, string_types): - if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ - all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): - stopfunc._reload_lang(lb) - f = getattr(stopfunc, s, None) - if f: - return f - elif not isinstance(s, FunctionType): - raise ValueError("Bad stop function") - return s -stopfunc._validate = _validate - - -def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, result, found=(), - stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): - """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ - if depth > min_depth and stop_func(input): - if not stop and (show or debug) and found not in result: - s = repr(input) - s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s - s = "[+] %s: %s" % (", ".join(found), s) - print(s if len(s) <= 80 else s[:77] + "...") - result[found] = input - if depth >= max_depth or len(result) > 0 and stop: - return - prev_enc = found[-1] if len(found) > 0 else "" - e = encodings.get(depth, encodings.get(-1, [])) - for new_input, encoding in __rank(prev_input, input, prev_enc, e, scoring_heuristic, extended): - if len(result) > 0 and stop: - return - if debug: - print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) - __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ), - stop, show, scoring_heuristic, extended, debug) - - -def __make_encodings_dict(include, exclude): - """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible - encoding names. It also creates a cache with the CodecInfo objects for improving performance. """ - def _develop(d, keep=True): - d = d or {} - for k, v in d.items(): - l, cc, sc = [], [e for e in v if e in CODECS_CATEGORIES], [e for e in v if e not in CODECS_CATEGORIES] - # list from in-scope categories and then everything that is not a category - for enc in ((list_encodings(*cc) if (len(cc) > 0 or keep) and len(sc) == 0 else []) + sc): - g = [] - for e in (search(enc, False) or [enc]): - try: - ci = lookup(e, False) - g.extend(ci.parameters['guess']) - except: - pass - if enc in g: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected - l.append(enc) - else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected - l.extend(g) - d[k] = list(set(l)) - return d - _excl, _incl = _develop(exclude, False), _develop(include) - return {k: [x for x in v if x not in _excl.get(k, [])] for k, v in _incl.items()} - - -def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False): - """ Filter valid encodings and rank them by relevance. """ - ranking = {} - for e in encodings: - try: - codec = CODECS_CACHE[e] - except KeyError: - try: - CODECS_CACHE[e] = codec = lookup(e, False) - except LookupError: - continue - t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) - if t: - ranking[e] = t - for encoding, result in sorted(ranking.items(), key=lambda x: (-x[1][0], x[0])): - yield result if yield_score else result[1], encoding - - -class _Text(object): - __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"] - - def __init__(self, text, pad_char=None): - self.text = ensure_str(text) - c = self.text[-1] - pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c) - self.padding = pad_char is not None and last_char == pad_char - if self.padding: - text = text.rstrip(b(pad_char) if isinstance(text, bytes) else pad_char) - self.len = len(self.text) - self.lcharset = len(set(self.text)) - self.printables = float(len([c for c in self.text if c in printable])) / self.len - self.entropy = entropy(self.text) - - -def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False): - """ Score relevant encodings given an input. """ - obj = None - sc = codec.parameters.get('scoring', {}) - no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False) - # ignore encodings that fail to decode with their default errors handling value - try: - new_input = codec.decode(input)[0] - except: - return - # ignore encodings that give an output identical to the input (identity transformation) or to the previous input - if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input): - return - # ignore encodings that transitively give the same output (identity transformation by chaining twice a same - # codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9) - if transitive and prev_encoding: - ci_prev = lookup(prev_encoding, False) - if ci_prev.parameters['name'] == codec.parameters['name']: - return - # compute input's characteristics only once and only if the control flow reaches this point - pad = sc.get('padding_char') - if obj is None: - obj = _Text(input, pad) - if heuristic: - # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base - # codecs) so that we can put the right one as early as possible and eventually exclude bad candidates - s = -sc.get('penalty', .0) - # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ; - # on the contrary, if the length of input text's charset is strictly greater, give a penalty - lcs = sc.get('len_charset', 256) - if isinstance(lcs, type(lambda: None)): - lcs = int(lcs(encoding)) - if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset: - s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1) - elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset: - s -= .2 # this can occur for encodings with no_error set to True - # then, take padding into account, giving a bonus if padding is to be encountered and effectively present, - # or a penalty when it should not be encountered but it is present - if pad and obj.padding: - s += .2 # when padding is encountered while it is legitimate, it could be a good indication => bonus - elif not pad and obj.padding: - s -= .1 # it could arise a padding character is encountered while not being padding => small penalty - # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when - # lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased) - if not no_error: - pr = sc.get('printables_rate', 0) - if isinstance(pr, type(lambda: None)): - pr = float(pr(obj.printables)) - if obj.printables - pr <= .05: - s += .1 - expf = sc.get('expansion_factor', 1.) - if expf: - f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f - if isinstance(expf, type(lambda: None)): - try: # this case allows to consider the current encoding name from the current codec - expf = expf(f, encoding) - except TypeError: - expf = expf(f) - if isinstance(expf, (int, float)): - tmp = expf - expf = (1/f - .1 <= 1/expf <= 1/f + .1) - elif isinstance(expf, (tuple, list)) and len(expf) == 2: - expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] - s += [-1., .1][expf] - # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the - # number of input characters to take bad entropies of shorter strings into account - entr = sc.get('entropy', lambda e: e) - entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr - if isinstance(entr, type(lambda: None)): - try: # this case allows to consider the current encoding name from the current codec - entr = entr(obj.entropy, encoding) - except TypeError: - entr = entr(obj.entropy) - if entr is not None: - # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1) - d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - obj.entropy) - if d_entr <= .5: - s += .5 - d_entr - # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) - bonus = sc.get('bonus_func') - if bonus is not None: - if isinstance(bonus, type(lambda: None)): - bonus = bonus(obj, codec, encoding) - if bonus: - s += .2 - else: - s = 1. - # exclude negative (and eventually null) scores as they are (hopefully) not relevant - if extended and s >= .0 or not extended and s > .0: - return s, new_input - - -def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=None, exclude=None, found=(), - stop=True, show=False, scoring_heuristic=True, extended=False, debug=False): - """ Try decoding without the knowledge of the encoding(s). - - :param input: input text to be guessed - :param stop_func: function defining the stop condition - :param min_depth: minimum search depth - :param max_depth: maximum search depth - ;param include: inclusion item OR list with category, codec or encoding names OR dictionary with lists per - depth (nothing means include every encoding) - :param exclude: exclusion item OR list with category, codec or encoding names OR dictionary with lists per - depth (nothing means exclude no encoding) - :param found: tuple of already found encodings - :param stop: whether to stop or not when a valid solution is found - :param show: whether to immediately show once a solution is found - :param scoring_heuristic: whether to apply the scoring heuristic during the search (if disabled, all scores are 1., - meaning that every non-failing encoding will be considered with no order of precedence) - :param extended: whether to also consider null scores with the heuristic - :param debug: whether to show each attempt at each depth during computation - """ - if len(input) == 0: - return "" - # check for min and max depths - if max_depth <= 0: - raise ValueError("Depth must be a non-null positive integer") - if min_depth > max_depth: - raise ValueError("Min depth shall be less than or equal to the max depth") - # take the tuple of found encodings into account - if len(found) > 0: - for encoding in found: - input = decode(input, encoding) - # handle the stop function as a regex if a string was given - if isinstance(stop_func, string_types): - stop_func = stopfunc.regex(stop_func) - # reformat include and exclude arguments ; supported formats: - for n, l in zip(["inc", "exc"], [include, exclude]): - if l is None: - if n == "inc": - include = l = {-1: CODECS_CATEGORIES} - else: - exclude = l = {} - # "category" OR "enc_name" OR whatever => means a single item for all depths - if isinstance(l, string_types): - if n == "inc": - include = l = {-1: [l]} - else: - exclude = l = {-1: [l]} - # ["enc_name1", "enc_name2", ...] => means for all depths - if isinstance(l, (list, tuple)): - if n == "inc": - include = l = {-1: l} - else: - exclude = l = {-1: l} - # {-1: [...], 2: [...], ...} => means prefedined depths with their lists of in-/excluded encodings - if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()): - raise ValueError("Include argument shall be a list or a dictionary with integer keys") - # precompute encodings lists per depth and cache the related CodecInfo objects - encodings, result = __make_encodings_dict(include, exclude), {} - try: - # breadth-first search - for d in range(max_depth): - __guess("", input, stop_func, 0, d+1, min_depth, encodings, result, tuple(found), stop, show, - scoring_heuristic, extended, debug) - if stop and len(result) > 0: - break - except KeyboardInterrupt: - pass - CODECS_CACHE = {} - return result -codecs.guess = guess - - -def rank(input, extended=False, limit=-1, include=None, exclude=None): - """ Rank the most probable encodings based on the given input. - - :param input: input text to be evaluated - :param extended: whether to consider null scores too (NB: negative scores are not output !) - :param limit: number of encodings to be returned (-1 means all of them) - :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) - :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) - """ - encodings = __make_encodings_dict(include if isinstance(include, dict) else {-1: include or CODECS_CATEGORIES}, - exclude if isinstance(exclude, dict) else {-1: exclude or []}) - r = list(__rank(None, input, "", encodings[-1], True, extended, True)) - return r[:limit] if len(r) > 1 else r -codecs.rank = rank - +# -*- coding: UTF-8 -*- +import _codecs +import codecs +import hashlib +import json +import os +import random +import re +import sre_parse +import sys +from encodings.aliases import aliases as ALIASES +from functools import reduce, update_wrapper, wraps +from importlib import import_module +from inspect import currentframe +from io import BytesIO +from itertools import chain, product +from locale import getlocale +from math import log +from pkgutil import iter_modules +from platform import system +from random import randint +from string import * +from types import FunctionType, ModuleType +try: # Python2 + import __builtin__ as builtins +except ImportError: + import builtins +try: # Python2 + from inspect import getfullargspec +except ImportError: + from inspect import getargspec as getfullargspec +try: # Python2 + from string import maketrans +except ImportError: + maketrans = str.maketrans +try: # Python3 + from importlib import reload +except ImportError: + pass + +# from Python 3.11, it seems that 'sre_parse' is not bound to 're' anymore +re.sre_parse = sre_parse + + +__all__ = ["add", "add_macro", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess", + "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "hashlib", "i2s", + "is_native", "list_categories", "list_encodings", "list_macros", "lookup", "maketrans", "os", "rank", "re", + "register", "remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro", + "DARWIN", "LANG", "LINUX", "MASKS", "UNIX", "WINDOWS"] +CODECS_REGISTRY = None +CODECS_OVERWRITTEN = [] +CODECS_CATEGORIES = ["native", "custom"] +CODECS_CACHE = {} +LANG = getlocale() +if LANG: + LANG = (LANG[0] or "")[:2].lower() +MASKS = { + 'a': printable, + 'b': "".join(chr(i) for i in range(256)), + 'd': digits, + 'h': digits + "abcdef", + 'H': digits + "ABCDEF", + 'l': ascii_lowercase, + 'p': punctuation, + 's': " ", + 'u': ascii_uppercase, +} + +__codecs_registry = [] + +MACROS = {} +PERS_MACROS = {} +PERS_MACROS_FILE = os.path.expanduser("~/.codext-macros.json") + +DARWIN = system() == "Darwin" +LINUX = system() == "Linux" +UNIX = DARWIN or LINUX +WINDOWS = system() == "Windows" + +entropy = lambda s: -sum([p * log(p, 2) for p in [float(s.count(c)) / len(s) for c in set(s)]]) + +isb = lambda s: isinstance(s, bytes) +iss = lambda s: isinstance(s, str) +fix = lambda x, ref: b(x) if isb(ref) else ensure_str(x) if iss(ref) else x + +s2i = lambda s: int(codecs.encode(s, "base16"), 16) +exc_name = lambda e: "".join(t.capitalize() for t in re.split(r"[-_+]", e)) + + +def i2s(input): + h = hex(input)[2:].rstrip("eL") + return codecs.decode(h.zfill(len(h) + len(h) % 2), "hex") + + +class CodecMacro(tuple): + """Macro details when looking up the codec registry. """ + def __new__(cls, name): + self = tuple.__new__(cls) + self.name = name + # get from personal macros first + try: + self.codecs = PERS_MACROS[name] + except KeyError: + try: + self.codecs = MACROS[name] + except KeyError: + raise LookupError("unknown macro: %s" % name) + if not isinstance(self.codecs, (tuple, list)): + raise ValueError("bad macro list: %s" % str(self.codecs)) + self.codecs = [lookup(e, False) for e in self.codecs] # lookup(e, False) + self.parameters = {'name': name, 'category': "macro"} # ^ means that macros won't be nestable + # test examples to check that the chain of encodings works + for action, examples in (self.codecs[0].parameters.get('examples', {}) or {'enc-dec(': ["T3st str!"]}).items(): + if re.match(r"enc(-dec)?\(", action): + for e in (examples.keys() if action.startswith("enc(") else examples or []): + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + if rd: + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + self.encode(s.lower() if rd.group(1) else s) + continue + self.encode(e) + + class Codec: + decode = self.decode + encode = self.encode + + class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return b(self.encode(input, self.errors)[0]) + self.incrementalencoder = IncrementalEncoder + + class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + return ensure_str(self.decode(input, self.errors)[0]) + self.incrementaldecoder = IncrementalDecoder + + class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + self.streamwriter = StreamWriter + + class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + self.streamreader = StreamReader + + return self + + def decode(self, input, error="strict"): + """ Decode with each codec in reverse order. """ + for ci in self.codecs[::-1]: + input, l = ci.decode(input, error) + return input, l + + def encode(self, input, error="strict"): + """ Encode with each codec. """ + for ci in self.codecs: + input, l = ci.encode(input, error) + return input, l + + def __repr__(self): + return "" % (self.name, id(self)) + + +# inspired from: https://stackoverflow.com/questions/10875442/possible-to-change-a-functions-repr-in-python +class Repr(object): + def __init__(self, name, func): + self.__name = name + self.__func = func + update_wrapper(self, func) + + def __call__(self, *args, **kwargs): + return self.__func(*args, **kwargs) + + def __repr__(self): + return "" % (self.__name, id(self)) + + +def __stdin_pipe(): + """ Stdin pipe read function. """ + try: + with open(0, 'rb') as f: + for l in f: + yield l + except TypeError: + for l in sys.stdin: + yield l + + +def _input(infile): + # handle input file or stdin + c = b("") + if infile: + with open(infile, 'rb') as f: + c = f.read() + else: + for line in __stdin_pipe(): + c += line + return c + + +def _set_exc(name, etype="ValueError"): + if not hasattr(builtins, name): + exec("class %s(%s): __module__ = 'builtins'" % (name, etype)) + setattr(builtins, name, locals()[name]) +_set_exc("InputSizeLimitError") +_set_exc("ParameterError") + + +def _stripl(s, st_lines, st_crlf): + if st_crlf: + s = s.replace(b"\r\n", b"") if isb(s) else s.replace("\r\n", "") + if st_lines: + s = s.replace(b"\n", b"") if isb(s) else s.replace("\n", "") + return s + + +def _with_repr(name): + def _wrapper(f): + return Repr(name, f) + return _wrapper + + +def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False, **kwargs): + """ This adds a new codec to the codecs module setting its encode and/or decode functions, eventually dynamically + naming the encoding with a pattern and with file handling. + + :param ename: encoding name + :param encode: encoding function or None + :param decode: decoding function or None + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + """ + remove(ename) + if encode: + if not isinstance(encode, FunctionType): + raise ValueError("Bad 'encode' function") + _set_exc("%sEncodeError" % exc_name(ename)) # create the custom encode exception as a builtin + if decode: + if not isinstance(decode, FunctionType): + raise ValueError("Bad 'decode' function") + _set_exc("%sDecodeError" % exc_name(ename)) # create the custom decode exception as a builtin + if not encode and not decode: + raise ValueError("At least one en/decoding function must be defined") + for exc in kwargs.get('extra_exceptions', []): + _set_exc(exc) # create additional custom exceptions as builtins + glob = currentframe().f_back.f_globals + # search function for the new encoding + @_with_repr(ename) + def getregentry(encoding): + if encoding != ename and not (pattern and re.match(pattern, encoding)): + return + fenc, fdec, name = encode, decode, encoding + # prepare CodecInfo input arguments + if pattern: + m, args, i = re.match(pattern, encoding), [], 1 + try: + while True: + try: + g = m.group(i) or "" + if g.isdigit() and not g.startswith("0") and "".join(set(g)) != "01": + g = int(g) + args += [g] + i += 1 + except AttributeError: + # this occurs when m is None or there is an error in fenc(g) or fdec(g), meaning no match + if m is not None: + raise + return + except IndexError: + # this occurs while m is not None, but possibly no capture group that gives at least 1 group index ; + # in this case, if fenc/fdec is a decorated function, execute it with no arg + if len(args) == 0: + if fenc and len(getfullargspec(fenc).args) == 1: + fenc = fenc() + if fdec and len(getfullargspec(fdec).args) == 1: + fdec = fdec() + else: + fenc = fenc(*args) if fenc else fenc + fdec = fdec(*args) if fdec else fdec + if fenc: + fenc = fix_inout_formats(fenc) + if fdec: + fdec = fix_inout_formats(fdec) + sl, sc = kwargs.pop('strip_lines', False), kwargs.pop('strip_crlf', False) + if sl or sc: + def _striplines(f): + def __wrapper(input, *a, **kw): + return f(_stripl(input, sc, sl), *a, **kw) + return __wrapper + # this fixes issues with wrapped encoded inputs + fdec = _striplines(fdec) + + class Codec(codecs.Codec): + def encode(self, input, errors="strict"): + if fenc is None: + raise NotImplementedError + return fenc(input, errors) + + def decode(self, input, errors="strict"): + if fdec is None: + raise NotImplementedError + return fdec(input, errors) + + class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + if fenc is None: + raise NotImplementedError + return b(fenc(input, self.errors)[0]) + + class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + if fdec is None: + raise NotImplementedError + return ensure_str(fdec(input, self.errors)[0]) + + class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + + class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + + ci = codecs.CodecInfo( + name=name, + encode=Codec().encode, + decode=Codec().decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamwriter=StreamWriter, + streamreader=StreamReader, + _is_text_encoding=text, + ) + ci.parameters = kwargs + ci.parameters['name'] = ename + ci.parameters['add_to_codecs'] = add_to_codecs + ci.parameters['pattern'] = pattern + ci.parameters['text'] = text + f = glob.get('__file__', os.path.join("custom", "_")) + cat = f.split(os.path.sep)[-2].rstrip("s") + if cat not in CODECS_CATEGORIES: + CODECS_CATEGORIES.append(cat) + ci.parameters['category'] = kwargs.get('category', cat) + ci.parameters['examples'] = kwargs.get('examples', glob.get('__examples__')) + ci.parameters['guess'] = kwargs.get('guess', glob.get('__guess__', [ename])) or [] + ci.parameters['module'] = kwargs.get('module', glob.get('__name__')) + ci.parameters.setdefault("scoring", {}) + for attr in ["bonus_func", "entropy", "expansion_factor", "len_charset", "penalty", "printables_rate", + "padding_char", "transitive"]: + a = kwargs.pop(attr, None) + if a is not None: + ci.parameters['scoring'][attr] = a + return ci + + getregentry.__name__ = re.sub(r"[\s\-]", "_", ename) + if kwargs.get('aliases'): + getregentry.__aliases__ = list(map(lambda n: re.sub(r"[\s\-]", "_", n), kwargs['aliases'])) + getregentry.__pattern__ = pattern + register(getregentry, add_to_codecs) + return getregentry + + +def add_macro(mname, *encodings): + """ This allows to define a macro, chaining multiple codecs one after the other. This relies on a default set of + macros from a YAML file embedded in the package and a local YAML file from the home folder that takes + precedence for defining personal macros. + + :param mname: macro name + :param encodings: encoding names of the encodings to be chained with the macro + """ + global PERS_MACROS + # check for name clash with alreday existing macros and codecs + if mname in MACROS or mname in PERS_MACROS: + raise ValueError("Macro name already exists") + try: + ci = lookup(mname, False) + raise ValueError("Macro name clashes with codec '%s'" % ci.name) + except LookupError: + pass + try: + PERS_MACROS[mname] = encodings + CodecMacro(mname) + with open(PERS_MACROS_FILE, 'w') as f: + json.dump(PERS_MACROS, f, indent=2) + except ValueError: + del PERS_MACROS[mname] + raise +codecs.add_macro = add_macro + + +def add_map(ename, encmap, repl_char="?", sep="", ignore_case=None, no_error=False, intype=None, outype=None, **kwargs): + """ This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs + module dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with + a pattern and with file handling (if text is True). + + :param ename: encoding name + :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture + group of the regex pattern) or a function building the encoding map + :param repl_char: replacement char (used when errors handling is set to "replace") + :param sep: string of possible character separators (hence, only single-char separators are considered) ; + - while encoding, the first separator is used + - while decoding, separators can be mixed in the input text + :param ignore_case: ignore text case while encoding and/or decoding + :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) + :param intype: specify the input type for pre-transforming the input text + :param outype: specify the output type for post-transforming the output text + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + """ + outype = outype or intype + if ignore_case not in [None, "encode", "decode", "both"]: + raise ValueError("Bad ignore_case parameter while creating encoding map") + if intype not in [None, "str", "bin", "ord"]: + raise ValueError("Bad input type parameter while creating encoding map") + if outype not in [None, "str", "bin", "ord"]: + raise ValueError("Bad output type parameter while creating encoding map") + + def __generic_code(decode=False): + def _wrapper(param): + """ The parameter for wrapping comes from the encoding regex pattern ; e.g. + [no pattern] => param will be None everytime + r"barbie[-_]?([1-4])$" => param could be int 1, 2, 3 or 4 + r"^morse(|[-_]?.{3})$" => param could be None, "-ABC" (for mapping to ".-/") + + In order of precedence: + 1. when param is a key in mapdict or mapdict is a list of encoding maps (hence in the case of "barbie...", + param MUST be an int, otherwise for the first case it could clash with a character of the encoding map) + 2. otherwise handle it as a new encoding character map "ABC" translates to ".-/" for morse + """ + p = param + if isinstance(encmap, FunctionType): + mapdict = encmap(p) + p = None + else: + mapdict = encmap + if isinstance(mapdict, dict): + smapdict = {k: v for k, v in mapdict.items()} + elif isinstance(mapdict, list) and isinstance(mapdict[0], dict): + smapdict = {k: v for k, v in mapdict[0].items()} + else: + raise ValueError("Bad mapping dictionary or list of mapping dictionaries") + if p is not None: + # case 1: param is empty string + if p == "": + if isinstance(mapdict, list): + smapdict = {k: v for k, v in mapdict[0].items()} + elif isinstance(mapdict, dict): + if '' in mapdict.keys() and isinstance(mapdict[''], dict): + smapdict = {k: v for k, v in mapdict[''].items()} + else: + smapdict = {k: v for k, v in mapdict.items()} + # no 'else' handling a LookupError here ; this case is covered by the first if/elif/else block + # case 2: list or dictionary or dictionary of numbered encodings + elif isinstance(p, int): + # if mapdict is a list, we shall align the parameter (starting from 1) as an index (starting from 0) + if isinstance(mapdict, list): + p -= 1 + if isinstance(mapdict, list) and 0 <= p < len(mapdict) or \ + isinstance(mapdict, dict) and p in mapdict.keys(): + smapdict = {k: v for k, v in mapdict[p].items()} + else: + raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + # case 3: dictionary of regex-selected encoding mappings + elif isinstance(mapdict, dict) and isinstance(list(mapdict.values())[0], dict): + tmp = None + for r, d in mapdict.items(): + if r == '': # this is already handled in case 1 ; anyway, an empty regex always matches, hence + continue # it must be excluded + if re.match(r, p): + tmp = d + break + if tmp is None: + raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + smapdict = tmp + # case 4: encoding characters translation + else: + # collect base tokens in order of appearance in the mapping dictionary + base_tokens = "" + for _, c in sorted(mapdict.items()): + for t in c: + for st in t: + if st not in base_tokens: + base_tokens += st + if " " not in sep: + base_tokens = base_tokens.replace(" ", "") + if len(p) > 0 and p[0] in "-_" and len(p[1:]) == len(set(p[1:])) == len(base_tokens): + p = p[1:] + if len(p) == len(set(p)) == len(base_tokens): + t = maketrans(base_tokens, p) + for k, v in smapdict.items(): + smapdict[k] = [x.translate(t) for x in v] if isinstance(v, list) else v.translate(t) + else: + raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + if ignore_case is not None: + cases = ["upper", "lower"] + case_d = cases[any(c in str(list(smapdict.values())) for c in "abcdefghijklmnopqrstuvwxyz")] + case_e = cases[any(c in str(list(smapdict.keys())) for c in "abcdefghijklmnopqrstuvwxyz")] + i = ignore_case + smapdict = {getattr(k, case_e)() if i in ["both", "encode"] else k: \ + ([getattr(x, case_d)() for x in v] if isinstance(v, list) else getattr(v, case_d)()) \ + if i in ["both", "decode"] else v for k, v in smapdict.items()} + if decode: + tmp = {} + # this has a meaning for encoding maps that could have clashes in encoded chars (e.g. Bacon's cipher ; + # I => abaaa but also J => abaaa, with the following, we keep I instead of letting J overwrite it) + for k, v in sorted(smapdict.items()): + if not isinstance(v, list): + v = [v] + for x in v: + if x not in tmp.keys(): + tmp[x] = k + smapdict, cs = tmp, reduce(lambda acc, x: acc + x, tmp.keys()) + kwargs['strip_lines'], kwargs['strip_crlf'] = "\n" not in set(cs), "\r\n" not in cs + # this allows to avoid an error with Python2 in the "for i, c in enumerate(parts)" loop + if '' not in smapdict.keys(): + smapdict[''] = "" + # determine token and result lengths + tmaxlen = max(map(len, smapdict.keys())) + tminlen = max(1, min(map(len, set(smapdict.keys()) - {''}))) + l = [] + for x in smapdict.values(): + getattr(l, ["append", "extend"][isinstance(x, list)])(x) + rminlen = max(1, min(map(len, set(l) - {''}))) + + # generic encoding/decoding function for map encodings + def code(text, errors="strict"): + icase = ignore_case == "both" or \ + decode and ignore_case == "decode" or \ + not decode and ignore_case == "encode" + if icase: + case = case_d if decode else case_e + if no_error: + errors = "leave" + text = ensure_str(text) + if not decode: + if intype == "bin": + text = "".join("{:0>8}".format(bin(ord(c))[2:]) for c in text) + elif intype == "ord": + text = "".join(str(ord(c)).zfill(3) for c in text) + r = "" + lsep = "" if decode else sep if len(sep) <= 1 else sep[0] + kind = ["character", "token"][tmaxlen > 1] + error_func = handle_error(ename, errors, lsep, repl_char, rminlen, decode, kind) + + # get the value from the mapping dictionary, trying the token with its inverted case if relevant + def __get_value(token, position, case_changed=False): + try: + result = smapdict[token] + except KeyError: + if icase and not case_changed: + token_inv_case = getattr(token, case)() + return __get_value(token_inv_case, position, True) + return error_func(token, position) + if isinstance(result, list): + result = result[0] + return result + lsep + + # if a separator is defined, rely on it by splitting the input text + if decode and len(sep) > 0: + for i, c in enumerate(re.split("[" + sep + "]", text)): + r += __get_value(c, i) + # otherwise, move through the text using a cursor for tokenizing it ; this allows defining more complex + # encodings with variable token lengths + else: + cursor, bad = 0, "" + while cursor < len(text): + token = text[cursor:cursor+1] + for l in range(tminlen, tmaxlen + 1): + token = text[cursor:cursor+l] + if token in smapdict.keys() or icase and getattr(token, case)() in smapdict.keys(): + r += __get_value(token, cursor) + cursor += l + break + else: + # collect bad chars and only move the cursor one char to the right + bad += text[cursor] + cursor += 1 + # if the number of bad chars is the minimum token length, consume it and start a new buffer + if len(bad) == tminlen or errors == "leave": + posn = cursor - len(bad) + r += error_func(bad, posn) + bad = "" + if decode: + if outype in ["bin", "ord"]: + tmp, r = "", r.replace(lsep, "") + step = [3, 8][outype == "bin"] + for i in range(0, len(r), step): + s = r[i:i+step] + try: + tmp += chr(int(s, 2) if outype == "bin" else int(s)) + except ValueError: + if len(s) > 0: + tmp += "[" + s + "]" + r = tmp + lsep + return r[:len(r)-len(lsep)], len(b(text)) + return code + if re.search(r"\([^(?:)]", kwargs.get('pattern', "")) is None: + # in this case, there is no capturing group for parametrization + return _wrapper(None) + return _wrapper + + glob = currentframe().f_back.f_globals + kwargs['category'] = glob['__file__'].split(os.path.sep)[-2].rstrip("s") + kwargs['examples'] = kwargs.get('examples', glob.get('__examples__')) + kwargs['encmap'] = encmap + kwargs['repl_char'] = repl_char + kwargs['sep'] = sep + kwargs['ignore_case'] = ignore_case + kwargs['no_error'] = no_error + kwargs['intype'] = intype + kwargs['outype'] = outype + kwargs['module'] = glob.get('__name__') + try: + if isinstance(encmap, dict): + smapdict = {k: v for k, v in encmap.items()} + elif isinstance(encmap, list) and isinstance(encmap[0], dict): + smapdict = {k: v for k, v in encmap[0].items()} + kwargs['repl_minlen'] = i = max(1, min(map(len, set(smapdict.values()) - {''}))) + kwargs['repl_minlen_b'] = max(1, min(map(len, map(b, set(smapdict.values()) - {''})))) + except: + pass + return add(ename, __generic_code(), __generic_code(True), **kwargs) +codecs.add_map = add_map + + +def clear(): + """ Clear codext's local registry of search functions. """ + global __codecs_registry, MACROS, PERS_MACROS + __codecs_registry, MACROS, PERS_MACROS = [], {}, {} +codecs.clear = clear + + +def examples(encoding, number=10): + """ Use the search function to get the matching encodings and provide examples of valid encoding names. """ + e = [] + for name in search(encoding): + for search_function in __codecs_registry: + n = search_function.__name__ + if name in [n, n.replace("_", "-")]: + temp = [] + for s in generate_strings_from_regex(search_function.__pattern__, yield_max=16*number): + temp.append(s) + random.shuffle(temp) + i = 0 + while i < min(number, len(temp)): + if not temp[i].isdigit(): + try: + lookup(temp[i], False) + e.append(temp[i]) + except LookupError: + pass + i += 1 + for alias, codec in ALIASES.items(): + if name == codec: + if codec not in e: + e.append(codec) + if not alias.isdigit(): + e.append(alias) + random.shuffle(e) + return sorted([e[i] for i in range(min(number, len(e)))], key=_human_keys) +codecs.examples = examples + + +def is_native(encoding): + """ Determine if a given encoding is native or not. """ + return lookup(encoding, False).parameters['category'] == "native" + + +def list_categories(): + """ Get a list of all codec categories. """ + c = CODECS_CATEGORIES + root = os.path.dirname(__file__) + for d in os.listdir(root): + if os.path.isdir(os.path.join(root, d)) and not d.startswith("__"): + c.append(d.rstrip("s")) + # particular category, hardcoded from base/_base.py + c += ["base-generic"] + return c +list_categories() + + +def list_encodings(*categories): + """ Get a list of all codecs. """ + # if "non-native" is in the input list, extend the list with the whole categories but "native" + categories, exclude = list(categories), [] + for c in categories[:]: + if c == "non-native": + for c in CODECS_CATEGORIES: + if c == "native" or c in categories: + continue + categories.append(c) + categories.remove("non-native") + if c.startswith("~"): + exclude.append(c[1:]) + categories.remove(c) + try: + categories.remove(c[1:]) + except ValueError: + pass + # now, filter codecs according to the input list of categories + enc = [] + if (len(categories) == 0 or "native" in categories) and "native" not in exclude: + for a in set(ALIASES.values()): + try: + ci = __orig_lookup(a) + except LookupError: + continue + if lookup(a) is ci: + enc.append(ci.name) + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + name = search_function.__name__.replace("_", "-") + p = search_function.__pattern__ + ci = search_function(name) if p is None else search_function(generate_string_from_regex(p)) + c = "other" if ci is None else ci.parameters['category'] + if (len(categories) == 0 or c in categories) and c not in exclude: + enc.append(name) + for category in categories: + if category not in CODECS_CATEGORIES: + raise ValueError("Category '%s' does not exist" % category) + return sorted(list(set(enc)), key=_human_keys) + + +def list_macros(): + """ Get a list of all macros, with the precedence on personal ones. """ + return sorted(list(set(list(MACROS.keys()) + list(PERS_MACROS.keys())))) + + +def remove(name): + """ Remove all search functions matching the input encoding name from codext's local registry or any macro with the + given name. """ + global __codecs_registry, MACROS, PERS_MACROS + tbr = [] + for search_function in __codecs_registry: + if search_function(name) is not None: + tbr.append(search_function) + for search_function in tbr: + __codecs_registry.remove(search_function) + try: + del MACROS[name] + except KeyError: + pass + try: + del PERS_MACROS[name] + with open(PERS_MACROS_FILE, 'w') as f: + json.dump(PERS_MACROS, f, indent=2) + except KeyError: + pass + try: + del CODECS_CACHE[name] + except KeyError: + pass + for s in ["En", "De"]: + try: + delattr(builtins, "%s%scodeError" % (name.capitalize(), s)) + except AttributeError: + pass +codecs.remove = remove + + +def reset(): + """ Reset codext's local registry of search functions and macros. """ + global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS + clear() + d = os.path.dirname(__file__) + for pkg in sorted(os.listdir(d)): + if pkg.startswith("_") or not os.path.isdir(os.path.join(d, pkg)): + continue + reload(import_module("codext." + pkg)) + # backup codext's registry + if CODECS_REGISTRY is None: + CODECS_REGISTRY = __codecs_registry[:] + # restore codext's registry + else: + __codecs_registry = CODECS_REGISTRY[:] + # restore codext's embedded set of macros + with open(os.path.join(os.path.dirname(__file__), "macros.json")) as f: + MACROS = json.load(f) + # reload personal set of macros + PERS_MACROS = {} + if os.path.exists(PERS_MACROS_FILE): + with open(PERS_MACROS_FILE) as f: + PERS_MACROS = json.load(f) +codecs.reset = reset + + +# conversion functions +def b(s): + """ Non-crashing bytes conversion function. """ + try: + return s.encode("latin-1") + except: + pass + try: + return s.encode("utf-8") + except: + pass + return s + + +def ensure_str(s, encoding='utf-8', errors='strict'): + """ Dummy str conversion function. """ + if isinstance(s, bytes): + try: + return s.decode(encoding, errors) + except: + return s.decode("latin-1") + return s + + +# make conversion functions compatible with input/output strings/bytes +def fix_inout_formats(f): + """ This decorator ensures that the first output of f will have the same text format as the first input (str or + bytes). """ + @wraps(f) + def _wrapper(*args, **kwargs): + a0 = args[0] + a0_isb = isb(a0) + a0 = ensure_str(a0) if iss(a0) or a0_isb else a0 + r = f(a0, *args[1:], **kwargs) + # special case: input is in bytes ; ensure that the returned length is this of the bytes, not this processed by + # the decode/encode function + if isinstance(r, (tuple, list)) and isinstance(r[1], int) and a0_isb: + r = tuple([list(r)[0]] + [len(args[0])] + list(r)[2:]) + return (fix(r[0], args[0]), ) + r[1:] if isinstance(r, (tuple, list)) else fix(r, args[0]) + return _wrapper + + +# alphabet generation function from a given mask +def get_alphabet_from_mask(mask): + """ This function generates an alphabet from the given mask. The style used is similar to Hashcat ; group keys are + marked with a heading "?". """ + i, alphabet = 0, "" + while i < len(mask): + c = mask[i] + if c == "?" and i < len(mask) - 1 and mask[i+1] in MASKS.keys(): + for c in MASKS[mask[i+1]]: + if c not in alphabet: + alphabet += c + i += 1 + elif c not in alphabet: + alphabet += c + i += 1 + return alphabet + + +# generic error handling function +def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=False, kind="character", item="position"): + """ This shortcut function allows to handle error modes given some tuning parameters. + + :param ename: encoding name + :param errors: error handling mode + :param sep: token separator + :param repl_char: replacement character (for use when errors="replace") + :param repl_minlen: repeat number for the replacement character + :param decode: whether we are encoding or decoding + :param item: position item description (for describing the error ; e.g. "group" or "token") + """ + exc = "%s%scodeError" % (exc_name(ename), ["En", "De"][decode]) + + def _handle_error(token, position, output="", eename=None): + """ This handles an encoding/decoding error according to the selected handling mode. + + :param token: input token to be encoded/decoded + :param position: token position index + :param output: output, as decoded up to the position of the error + """ + if errors == "strict": + msg = "'%s' codec can't %scode %s '%s' in %s %d" + token = ensure_str(token) + token = token[:7] + "..." if len(token) > 10 else token + err = getattr(builtins, exc)(msg % (eename or ename, ["en", "de"][decode], kind, token, item, position)) + err.output = output + err.__cause__ = err + raise err + elif errors == "leave": + return token + sep + elif errors == "replace": + return repl_char * repl_minlen + sep + elif errors == "ignore": + return "" + else: + raise ValueError("Unsupported error handling '{}'".format(errors)) + return _handle_error + + +# codecs module hooks +__orig_lookup = _codecs.lookup +__orig_register = _codecs.register + + +def __add(ename, encode=None, decode=None, pattern=None, text=True, **kwargs): + kwargs.pop('add_to_codecs', None) + return add(ename, encode, decode, pattern, text, True, **kwargs) +__add.__doc__ = add.__doc__ +codecs.add = __add + + +def decode(obj, encoding='utf-8', errors='strict'): + """ Custom decode function relying on the hooked lookup function. """ + return lookup(encoding).decode(obj, errors)[0] +codecs.decode = decode + + +def encode(obj, encoding='utf-8', errors='strict'): + """ Custom encode function relying on the hooked lookup function. """ + n, m = 1, re.search(r"\[(\d+)\]$", encoding) + if m: + n = int(m.group(1)) + encoding = re.sub(r"\[(\d+)\]$", "", encoding) + ci = lookup(encoding) + for i in range(n): + try: + obj = ci.encode(obj, errors)[0] + except (AttributeError, TypeError) as e: # occurs for encodings that require str as input while 'obj' is bytes + if str(e) not in ["'bytes' object has no attribute 'encode'", + "ord() expected string of length 1, but int found"] or \ + encoding in ["latin-1", "utf-8"]: # encodings considered when using b(...) + raise + obj = ci.encode(ensure_str(obj), errors)[0] + return obj +codecs.encode = encode + + +def lookup(encoding, macro=True): + """ Hooked lookup function for searching first for codecs in the local registry of this module. """ + # first, try to match the given encoding with codecs' search functions + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + codecinfo = search_function(encoding) + if codecinfo is not None: + return codecinfo + # then, if a codec name was given, generate an encoding name from its pattern and get the CodecInfo + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + if search_function.__name__.replace("_", "-") == encoding or \ + encoding in getattr(search_function, "__aliases__", []): + codecinfo = search_function(generate_string_from_regex(search_function.__pattern__)) + if codecinfo is not None: + return codecinfo + # finally, get a CodecInfo with the original lookup function and refine it with a dictionary of parameters + try: + ci = __orig_lookup(encoding) + ci.parameters = {'category': "native", 'module': "codecs", 'name': ALIASES.get(ci.name, ci.name)} + return ci + except LookupError: + if not macro: + raise + try: + return CodecMacro(encoding) + except LookupError: + e = LookupError("unknown encoding: %s" % encoding) + e.__cause__ = e # stop exception chaining + raise e +codecs.lookup = lookup + + +def register(search_function, add_to_codecs=False): + """ Register function for registering new codecs in the local registry of this module and, if required, in the + native codecs registry (for use with the built-in 'open' function). + + :param search_function: search function for the codecs registry + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + """ + if search_function not in __codecs_registry: + try: + __orig_lookup(search_function.__name__) + l = CODECS_OVERWRITTEN + except LookupError: + l = __codecs_registry + l.append(search_function) + if add_to_codecs: + __orig_register(search_function) + + +def __register(search_function): + """ Same as register(...), but with add_to_codecs set by default to True. """ + register(search_function, True) +codecs.register = __register + + +def search(encoding_regex, extended=True): + """ Function similar to lookup but allows to search for an encoding based on a regex instead. It searches this way + into the local registry but also tries a simple lookup with the original lookup function. """ + matches = [] + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + n = search_function.__name__ + for name in [n, n.replace("_", "-")]: + if re.search(encoding_regex, name): + matches.append(n.replace("_", "-")) + continue + if extended: + # in some cases, encoding_regex can match a generated string that uses a particular portion of its + # generating pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also + # find "morse" or "atbash" very rarely because of their dynamic patterns and the limited number of randomly + # generated strings + # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of + # matches ; executing 5 times the string generation for a given codec but adding the codec to the list of + # matches only if we get at least 3 matches ensures that we consider up to 2 failures that could be + # stochastic, therefore drastically decreasing the probability to get a "junk" encoding in the matches list + c = 0 + for i in range(5): + for s in generate_strings_from_regex(search_function.__pattern__): + if re.search(encoding_regex, s): + c += 1 + break + if c >= 3: + matches.append(n) + break + for s, n in ALIASES.items(): + if re.search(encoding_regex, s) or re.search(encoding_regex, n): + matches.append(n) + return sorted(list(set(matches)), key=_human_keys) +codecs.search = search + + +# utility function for the search feature +CATEGORIES = { + 'digit': digits, + 'not_digit': reduce(lambda x, c: x.replace(c, ""), digits, printable), + 'space': whitespace, + 'not_space': reduce(lambda x, c: x.replace(c, ""), whitespace, printable), + 'word': ascii_letters + digits + '_', + 'not_word': reduce(lambda x, c: x.replace(c, ""), ascii_letters + digits + '_', printable), +} +REPEAT_MAX = 10 +STAR_PLUS_MAX = 10 +YIELD_MAX = 100 + + +def __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max, parsed=False): + """ Recursive function to generate strings from a regex pattern. """ + if regex is None: + return + __groups = {} + tokens = [] + negate, last_rand = False, None + for state in (regex if parsed else re.sre_parse.parse(b(getattr(regex, "pattern", regex)))): + code = getattr(state[0], "name", state[0]).lower() + value = getattr(state[1], "name", state[1]) + value = value.lower() if isinstance(value, str) else value + if code in ["assert_not", "at"]: + continue + elif code == "any": + charset = list(printable.replace("\n", "")) + while charset[0] == last_rand and len(charset) > 1: + random.shuffle(charset) + last_rand = charset[0] + tokens.append(charset) # should be ord(x) with x belongs to [0, 256[ + elif code == "assert": + tokens.append(list(__gen_str_from_re(value[1], star_plus_max, repeat_max, yield_max, True))) + elif code == "branch": + result = [] + for r in value[1]: + result += list(__gen_str_from_re(r, star_plus_max, repeat_max, yield_max, True)) or [""] + tokens.append(result) + elif code == "category": + charset = list(CATEGORIES[value[9:]]) + if negate: + negate = False + charset = list(set(printable).difference(charset)) + while charset[0] == last_rand and len(charset) > 1: + random.shuffle(charset) + last_rand = charset[0] + tokens.append(charset) + elif code == "groupref": + tokens.extend(__groups[value]) + elif code == "in": + subtokens = list(__gen_str_from_re(value, star_plus_max, repeat_max, yield_max, True)) + subtokens = [x for l in subtokens for x in l] + tokens.append(subtokens) + elif code == "literal": + tokens.append(chr(value)) + elif code in ["max_repeat", "min_repeat"]: + start, end = value[:2] + end = min(end, star_plus_max) + start = min(start, end) + charset = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) + subtokens = [] + if start == 0 and end == 1: + subtokens.append("") + subtokens.extend(charset) + elif len(charset) ** end > repeat_max: + for i in range(min(repeat_max, 10 * len(charset))): + n = random.randint(start, end + 1) + token = "" if n == 0 else "".join(random.choice(charset) for i in range(n)) + if token not in subtokens: + subtokens.append(token) + else: + i -= 1 + else: + for n in range(start, end + 1): + for c in product(charset, repeat=n): + subtokens.append("".join(c)) + tokens.append(subtokens) + elif code == "negate": + negate = True + elif code == "not_literal": + charset = list(printable.replace(chr(value), "")) + while charset[0] == last_rand and len(charset) > 1: + random.shuffle(charset) + last_rand = charset[0] + tokens.append(charset) + elif code == "range": + tokens.append("".join(chr(i) for i in range(value[0], value[1] + 1))) + elif code == "subpattern": + result = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) + if value[0]: + __groups[value[0]] = result + tokens.append(result) + else: + raise NotImplementedError("Unhandled code '{}'".format(code)) + if len(tokens) == 0: + tokens = [""] + i = 0 + for result in product(*tokens): + yield "".join(result) + i += 1 + if i >= yield_max: + break + + +def _human_keys(text): + """ Sorting function for considering strings with numbers (e.g. base2, base10, base100) """ + tokens = [] + for s in re.split(r"(\d+|\D+)", text): + tokens.append(int(s) if s.isdigit() else s) + return tokens + + +def generate_string_from_regex(regex): + """ Utility function to generate a single string from a regex pattern. """ + if regex: + return list(generate_strings_from_regex(regex, yield_max=1))[0] + + +def generate_strings_from_regex(regex, star_plus_max=STAR_PLUS_MAX, repeat_max=REPEAT_MAX, yield_max=YIELD_MAX): + """ Utility function to generate strings from a regex pattern. """ + i = 0 + for result in __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max): + yield result + + +# guess feature objects +__module_exists = lambda n: n in [x[1] for x in iter_modules()] +stopfunc = ModuleType("stopfunc", """ + Predefined stop functions + ~~~~~~~~~~~~~~~~~~~~~~~~~ + + This submodule contains stop functions for the guess feature of codext. + + - `flag`: searches for the pattern "[Ff][Ll1][Aa4@][Gg9]" (either UTF-8 or UTF-16) + - `lang_**`: checks if the given lang (any from the PROFILES_DIRECTORY of the langdetect module) is detected + - `printables`: checks that every output character is in the set of printables + - `regex`: takes one argument, the regular expression, for checking a string against the given pattern + - `text`: checks for printables and an entropy less than 4.6 (empirically determined) +""") +stopfunc.printables = lambda s: all(c in printable for c in ensure_str(s)) +stopfunc.printables.__name__ = stopfunc.printables.__qualname__ = "printables" +stopfunc.regex = lambda p: lambda s: re.search(p, ensure_str(s)) is not None +stopfunc.regex.__name__ = stopfunc.regex.__qualname__ = "regex" +stopfunc.text = lambda s: stopfunc.printables(s) and entropy(s) < 4.6 +stopfunc.text.__name__ = stopfunc.text.__qualname__ = "text" +stopfunc.flag = lambda x: re.search(r"[Ff][Ll1][Aa4@][Gg96]", ensure_str(x)) is not None +stopfunc.flag.__name__ = stopfunc.flag.__qualname__ = "flag" +stopfunc.default = stopfunc.text + +stopfunc.LANG_BACKEND = None +stopfunc.LANG_BACKENDS = [n for n in ["pycld2", "langdetect", "langid", "cld3", "textblob"] if __module_exists(n)] +if len(stopfunc.LANG_BACKENDS) > 0: + stopfunc.LANG_BACKEND = stopfunc.LANG_BACKENDS[0] +if "cld3" in stopfunc.LANG_BACKENDS: + stopfunc.CLD3_LANGUAGES = "af|am|ar|bg|bn|bs|ca|ce|co|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|fy|ga|gd|gl|gu|ha|" \ + "hi|hm|hr|ht|hu|hy|id|ig|is|it|iw|ja|jv|ka|kk|km|kn|ko|ku|ky|la|lb|lo|lt|lv|mg|mi|mk|" \ + "ml|mn|mr|ms|mt|my|ne|nl|no|ny|pa|pl|ps|pt|ro|ru|sd|si|sk|sl|sm|sn|so|sq|sr|st|su|sv|" \ + "sw|ta|te|tg|th|tr|uk|ur|uz|vi|xh|yi|yo|zh|zu".split("|") +if "textblob" in stopfunc.LANG_BACKENDS: + stopfunc.TEXTBLOB_LANGUAGES = "af|ar|az|be|bg|bn|ca|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|ga|gl|gu|hi|hr|ht|hu|" \ + "id|is|it|iw|ja|ka|kn|ko|la|lt|lv|mk|ms|mt|nl|no|pl|pt|ro|ru|sk|sl|sq|sr|sv|sw|ta|" \ + "te|th|tl|tr|uk|ur|vi|yi|zh".split("|") + + +def _detect(text): + _lb, t = stopfunc.LANG_BACKEND, ensure_str(text) + if _lb is None: + raise ValueError("No language backend %s" % ["selected", "installed"][len(stopfunc.LANG_BACKENDS) == 0]) + return langid.classify(t)[0] if _lb == "langid" else \ + langdetect.detect(t) if _lb == "langdetect" else \ + pycld2.detect(t)[2][0][1] if _lb == "pycld2" else \ + cld3.get_language(t).language[:2] if _lb == "cld3" else \ + textblob.TextBlob(t).detect_language()[:2] + + +def _lang(lang): + def _test(s): + if not stopfunc.text(s): + return False + try: + return _detect(ensure_str(s))[:2] == lang + except: + return False + return _test + + +def _load_lang_backend(backend=None): + # import the requested backend library if not imported yet + if backend is None or backend in stopfunc.LANG_BACKENDS: + stopfunc.LANG_BACKEND = backend + if backend: + globals()[backend] = __import__(backend) + else: + raise ValueError("Unsupported language detection backend") + # remove language-related stop functions + for attr in dir(stopfunc): + if attr.startswith("_") or not isinstance(getattr(stopfunc, attr), FunctionType): + continue + if re.match(r"lang_[a-z]{2}$", attr): + delattr(stopfunc, attr) + # rebind applicable language-related stop functions + if stopfunc.LANG_BACKEND: + _lb = stopfunc.LANG_BACKEND + if _lb == "langid": + langid.langid.load_model() + for lang in ( + langid.langid.identifier.nb_classes if _lb == "langid" else \ + list(set(p[:2] for p in os.listdir(langdetect.PROFILES_DIRECTORY))) if _lb == "langdetect" else \ + list(set(x[1][:2] for x in pycld2.LANGUAGES if x[0] in pycld2.DETECTED_LANGUAGES)) if _lb == "pycld2" else \ + stopfunc.CLD3_LANGUAGES if _lb == "cld3" else \ + stopfunc.TEXTBLOB_LANGUAGES if _lb == "textblob" else \ + []): + n = "lang_%s" % lang + setattr(stopfunc, n, _lang(lang)) + getattr(stopfunc, n).__name__ = getattr(stopfunc, n).__qualname__ = n + if LANG: + flng = "lang_%s" % LANG + if getattr(stopfunc, flng, None): + stopfunc.default = getattr(stopfunc, flng) +stopfunc._reload_lang = _load_lang_backend + + +def _validate(stop_function, lang_backend="none"): + s, lb = stop_function, lang_backend + if isinstance(s, str): + if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ + all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): + stopfunc._reload_lang(lb) + f = getattr(stopfunc, s, None) + if f: + return f + elif not isinstance(s, FunctionType): + raise ValueError("Bad stop function") + return s +stopfunc._validate = _validate + + +def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, result, found=(), + stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): + """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ + if depth > min_depth and stop_func(input): + if not stop and (show or debug) and found not in result: + s = repr(input) + s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s + s = "[+] %s: %s" % (", ".join(found), s) + print(s if len(s) <= 80 else s[:77] + "...") + result[found] = input + if depth >= max_depth or len(result) > 0 and stop: + return + prev_enc = found[-1] if len(found) > 0 else "" + e = encodings.get(depth, encodings.get(-1, [])) + for new_input, encoding in __rank(prev_input, input, prev_enc, e, scoring_heuristic, extended): + if len(result) > 0 and stop: + return + if debug: + print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) + __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ), + stop, show, scoring_heuristic, extended, debug) + + +def __make_encodings_dict(include, exclude): + """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible + encoding names. It also creates a cache with the CodecInfo objects for improving performance. """ + def _develop(d, keep=True): + d = d or {} + for k, v in d.items(): + l, cc, sc = [], [e for e in v if e in CODECS_CATEGORIES], [e for e in v if e not in CODECS_CATEGORIES] + # list from in-scope categories and then everything that is not a category + for enc in ((list_encodings(*cc) if (len(cc) > 0 or keep) and len(sc) == 0 else []) + sc): + g = [] + for e in (search(enc, False) or [enc]): + try: + ci = lookup(e, False) + g.extend(ci.parameters['guess']) + except: + pass + if enc in g: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected + l.append(enc) + else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected + l.extend(g) + d[k] = list(set(l)) + return d + _excl, _incl = _develop(exclude, False), _develop(include) + return {k: [x for x in v if x not in _excl.get(k, [])] for k, v in _incl.items()} + + +def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False): + """ Filter valid encodings and rank them by relevance. """ + ranking = {} + for e in encodings: + try: + codec = CODECS_CACHE[e] + except KeyError: + try: + CODECS_CACHE[e] = codec = lookup(e, False) + except LookupError: + continue + t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) + if t: + ranking[e] = t + for encoding, result in sorted(ranking.items(), key=lambda x: (-x[1][0], x[0])): + yield result if yield_score else result[1], encoding + + +class _Text(object): + __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"] + + def __init__(self, text, pad_char=None): + self.text = ensure_str(text) + c = self.text[-1] + pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c) + self.padding = pad_char is not None and last_char == pad_char + if self.padding: + text = text.rstrip(b(pad_char) if isinstance(text, bytes) else pad_char) + self.len = len(self.text) + self.lcharset = len(set(self.text)) + self.printables = float(len([c for c in self.text if c in printable])) / self.len + self.entropy = entropy(self.text) + + +def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False): + """ Score relevant encodings given an input. """ + obj = None + sc = codec.parameters.get('scoring', {}) + no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False) + # ignore encodings that fail to decode with their default errors handling value + try: + new_input = codec.decode(input)[0] + except: + return + # ignore encodings that give an output identical to the input (identity transformation) or to the previous input + if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input): + return + # ignore encodings that transitively give the same output (identity transformation by chaining twice a same + # codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9) + if transitive and prev_encoding: + ci_prev = lookup(prev_encoding, False) + if ci_prev.parameters['name'] == codec.parameters['name']: + return + # compute input's characteristics only once and only if the control flow reaches this point + pad = sc.get('padding_char') + if obj is None: + obj = _Text(input, pad) + if heuristic: + # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base + # codecs) so that we can put the right one as early as possible and eventually exclude bad candidates + s = -sc.get('penalty', .0) + # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ; + # on the contrary, if the length of input text's charset is strictly greater, give a penalty + lcs = sc.get('len_charset', 256) + if isinstance(lcs, type(lambda: None)): + lcs = int(lcs(encoding)) + if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset: + s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1) + elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset: + s -= .2 # this can occur for encodings with no_error set to True + # then, take padding into account, giving a bonus if padding is to be encountered and effectively present, + # or a penalty when it should not be encountered but it is present + if pad and obj.padding: + s += .2 # when padding is encountered while it is legitimate, it could be a good indication => bonus + elif not pad and obj.padding: + s -= .1 # it could arise a padding character is encountered while not being padding => small penalty + # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when + # lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased) + if not no_error: + pr = sc.get('printables_rate', 0) + if isinstance(pr, type(lambda: None)): + pr = float(pr(obj.printables)) + if obj.printables - pr <= .05: + s += .1 + expf = sc.get('expansion_factor', 1.) + if expf: + f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f + if isinstance(expf, type(lambda: None)): + try: # this case allows to consider the current encoding name from the current codec + expf = expf(f, encoding) + except TypeError: + expf = expf(f) + if isinstance(expf, (int, float)): + tmp = expf + expf = (1/f - .1 <= 1/expf <= 1/f + .1) + elif isinstance(expf, (tuple, list)) and len(expf) == 2: + expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] + s += [-1., .1][expf] + # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the + # number of input characters to take bad entropies of shorter strings into account + entr = sc.get('entropy', lambda e: e) + entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr + if isinstance(entr, type(lambda: None)): + try: # this case allows to consider the current encoding name from the current codec + entr = entr(obj.entropy, encoding) + except TypeError: + entr = entr(obj.entropy) + if entr is not None: + # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1) + d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - obj.entropy) + if d_entr <= .5: + s += .5 - d_entr + # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) + bonus = sc.get('bonus_func') + if bonus is not None: + if isinstance(bonus, type(lambda: None)): + bonus = bonus(obj, codec, encoding) + if bonus: + s += .2 + else: + s = 1. + # exclude negative (and eventually null) scores as they are (hopefully) not relevant + if extended and s >= .0 or not extended and s > .0: + return s, new_input + + +def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=None, exclude=None, found=(), + stop=True, show=False, scoring_heuristic=True, extended=False, debug=False): + """ Try decoding without the knowledge of the encoding(s). + + :param input: input text to be guessed + :param stop_func: function defining the stop condition + :param min_depth: minimum search depth + :param max_depth: maximum search depth + ;param include: inclusion item OR list with category, codec or encoding names OR dictionary with lists per + depth (nothing means include every encoding) + :param exclude: exclusion item OR list with category, codec or encoding names OR dictionary with lists per + depth (nothing means exclude no encoding) + :param found: tuple of already found encodings + :param stop: whether to stop or not when a valid solution is found + :param show: whether to immediately show once a solution is found + :param scoring_heuristic: whether to apply the scoring heuristic during the search (if disabled, all scores are 1., + meaning that every non-failing encoding will be considered with no order of precedence) + :param extended: whether to also consider null scores with the heuristic + :param debug: whether to show each attempt at each depth during computation + """ + if len(input) == 0: + return "" + # check for min and max depths + if max_depth <= 0: + raise ValueError("Depth must be a non-null positive integer") + if min_depth > max_depth: + raise ValueError("Min depth shall be less than or equal to the max depth") + # take the tuple of found encodings into account + if len(found) > 0: + for encoding in found: + input = decode(input, encoding) + # handle the stop function as a regex if a string was given + if isinstance(stop_func, str): + stop_func = stopfunc.regex(stop_func) + # reformat include and exclude arguments ; supported formats: + for n, l in zip(["inc", "exc"], [include, exclude]): + if l is None: + if n == "inc": + include = l = {-1: CODECS_CATEGORIES} + else: + exclude = l = {} + # "category" OR "enc_name" OR whatever => means a single item for all depths + if isinstance(l, str): + if n == "inc": + include = l = {-1: [l]} + else: + exclude = l = {-1: [l]} + # ["enc_name1", "enc_name2", ...] => means for all depths + if isinstance(l, (list, tuple)): + if n == "inc": + include = l = {-1: l} + else: + exclude = l = {-1: l} + # {-1: [...], 2: [...], ...} => means prefedined depths with their lists of in-/excluded encodings + if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()): + raise ValueError("Include argument shall be a list or a dictionary with integer keys") + # precompute encodings lists per depth and cache the related CodecInfo objects + encodings, result = __make_encodings_dict(include, exclude), {} + try: + # breadth-first search + for d in range(max_depth): + __guess("", input, stop_func, 0, d+1, min_depth, encodings, result, tuple(found), stop, show, + scoring_heuristic, extended, debug) + if stop and len(result) > 0: + break + except KeyboardInterrupt: + pass + CODECS_CACHE = {} + return result +codecs.guess = guess + + +def rank(input, extended=False, limit=-1, include=None, exclude=None): + """ Rank the most probable encodings based on the given input. + + :param input: input text to be evaluated + :param extended: whether to consider null scores too (NB: negative scores are not output !) + :param limit: number of encodings to be returned (-1 means all of them) + :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) + :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) + """ + encodings = __make_encodings_dict(include if isinstance(include, dict) else {-1: include or CODECS_CATEGORIES}, + exclude if isinstance(exclude, dict) else {-1: exclude or []}) + r = list(__rank(None, input, "", encodings[-1], True, extended, True)) + return r[:limit] if len(r) > 1 else r +codecs.rank = rank + diff --git a/src/codext/__init__.py b/src/codext/__init__.py index f95abb8..67d6b5a 100644 --- a/src/codext/__init__.py +++ b/src/codext/__init__.py @@ -1,255 +1,257 @@ -# -*- coding: UTF-8 -*- -"""Codecs extension module. - -""" -from __future__ import print_function -from _codecs import lookup as orig_lookup -from ast import literal_eval -from six import binary_type, text_type - -from .__common__ import * -from .__info__ import __author__, __copyright__, __email__, __license__, __source__, __version__ - - -__all__ = ["add", "add_map", "clear", "decode", "encode", "guess", "lookup", "open", "rank", "register", "remove", - "reset"] - -decode = codecs.decode -encode = codecs.encode -guess = codecs.guess -lookup = codecs.lookup -open = codecs.open - -_lst = list -list = list_encodings # not included in __all__ because of shadow name - - -reset() - - -def __format_list(items, include=True): - if items is None: - return - d = {-1: list_encodings() if include else []} - for n, i in enumerate(items): - try: - depth, i = i.split(":") - depth = int(depth.strip().replace("~", "-")) - if depth < 0: - depth = -1 - except ValueError: - if n == 0: - d[-1] = [] - depth = -1 - d.setdefault(depth, []) - d[depth].append(i.strip()) - return d - - -def __print_tabular(lst, space=4): - try: - cols, _ = os.get_terminal_size() - # first, convert the list to a table that fits into the terminal - i, line, w = 0, "", [] - while i < len(lst): - x = lst[i] - l = len(x) - col = "%-{}s".format(l + space) % x - i += 1 - w.append(l) - if len(line) + len(col) > cols: - break - line += col - while True: - t = [lst[j:j+i] for j in range(0, len(lst), i)] - w = [max(0 if j+k*i >= len(lst) else len(lst[j+k*i]) for k in range(len(t))) for j, _ in enumerate(w)] - if sum(w) + space * len(w) >= cols: - i -= 1 - w.pop() - else: - break - print("\n".join("".join("%-{}s".format(w[n] + space) % x for n, x in enumerate(r)) for r in t) + "\n") - except (AttributeError, OSError): - print(", ".join(lst) + "\n") - - -def main(): - import argparse, os - - class _CustomFormatter(argparse.RawTextHelpFormatter): - def __init__(self, prog, **kwargs): - kwargs['max_help_position'] = 32 - super(_CustomFormatter, self).__init__(prog, **kwargs) - - def _format_action_invocation(self, action): - if not action.option_strings: - metavar, = self._metavar_formatter(action, action.dest)(1) - return metavar - else: - return ", ".join(action.option_strings) - - descr = "Codecs Extension (CodExt) {}\n\nAuthor : {} ({})\nCopyright: {}\nLicense : {}\nSource : {}\n" \ - "\nThis tool allows to encode/decode input strings/files with an extended set of codecs.\n\n" \ - .format(__version__, __author__, __email__, __copyright__, __license__, __source__) - examples = "usage examples:\n- " + "\n- ".join([ - "codext search bitcoin", - "codext decode base32 -i file.b32", - "codext encode morse < to_be_encoded.txt", - "echo \"test\" | codext encode base100", - "echo -en \"test\" | codext encode braille -o test.braille", - "codext encode base64 < to_be_encoded.txt > text.b64", - "echo -en \"test\" | codext encode base64 | codext encode base32", - "echo -en \"mrdvm6teie6t2cq=\" | codext encode upper | codext decode base32 | codext decode base64", - "echo -en \"test\" | codext encode upper reverse base32 | codext decode base32 reverse lower", - "echo -en \"test\" | codext encode upper reverse base32 base64 morse", - "echo -en \"test\" | codext encode base64 gzip | codext guess", - "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base", - ]) - kw = {'formatter_class': _CustomFormatter} - parser = argparse.ArgumentParser(description=descr, epilog=examples, **kw) - kw2 = {'required': True} if PY3 else {} - sparsers = parser.add_subparsers(dest="command", help="command to be executed", **kw2) - parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)") - parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)") - parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip", - help="strip newlines from input (default: False)") - encode = sparsers.add_parser("encode", help="encode input using the specified codecs", **kw) - encode.add_argument("encoding", nargs="+", help="list of encodings to apply") - encode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], - help="error handling (default: strict)") - decode = sparsers.add_parser("decode", help="decode input using the specified codecs", **kw) - decode.add_argument("encoding", nargs="+", help="list of encodings to apply") - decode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], - help="error handling (default: strict)") - guess = sparsers.add_parser("guess", help="try guessing the decoding codecs", **kw) - guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)") - guess.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely not used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - guess.add_argument("-E", "--extended", action="store_true", - help="while using the scoring heuristic, also consider null scores (default: False)") - lng = "lang_%s" % LANG - def_func = lng if getattr(stopfunc, lng, None) else "text" - guess.add_argument("-f", "--stop-function", default=def_func, metavar="FUNC", help="result checking function " - "(default: %s) ; format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-" - "sensitive ; add -i to force it as case-insensitive or add '(?i)' in front of the expression" - % def_func) - guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down" - " the search but may be more accurate (default: False)") - guess.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - guess.add_argument("-I", "--case-insensitive", dest="icase", action="store_true", - help="while using the regex stop function, set it as case-insensitive (default: False)") - if len(stopfunc.LANG_BACKENDS) > 0: - _lb = stopfunc.LANG_BACKEND - guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"], - help="natural language detection backend (default: %s)" % _lb) - guess.add_argument("-m", "--min-depth", default=0, type=int, metavar="INT", - help="minimum codec search depth before triggering results (default: 0)") - guess.add_argument("-M", "--max-depth", default=5, type=int, metavar="INT", - help="maximum codec search depth (default: 5)") - guess.add_argument("-s", "--do-not-stop", action="store_true", - help="do not stop if a valid output is found (default: False)") - guess.add_argument("-v", "--verbose", action="store_true", - help="show guessing information and steps (default: False)") - rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw) - rank.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely not used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - rank.add_argument("-E", "--extended", action="store_true", - help="while using the scoring heuristic, also consider null scores (default: False)") - rank.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results") - search = sparsers.add_parser("search", help="search for codecs") - search.add_argument("pattern", nargs="+", help="encoding pattern to search") - listi = sparsers.add_parser("list", help="list items") - lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", **kw2) - liste = lsparsers.add_parser("encodings", help="list encodings") - liste.add_argument("category", nargs="+", help="selected categories") - listm = lsparsers.add_parser("macros", help="list macros") - addm = sparsers.add_parser("add-macro", help="add a macro to the registry") - addm.add_argument("name", help="macro's name") - addm.add_argument("encoding", nargs="+", help="list of encodings to chain") - remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry") - remm.add_argument("name", help="macro's name") - args = parser.parse_args() - if args.command in ["guess", "rank"]: - args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) - try: - # if a search pattern is given, only handle it - if args.command == "search": - results = [] - for enc in args.pattern: - results.extend(codecs.search(enc)) - print(", ".join(results) or "No encoding found") - return 0 - # add/remove macros (not requiring to input a file or text) - elif args.command == "add-macro": - add_macro(args.name, *args.encoding) - return 0 - elif args.command == "remove-macro": - remove_macro(args.name) - return 0 - # list encodings or macros - elif args.command == "list": - if args.type == "encodings": - cats = args.category or list_categories() - for c in sorted(cats): - l = list_encodings(c) - if len(l) > 0: - if len(cats) > 0: - print(c.upper() + ":") - __print_tabular(l) - elif args.type == "macros": - l = list_macros() - if len(l) > 0: - __print_tabular(l) - return 0 - # handle input file or stdin - c =_input(args.infile) - c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") - # strip any other (CR)LF - if args.strip: - c = re.sub(r"\r?\n", "", c) if isinstance(c, str) else c.replace(b"\r\n", b"").replace(b"\n", b"") - if args.command in ["decode", "encode"]: - # encode or decode - for encoding in args.encoding: - c = getattr(codecs, ["encode", "decode"][args.command == "decode"])(c, encoding, args.errors) - # handle output file or stdout - if args.outfile: - with open(args.outfile, 'wb') as f: - f.write(c) - else: - print(ensure_str(c or "Could not %scode :-(" % ["en", "de"][args.command == "decode"]), end="") - elif args.command == "guess": - s, lb = args.stop_function, args.lang_backend - if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ - all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): - stopfunc._reload_lang(lb) - r = codecs.guess(c, - getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth, - args.include, args.exclude, args.encoding, not args.do_not_stop, True, # show - not args.no_heuristic, args.extended, args.verbose) - for i, o in enumerate(r.items()): - e, out = o - if len(e) > 0: - if args.outfile: - n, ext = os.path.splitext(args.outfile) - fn = args.outfile if len(r) == 1 else "%s-%d%s" % (n, i+1, ext) - else: - print("Codecs: %s" % ", ".join(e)) - print(ensure_str(out)) - if len(r) == 0: - print("Could not decode :-(") - elif args.command == "rank": - for i, e in codecs.rank(c, args.extended, args.limit, args.include, args.exclude): - s = "[+] %.5f: %s" % (i[0], e) - print(s if len(s) <= 80 else s[:77] + "...") - except Exception as e: - raise e - m = str(e) - print("codext: " + m[0].lower() + m[1:]) - +# -*- coding: UTF-8 -*- +"""Codecs extension module. + +""" +from .__common__ import * +from .__info__ import __author__, __copyright__, __email__, __license__, __source__, __version__ + + +__all__ = ["add", "add_map", "clear", "decode", "encode", "guess", "lookup", "open", "rank", "register", "remove", + "reset"] + +decode = codecs.decode +encode = codecs.encode +guess = codecs.guess +lookup = codecs.lookup +open = codecs.open + +_lst = list +list = list_encodings # not included in __all__ because of shadow name + + +reset() + + +# populate codext with attributes from codecs that were not modified +for attr in codecs.__all__: + if attr in __all__: + continue + locals()[attr] = getattr(codecs, attr) + __all__.append(attr) + + +def __format_list(items, include=True): + if items is None: + return + d = {-1: list_encodings() if include else []} + for n, i in enumerate(items): + try: + depth, i = i.split(":") + depth = int(depth.strip().replace("~", "-")) + if depth < 0: + depth = -1 + except ValueError: + if n == 0: + d[-1] = [] + depth = -1 + d.setdefault(depth, []) + d[depth].append(i.strip()) + return d + + +def __print_tabular(lst, space=4): + try: + cols, _ = os.get_terminal_size() + # first, convert the list to a table that fits into the terminal + i, line, w = 0, "", [] + while i < len(lst): + x = lst[i] + l = len(x) + col = "%-{}s".format(l + space) % x + i += 1 + w.append(l) + if len(line) + len(col) > cols: + break + line += col + while True: + t = [lst[j:j+i] for j in range(0, len(lst), i)] + w = [max(0 if j+k*i >= len(lst) else len(lst[j+k*i]) for k in range(len(t))) for j, _ in enumerate(w)] + if sum(w) + space * len(w) >= cols: + i -= 1 + w.pop() + else: + break + print("\n".join("".join("%-{}s".format(w[n] + space) % x for n, x in enumerate(r)) for r in t) + "\n") + except (AttributeError, OSError): + print(", ".join(lst) + "\n") + + +def main(): + import argparse, os + + class _CustomFormatter(argparse.RawTextHelpFormatter): + def __init__(self, prog, **kwargs): + kwargs['max_help_position'] = 32 + super(_CustomFormatter, self).__init__(prog, **kwargs) + + def _format_action_invocation(self, action): + if not action.option_strings: + metavar, = self._metavar_formatter(action, action.dest)(1) + return metavar + else: + return ", ".join(action.option_strings) + + descr = "Codecs Extension (CodExt) {}\n\nAuthor : {} ({})\nCopyright: {}\nLicense : {}\nSource : {}\n" \ + "\nThis tool allows to encode/decode input strings/files with an extended set of codecs.\n\n" \ + .format(__version__, __author__, __email__, __copyright__, __license__, __source__) + examples = "usage examples:\n- " + "\n- ".join([ + "codext search bitcoin", + "codext decode base32 -i file.b32", + "codext encode morse < to_be_encoded.txt", + "echo \"test\" | codext encode base100", + "echo -en \"test\" | codext encode braille -o test.braille", + "codext encode base64 < to_be_encoded.txt > text.b64", + "echo -en \"test\" | codext encode base64 | codext encode base32", + "echo -en \"mrdvm6teie6t2cq=\" | codext encode upper | codext decode base32 | codext decode base64", + "echo -en \"test\" | codext encode upper reverse base32 | codext decode base32 reverse lower", + "echo -en \"test\" | codext encode upper reverse base32 base64 morse", + "echo -en \"test\" | codext encode base64 gzip | codext guess", + "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base", + ]) + kw = {'formatter_class': _CustomFormatter} + parser = argparse.ArgumentParser(description=descr, epilog=examples, **kw) + sparsers = parser.add_subparsers(dest="command", help="command to be executed", required=True) + parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)") + parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)") + parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip", + help="strip newlines from input (default: False)") + encode = sparsers.add_parser("encode", help="encode input using the specified codecs", **kw) + encode.add_argument("encoding", nargs="+", help="list of encodings to apply") + encode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], + help="error handling (default: strict)") + decode = sparsers.add_parser("decode", help="decode input using the specified codecs", **kw) + decode.add_argument("encoding", nargs="+", help="list of encodings to apply") + decode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], + help="error handling (default: strict)") + guess = sparsers.add_parser("guess", help="try guessing the decoding codecs", **kw) + guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)") + guess.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely not used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + guess.add_argument("-E", "--extended", action="store_true", + help="while using the scoring heuristic, also consider null scores (default: False)") + lng = "lang_%s" % LANG + def_func = lng if getattr(stopfunc, lng, None) else "text" + guess.add_argument("-f", "--stop-function", default=def_func, metavar="FUNC", help="result checking function " + "(default: %s) ; format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-" + "sensitive ; add -i to force it as case-insensitive or add '(?i)' in front of the expression" + % def_func) + guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down" + " the search but may be more accurate (default: False)") + guess.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + guess.add_argument("-I", "--case-insensitive", dest="icase", action="store_true", + help="while using the regex stop function, set it as case-insensitive (default: False)") + if len(stopfunc.LANG_BACKENDS) > 0: + _lb = stopfunc.LANG_BACKEND + guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"], + help="natural language detection backend (default: %s)" % _lb) + guess.add_argument("-m", "--min-depth", default=0, type=int, metavar="INT", + help="minimum codec search depth before triggering results (default: 0)") + guess.add_argument("-M", "--max-depth", default=5, type=int, metavar="INT", + help="maximum codec search depth (default: 5)") + guess.add_argument("-s", "--do-not-stop", action="store_true", + help="do not stop if a valid output is found (default: False)") + guess.add_argument("-v", "--verbose", action="store_true", + help="show guessing information and steps (default: False)") + rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw) + rank.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely not used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + rank.add_argument("-E", "--extended", action="store_true", + help="while using the scoring heuristic, also consider null scores (default: False)") + rank.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results") + search = sparsers.add_parser("search", help="search for codecs") + search.add_argument("pattern", nargs="+", help="encoding pattern to search") + listi = sparsers.add_parser("list", help="list items") + lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", required=True) + liste = lsparsers.add_parser("encodings", help="list encodings") + liste.add_argument("category", nargs="+", help="selected categories") + listm = lsparsers.add_parser("macros", help="list macros") + addm = sparsers.add_parser("add-macro", help="add a macro to the registry") + addm.add_argument("name", help="macro's name") + addm.add_argument("encoding", nargs="+", help="list of encodings to chain") + remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry") + remm.add_argument("name", help="macro's name") + args = parser.parse_args() + if args.command in ["guess", "rank"]: + args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) + try: + # if a search pattern is given, only handle it + if args.command == "search": + results = [] + for enc in args.pattern: + results.extend(codecs.search(enc)) + print(", ".join(results) or "No encoding found") + return 0 + # add/remove macros (not requiring to input a file or text) + elif args.command == "add-macro": + add_macro(args.name, *args.encoding) + return 0 + elif args.command == "remove-macro": + remove_macro(args.name) + return 0 + # list encodings or macros + elif args.command == "list": + if args.type == "encodings": + cats = args.category or list_categories() + for c in sorted(cats): + l = list_encodings(c) + if len(l) > 0: + if len(cats) > 0: + print(c.upper() + ":") + __print_tabular(l) + elif args.type == "macros": + l = list_macros() + if len(l) > 0: + __print_tabular(l) + return 0 + # handle input file or stdin + c =_input(args.infile) + c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") + # strip any other (CR)LF + if args.strip: + c = re.sub(r"\r?\n", "", c) if isinstance(c, str) else c.replace(b"\r\n", b"").replace(b"\n", b"") + if args.command in ["decode", "encode"]: + # encode or decode + for encoding in args.encoding: + c = getattr(codecs, ["encode", "decode"][args.command == "decode"])(c, encoding, args.errors) + # handle output file or stdout + if args.outfile: + with open(args.outfile, 'wb') as f: + f.write(c) + else: + print(ensure_str(c or "Could not %scode :-(" % ["en", "de"][args.command == "decode"]), end="") + elif args.command == "guess": + s, lb = args.stop_function, args.lang_backend + if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ + all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): + stopfunc._reload_lang(lb) + r = codecs.guess(c, + getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth, + args.include, args.exclude, args.encoding, not args.do_not_stop, True, # show + not args.no_heuristic, args.extended, args.verbose) + for i, o in enumerate(r.items()): + e, out = o + if len(e) > 0: + if args.outfile: + n, ext = os.path.splitext(args.outfile) + fn = args.outfile if len(r) == 1 else "%s-%d%s" % (n, i+1, ext) + else: + print("Codecs: %s" % ", ".join(e)) + print(ensure_str(out)) + if len(r) == 0: + print("Could not decode :-(") + elif args.command == "rank": + for i, e in codecs.rank(c, args.extended, args.limit, args.include, args.exclude): + s = "[+] %.5f: %s" % (i[0], e) + print(s if len(s) <= 80 else s[:77] + "...") + except Exception as e: + raise e + m = str(e) + print("codext: " + m[0].lower() + m[1:]) + diff --git a/src/codext/base/_base.py b/src/codext/base/_base.py index fce8b9a..27a31e3 100755 --- a/src/codext/base/_base.py +++ b/src/codext/base/_base.py @@ -1,291 +1,290 @@ -# -*- coding: UTF-8 -*- -"""Generic baseN functions. - -""" -from argparse import ArgumentParser, RawTextHelpFormatter -from math import log -from six import integer_types, string_types -from string import ascii_lowercase as lower, ascii_uppercase as upper, digits, printable -from textwrap import wrap as wraptext -from types import FunctionType, MethodType - -from ..__common__ import * -from ..__common__ import _set_exc -from ..__info__ import __version__ - - -_set_exc("BaseError") -_set_exc("BaseEncodeError") -_set_exc("BaseDecodeError") -""" -Curve fitting: - ->>> import matplotlib.pyplot as plt ->>> import pandas as pd ->>> import scipy.optimize ->>> from statistics import mean ->>> from tinyscript import random ->>> x, y = [], [] ->>> for i in range(2, 256): - v = [] - for j in range(16, 2048, 16): - s = random.randstr(j) - v.append(float(len(codext.encode(s, "base%d-generic" % i))) / len(s)) - x.append(i) - y.append(mean(v)) ->>> data = pd.DataFrame({'base': x, 'expf': y}) ->>> def fit(x, y, func, params): - params, cv = scipy.optimize.curve_fit(func, x, y, params) - print(params) - y2 = func(x, *params) - plt.clf() - plt.plot(x, y, ".", color="blue", alpha=.3) - plt.plot(x, y2, color="red", linewidth=3.0) - plt.show() ->>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (1, 1, 1, 1)) -[ 0.02841434 0.00512664 -0.99999984 0.01543879] ->>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (.028, .005, -1, .015)) -[ 0.02827357 0.00510124 -0.99999984 0.01536941] -""" -EXPANSION_FACTOR = lambda base: 0.02827357 / (base**0.00510124-0.99999984) + 0.01536941 -SIZE_LIMIT = 1024 * 1024 * 1024 - - -def _generate_charset(n): - """ Generate a characters set. - - :param n: size of charset - """ - if 1 < n <= len(printable): - return printable[:n] - elif len(printable) < n < 256: - return "".join(chr(i) for i in range(n)) - raise ValueError("Bad size of character set") - - -def _get_charset(charset, p=""): - """ Characters set selection function. It allows to define charsets in many different ways. - - :param charset: charset object, can be a string (the charset itself), a function (that chooses the right charset - depending on the input parameter) or a dictionary (either by exact key or by pattern matching) - :param p: the parameter for choosing the charset - """ - # case 1: charset is a function, so return its result - if isinstance(charset, FunctionType): - return charset(p) - # case 2: charset is a string, so return it - elif isinstance(charset, string_types): - return charset - # case 3: charset is a dict with keys '' and 'inv', typically for a charset using lowercase and uppercase characters - # that can be inverted - elif isinstance(charset, dict) and list(charset.keys()) == ["", "inv"]: - return charset["inv" if re.match(r"[-_]inv(erted)?$", p) else ""] - # case 4: charset is a dict, but not with the specific keys '' and 'inv', so consider it as pattern-charset pairs - elif isinstance(charset, dict): - # try to handle [p]arameter as a simple key - try: - return charset[p] - except KeyError: - pass - # or handle [p]arameter as a pattern - default, n, best = None, None, None - for pattern, cset in charset.items(): - n = len(cset) - if re.match(pattern, ""): - default = cset - continue - m = re.match(pattern, p) - if m: # find the longest match from the patterns - s, e = m.span() - if e - s > len(best or ""): - best = pattern - if best: - return charset[best] - # special case: the given [p]arameter can be the charset itself if it has the right length - p = re.sub(r"^[-_]+", "", p) - if len(p) == n: - return p - # or simply rely on key '' - if default is not None: - return default - raise ValueError("Bad charset descriptor ('%s')" % p) - - -# generic base en/decoding functions -def base_encode(input, charset, errors="strict", exc=BaseEncodeError): - """ Base-10 to base-N encoding. - - :param input: input (str or int) to be decoded - :param charset: base-N characters set - :param errors: errors handling marker - :param exc: exception to be raised in case of error - """ - i, n, r = input if isinstance(input, integer_types) else s2i(input), len(charset), "" - if n == 1: - if i > SIZE_LIMIT: - raise InputSizeLimitError("Input exceeded size limit") - return i * charset[0] - if n == 10: - return str(i) if charset == digits else "".join(charset[int(x)] for x in str(i)) - while i > 0: - i, c = divmod(i, n) - r = charset[c] + r - return r - - -def base_decode(input, charset, errors="strict", exc=BaseDecodeError): - """ Base-N to base-10 decoding. - - :param input: input to be decoded - :param charset: base-N characters set - :param errors: errors handling marker - :param exc: exception to be raised in case of error - """ - i, n, dec = 0, len(charset), lambda n: base_encode(n, [chr(x) for x in range(256)], errors, exc) - if n == 1: - return i2s(len(input)) - if n == 10: - return i2s(int(input)) if charset == digits else "".join(str(charset.index(c)) for c in input) - for k, c in enumerate(input): - try: - i = i * n + charset.index(c) - except ValueError: - handle_error("base", errors, exc, decode=True)(c, k, dec(i), "base%d" % n) - return dec(i) - - -# base codec factory functions -def base(charset, pattern, pow2=False, encode_template=base_encode, decode_template=base_decode, name=None, **kwargs): - """ Base-N codec factory. - - :param charset: charset selection function - :param pattern: matching pattern for the codec name (first capturing group is used as the parameter for selecting - the charset) - :param pow2: whether the base codec's N is a power of 2 - """ - cs = _get_charset(charset) - n = len(cs) - nb = log(n, 2) - if pow2 and nb != int(nb): - raise BaseError("Bad charset ; {} is not a power of 2".format(n)) - - def encode(param="", *args): - a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) - def _encode(input, errors="strict"): - if len(input) == 0: - return "", 0 - return encode_template(input, a, errors), len(input) - return _encode - - def decode(param="", *args): - a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) - sl, sc = "\n" not in a, "\n" not in a and not "\r" in a - def _decode(input, errors="strict"): - if len(input) == 0: - return "", 0 - input = _stripl(input, sc, sl) - return decode_template(input, a, errors), len(input) - return _decode - - kwargs['len_charset'] = n - kwargs['printables_rate'] = float(len([c for c in cs if c in printable])) / len(cs) - kwargs['expansion_factor'] = kwargs.pop('expansion_factor', (EXPANSION_FACTOR(n), .05)) - n = "base{}".format(n) if name is None else name - try: - g = [n, n + "-inv"] if "[-_]inv(erted)?$" in charset.keys() else [n] - except AttributeError: - g = [n] - kwargs['guess'] = kwargs.get('guess', g) - add(n, encode, decode, pattern, entropy=nb, **kwargs) - - -def base_generic(): - """ Base-N generic codec. """ - def encode(n): - a = _generate_charset(int(n)) - def _encode(input, errors="strict"): - return base_encode(input, a, errors), len(input) - return _encode - - def decode(n): - a = _generate_charset(int(n)) - sl, sc = "\n" not in a, "\n" not in a and not "\r" in a - def _decode(input, errors="strict"): - input = _stripl(input, sc, sl) - return base_decode(input, a, errors), len(input) - return _decode - - add("base", encode, decode, r"^base[-_]?([2-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:[-_]generic)?$", - guess=["base%d-generic" % i for i in range(2, 255)], entropy=lambda e, n: log(int(n.split("-")[0][4:]), 2), - len_charset=lambda n: int(n.split("-")[0][4:]), printables_rate=1., category="base-generic", penalty=.4, - expansion_factor=lambda f, n: (EXPANSION_FACTOR(int(n.split("-")[0][4:])), .05)) - - -def main(n, ref=None, alt=None, inv=True, swap=True, wrap=True): - base = str(n) + ("-" + alt.lstrip("-") if alt else "") - src = "The data are encoded as described for the base%(base)s alphabet in %(reference)s.\n" % \ - {'base': base, 'reference': "\n" + ref if len(ref) > 20 else ref} if ref else "" - text = "%(source)sWhen decoding, the input may contain newlines in addition to the bytes of the formal base" \ - "%(base)s alphabet. Use --ignore-garbage to attempt to recover from any other non-alphabet bytes in the" \ - " encoded stream." % {'base': base, 'source': src} - text = "\n".join(x for x in wraptext(text, 74)) - descr = """Usage: base%(base)s [OPTION]... [FILE] -Base%(base)s encode or decode FILE, or standard input, to standard output. - -With no FILE, or when FILE is -, read standard input. - -Mandatory arguments to long options are mandatory for short options too. - -d, --decode decode data - -i, --ignore-garbage when decoding, ignore non-alphabet characters -%(inv)s%(swap)s%(wrap)s - - --help display this help and exit - --version output version information and exit - -%(text)s - -Report base%(base)s translation bugs to -Full documentation at: -""" % {'base': base, 'text': text, - 'inv': ["", " -I, --invert invert charsets from the base alphabet (e.g. digits and letters)\n"][inv], - 'swap': ["", " -s, --swapcase swap the case\n"][swap], - 'wrap': ["", " -w, --wrap=COLS wrap encoded lines after COLS character (default 76).\n"+ 26 * " " + \ - "Use 0 to disable line wrapping"][wrap]} - - def _main(): - p = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) - p.format_help = MethodType(lambda s: s.description, p) - p.add_argument("file", nargs="?") - p.add_argument("-d", "--decode", action="store_true") - p.add_argument("-i", "--ignore-garbage", action="store_true") - if inv: - p.add_argument("-I", "--invert", action="store_true") - if swap: - p.add_argument("-s", "--swapcase", action="store_true") - if wrap: - p.add_argument("-w", "--wrap", type=int, default=76) - p.add_argument("--help", action="help") - p.add_argument("--version", action="version") - p.version = "CodExt " + __version__ - args = p.parse_args() - if args.decode: - args.wrap = 0 - args.invert = getattr(args, "invert", False) - c, f = _input(args.file), [encode, decode][args.decode] - if swap and args.swapcase and args.decode: - c = codecs.decode(c, "swapcase") - c = b(c).rstrip(b"\r\n") - try: - c = f(c, "base" + base + ["", "-inv"][getattr(args, "invert", False)], - ["strict", "ignore"][args.ignore_garbage]) - except Exception as err: - print("%sbase%s: invalid input" % (getattr(err, "output", ""), base)) - return 1 - c = ensure_str(c) - if swap and args.swapcase and not args.decode: - c = codecs.encode(c, "swapcase") - for l in (wraptext(c, args.wrap) if args.wrap > 0 else [c]) if wrap else c.split("\n"): - print(l) - return 0 - return _main - +# -*- coding: UTF-8 -*- +"""Generic baseN functions. + +""" +from argparse import ArgumentParser, RawTextHelpFormatter +from math import log +from string import ascii_lowercase as lower, ascii_uppercase as upper, digits, printable +from textwrap import wrap as wraptext +from types import FunctionType, MethodType + +from ..__common__ import * +from ..__common__ import _set_exc +from ..__info__ import __version__ + + +_set_exc("BaseError") +_set_exc("BaseEncodeError") +_set_exc("BaseDecodeError") +""" +Curve fitting: + +>>> import matplotlib.pyplot as plt +>>> import pandas as pd +>>> import scipy.optimize +>>> from statistics import mean +>>> from tinyscript import random +>>> x, y = [], [] +>>> for i in range(2, 256): + v = [] + for j in range(16, 2048, 16): + s = random.randstr(j) + v.append(float(len(codext.encode(s, "base%d-generic" % i))) / len(s)) + x.append(i) + y.append(mean(v)) +>>> data = pd.DataFrame({'base': x, 'expf': y}) +>>> def fit(x, y, func, params): + params, cv = scipy.optimize.curve_fit(func, x, y, params) + print(params) + y2 = func(x, *params) + plt.clf() + plt.plot(x, y, ".", color="blue", alpha=.3) + plt.plot(x, y2, color="red", linewidth=3.0) + plt.show() +>>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (1, 1, 1, 1)) +[ 0.02841434 0.00512664 -0.99999984 0.01543879] +>>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (.028, .005, -1, .015)) +[ 0.02827357 0.00510124 -0.99999984 0.01536941] +""" +EXPANSION_FACTOR = lambda base: 0.02827357 / (base**0.00510124-0.99999984) + 0.01536941 +SIZE_LIMIT = 1024 * 1024 * 1024 + + +def _generate_charset(n): + """ Generate a characters set. + + :param n: size of charset + """ + if 1 < n <= len(printable): + return printable[:n] + elif len(printable) < n < 256: + return "".join(chr(i) for i in range(n)) + raise ValueError("Bad size of character set") + + +def _get_charset(charset, p=""): + """ Characters set selection function. It allows to define charsets in many different ways. + + :param charset: charset object, can be a string (the charset itself), a function (that chooses the right charset + depending on the input parameter) or a dictionary (either by exact key or by pattern matching) + :param p: the parameter for choosing the charset + """ + # case 1: charset is a function, so return its result + if isinstance(charset, FunctionType): + return charset(p) + # case 2: charset is a string, so return it + elif isinstance(charset, str): + return charset + # case 3: charset is a dict with keys '' and 'inv', typically for a charset using lowercase and uppercase characters + # that can be inverted + elif isinstance(charset, dict) and list(charset.keys()) == ["", "inv"]: + return charset["inv" if re.match(r"[-_]inv(erted)?$", p) else ""] + # case 4: charset is a dict, but not with the specific keys '' and 'inv', so consider it as pattern-charset pairs + elif isinstance(charset, dict): + # try to handle [p]arameter as a simple key + try: + return charset[p] + except KeyError: + pass + # or handle [p]arameter as a pattern + default, n, best = None, None, None + for pattern, cset in charset.items(): + n = len(cset) + if re.match(pattern, ""): + default = cset + continue + m = re.match(pattern, p) + if m: # find the longest match from the patterns + s, e = m.span() + if e - s > len(best or ""): + best = pattern + if best: + return charset[best] + # special case: the given [p]arameter can be the charset itself if it has the right length + p = re.sub(r"^[-_]+", "", p) + if len(p) == n: + return p + # or simply rely on key '' + if default is not None: + return default + raise ValueError("Bad charset descriptor ('%s')" % p) + + +# generic base en/decoding functions +def base_encode(input, charset, errors="strict", exc=BaseEncodeError): + """ Base-10 to base-N encoding. + + :param input: input (str or int) to be decoded + :param charset: base-N characters set + :param errors: errors handling marker + :param exc: exception to be raised in case of error + """ + i, n, r = input if isinstance(input, int) else s2i(input), len(charset), "" + if n == 1: + if i > SIZE_LIMIT: + raise InputSizeLimitError("Input exceeded size limit") + return i * charset[0] + if n == 10: + return str(i) if charset == digits else "".join(charset[int(x)] for x in str(i)) + while i > 0: + i, c = divmod(i, n) + r = charset[c] + r + return r + + +def base_decode(input, charset, errors="strict", exc=BaseDecodeError): + """ Base-N to base-10 decoding. + + :param input: input to be decoded + :param charset: base-N characters set + :param errors: errors handling marker + :param exc: exception to be raised in case of error + """ + i, n, dec = 0, len(charset), lambda n: base_encode(n, [chr(x) for x in range(256)], errors, exc) + if n == 1: + return i2s(len(input)) + if n == 10: + return i2s(int(input)) if charset == digits else "".join(str(charset.index(c)) for c in input) + for k, c in enumerate(input): + try: + i = i * n + charset.index(c) + except ValueError: + handle_error("base", errors, exc, decode=True)(c, k, dec(i), "base%d" % n) + return dec(i) + + +# base codec factory functions +def base(charset, pattern, pow2=False, encode_template=base_encode, decode_template=base_decode, name=None, **kwargs): + """ Base-N codec factory. + + :param charset: charset selection function + :param pattern: matching pattern for the codec name (first capturing group is used as the parameter for selecting + the charset) + :param pow2: whether the base codec's N is a power of 2 + """ + cs = _get_charset(charset) + n = len(cs) + nb = log(n, 2) + if pow2 and nb != int(nb): + raise BaseError("Bad charset ; {} is not a power of 2".format(n)) + + def encode(param="", *args): + a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) + def _encode(input, errors="strict"): + if len(input) == 0: + return "", 0 + return encode_template(input, a, errors), len(input) + return _encode + + def decode(param="", *args): + a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) + sl, sc = "\n" not in a, "\n" not in a and not "\r" in a + def _decode(input, errors="strict"): + if len(input) == 0: + return "", 0 + input = _stripl(input, sc, sl) + return decode_template(input, a, errors), len(input) + return _decode + + kwargs['len_charset'] = n + kwargs['printables_rate'] = float(len([c for c in cs if c in printable])) / len(cs) + kwargs['expansion_factor'] = kwargs.pop('expansion_factor', (EXPANSION_FACTOR(n), .05)) + n = "base{}".format(n) if name is None else name + try: + g = [n, n + "-inv"] if "[-_]inv(erted)?$" in charset.keys() else [n] + except AttributeError: + g = [n] + kwargs['guess'] = kwargs.get('guess', g) + add(n, encode, decode, pattern, entropy=nb, **kwargs) + + +def base_generic(): + """ Base-N generic codec. """ + def encode(n): + a = _generate_charset(int(n)) + def _encode(input, errors="strict"): + return base_encode(input, a, errors), len(input) + return _encode + + def decode(n): + a = _generate_charset(int(n)) + sl, sc = "\n" not in a, "\n" not in a and not "\r" in a + def _decode(input, errors="strict"): + input = _stripl(input, sc, sl) + return base_decode(input, a, errors), len(input) + return _decode + + add("base", encode, decode, r"^base[-_]?([2-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:[-_]generic)?$", + guess=["base%d-generic" % i for i in range(2, 255)], entropy=lambda e, n: log(int(n.split("-")[0][4:]), 2), + len_charset=lambda n: int(n.split("-")[0][4:]), printables_rate=1., category="base-generic", penalty=.4, + expansion_factor=lambda f, n: (EXPANSION_FACTOR(int(n.split("-")[0][4:])), .05)) + + +def main(n, ref=None, alt=None, inv=True, swap=True, wrap=True): + base = str(n) + ("-" + alt.lstrip("-") if alt else "") + src = "The data are encoded as described for the base%(base)s alphabet in %(reference)s.\n" % \ + {'base': base, 'reference': "\n" + ref if len(ref) > 20 else ref} if ref else "" + text = "%(source)sWhen decoding, the input may contain newlines in addition to the bytes of the formal base" \ + "%(base)s alphabet. Use --ignore-garbage to attempt to recover from any other non-alphabet bytes in the" \ + " encoded stream." % {'base': base, 'source': src} + text = "\n".join(x for x in wraptext(text, 74)) + descr = """Usage: base%(base)s [OPTION]... [FILE] +Base%(base)s encode or decode FILE, or standard input, to standard output. + +With no FILE, or when FILE is -, read standard input. + +Mandatory arguments to long options are mandatory for short options too. + -d, --decode decode data + -i, --ignore-garbage when decoding, ignore non-alphabet characters +%(inv)s%(swap)s%(wrap)s + + --help display this help and exit + --version output version information and exit + +%(text)s + +Report base%(base)s translation bugs to +Full documentation at: +""" % {'base': base, 'text': text, + 'inv': ["", " -I, --invert invert charsets from the base alphabet (e.g. digits and letters)\n"][inv], + 'swap': ["", " -s, --swapcase swap the case\n"][swap], + 'wrap': ["", " -w, --wrap=COLS wrap encoded lines after COLS character (default 76).\n"+ 26 * " " + \ + "Use 0 to disable line wrapping"][wrap]} + + def _main(): + p = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) + p.format_help = MethodType(lambda s: s.description, p) + p.add_argument("file", nargs="?") + p.add_argument("-d", "--decode", action="store_true") + p.add_argument("-i", "--ignore-garbage", action="store_true") + if inv: + p.add_argument("-I", "--invert", action="store_true") + if swap: + p.add_argument("-s", "--swapcase", action="store_true") + if wrap: + p.add_argument("-w", "--wrap", type=int, default=76) + p.add_argument("--help", action="help") + p.add_argument("--version", action="version") + p.version = "CodExt " + __version__ + args = p.parse_args() + if args.decode: + args.wrap = 0 + args.invert = getattr(args, "invert", False) + c, f = _input(args.file), [encode, decode][args.decode] + if swap and args.swapcase and args.decode: + c = codecs.decode(c, "swapcase") + c = b(c).rstrip(b"\r\n") + try: + c = f(c, "base" + base + ["", "-inv"][getattr(args, "invert", False)], + ["strict", "ignore"][args.ignore_garbage]) + except Exception as err: + print("%sbase%s: invalid input" % (getattr(err, "output", ""), base)) + return 1 + c = ensure_str(c) + if swap and args.swapcase and not args.decode: + c = codecs.encode(c, "swapcase") + for l in (wraptext(c, args.wrap) if args.wrap > 0 else [c]) if wrap else c.split("\n"): + print(l) + return 0 + return _main + diff --git a/src/codext/base/base100.py b/src/codext/base/base100.py index f5faa1d..2287463 100755 --- a/src/codext/base/base100.py +++ b/src/codext/base/base100.py @@ -1,56 +1,47 @@ -# -*- coding: UTF-8 -*- -"""Base100 Codec - base100 content encoding. - -Note: only works in Python3 ; strongly inspired from https://github.com/MasterGroosha/pybase100 - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ._base import main -from ..__common__ import * - - -# no __examples__ ; handled manually in tests/test_base.py - - -def base100_encode(input, errors="strict"): - raise NotImplementedError - - -def base100_decode(input, errors="strict"): - raise NotImplementedError - - -if PY3: - class Base100DecodeError(ValueError): - __module__ = "builtins" - - def base100_encode(input, errors="strict"): - input = b(input) - r = [240, 159, 0, 0] * len(input) - for i, c in enumerate(input): - r[4*i+2] = (c + 55) // 64 + 143 - r[4*i+3] = (c + 55) % 64 + 128 - return bytes(r), len(input) - - def base100_decode(input, errors="strict"): - input = b(_stripl(input, True, True)) - if errors == "ignore": - input = input.replace(b"\n", "") - if len(input) % 4 != 0: - raise Base100DecodeError("Bad input (length should be multiple of 4)") - r = [None] * (len(input) // 4) - for i, c in enumerate(input): - if i % 4 == 2: - tmp = ((c - 143) * 64) % 256 - elif i % 4 == 3: - r[i//4] = (c - 128 + tmp - 55) & 0xff - return bytes(r), len(input) - - -add("base100", base100_encode, base100_decode, r"^(?:base[-_]?100|emoji)$", expansion_factor=1.) -main100 = main(100, "") - +# -*- coding: UTF-8 -*- +"""Base100 Codec - base100 content encoding. + +Note: only works in Python3 ; strongly inspired from https://github.com/MasterGroosha/pybase100 + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ._base import main +from ..__common__ import * + +# no __examples__ ; handled manually in tests/test_base.py + +class Base100DecodeError(ValueError): + __module__ = "builtins" + + +def base100_encode(input, errors="strict"): + input = b(input) + r = [240, 159, 0, 0] * len(input) + for i, c in enumerate(input): + r[4*i+2] = (c + 55) // 64 + 143 + r[4*i+3] = (c + 55) % 64 + 128 + return bytes(r), len(input) + + +def base100_decode(input, errors="strict"): + input = b(_stripl(input, True, True)) + if errors == "ignore": + input = input.replace(b"\n", b"") + if len(input) % 4 != 0: + raise Base100DecodeError("Bad input (length should be multiple of 4)") + r = [None] * (len(input) // 4) + for i, c in enumerate(input): + if i % 4 == 2: + tmp = ((c - 143) * 64) % 256 + elif i % 4 == 3: + r[i//4] = (c - 128 + tmp - 55) & 0xff + return bytes(r), len(input) + + +add("base100", base100_encode, base100_decode, r"^(?:base[-_]?100|emoji)$", expansion_factor=1.) +main100 = main(100, "") + diff --git a/src/codext/base/base122.py b/src/codext/base/base122.py index f580ff8..b326341 100755 --- a/src/codext/base/base122.py +++ b/src/codext/base/base122.py @@ -1,106 +1,98 @@ -# -*- coding: UTF-8 -*- -"""Base122 Codec - base122 content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ._base import main -from ..__common__ import * - - -__examples__ = { - 'enc(base122|base-122)': { - 'this is a test': ":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", - b'This is another longer test string with d1g1t5 and sp3c141 characters !\n': \ - b"*\x1a\xca\x97\x19\x01Rs\x10\x18-f{QPe9\x08\xcb\x86{9Ne9\x08\x0eF+Mh 9]\x0e\xd3\x8b" - b"9N ;Z.FA\x01H13L.C)\x01Bn2\x08\x0e7\x01MF1\x1a\x0c$\x06\x1b!Br0XnF+If \x10B@" - }, - 'enc-dec(base_122)': ["@random"], -} if PY3 else {'enc(base122': None} - - -_BAD = [0, 10, 13, 34, 38, 92] -_i = lambda c: c if isinstance(c, int) else ord(c) - - -def base122_encode(input, errors='strict'): - raise NotImplementedError - - -def base122_decode(input, errors='strict'): - raise NotImplementedError - - -if PY3: - # inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js - def base122_encode(input, errors="strict"): - idx, bit, r, l = 0, 0, [], len(input) - - def _get_7bits(idx, bit): - if idx >= l: - return idx, bit, False - B1 = _i(input[idx]) - p1 = (((254 >> bit) & B1) << bit) >> 1 - bit += 7 - if bit < 8: - return idx, bit, p1 - bit -= 8 - idx += 1 - if idx >= l: - return idx, bit, p1 - B2 = _i(input[idx]) - p2 = (((65280 >> bit) & B2) & 255) >> (8 - bit) - return idx, bit, (p1 | p2) - - while True: - if idx >= l: - break - # get seven bits of input data - idx, bit, B = _get_7bits(idx, bit) - # check for illegal chars - try: - bad_idx = _BAD.index(B) - except ValueError: - r.append(B) - continue - idx, bit, nB = _get_7bits(idx, bit) - if nB is False: - nB, bad_idx = B, 7 - B1, B2 = 194, 128 - B1 |= (7 & bad_idx) << 2 - B1 |= int((nB & 64) > 0) - B2 |= nB & 63 - r.extend([B1, B2]) - return "".join(map(chr, r)).encode("latin-1"), len(input) - - # inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js - def base122_decode(input, errors="strict"): - currB, bob, r, input = 0, 0, [], list(map(ord, input)) - - def _get_7bits(currB, bob, B, decoded): - B <<= 1 - currB |= (B % 0x100000000) >> bob - bob += 7 - if bob >= 8: - decoded += [currB] - bob -= 8 - return (B << (7 - bob)) & 255, bob - - for i in range(len(input)): - if input[i] >= 128: - try: - currB, bob = _get_7bits(currB, bob, _BAD[(input[i] >> 8) & 7], r) - except IndexError: - pass - currB, bob = _get_7bits(currB, bob, input[i] & 127, r) - else: - currB, bob = _get_7bits(currB, bob, input[i], r) - return "".join(map(chr, r)).rstrip("\0"), len(input) - - -add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085) -main122 = main(122, "", wrap=False) - +# -*- coding: UTF-8 -*- +"""Base122 Codec - base122 content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ._base import main +from ..__common__ import * + + +__examples__ = { + 'enc(base122|base-122)': { + 'this is a test': ":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", + b'This is another longer test string with d1g1t5 and sp3c141 characters !\n': \ + b"*\x1a\xca\x97\x19\x01Rs\x10\x18-f{QPe9\x08\xcb\x86{9Ne9\x08\x0eF+Mh 9]\x0e\xd3\x8b" + b"9N ;Z.FA\x01H13L.C)\x01Bn2\x08\x0e7\x01MF1\x1a\x0c$\x06\x1b!Br0XnF+If \x10B@" + }, + 'enc-dec(base_122)': ["@random"], +} + + +_BAD = [0, 10, 13, 34, 38, 92] +_i = lambda c: c if isinstance(c, int) else ord(c) + + +# inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js +def base122_encode(input, errors="strict"): + idx, bit, r, l = 0, 0, [], len(input) + + def _get_7bits(idx, bit): + if idx >= l: + return idx, bit, False + B1 = _i(input[idx]) + p1 = (((254 >> bit) & B1) << bit) >> 1 + bit += 7 + if bit < 8: + return idx, bit, p1 + bit -= 8 + idx += 1 + if idx >= l: + return idx, bit, p1 + B2 = _i(input[idx]) + p2 = (((65280 >> bit) & B2) & 255) >> (8 - bit) + return idx, bit, (p1 | p2) + + while True: + if idx >= l: + break + # get seven bits of input data + idx, bit, B = _get_7bits(idx, bit) + # check for illegal chars + try: + bad_idx = _BAD.index(B) + except ValueError: + r.append(B) + continue + idx, bit, nB = _get_7bits(idx, bit) + if nB is False: + nB, bad_idx = B, 7 + B1, B2 = 194, 128 + B1 |= (7 & bad_idx) << 2 + B1 |= int((nB & 64) > 0) + B2 |= nB & 63 + r.extend([B1, B2]) + return "".join(map(chr, r)).encode("latin-1"), len(input) + + +# inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js +def base122_decode(input, errors="strict"): + currB, bob, r, input = 0, 0, [], list(map(ord, input)) + + def _get_7bits(currB, bob, B, decoded): + B <<= 1 + currB |= (B % 0x100000000) >> bob + bob += 7 + if bob >= 8: + decoded += [currB] + bob -= 8 + return (B << (7 - bob)) & 255, bob + + for i in range(len(input)): + if input[i] >= 128: + try: + currB, bob = _get_7bits(currB, bob, _BAD[(input[i] >> 8) & 7], r) + except IndexError: + pass + currB, bob = _get_7bits(currB, bob, input[i] & 127, r) + else: + currB, bob = _get_7bits(currB, bob, input[i], r) + return "".join(map(chr, r)).rstrip("\0"), len(input) + + +add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085) +main122 = main(122, "", wrap=False) + diff --git a/src/codext/base/base85.py b/src/codext/base/base85.py index bc6d8b2..22aad28 100755 --- a/src/codext/base/base85.py +++ b/src/codext/base/base85.py @@ -1,186 +1,185 @@ -# -*- coding: UTF-8 -*- -"""Base85 Codec - base85 content encoding. - -This is a simple wrapper for adding base64.b85**code to the codecs. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import base64 -from six import integer_types - -from ._base import _get_charset, digits, lower, main, upper -from ..__common__ import * - - -__examples__ = { - 'enc-dec(base85|z85|base85-ipv6)': ["@random{512,1024,2048}"], - 'enc-dec(base85-btoa|base85-xbtoa)': ["@random{512,1024,2048}"], - 'enc(base85|ascii85)': {'this is a test': "FD,B0+DGm>@3BZ'F*%"}, - 'enc(base85-adobe)': {'this is a test': "<~FD,B0+DGm>@3BZ'F*%~>", - 'this is a test\0\0\0\0\0\0': "<~FD,B0+DGm>@3BZ'F*%B^z~>"}, - 'enc(z85|base85-z)': {'this is a test': "BzbxfazC)tvixV6B94"}, - 'enc(base85-ipv6|base85_rfc1924)': {'this is a test': "bZBXFAZc?TVIXv6b94"}, - 'enc(base85_btoa)': {'this is a test': "FD,B0+DGm>@3BZ'F*%B^"}, - 'enc(base85_btoa)': {'this\0\0\0\0test': "FD,B0+DGm>@3BZ'F*%B^"}, - 'enc(base85_btoa)': {'this is a test\0\0\0\0': "FD,B0+DGm>y@3BZ'F*%B^z"}, - 'enc(base85-xbtoa)': {'this is a test': "xbtoa Begin\nFD,B0+DGm>@3BZ'F*%B^\nxbtoa End N 14 e E 4b" \ - " S 523 R 1b132e"}, - 'dec(base85-xbtoa)': {'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End': None, - 'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End N 14 e E 4b S 523 R 000bad': - None}, - 'enc(base85-xml)': {'this is a test': "bZBXFAZc@TVIXv6b94"}, - 'enc(base85|ascii85)': {'this\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0test': "FD,B0zzz!!!\"@ATMq"}, -} -__guess__ = ["ascii85", "z85", "base85-ipv6", "base85-xml", "base85-adobe", "base85-xbtoa"] - - -B85 = { - r'(base[-_]?85([-_]ascii)?|ascii85)$': "!\"#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_`" + lower[:21], - r'(z85|base[-_]?85[-_]z(eromq)?)$': digits + lower + upper + ".-:+=^!/*?&<>()[]{}@%$#", - r'base[-_]?85[-_](rfc1924|ipv6)$': digits + upper + lower + "!#$%&()*+-;<=>?@^_`{|}~", - r'base[-_]?85[-_]xml$': digits + upper + lower[:-1] + "!#$()*+,-./:;=?@^`{|}~z_", -} -B85[r'(base[-_]?85[-_]adobe)$'] = B85[r'(base[-_]?85[-_]x?btoa)$'] = B85[r'(base[-_]?85([-_]ascii)?|ascii85)$'] -POW85 = [85 ** i for i in range(5)] - - -def __format(text, mode, decode=False, **kwargs): - if "adobe" in mode: - if decode: - if text.startswith("<~") and text.endswith("~>"): - text = text[2:-2] - else: - text = "<~" + text + "~>" - elif "xbtoa" in mode: - sp, ep = "xbtoa [bB]egin\n", "xbtoa [eE]nd" - if decode: - if re.match(r"^xbtoa\s+[bB]egin\n", text) and \ - re.search(r"\nxbtoa\s+[eE]nd N \d+{h} E{h} S{h} R{h}\s*$".format(h=" [0-9a-fA-F]+"), text): - text = "".join(text.split("\n")[1:-1]).replace(" ", "") - elif not decode: - l, t = kwargs['length'], "\n".join(text[i:i+78] for i in range(0, len(text), 78)) - text = "xbtoa Begin\n%s\nxbtoa End N %d %x E %x S %x R %x" % \ - (t, l, l, kwargs['c_xor'], kwargs['c_sum'], kwargs['c_rot']) - return text - - -def __xbtoa_values(text): - try: - hr = "[0-9a-fA-F]+" - return re.search(r"\nxbtoa\s+[eE]nd N (\d+) ({h}) E ({h}) S ({h}) R ({h})\s*$".format(h=hr), text).groups() - except: - raise Base85DecodeError("Bad or missing xbtoa parameters") - - -def base85_encode(mode): - b85 = _get_charset(B85, mode) - def encode(input, errors="strict"): - r, l, kw = "", len(input), {} - if l == 0: - return input, 0 - if "xbtoa" in mode: - kw['length'] = l - kw['c_xor'], kw['c_sum'], kw['c_rot'] = 0, 0, 0 - n_pad = (4 - l % 4) % 4 - for i in range(0, l, 4): - block = input[i:i+4] - if block == "\0\0\0\0" and b85[-3:] == "stu": - r += "z" - if block == "\x20\x20\x20\x20" and "btoa" in mode: - r += "y" - if "xbtoa" in mode: - for c in block: - k = ord(c) - kw['c_xor'] ^= k - kw['c_sum'] += k + 1 - kw['c_rot'] <<= 1 - if kw['c_rot'] & 0x80000000: - kw['c_rot'] += 1 - kw['c_rot'] += k - if block == "\0\0\0\0" and b85[-3:] == "stu" or block == "\x20\x20\x20\x20" and "btoa" in mode: - continue - if len(block) < 4: - block += n_pad * "\0" - n, bl = s2i(block), "" - for _ in range(5): - n, k = divmod(n, 85) - bl = b85[k] + bl - r += bl - if "btoa" not in mode and n_pad: - r = r[:-n_pad] - if b85[-3:] == "stu" and r[-5:] == "!!!!!": - r = r[:-5] + "z" - return __format(r, mode, **kw), l - return encode - - -def base85_decode(mode): - b85 = _get_charset(B85, mode) - def decode(input, errors="strict"): - r, l, i, n_pad = "", len(input), 0, 0 - if l == 0: - return input, 0 - if "xbtoa" in mode: - v = __xbtoa_values(input) - n_last = int(v[0]) % 4 - c_xor, c_sum, c_rot = 0, 0, 0 - input = __format(input, mode, True) - ehandler = handle_error("base85", errors, decode=True) - if b85[-3:] == "stu" and input[-1] == "z": - input = input[:-1] + "!!!!!" - l = len(input) - while i < l: - n, incr = 0, 5 - if input[i] == "z" and b85[-3:] == "stu": - bl, incr = "\0\0\0\0", 1 - elif input[i] == "y" and "btoa" in mode: - bl, incr = "\x20\x20\x20\x20", 1 - else: - block = input[i:i+5] - if len(block) < 5: - n_pad = 5 - len(block) % 5 - block += n_pad * "\0" - for k, c in enumerate(block[::-1]): - try: - n += (b85.index(c) if c != "\0" else 255) * POW85[k] - except ValueError: - r += ehandler(c, i + k, r) - bl = codecs.decode("{:0>8}".format(hex(n & 0xffffffff)[2:]), "hex") - if "xbtoa" in mode: - if i + 5 == l and n_last > 0: - bl = bl[:n_last] - for c in bl: - k = ord(c) - c_xor ^= k - c_sum += k + 1 - c_rot <<= 1 - if c_rot & 0x80000000: - c_rot += 1 - c_rot += k - r += bl - i += incr - if n_pad > 0: - r = r[:-n_pad] - if "xbtoa" in mode: - chkv = ["%d" % len(r), "%x" % len(r), "%x" % c_xor, "%x" % c_sum, "%x" % c_rot] - if any(v1 != v2 for v1, v2 in zip(v, chkv)) and errors == "strict": - raise Base85ValueError("A check value does not match (%s != %s)" % (str(list(v)).replace("'", ""), - str(chkv).replace("'", ""))) - return r, l - return decode - - -add("base85", base85_encode, base85_decode, expansion_factor=lambda f, ename: f if "xbtoa" in ename else 1.25, - pattern=r"^(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)$", - extra_exceptions=["Base85ValueError"]) -main85 = main(85, None) -main85adobe = main(85, None, "adobe") -main85xbtoa = main(85, None, "xbtoa", wrap=False) -main85rfc1924 = main(85, "RFC 1924", "ipv6") -main85xml = main(85, "", "xml") -main85zeromq = main(85, "", "zeromq") - +# -*- coding: UTF-8 -*- +"""Base85 Codec - base85 content encoding. + +This is a simple wrapper for adding base64.b85**code to the codecs. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import base64 + +from ._base import _get_charset, digits, lower, main, upper +from ..__common__ import * + + +__examples__ = { + 'enc-dec(base85|z85|base85-ipv6)': ["@random{512,1024,2048}"], + 'enc-dec(base85-btoa|base85-xbtoa)': ["@random{512,1024,2048}"], + 'enc(base85|ascii85)': {'this is a test': "FD,B0+DGm>@3BZ'F*%"}, + 'enc(base85-adobe)': {'this is a test': "<~FD,B0+DGm>@3BZ'F*%~>", + 'this is a test\0\0\0\0\0\0': "<~FD,B0+DGm>@3BZ'F*%B^z~>"}, + 'enc(z85|base85-z)': {'this is a test': "BzbxfazC)tvixV6B94"}, + 'enc(base85-ipv6|base85_rfc1924)': {'this is a test': "bZBXFAZc?TVIXv6b94"}, + 'enc(base85_btoa)': {'this is a test': "FD,B0+DGm>@3BZ'F*%B^"}, + 'enc(base85_btoa)': {'this\0\0\0\0test': "FD,B0+DGm>@3BZ'F*%B^"}, + 'enc(base85_btoa)': {'this is a test\0\0\0\0': "FD,B0+DGm>y@3BZ'F*%B^z"}, + 'enc(base85-xbtoa)': {'this is a test': "xbtoa Begin\nFD,B0+DGm>@3BZ'F*%B^\nxbtoa End N 14 e E 4b" \ + " S 523 R 1b132e"}, + 'dec(base85-xbtoa)': {'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End': None, + 'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End N 14 e E 4b S 523 R 000bad': + None}, + 'enc(base85-xml)': {'this is a test': "bZBXFAZc@TVIXv6b94"}, + 'enc(base85|ascii85)': {'this\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0test': "FD,B0zzz!!!\"@ATMq"}, +} +__guess__ = ["ascii85", "z85", "base85-ipv6", "base85-xml", "base85-adobe", "base85-xbtoa"] + + +B85 = { + r'(base[-_]?85([-_]ascii)?|ascii85)$': "!\"#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_`" + lower[:21], + r'(z85|base[-_]?85[-_]z(eromq)?)$': digits + lower + upper + ".-:+=^!/*?&<>()[]{}@%$#", + r'base[-_]?85[-_](rfc1924|ipv6)$': digits + upper + lower + "!#$%&()*+-;<=>?@^_`{|}~", + r'base[-_]?85[-_]xml$': digits + upper + lower[:-1] + "!#$()*+,-./:;=?@^`{|}~z_", +} +B85[r'(base[-_]?85[-_]adobe)$'] = B85[r'(base[-_]?85[-_]x?btoa)$'] = B85[r'(base[-_]?85([-_]ascii)?|ascii85)$'] +POW85 = [85 ** i for i in range(5)] + + +def __format(text, mode, decode=False, **kwargs): + if "adobe" in mode: + if decode: + if text.startswith("<~") and text.endswith("~>"): + text = text[2:-2] + else: + text = "<~" + text + "~>" + elif "xbtoa" in mode: + sp, ep = "xbtoa [bB]egin\n", "xbtoa [eE]nd" + if decode: + if re.match(r"^xbtoa\s+[bB]egin\n", text) and \ + re.search(r"\nxbtoa\s+[eE]nd N \d+{h} E{h} S{h} R{h}\s*$".format(h=" [0-9a-fA-F]+"), text): + text = "".join(text.split("\n")[1:-1]).replace(" ", "") + elif not decode: + l, t = kwargs['length'], "\n".join(text[i:i+78] for i in range(0, len(text), 78)) + text = "xbtoa Begin\n%s\nxbtoa End N %d %x E %x S %x R %x" % \ + (t, l, l, kwargs['c_xor'], kwargs['c_sum'], kwargs['c_rot']) + return text + + +def __xbtoa_values(text): + try: + hr = "[0-9a-fA-F]+" + return re.search(r"\nxbtoa\s+[eE]nd N (\d+) ({h}) E ({h}) S ({h}) R ({h})\s*$".format(h=hr), text).groups() + except: + raise Base85DecodeError("Bad or missing xbtoa parameters") + + +def base85_encode(mode): + b85 = _get_charset(B85, mode) + def encode(input, errors="strict"): + r, l, kw = "", len(input), {} + if l == 0: + return input, 0 + if "xbtoa" in mode: + kw['length'] = l + kw['c_xor'], kw['c_sum'], kw['c_rot'] = 0, 0, 0 + n_pad = (4 - l % 4) % 4 + for i in range(0, l, 4): + block = input[i:i+4] + if block == "\0\0\0\0" and b85[-3:] == "stu": + r += "z" + if block == "\x20\x20\x20\x20" and "btoa" in mode: + r += "y" + if "xbtoa" in mode: + for c in block: + k = ord(c) + kw['c_xor'] ^= k + kw['c_sum'] += k + 1 + kw['c_rot'] <<= 1 + if kw['c_rot'] & 0x80000000: + kw['c_rot'] += 1 + kw['c_rot'] += k + if block == "\0\0\0\0" and b85[-3:] == "stu" or block == "\x20\x20\x20\x20" and "btoa" in mode: + continue + if len(block) < 4: + block += n_pad * "\0" + n, bl = s2i(block), "" + for _ in range(5): + n, k = divmod(n, 85) + bl = b85[k] + bl + r += bl + if "btoa" not in mode and n_pad: + r = r[:-n_pad] + if b85[-3:] == "stu" and r[-5:] == "!!!!!": + r = r[:-5] + "z" + return __format(r, mode, **kw), l + return encode + + +def base85_decode(mode): + b85 = _get_charset(B85, mode) + def decode(input, errors="strict"): + r, l, i, n_pad = "", len(input), 0, 0 + if l == 0: + return input, 0 + if "xbtoa" in mode: + v = __xbtoa_values(input) + n_last = int(v[0]) % 4 + c_xor, c_sum, c_rot = 0, 0, 0 + input = __format(input, mode, True) + ehandler = handle_error("base85", errors, decode=True) + if b85[-3:] == "stu" and input[-1] == "z": + input = input[:-1] + "!!!!!" + l = len(input) + while i < l: + n, incr = 0, 5 + if input[i] == "z" and b85[-3:] == "stu": + bl, incr = "\0\0\0\0", 1 + elif input[i] == "y" and "btoa" in mode: + bl, incr = "\x20\x20\x20\x20", 1 + else: + block = input[i:i+5] + if len(block) < 5: + n_pad = 5 - len(block) % 5 + block += n_pad * "\0" + for k, c in enumerate(block[::-1]): + try: + n += (b85.index(c) if c != "\0" else 255) * POW85[k] + except ValueError: + r += ehandler(c, i + k, r) + bl = codecs.decode("{:0>8}".format(hex(n & 0xffffffff)[2:]), "hex") + if "xbtoa" in mode: + if i + 5 == l and n_last > 0: + bl = bl[:n_last] + for c in bl: + k = ord(c) + c_xor ^= k + c_sum += k + 1 + c_rot <<= 1 + if c_rot & 0x80000000: + c_rot += 1 + c_rot += k + r += bl + i += incr + if n_pad > 0: + r = r[:-n_pad] + if "xbtoa" in mode: + chkv = ["%d" % len(r), "%x" % len(r), "%x" % c_xor, "%x" % c_sum, "%x" % c_rot] + if any(v1 != v2 for v1, v2 in zip(v, chkv)) and errors == "strict": + raise Base85ValueError("A check value does not match (%s != %s)" % (str(list(v)).replace("'", ""), + str(chkv).replace("'", ""))) + return r, l + return decode + + +add("base85", base85_encode, base85_decode, expansion_factor=lambda f, ename: f if "xbtoa" in ename else 1.25, + pattern=r"^(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)$", + extra_exceptions=["Base85ValueError"]) +main85 = main(85, None) +main85adobe = main(85, None, "adobe") +main85xbtoa = main(85, None, "xbtoa", wrap=False) +main85rfc1924 = main(85, "RFC 1924", "ipv6") +main85xml = main(85, "", "xml") +main85zeromq = main(85, "", "zeromq") + diff --git a/src/codext/binary/baudot.py b/src/codext/binary/baudot.py index a57e1ea..1cdd111 100755 --- a/src/codext/binary/baudot.py +++ b/src/codext/binary/baudot.py @@ -1,295 +1,281 @@ -# -*- coding: UTF-8 -*- -"""Baudot Codec - baudot content conversion to HTML. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_us"] -if PY3: - __CODES.extend(["ita2_meteo", "mtk2", "murray", "uk"]) -__guess__ = ["baudot%s-{}-{}".format(x, y) for x in __CODES for y in ["lsb", "msb"]] -__examples1__ = { - 'enc(baudot-BAD_ALPHABET)': None, - 'enc(baudot_ccitt2_lsb)': {'TEST 1234': "00001100001010000001001001101111101110011000001010"}, - 'enc(baudot-ita1)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, - 'enc(baudot_ita2_msb)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, - 'enc(baudot-ita2-us)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, - 'enc(baudot)': {'\x01\x02': None}, - 'enc(baudot_ccitt1-lsb)': {'TEST ': None}, -} -__examples2__ = { - 'enc(baudot_spaced-BAD_ALPHABET)': None, - 'enc(baudot-spaced_ccitt2_lsb)': {'TEST 1234': "00001 10000 10100 00001 00100 11011 11101 11001 10000 01010"}, - 'enc(baudot_spaced-ita1)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, - 'enc(baudot-spaced_ita2_msb)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, - 'enc(baudot_spaced-ita2-us)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, -} -__examples3__ = { - 'enc(baudot_tape-BAD_ALPHABET)': None, - 'enc(baudot_tape-ita1)': { - 'TEST 1234': "***.**\n* *. *\n .* \n* *. \n* *. *\n* . \n * . \n . *\n .* \n *. \n *. *", - }, - 'dec(baudot-tape_ita2)': {'BAD_HEADER\n .* \n': None}, - 'dec(baudot-tape_ita2-us)': {'***.**\nBAD_TAPE\n': None}, - 'dec(baudot_tape-ccitt1_lsb)': {'***.**\n .* \n* . *\n* . \n': None}, -} -if PY3: - __examples1__.update({ - 'enc(baudot_ccitt1_lsb)': {'TEST1234': "101010001010001101010100000100000100000100101"}, - 'enc(baudot-fr)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, - }) - __examples2__.update({ - 'enc(baudot-spaced_ccitt1_lsb)': {'TEST1234': "10101 00010 10001 10101 01000 00100 00010 00001 00101"}, - 'enc(baudot_spaced-fr)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, - }) - - -PATTERN = r"^baudot%s([-_](?:ccitt1|ccitt2|eu|fr|ita1|ita2|ita2[-_](?:us" + (r"|meteo" if PY3 else r"") + r")" + \ - (r"|mtk2|murray|uk" if PY3 else r"") + r"|us_tty)(?:[-_](?:lsb|msb))?)?$" -# reserved character -RES_CHR = "\xff" - -# sources: -# - http://rabbit.eng.miami.edu/info/baudot.html -# - https://en.wikipedia.org/wiki/Baudot_code -# - https://fr.qwe.wiki/wiki/Baudot_code -# all alphabets consider MSB by default -# CCITT-1 original Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) -CCITT1 = [ - "00001", "00010", - "\x00\xff\xff\xffA-JKEXGM/ZHLYSBRUTCQIWFNOVDP", - "\x00\xff\xff\xff1.6(2\xff7)\xff:\xff=3\xff8-4\xff9/\xff?\xff£5'0+" if PY3 else \ - "\x00\xff\xff\xff1.6(2\xff7)\xff:\xff=3\xff8-4\xff9/\xff?\xff$5'0+", -] -# CCITT-2 revised Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) -CCITT2 = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", -] -# Original Baudot (French/European ; sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -BAUDOT = EU = FR = [ - "10000", "01000", - "\x00AEÉYUIO\xffJGHBCFD \nXZSTWV\x7fKMLRQNP" if PY3 else "\x00AEeYUIO\xffJGHBCFD \nXZSTWV\x7fKMLRQNP", - "\x0012&34°5 67h89f0\xff.,:;!?'\x7f()=-/\u2116%" if PY3 else "\x0012&34o5 67h89f0\xff.,:;!?'\x7f()=-/\xff%", -] -# International Telegraphic Alphabet 1 (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -ITA1 = [ - "10000", "01000", - "\x00AE\rYUIO\xffJGHBCFD \xffXZSTWV\x7fKMLRQNP", - "\x0012\r34\xff5 67+89\xff0\xff\n,:.\xff?'\x7f()=-/\xff%", -] -# International Telegraphic Alphabet 2 (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -ITA2 = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "\x003\n- '87\r\x054\x07,!:(5+)2$6019?&\xff./=\xff", -] -# International Telegraphic Alphabet 2 - US TTY (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -ITA2_US = US_TTY = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", -] -# International Telegraphic Alphabet 2 - Meteo version (source: https://en.wikipedia.org/wiki/Baudot_code) -if PY3: - ITA2_METEO = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "-3\n\u2191 \x0787\r\u21974\u2199\u29b7\u2192\u25ef\u21905+\u21962\u21936019\u2295\u2198\xff./\u29b6\xff", - ] -# Russian MTK-2 alphabet (source: https://fr.qwe.wiki/wiki/Baudot_code) -if PY3: - MTK2 = [ - "11111", "11011", - "\x00Е\n\xff СИУ\r\xffРЙНФЦКТЗЛВХЫПЯОБГ\xffМЬЖ\xff", - "\x003\n- '87\r\xff4Ю,Э:(5+)2Щ6019?Ш\xff./=\xff", - ] -# Murray code ; NB: not all fractions are supported (source: https://en.wikipedia.org/wiki/Baudot_code) -if PY3: - MURRAY = [ - "00100", "11011", - " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", - "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*" if PY3 else \ - "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,$)*", - ] -# English Baudot ; NB: not all fractions are supported (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -if PY3: - UK = [ - "10000", "01000", - "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", - "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+" if PY3 else \ - "\x0012\xff34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/$+", - ] - - -def _bits_from_tape(tape, trans={'*': "1", ' ': "0"}): - """ Converts a tape-like string with the given translation for ones and zeros to a series of bits. """ - bits = "" - trans_rev = {v: k for k, v in trans.items()} - for i, line in enumerate(tape.splitlines()): - if i == 0: - if line != trans_rev['1'] * 3 + "." + trans_rev['1'] * 2: - raise ValueError("Bad tape header '{}'".format(line)) - else: - line = line[:3] + line[4:] - if len(line) != 5: - raise ValueError("Bad tape line '{}'".format(line)) - bits += "".join(trans.get(c, "") for c in line) - return bits - - -def _bits_to_tape(bits, trans={'1': "*", '0': " "}): - """ Converts a series of bits to a tape-like string with the given translation for ones and zeros. """ - tape = [trans['1'] * 3 + "." + trans['1'] * 2] - for i in range(0, len(bits), 5): - group = "".join(trans[b] for b in bits[i:i+5]) - tape.append(group[:3] + "." + group[3:]) - return "\n".join(tape) - - -def _check_alphabet(alphabet): - """ Checks the length of letters and figures (must be 32 chars). """ - for chars in alphabet: - l = len(chars) - if l != 32: - raise ValueError("Bad length of alphabet (%d instead of 32)" % l) - - -def _handle_alphabet(alphabet): - """ Gets the given alphabet name and transforms it to its dictionary with letters and figures. """ - alphabet = (alphabet or "baudot").lower().replace("-", "_").strip("_") - if "_lsb" in alphabet: - alphabet = alphabet.replace("_lsb", "") - func = lambda x: x[::-1] - else: - alphabet = alphabet.replace("_msb", "") - func = lambda x: x - _ = globals()[alphabet.upper()] - st, a = _[:2], _[2:] - _check_alphabet(a) - alphabet = {n: {ch: bin(i)[2:].zfill(5) for i, ch in enumerate(alph) if ch != RES_CHR} for n, alph in \ - zip(["letters", "figures"], a)} - return alphabet, {'letters': st[0], 'figures': st[1]}, func - - -def baudot_encode(alphabet=None, spaced=False, tape=False): - ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") - alphabet, states, func = _handle_alphabet(alphabet) - def encode(text, errors="strict"): - text = text.upper() - s, l, state, seen_states = "", len(b(text)), None, [] - for i, c in enumerate(text): - # if the state is undefined yet, find the relevant alphabet - if state is None: - bits= None - for st in states.keys(): - try: - bits = func(alphabet[st][c]) - state = st - if st not in seen_states: - seen_states.append(st) - break - except KeyError: - pass - if bits is None: - bits = handle_error(ename, errors, "?", 5)(c, i) - s += bits - # otherwise, handle state change (when the current alphabet does not contain the character to encode but the - # other alphabet does - else: - try: - s += func(alphabet[state][c]) - continue - except KeyError: - state = list(set(states.keys()) - {state})[0] - try: - s += func(states[state]) + func(alphabet[state][c]) - if state not in seen_states: - seen_states.append(state) - except KeyError as e: - state = list(set(states.keys()) - {state})[0] # reset the state - s += handle_error(ename, errors, "?", 5)(c, i) - # by default, if no state is specified, the encoded string is handled as letters ; so if figures are used only, - # it is necessary to include the groups of bits for figures at the beginning of the encoded string - s = (states['figures'] if seen_states == ["figures"] else "") + s - if spaced: - s = " ".join(s[i:i+5] for i in range(0, len(s), 5)) - elif tape: - s = _bits_to_tape(s) - return s, l - return encode - - -def baudot_decode(alphabet=None, spaced=False, tape=False): - ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") - alphabet, states, func = _handle_alphabet(alphabet) - alphabet = {st: {v: k for k, v in alph.items()} for st, alph in alphabet.items()} - states = {v: k for k, v in states.items()} - def decode(text, errors="strict"): - s, l = "", len(b(text)) - if spaced: - text = text.replace(" ", "") - elif tape: - text = _bits_from_tape(text) - # infer the starting state by searching for the first encountered groups of bits indicating a valid state ; - # by default, we assume letters - state = "letters" - for i in range(0, len(text), 5): - bits = func(text[i:i+5]) - # the following code handles a possible ambiguity ; e.g. when letters have a group of bits matching - # a state change - if bits in states.keys(): - error = False - # so, when we see the bits of a state, we parse previous groups in order to determine if they are valid - # groups in the corresponding state, that is, if no error occurs ; if an error occurs, then it is a - # valid state change and not simply a character, and we can set it as the starting state - for j in range(i-5, 0, -5): - try: - alphabet[states[bits]][text[j:j+5]] - except KeyError: - error = True - break - if error: - state = list(set(states.values()) - {states[bits]})[0] - break - # now parse the input text - for i in range(0, len(text), 5): - bits = func(text[i:i+5]) - try: - s += alphabet[state][bits] - except KeyError: - if bits in states.keys() and states[bits] != state: - state = states[bits] - else: - s += handle_error(ename, errors, decode=True, item="group")(bits, i//5) - return s, l - return decode - - -add("baudot", baudot_encode, baudot_decode, PATTERN % r"", examples=__examples1__, guess=[x % "" for x in __guess__], - entropy=1., printables_rate=1.) - - -baudot_spaced_encode = lambda a: baudot_encode(a, spaced=True) -baudot_spaced_decode = lambda a: baudot_decode(a, spaced=True) -add("baudot-spaced", baudot_spaced_encode, baudot_spaced_decode, PATTERN % r"[-_]spaced", examples=__examples2__, - guess=[x % "-spaced" for x in __guess__], entropy=1.48, printables_rate=1.) - - -baudot_tape_encode = lambda a: baudot_encode(a, tape=True) -baudot_tape_decode = lambda a: baudot_decode(a, tape=True) -add("baudot-tape", baudot_tape_encode, baudot_tape_decode, PATTERN % r"[-_]tape", examples=__examples3__, - guess=[x % "-tape" for x in __guess__], entropy=1.86, printables_rate=1.) - +# -*- coding: UTF-8 -*- +"""Baudot Codec - baudot content conversion to HTML. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_meteo", "ita2_us", "mtk2", "murray", "uk"] +__guess__ = ["baudot%s-{}-{}".format(x, y) for x in __CODES for y in ["lsb", "msb"]] +__examples1__ = { + 'enc(baudot-BAD_ALPHABET)': None, + 'enc(baudot_ccitt2_lsb)': {'TEST 1234': "00001100001010000001001001101111101110011000001010"}, + 'enc(baudot-ita1)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, + 'enc(baudot_ita2_msb)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, + 'enc(baudot-ita2-us)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, + 'enc(baudot)': {'\x01\x02': None}, + 'enc(baudot_ccitt1-lsb)': {'TEST ': None}, + 'enc(baudot_ccitt1_lsb)': {'TEST1234': "101010001010001101010100000100000100000100101"}, + 'enc(baudot-fr)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, +} +__examples2__ = { + 'enc(baudot_spaced-BAD_ALPHABET)': None, + 'enc(baudot-spaced_ccitt2_lsb)': {'TEST 1234': "00001 10000 10100 00001 00100 11011 11101 11001 10000 01010"}, + 'enc(baudot_spaced-ita1)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, + 'enc(baudot-spaced_ita2_msb)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, + 'enc(baudot_spaced-ita2-us)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, + 'enc(baudot-spaced_ccitt1_lsb)': {'TEST1234': "10101 00010 10001 10101 01000 00100 00010 00001 00101"}, + 'enc(baudot_spaced-fr)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, +} +__examples3__ = { + 'enc(baudot_tape-BAD_ALPHABET)': None, + 'enc(baudot_tape-ita1)': { + 'TEST 1234': "***.**\n* *. *\n .* \n* *. \n* *. *\n* . \n * . \n . *\n .* \n *. \n *. *", + }, + 'dec(baudot-tape_ita2)': {'BAD_HEADER\n .* \n': None}, + 'dec(baudot-tape_ita2-us)': {'***.**\nBAD_TAPE\n': None}, + 'dec(baudot_tape-ccitt1_lsb)': {'***.**\n .* \n* . *\n* . \n': None}, +} + + +PATTERN = r"^baudot%s([-_](?:ccitt1|ccitt2|eu|fr|ita1|ita2|ita2[-_](?:us|meteo)|mtk2|murray|uk|us_tty)" + \ + r"(?:[-_](?:lsb|msb))?)?$" +# reserved character +RES_CHR = "\xff" + +# sources: +# - http://rabbit.eng.miami.edu/info/baudot.html +# - https://en.wikipedia.org/wiki/Baudot_code +# - https://fr.qwe.wiki/wiki/Baudot_code +# all alphabets consider MSB by default +# CCITT-1 original Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) +CCITT1 = [ + "00001", "00010", + "\x00\xff\xff\xffA-JKEXGM/ZHLYSBRUTCQIWFNOVDP", + "\x00\xff\xff\xff1.6(2\xff7)\xff:\xff=3\xff8-4\xff9/\xff?\xff£5'0+", +] +# CCITT-2 revised Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) +CCITT2 = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", +] +# Original Baudot (French/European ; sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +BAUDOT = EU = FR = [ + "10000", "01000", + "\x00AEÉYUIO\xffJGHBCFD \nXZSTWV\x7fKMLRQNP", + "\x0012&34°5 67h89f0\xff.,:;!?'\x7f()=-/\u2116%", +] +# International Telegraphic Alphabet 1 (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +ITA1 = [ + "10000", "01000", + "\x00AE\rYUIO\xffJGHBCFD \xffXZSTWV\x7fKMLRQNP", + "\x0012\r34\xff5 67+89\xff0\xff\n,:.\xff?'\x7f()=-/\xff%", +] +# International Telegraphic Alphabet 2 (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +ITA2 = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "\x003\n- '87\r\x054\x07,!:(5+)2$6019?&\xff./=\xff", +] +# International Telegraphic Alphabet 2 - US TTY (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +ITA2_US = US_TTY = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", +] +# International Telegraphic Alphabet 2 - Meteo version (source: https://en.wikipedia.org/wiki/Baudot_code) +ITA2_METEO = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "-3\n\u2191 \x0787\r\u21974\u2199\u29b7\u2192\u25ef\u21905+\u21962\u21936019\u2295\u2198\xff./\u29b6\xff", +] +# Russian MTK-2 alphabet (source: https://fr.qwe.wiki/wiki/Baudot_code) +MTK2 = [ + "11111", "11011", + "\x00Е\n\xff СИУ\r\xffРЙНФЦКТЗЛВХЫПЯОБГ\xffМЬЖ\xff", + "\x003\n- '87\r\xff4Ю,Э:(5+)2Щ6019?Ш\xff./=\xff", +] +# Murray code ; NB: not all fractions are supported (source: https://en.wikipedia.org/wiki/Baudot_code) +MURRAY = [ + "00100", "11011", + " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", + "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*", +] +# English Baudot ; NB: not all fractions are supported (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +UK = [ + "10000", "01000", + "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", + "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+", +] + + +def _bits_from_tape(tape, trans={'*': "1", ' ': "0"}): + """ Converts a tape-like string with the given translation for ones and zeros to a series of bits. """ + bits = "" + trans_rev = {v: k for k, v in trans.items()} + for i, line in enumerate(tape.splitlines()): + if i == 0: + if line != trans_rev['1'] * 3 + "." + trans_rev['1'] * 2: + raise ValueError("Bad tape header '{}'".format(line)) + else: + line = line[:3] + line[4:] + if len(line) != 5: + raise ValueError("Bad tape line '{}'".format(line)) + bits += "".join(trans.get(c, "") for c in line) + return bits + + +def _bits_to_tape(bits, trans={'1': "*", '0': " "}): + """ Converts a series of bits to a tape-like string with the given translation for ones and zeros. """ + tape = [trans['1'] * 3 + "." + trans['1'] * 2] + for i in range(0, len(bits), 5): + group = "".join(trans[b] for b in bits[i:i+5]) + tape.append(group[:3] + "." + group[3:]) + return "\n".join(tape) + + +def _check_alphabet(alphabet): + """ Checks the length of letters and figures (must be 32 chars). """ + for chars in alphabet: + l = len(chars) + if l != 32: + raise ValueError("Bad length of alphabet (%d instead of 32)" % l) + + +def _handle_alphabet(alphabet): + """ Gets the given alphabet name and transforms it to its dictionary with letters and figures. """ + alphabet = (alphabet or "baudot").lower().replace("-", "_").strip("_") + if "_lsb" in alphabet: + alphabet = alphabet.replace("_lsb", "") + func = lambda x: x[::-1] + else: + alphabet = alphabet.replace("_msb", "") + func = lambda x: x + _ = globals()[alphabet.upper()] + st, a = _[:2], _[2:] + _check_alphabet(a) + alphabet = {n: {ch: bin(i)[2:].zfill(5) for i, ch in enumerate(alph) if ch != RES_CHR} for n, alph in \ + zip(["letters", "figures"], a)} + return alphabet, {'letters': st[0], 'figures': st[1]}, func + + +def baudot_encode(alphabet=None, spaced=False, tape=False): + ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") + alphabet, states, func = _handle_alphabet(alphabet) + def encode(text, errors="strict"): + text = text.upper() + s, l, state, seen_states = "", len(b(text)), None, [] + for i, c in enumerate(text): + # if the state is undefined yet, find the relevant alphabet + if state is None: + bits= None + for st in states.keys(): + try: + bits = func(alphabet[st][c]) + state = st + if st not in seen_states: + seen_states.append(st) + break + except KeyError: + pass + if bits is None: + bits = handle_error(ename, errors, "?", 5)(c, i) + s += bits + # otherwise, handle state change (when the current alphabet does not contain the character to encode but the + # other alphabet does + else: + try: + s += func(alphabet[state][c]) + continue + except KeyError: + state = list(set(states.keys()) - {state})[0] + try: + s += func(states[state]) + func(alphabet[state][c]) + if state not in seen_states: + seen_states.append(state) + except KeyError as e: + state = list(set(states.keys()) - {state})[0] # reset the state + s += handle_error(ename, errors, "?", 5)(c, i) + # by default, if no state is specified, the encoded string is handled as letters ; so if figures are used only, + # it is necessary to include the groups of bits for figures at the beginning of the encoded string + s = (states['figures'] if seen_states == ["figures"] else "") + s + if spaced: + s = " ".join(s[i:i+5] for i in range(0, len(s), 5)) + elif tape: + s = _bits_to_tape(s) + return s, l + return encode + + +def baudot_decode(alphabet=None, spaced=False, tape=False): + ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") + alphabet, states, func = _handle_alphabet(alphabet) + alphabet = {st: {v: k for k, v in alph.items()} for st, alph in alphabet.items()} + states = {v: k for k, v in states.items()} + def decode(text, errors="strict"): + s, l = "", len(b(text)) + if spaced: + text = text.replace(" ", "") + elif tape: + text = _bits_from_tape(text) + # infer the starting state by searching for the first encountered groups of bits indicating a valid state ; + # by default, we assume letters + state = "letters" + for i in range(0, len(text), 5): + bits = func(text[i:i+5]) + # the following code handles a possible ambiguity ; e.g. when letters have a group of bits matching + # a state change + if bits in states.keys(): + error = False + # so, when we see the bits of a state, we parse previous groups in order to determine if they are valid + # groups in the corresponding state, that is, if no error occurs ; if an error occurs, then it is a + # valid state change and not simply a character, and we can set it as the starting state + for j in range(i-5, 0, -5): + try: + alphabet[states[bits]][text[j:j+5]] + except KeyError: + error = True + break + if error: + state = list(set(states.values()) - {states[bits]})[0] + break + # now parse the input text + for i in range(0, len(text), 5): + bits = func(text[i:i+5]) + try: + s += alphabet[state][bits] + except KeyError: + if bits in states.keys() and states[bits] != state: + state = states[bits] + else: + s += handle_error(ename, errors, decode=True, item="group")(bits, i//5) + return s, l + return decode + + +add("baudot", baudot_encode, baudot_decode, PATTERN % r"", examples=__examples1__, guess=[x % "" for x in __guess__], + entropy=1., printables_rate=1.) + + +baudot_spaced_encode = lambda a: baudot_encode(a, spaced=True) +baudot_spaced_decode = lambda a: baudot_decode(a, spaced=True) +add("baudot-spaced", baudot_spaced_encode, baudot_spaced_decode, PATTERN % r"[-_]spaced", examples=__examples2__, + guess=[x % "-spaced" for x in __guess__], entropy=1.48, printables_rate=1.) + + +baudot_tape_encode = lambda a: baudot_encode(a, tape=True) +baudot_tape_decode = lambda a: baudot_decode(a, tape=True) +add("baudot-tape", baudot_tape_encode, baudot_tape_decode, PATTERN % r"[-_]tape", examples=__examples3__, + guess=[x % "-tape" for x in __guess__], entropy=1.86, printables_rate=1.) + diff --git a/src/codext/binary/rotate.py b/src/codext/binary/rotate.py index 944e2b2..fb0c697 100755 --- a/src/codext/binary/rotate.py +++ b/src/codext/binary/rotate.py @@ -1,52 +1,51 @@ -# -*- coding: UTF-8 -*- -"""Rotate-Bits Codec - rotate-N-bits content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(rotate-0|rotate-8|rotate-left-8)': None, - 'enc(rotate1|rotate-right-1|rotate_1)': {'This is a test': "*4\xb4\xb9\x10\xb4\xb9\x10\xb0\x10:\xb2\xb9:"}, - 'enc(rotate-left-1|rotate_left_1)': {'This is a test': "¨ÐÒæ@Òæ@Â@èÊæè"}, -} -__guess__ = ["rotate-%d" % i for i in range(1, 8)] + ["rotate-left-%d" % i for i in range(1, 8)] - - -if PY3: - def _getn(i): - m = 1 - if str(i).startswith("left"): - i = i[4:].lstrip("-_") - m = -1 - return m * int(i) - - - def _rotaten(text, n=1): - r = "" - for c in ensure_str(text): - b = bin(ord(c))[2:].zfill(8) - r += chr(int(b[-n:] + b[:-n], 2)) - return r - - - def rotate_encode(i): - def encode(text, errors="strict"): - return _rotaten(text, _getn(i)), len(text) - return encode - - - def rotate_decode(i): - def decode(text, errors="strict"): - return _rotaten(text, -_getn(i)), len(text) - return decode - - - add("rotate", rotate_encode, rotate_decode, r"rotate(?:[-_]?bits)?[-_]?((?:(?:left|right)[-_]?)?[1-7])$", - transitive=True) - +# -*- coding: UTF-8 -*- +"""Rotate-Bits Codec - rotate-N-bits content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(rotate-0|rotate-8|rotate-left-8)': None, + 'enc(rotate1|rotate-right-1|rotate_1)': {'This is a test': "*4\xb4\xb9\x10\xb4\xb9\x10\xb0\x10:\xb2\xb9:"}, + 'enc(rotate-left-1|rotate_left_1)': {'This is a test': "¨ÐÒæ@Òæ@Â@èÊæè"}, +} +__guess__ = ["rotate-%d" % i for i in range(1, 8)] + ["rotate-left-%d" % i for i in range(1, 8)] + + +def _getn(i): + m = 1 + if str(i).startswith("left"): + i = i[4:].lstrip("-_") + m = -1 + return m * int(i) + + +def _rotaten(text, n=1): + r = "" + for c in ensure_str(text): + b = bin(ord(c))[2:].zfill(8) + r += chr(int(b[-n:] + b[:-n], 2)) + return r + + +def rotate_encode(i): + def encode(text, errors="strict"): + return _rotaten(text, _getn(i)), len(text) + return encode + + +def rotate_decode(i): + def decode(text, errors="strict"): + return _rotaten(text, -_getn(i)), len(text) + return decode + + +add("rotate", rotate_encode, rotate_decode, r"rotate(?:[-_]?bits)?[-_]?((?:(?:left|right)[-_]?)?[1-7])$", + transitive=True) + diff --git a/src/codext/common/cases.py b/src/codext/common/cases.py index 8aa87e4..2f91ada 100644 --- a/src/codext/common/cases.py +++ b/src/codext/common/cases.py @@ -27,11 +27,12 @@ add("lowercase", lowercase, uppercase, r"^lower(?:case)?$", penalty=.2) slugify = lambda i, e="strict", d="-": (re.sub(r"[^0-9a-z]+", d, i.lower()).strip(d), len(i)) -add("slugify", lambda i, e="strict": slugify(i, e), None, r"^(?:slug(?:ify)?|kebab(?:[-_]?case)?)$") +add("slugify", lambda i, e="strict": slugify(i, e), None, r"^(?:slug(?:ify)?|(?:dash|kebab)(?:[-_]?case)?)$") add("snakecase", lambda i, e="strict": slugify(i, e, "_"), None, r"^snake(?:[-_]?case)?$") +add("screamingsnakecase", lambda i, e="strict": slugify(i, e, "_").upper(), None, r"^screaming[-_]snake(?:[-_]?case)?$") swapcase = lambda i, e="strict": (i.swapcase(), len(i)) -add("swapcase", swapcase, swapcase, r"^(?:swap(?:[-_]?case)?|invert(?:case)?)$", penalty=.2) +add("swapcase", swapcase, swapcase, r"^(?:(?:flip|swap)(?:[-_]?case)?|invert(?:case)?)$", penalty=.2) title = lambda i, e="strict": (i.title(), len(i)) untitle = lambda i, e="strict": (" ".join(w[0].lower() + w[1:] if len(w) > 0 else "" for w in i.split()), len(i)) diff --git a/src/codext/compressions/pkzip.py b/src/codext/compressions/pkzip.py index 47d9cd5..35ec94e 100755 --- a/src/codext/compressions/pkzip.py +++ b/src/codext/compressions/pkzip.py @@ -1,56 +1,55 @@ -# -*- coding: UTF-8 -*- -"""Pkzip Codec - pkzip content compression. - -NB: Not an encoding properly speaking. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import zipfile - -from ..__common__ import * - - -_str = ["test", "This is a test", "@random{512,1024,2048}"] -__examples1__ = {'enc-dec(pkzip-deflate|deflate)': _str} -__examples2__ = {'enc-dec(pkzip_bz2|bzip2)': _str} -__examples3__ = {'enc-dec(pkzip-lzma|lzma)': _str} - - -if PY3: - NULL = { - 8: b"\x03\x00", - 12: b"BZh9\x17rE8P\x90\x00\x00\x00\x00", - 14: b"\t\x04\x05\x00]\x00\x00\x80\x00\x00\x83\xff\xfb\xff\xff\xc0\x00\x00\x00", - } - - - def pkzip_encode(compression_type): - def _encode(text, errors="strict"): - c = zipfile._get_compressor(compression_type) - return c.compress(b(text)) + c.flush(), len(text) - return _encode - - - def pkzip_decode(compression_type, name): - def _decode(data, errors="strict"): - d = zipfile._get_decompressor(compression_type) - r = d.decompress(b(data)) - if len(r) == 0 and b(data) != NULL[compression_type]: - return handle_error(name, errors, decode=True)(data[0], 0) if len(data) > 0 else "", len(data) - return r, len(r) - return _decode - - - add("pkzip_deflate", pkzip_encode(8), pkzip_decode(8, "deflate"), r"(?:(?:pk)?zip[-_])?deflate", - examples=__examples1__, guess=["deflate"]) - - add("pkzip_bzip2", pkzip_encode(12), pkzip_decode(12, "bzip2"), r"(?:(?:pk)?zip[-_])?bz(?:ip)?2", - examples=__examples2__, guess=["bz2"]) - - add("pkzip_lzma", pkzip_encode(14), pkzip_decode(14, "lzma"), r"(?:(?:pk)?zip[-_])?lzma", - examples=__examples3__, guess=["lzma"]) - +# -*- coding: UTF-8 -*- +"""Pkzip Codec - pkzip content compression. + +NB: Not an encoding properly speaking. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import zipfile + +from ..__common__ import * + + +_str = ["test", "This is a test", "@random{512,1024,2048}"] +__examples1__ = {'enc-dec(pkzip-deflate|deflate)': _str} +__examples2__ = {'enc-dec(pkzip_bz2|bzip2)': _str} +__examples3__ = {'enc-dec(pkzip-lzma|lzma)': _str} + + +NULL = { + 8: b"\x03\x00", + 12: b"BZh9\x17rE8P\x90\x00\x00\x00\x00", + 14: b"\t\x04\x05\x00]\x00\x00\x80\x00\x00\x83\xff\xfb\xff\xff\xc0\x00\x00\x00", +} + + +def pkzip_encode(compression_type): + def _encode(text, errors="strict"): + c = zipfile._get_compressor(compression_type) + return c.compress(b(text)) + c.flush(), len(text) + return _encode + + +def pkzip_decode(compression_type, name): + def _decode(data, errors="strict"): + d = zipfile._get_decompressor(compression_type) + r = d.decompress(b(data)) + if len(r) == 0 and b(data) != NULL[compression_type]: + return handle_error(name, errors, decode=True)(data[0], 0) if len(data) > 0 else "", len(data) + return r, len(r) + return _decode + + +add("pkzip_deflate", pkzip_encode(8), pkzip_decode(8, "deflate"), r"(?:(?:pk)?zip[-_])?deflate", + examples=__examples1__, guess=["deflate"]) + +add("pkzip_bzip2", pkzip_encode(12), pkzip_decode(12, "bzip2"), r"(?:(?:pk)?zip[-_])?bz(?:ip)?2", + examples=__examples2__, guess=["bz2"]) + +add("pkzip_lzma", pkzip_encode(14), pkzip_decode(14, "lzma"), r"(?:(?:pk)?zip[-_])?lzma", + examples=__examples3__, guess=["lzma"]) + diff --git a/src/codext/crypto/railfence.py b/src/codext/crypto/railfence.py index 3d150c0..a25f27a 100644 --- a/src/codext/crypto/railfence.py +++ b/src/codext/crypto/railfence.py @@ -1,96 +1,96 @@ -# -*- coding: UTF-8 -*- -"""Rail Fence Cipher Codec - rail fence content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(rail_123|rail-2-123)': {'this is a test': None}, - 'enc(railfence|zigzag)': {'this is a test': "t ashsi etist"}, - 'enc(rail-5|zigzag_5)': {'this is a test': "tah istsiet s"}, - 'enc(rail_5-3|rail_5_3)': {'this is a test': "it sss etiath "}, - 'enc(rail-5-3-up|rail_5_3-up)': {'this is a test': "h tiats e ssit"}, - 'enc(rail-7-4|rail_7_4)': {'this is a test': "a stiet shsti"}, - 'dec(zigzag)': {'': ""}, -} -__guess__ = ["railfence-%d" % i for i in range(1, 11)] + ["railfence-%d-up" % i for i in range(1, 11)] - - -def __build(text, rails, offset, up): - l, rail = len(text), offset - # set the starting rail and direction - if up: - dr = -1 - rail = rails - offset - 1 - else: - dr = 1 - # create rails - f = [[None] * l for i in range(rails)] - # now zig-zag between rails - for x in range(l): - f[rail][x] = text[x] - if rail >= rails - 1: - dr = -1 - elif rail <= 0: - dr = 1 - rail += dr - return f - - -def __check(length, rails, offset): - if rails > length: - raise ParameterError("Bad parameter for encoding 'railfence': rails=%d (should be >%d)" % (rails, length)) - if offset > rails: - raise ParameterError("Bad parameter for encoding 'railfence': offset=%d (should be >%d)" % (offset, rails)) - - -def railfence_encode(rails, offset, up): - rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" - def encode(text, errors="strict"): - r, l = "", len(text) - __check(l, rails, offset) - f = __build(text, rails, offset, up) - for rail in range(rails): - for x in range(l): - if f[rail][x] is not None: - r += f[rail][x] - return r, l - return encode - - -def railfence_decode(rails, offset, up): - rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" - def decode(text, errors="strict"): - # this if block is particularly useful with Python2 ; see codecs.py at line 492 in comparison with codecs.py - # from Python3 at line 501: in Python2, a last block can be read while empty while in Python3 not - # as a consequence, in Python2, an error is triggered as an empty text cannot be decoded with Rail Fence with - # a rails parameter > 0 (see the __check(length, rails, offset)) function - if text == "": - return "", 0 - r, i, l = "", 0, len(text) - __check(l, rails, offset) - f = __build("." * len(text), rails, offset, up) - # put the characters in the right place - for rail in range(rails): - for x in range(l): - if f[rail][x] == ".": - f[rail][x] = text[i] - i += 1 - # read the characters in the right order - for x in range(l): - for rail in range(rails): - if f[rail][x] is not None: - r += f[rail][x] - return r, len(text) - return decode - - -add("railfence", railfence_encode, railfence_decode, - r"^(?:rail(?:[-_]?fence)?|zigzag)(?:[-_]([1-9]|[1-9]\d+)(?:[-_]([0-9]|[1-9]\d+))?(?:[-_](up))?)?$") - +# -*- coding: UTF-8 -*- +"""Rail Fence Cipher Codec - rail fence content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(rail_123|rail-2-123)': {'this is a test': None}, + 'enc(railfence|zigzag)': {'this is a test': "t ashsi etist"}, + 'enc(rail-5|zigzag_5)': {'this is a test': "tah istsiet s"}, + 'enc(rail_5-3|rail_5_3)': {'this is a test': "it sss etiath "}, + 'enc(rail-5-3-up|rail_5_3-up)': {'this is a test': "h tiats e ssit"}, + 'enc(rail-7-4|rail_7_4)': {'this is a test': "a stiet shsti"}, + 'dec(zigzag)': {'': ""}, +} +__guess__ = ["railfence-%d" % i for i in range(1, 11)] + ["railfence-%d-up" % i for i in range(1, 11)] + + +def __build(text, rails, offset, up): + l, rail = len(text), offset + # set the starting rail and direction + if up: + dr = -1 + rail = rails - offset - 1 + else: + dr = 1 + # create rails + f = [[None] * l for i in range(rails)] + # now zig-zag between rails + for x in range(l): + f[rail][x] = text[x] + if rail >= rails - 1: + dr = -1 + elif rail <= 0: + dr = 1 + rail += dr + return f + + +def __check(length, rails, offset): + if rails > length: + raise ParameterError("Bad parameter for encoding 'railfence': rails=%d (should be <= %d)" % (rails, length)) + if offset > rails: + raise ParameterError("Bad parameter for encoding 'railfence': offset=%d (should be <= %d)" % (offset, rails)) + + +def railfence_encode(rails, offset, up): + rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" + def encode(text, errors="strict"): + r, l = "", len(text) + __check(l, rails, offset) + f = __build(text, rails, offset, up) + for rail in range(rails): + for x in range(l): + if f[rail][x] is not None: + r += f[rail][x] + return r, l + return encode + + +def railfence_decode(rails, offset, up): + rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" + def decode(text, errors="strict"): + # this if block is particularly useful with Python2 ; see codecs.py at line 492 in comparison with codecs.py + # from Python3 at line 501: in Python2, a last block can be read while empty while in Python3 not + # as a consequence, in Python2, an error is triggered as an empty text cannot be decoded with Rail Fence with + # a rails parameter > 0 (see the __check(length, rails, offset)) function + if text == "": + return "", 0 + r, i, l = "", 0, len(text) + __check(l, rails, offset) + f = __build("." * len(text), rails, offset, up) + # put the characters in the right place + for rail in range(rails): + for x in range(l): + if f[rail][x] == ".": + f[rail][x] = text[i] + i += 1 + # read the characters in the right order + for x in range(l): + for rail in range(rails): + if f[rail][x] is not None: + r += f[rail][x] + return r, len(text) + return decode + + +add("railfence", railfence_encode, railfence_decode, + r"^(?:rail(?:[-_]?fence)?|zigzag)(?:[-_]([1-9]|[1-9]\d+)(?:[-_]([0-9]|[1-9]\d+))?(?:[-_](up))?)?$") + diff --git a/src/codext/hashing/blake.py b/src/codext/hashing/blake.py index 2fad090..6656c46 100644 --- a/src/codext/hashing/blake.py +++ b/src/codext/hashing/blake.py @@ -8,20 +8,18 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib +from ..__common__ import * -from ..__common__ import add, b, PY3 +def blake_hash(c): + def _hash_transform(l): + l = (l or "64" if c == "b" else "32").lstrip("_-") + def _encode(data, error="strict"): + return getattr(hashlib, "blake2%s" % c)(b(data), digest_size=int(l)).hexdigest(), len(data) + return _encode + return _hash_transform -if PY3: - def blake_hash(c): - def _hash_transform(l): - l = (l or "64" if c == "b" else "32").lstrip("_-") - def _encode(data, error="strict"): - return getattr(hashlib, "blake2%s" % c)(b(data), digest_size=int(l)).hexdigest(), len(data) - return _encode - return _hash_transform - add("blake2b", blake_hash("b"), pattern=r"^blake2b(|[-_](?:[1-9]|[1-5]\d|6[0-4]))$", guess=None) - add("blake2s", blake_hash("s"), pattern=r"^blake2s(|[-_](?:[1-9]|[1-2]\d|3[0-2]))$", guess=None) +add("blake2b", blake_hash("b"), pattern=r"^blake2b(|[-_](?:[1-9]|[1-5]\d|6[0-4]))$", guess=None) +add("blake2s", blake_hash("s"), pattern=r"^blake2s(|[-_](?:[1-9]|[1-2]\d|3[0-2]))$", guess=None) diff --git a/src/codext/hashing/crypt.py b/src/codext/hashing/crypt.py index caf8290..0d44d8e 100644 --- a/src/codext/hashing/crypt.py +++ b/src/codext/hashing/crypt.py @@ -8,10 +8,10 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -from ..__common__ import add, ensure_str, PY3, UNIX +from ..__common__ import add, ensure_str, UNIX -if PY3 and UNIX: +if UNIX: import crypt METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] diff --git a/src/codext/hashing/md.py b/src/codext/hashing/md.py index 6463722..521a01c 100644 --- a/src/codext/hashing/md.py +++ b/src/codext/hashing/md.py @@ -8,9 +8,7 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib - -from ..__common__ import add, b +from ..__common__ import * MD2_TABLE = [41, 46, 67, 201, 162, 216, 124, 1, 61, 54, 84, 161, 236, 240, 6, 19, 98, 167, 5, 243, 192, 199, 115, 140, diff --git a/src/codext/hashing/sha.py b/src/codext/hashing/sha.py index dd94002..1351fe8 100644 --- a/src/codext/hashing/sha.py +++ b/src/codext/hashing/sha.py @@ -8,9 +8,7 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib - -from ..__common__ import add, b, PY3 +from ..__common__ import * add("sha1", lambda s, error="strict": (hashlib.sha1(b(s)).hexdigest(), len(s)), guess=None) @@ -18,15 +16,12 @@ add("sha256", lambda s, error="strict": (hashlib.sha256(b(s)).hexdigest(), len(s)), guess=None) add("sha384", lambda s, error="strict": (hashlib.sha384(b(s)).hexdigest(), len(s)), guess=None) add("sha512", lambda s, error="strict": (hashlib.sha512(b(s)).hexdigest(), len(s)), guess=None) - - -if PY3: - add("sha3_224", lambda s, error="strict": (hashlib.sha3_224(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]224$", - guess=None) - add("sha3_256", lambda s, error="strict": (hashlib.sha3_256(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]256$", - guess=None) - add("sha3_384", lambda s, error="strict": (hashlib.sha3_384(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]384$", - guess=None) - add("sha3_512", lambda s, error="strict": (hashlib.sha3_512(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]512$", - guess=None) +add("sha3_224", lambda s, error="strict": (hashlib.sha3_224(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]224$", + guess=None) +add("sha3_256", lambda s, error="strict": (hashlib.sha3_256(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]256$", + guess=None) +add("sha3_384", lambda s, error="strict": (hashlib.sha3_384(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]384$", + guess=None) +add("sha3_512", lambda s, error="strict": (hashlib.sha3_512(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]512$", + guess=None) diff --git a/src/codext/hashing/shake.py b/src/codext/hashing/shake.py index af79dce..22c7b99 100644 --- a/src/codext/hashing/shake.py +++ b/src/codext/hashing/shake.py @@ -8,20 +8,18 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib +from ..__common__ import * -from ..__common__ import add, b, PY3 +def shake_hash(i): + def _hash_transform(l): + l = (l or str(i)).lstrip("_-") + def _encode(data, error="strict"): + return getattr(hashlib, "shake_%d" % i)(b(data)).hexdigest(int(l)), len(data) + return _encode + return _hash_transform -if PY3: - def shake_hash(i): - def _hash_transform(l): - l = (l or str(i)).lstrip("_-") - def _encode(data, error="strict"): - return getattr(hashlib, "shake_%d" % i)(b(data)).hexdigest(int(l)), len(data) - return _encode - return _hash_transform - add("shake_128", shake_hash(128), pattern=r"^shake[-_]?128(|[-_][1-9]\d*)$", guess=None) - add("shake_256", shake_hash(256), pattern=r"^shake[-_]?256(|[-_][1-9]\d*)$", guess=None) +add("shake_128", shake_hash(128), pattern=r"^shake[-_]?128(|[-_][1-9]\d*)$", guess=None) +add("shake_256", shake_hash(256), pattern=r"^shake[-_]?256(|[-_][1-9]\d*)$", guess=None) diff --git a/src/codext/languages/braille.py b/src/codext/languages/braille.py index b28c56e..775399c 100755 --- a/src/codext/languages/braille.py +++ b/src/codext/languages/braille.py @@ -1,34 +1,33 @@ -# -*- coding: UTF-8 -*- -"""Braille Codec - braille content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(braille)': {'this is a test': "⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞"}, -} - - -ENCMAP = { - # digits - '0': '⠴', '1': '⠂', '2': '⠆', '3': '⠒', '4': '⠲', '5': '⠢', '6': '⠖', '7': '⠶', '8': '⠦', '9': '⠔', - # letters - 'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑', 'f': '⠋', 'g': '⠛', 'h': '⠓', 'i': '⠊', 'j': '⠚', 'k': '⠅', - 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕', 'p': '⠏', 'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞', 'u': '⠥', 'v': '⠧', - 'w': '⠺', 'x': '⠭', 'y': '⠽', 'z': '⠵', - # punctuation - ' ': '⠀', '!': '⠮', '"': '⠐', '#': '⠼', '$': '⠫', '%': '⠩', '&': '⠯', ':': '⠱', ';': '⠰', '<': '⠣', '=': '⠿', - '>': '⠜', '?': '⠹', '@': '⠈', "'": '⠄', '(': '⠷', ')': '⠾', '*': '⠡', '+': '⠬', ',': '⠠', '-': '⠤', '.': '⠨', - '/': '⠌', '[': '⠪', '\\': '⠳', ']': '⠻', '^': '⠘', '_': '⠸', -} - - -if PY3: - add_map("braille", ENCMAP, ignore_case="encode") - +# -*- coding: UTF-8 -*- +"""Braille Codec - braille content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(braille)': {'this is a test': "⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞"}, +} + + +ENCMAP = { + # digits + '0': '⠴', '1': '⠂', '2': '⠆', '3': '⠒', '4': '⠲', '5': '⠢', '6': '⠖', '7': '⠶', '8': '⠦', '9': '⠔', + # letters + 'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑', 'f': '⠋', 'g': '⠛', 'h': '⠓', 'i': '⠊', 'j': '⠚', 'k': '⠅', + 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕', 'p': '⠏', 'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞', 'u': '⠥', 'v': '⠧', + 'w': '⠺', 'x': '⠭', 'y': '⠽', 'z': '⠵', + # punctuation + ' ': '⠀', '!': '⠮', '"': '⠐', '#': '⠼', '$': '⠫', '%': '⠩', '&': '⠯', ':': '⠱', ';': '⠰', '<': '⠣', '=': '⠿', + '>': '⠜', '?': '⠹', '@': '⠈', "'": '⠄', '(': '⠷', ')': '⠾', '*': '⠡', '+': '⠬', ',': '⠠', '-': '⠤', '.': '⠨', + '/': '⠌', '[': '⠪', '\\': '⠳', ']': '⠻', '^': '⠘', '_': '⠸', +} + + +add_map("braille", ENCMAP, ignore_case="encode") + diff --git a/src/codext/languages/galactic.py b/src/codext/languages/galactic.py index e77cb3a..26544b5 100644 --- a/src/codext/languages/galactic.py +++ b/src/codext/languages/galactic.py @@ -29,7 +29,6 @@ } -if PY3: - add_map("galactic", ENCMAP, ignore_case="encode", printables_rate=0., - pattern=r"^(?:galactic(?:[-_]alphabet)?|minecraft(?:[-_](?:enchantment|enchanting[-_]language))?)$") +add_map("galactic", ENCMAP, ignore_case="encode", printables_rate=0., + pattern=r"^(?:galactic(?:[-_]alphabet)?|minecraft(?:[-_](?:enchantment|enchanting[-_]language))?)$") diff --git a/src/codext/languages/tap.py b/src/codext/languages/tap.py index efd551d..ec7c15b 100644 --- a/src/codext/languages/tap.py +++ b/src/codext/languages/tap.py @@ -1,39 +1,38 @@ -# -*- coding: UTF-8 -*- -"""Tap code - Tap/knock code encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(tap|knock-code|tap_code)': {'this is a test' : ".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. ." - "⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ...."}, -} -__guess__ = ["tap", "tap-inv"] - - -def __build_encmap(a): - d, i = {}, 0 - for x in range(1,6): - for y in range(1,6): - d[a[i]] = x * "." + " " + y * "." - i += 1 - d['k'], d[' '] = d['c'], " " - return d - - - -ENCMAP = { - '': __build_encmap("abcdefghijlmnopqrstuvwxyz"), - 'inv': __build_encmap("abcdefghijlmnopqrstuvwxyz"[::-1]), -} - - -if PY3: - add_map("tap", ENCMAP, ignore_case="both", sep="⠀", pattern=r"^(?:tap|knock)(?:[-_]code)?(|inv)$") - +# -*- coding: UTF-8 -*- +"""Tap code - Tap/knock code encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(tap|knock-code|tap_code)': {'this is a test' : ".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. ." + "⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ...."}, +} +__guess__ = ["tap", "tap-inv"] + + +def __build_encmap(a): + d, i = {}, 0 + for x in range(1,6): + for y in range(1,6): + d[a[i]] = x * "." + " " + y * "." + i += 1 + d['k'], d[' '] = d['c'], " " + return d + + + +ENCMAP = { + '': __build_encmap("abcdefghijlmnopqrstuvwxyz"), + 'inv': __build_encmap("abcdefghijlmnopqrstuvwxyz"[::-1]), +} + + +add_map("tap", ENCMAP, ignore_case="both", sep="⠀", pattern=r"^(?:tap|knock)(?:[-_]code)?(|inv)$") + diff --git a/src/codext/others/uuencode.py b/src/codext/others/uuencode.py index a2f2fb6..f1ecfc3 100644 --- a/src/codext/others/uuencode.py +++ b/src/codext/others/uuencode.py @@ -17,7 +17,7 @@ 'dec(uu-encode)': {'.=&AI': "<This is a test>"}, - 'dec(html|html_entity)': {'&DoesNotExist;': None}, - 'dec(html_entities|html-entity)': { - '<This is a test>': "", - '<This is a test>': "", - }, -} -if PY3: - __examples__['enc(html)'] = {'\u1234': "&1234;"} - - -# source: https://dev.w3.org/html5/html-author/charref -ENCMAP = { - '\t': " ", '\n': " ", '!': "!", '"': """, '#': "#", '$': "$", '%': "%", - '&': "&", '\'': "'", '(': "(", ')': ")", '*': "*", '+': "+", ',': ",", - '.': ".", '/': "/", ':': ":", ';': ";", '<': "<", '=': "=", '>': ">", - '?': "?", '@': "@", '[': "[", '\\': "\", ']': "]", '^': "^", '_': "_", - '`': "`", '{': "{", '|': "|", '}': "}", '¡': "¡", '¢': "¢", - '£': "£", '¤': "¤", '¥': "¥", '¦': "¦", '§': "§", '¨': "¨", '©': "©", - 'ª': "ª", '«': "«", '¬': "¬", '­': "­", '®': "®", '¯': "¯", '°': "°", - '±': "±", '²': "²", '³': "³", '´': "´", 'µ': "µ", '¶': "¶", '·': "·", - '¸': "¸", '¹': "¹", 'º': "º", '»': "»", '¼': "¼", '½': "½", '¾': "¾", - '¿': "¿", 'À': "À", 'Á': "Á", 'Â': "Â", 'Ã': "Ã", 'Ä': "Ä", 'Å': "Å", - 'Æ': "Æ", 'Ç': "Ç", 'È': "È", 'É': "É", 'Ê': "Ê", 'Ë': "Ë", 'Ì': "Ì", - 'Í': "Í", 'Î': "Î", 'Ï': "Ï", 'Ð': "Ð", 'Ñ': "Ñ", 'Ò': "Ò", 'Ó': "Ó", - 'Ô': "Ô", 'Õ': "Õ", 'Ö': "Ö", '×': "×", 'Ø': "Ø", 'Ù': "Ù", 'Ú': "Ú", - 'Û': "Û", 'Ü': "Ü", 'Ý': "Ý", 'Þ': "Þ", 'ß': "ß", 'à': "à", 'á': "á", - 'â': "â", 'ã': "ã", 'ä': "ä", 'å': "å", 'æ': "æ", 'ç': "ç", 'è': "è", - 'é': "é", 'ê': "ê", 'ë': "ë", 'ì': "ì", 'í': "í", 'î': "î", 'ï': "ï", - 'ð': "ð", 'ñ': "ñ", 'ò': "ò", 'ó': "ó", 'ô': "ô", 'õ': "õ", 'ö': "ö", - '÷': "÷", 'ø': "ø", 'ù': "ù", 'ú': "ú", 'û': "û", 'ü': "ü", 'ý': "ý", - 'þ': "þ", 'ÿ': "ÿ", 'Ā': "Ā", 'ā': "ā", 'Ă': "Ă", 'ă': "ă", 'Ą': "Ą", - 'ą': "ą", 'Ć': "Ć", 'ć': "ć", 'Ĉ': "Ĉ", 'ĉ': "ĉ", 'Ċ': "Ċ", 'ċ': "ċ", - 'Č': "Č", 'č': "č", 'Ď': "Ď", 'ď': "ď", 'Đ': "Đ", 'đ': "đ", - 'Ē': "Ē", 'ē': "ē", 'Ė': "Ė", 'ė': "ė", 'Ę': "Ę", 'ę': "ę", 'Ě': "Ě", - 'ě': "ě", 'Ĝ': "Ĝ", 'ĝ': "ĝ", 'Ğ': "Ğ", 'ğ': "ğ", 'Ġ': "Ġ", 'ġ': "ġ", - 'Ģ': "Ģ", 'Ĥ': "Ĥ", 'ĥ': "ĥ", 'Ħ': "Ħ", 'ħ': "ħ", 'Ĩ': "Ĩ", - 'ĩ': "ĩ", 'Ī': "Ī", 'ī': "ī", 'Į': "Į", 'į': "į", 'İ': "İ", 'ı': "ı", - 'IJ': "IJ", 'ij': "ij", 'Ĵ': "Ĵ", 'ĵ': "ĵ", 'Ķ': "Ķ", 'ķ': "ķ", 'ĸ': "ĸ", - 'Ĺ': "Ĺ", 'ĺ': "ĺ", 'Ļ': "Ļ", 'ļ': "ļ", 'Ľ': "Ľ", 'ľ': "ľ", - 'Ŀ': "Ŀ", 'ŀ': "ŀ", 'Ł': "Ł", 'ł': "ł", 'Ń': "Ń", 'ń': "ń", - 'Ņ': "Ņ", 'ņ': "ņ", 'Ň': "Ň", 'ň': "ň", 'ʼn': "ʼn", 'Ŋ': "Ŋ", 'ŋ': "ŋ", - 'Ō': "Ō", 'ō': "ō", 'Ő': "Ő", 'ő': "ő", 'Œ': "Œ", 'œ': "œ", 'Ŕ': "Ŕ", - 'ŕ': "ŕ", 'Ŗ': "Ŗ", 'ŗ': "ŗ", 'Ř': "Ř", 'ř': "ř", 'Ś': "Ś", - 'ś': "ś", 'Ŝ': "Ŝ", 'ŝ': "ŝ", 'Ş': "Ş", 'ş': "ş", 'Š': "Š", - 'š': "š", 'Ţ': "Ţ", 'ţ': "ţ", 'Ť': "Ť", 'ť': "ť", 'Ŧ': "Ŧ", - 'ŧ': "ŧ", 'Ũ': "Ũ", 'ũ': "ũ", 'Ū': "Ū", 'ū': "ū", 'Ŭ': "Ŭ", - 'ŭ': "ŭ", 'Ů': "Ů", 'ů': "ů", 'Ű': "Ű", 'ű': "ű", 'Ų': "Ų", 'ų': "ų", - 'Ŵ': "Ŵ", 'ŵ': "ŵ", 'Ŷ': "Ŷ", 'ŷ': "ŷ", 'Ÿ': "Ÿ", 'Ź': "Ź", 'ź': "ź", - 'Ż': "Ż", 'ż': "ż", 'Ž': "Ž", 'ž': "ž", 'ƒ': "ƒ", 'Ƶ': "Ƶ", 'ǵ': "ǵ", - 'ȷ': "ȷ", 'ˆ': "ˆ", 'ˇ': "ˇ", '˘': "˘", '˙': "˙", '˚': "˚", '˛': "˛", - '˜': "˜", '˝': "˝", '̑': "̑", '̲': "_", 'Α': "Α", 'Β': "Β", - 'Γ': "Γ", 'Δ': "Δ", 'Ε': "Ε", 'Ζ': "Ζ", 'Η': "Η", 'Θ': "Θ", 'Ι': "Ι", - 'Κ': "Κ", 'Λ': "Λ", 'Μ': "Μ", 'Ν': "Ν", 'Ξ': "Ξ", 'Ο': "Ο", 'Π': "Π", - 'Ρ': "Ρ", 'Σ': "Σ", 'Τ': "Τ", 'Υ': "Υ", 'Φ': "Φ", 'Χ': "Χ", 'Ψ': "Ψ", - 'Ω': "Ω", 'α': "α", 'β': "β", 'γ': "γ", 'δ': "δ", 'ε': "ϵ", 'ζ': "ζ", - 'η': "η", 'θ': "θ", 'ι': "ι", 'κ': "κ", 'λ': "λ", 'μ': "μ", 'ν': "ν", - 'ξ': "ξ", 'ο': "ο", 'π': "π", 'ρ': "ρ", 'ς': "ς", 'σ': "σ", 'τ': "τ", - 'υ': "υ", 'φ': "φ", 'χ': "χ", 'ψ': "ψ", 'ω': "ω", 'ϑ': "ϑ", 'ϒ': "ϒ", - 'ϕ': "ϕ", 'ϖ': "ϖ", 'Ϝ': "Ϝ", 'ϝ': "ϝ", 'ϰ': "ϰ", 'ϱ': "ϱ", - 'ϵ': "ε", '϶': "϶", 'Ё': "Ё", 'Ђ': "Ђ", 'Ѓ': "Ѓ", 'Є': "Є", 'Ѕ': "Ѕ", - 'І': "І", 'Ї': "Ї", 'Ј': "Ј", 'Љ': "Љ", 'Њ': "Њ", 'Ћ': "Ћ", 'Ќ': "Ќ", - 'Ў': "Ў", 'Џ': "Џ", 'А': "А", 'Б': "Б", 'В': "В", 'Г': "Г", 'Д': "Д", 'Е': "Е", - 'Ж': "Ж", 'З': "З", 'И': "И", 'Й': "Й", 'К': "К", 'Л': "Л", 'М': "М", 'Н': "Н", - 'О': "О", 'П': "П", 'Р': "Р", 'С': "С", 'Т': "Т", 'У': "У", 'Ф': "Ф", 'Х': "Х", - 'Ц': "Ц", 'Ч': "Ч", 'Ш': "Ш", 'Щ': "Щ", 'Ъ': "Ъ", 'Ы': "Ы", 'Ь': "Ь", - 'Э': "Э", 'Ю': "Ю", 'Я': "Я", 'а': "а", 'б': "б", 'в': "в", 'г': "г", 'д': "д", - 'е': "е", 'ж': "ж", 'з': "з", 'и': "и", 'й': "й", 'к': "к", 'л': "л", 'м': "м", - 'н': "н", 'о': "о", 'п': "п", 'р': "р", 'с': "с", 'т': "т", 'у': "у", 'ф': "ф", - 'х': "х", 'ц': "ц", 'ч': "ч", 'ш': "ш", 'щ': "щ", 'ъ': "ъ", 'ы': "ы", - 'ь': "ь", 'э': "э", 'ю': "ю", 'я': "я", 'ё': "ё", 'ђ': "ђ", 'ѓ': "ѓ", - 'є': "є", 'ѕ': "ѕ", 'і': "і", 'ї': "ї", 'ј': "ј", 'љ': "љ", 'њ': "њ", - 'ћ': "ћ", 'ќ': "ќ", 'ў': "ў", 'џ': "џ", '\u2002': " ", '\u2003': " ", - '\u2004': " ", '\u2005': " ", '\u2007': " ", '\u2008': " ", '\u2009': " ", - '\u200a': " ", '​\u200b': "​", '\u200c': "‌", '\u200d': "‍", '\u200e': "‎", - '\u200f': "‏", '‐': "‐", '–': "–", '—': "—", - '―': "―", '‖': "‖", '‘': "‘", '’': "’", '‚': "‚", '“': "“", '”': "”", - '„': "„", '†': "†", '‡': "‡", '•': "•", '‥': "‥", '…': "…", '‰': "‰", - '‱': "‱", '′': "′", '″': "″", '‴': "‴", '‵': "‵", '‹': "‹", - '›': "›", '‾': "‾", '⁁': "⁁", '⁃': "⁃", '⁄': "⁄", '⁏': "⁏", '⁗': "⁗", - '\u205f': " ", '⁠': "⁠", '⁡': "⁡", '⁢': "⁢", '⁣': "⁣", - '€': "€", '⃛': "⃛", '⃜': "⃜", 'ℂ': "ℂ", '℅': "℅", 'ℊ': "ℊ", 'ℋ': "ℋ", - 'ℌ': "ℌ", 'ℍ': "ℍ", 'ℎ': "ℎ", 'ℏ': "ℏ", 'ℐ': "ℐ", 'ℑ': "ℑ", - 'ℒ': "ℒ", 'ℓ': "ℓ", 'ℕ': "ℕ", '№': "№", '℗': "℗", '℘': "℘", 'ℙ': "ℙ", - 'ℚ': "ℚ", 'ℛ': "ℛ", 'ℜ': "ℜ", 'ℝ': "ℝ", '℞': "℞", '™': "™", 'ℤ': "ℤ", - 'Ω': "Ω", '℧': "℧", 'ℨ': "ℨ", '℩': "℩", 'Å': "Å", 'ℬ': "ℬ", 'ℭ': "ℭ", - 'ℯ': "ℯ", 'ℰ': "ℰ", 'ℱ': "ℱ", 'ℳ': "ℳ", 'ℴ': "ℴ", 'ℵ': "ℵ", 'ℶ': "ℶ", - 'ℷ': "ℷ", 'ℸ': "ℸ", 'ⅅ': "ⅅ", 'ⅆ': "ⅆ", 'ⅇ': "ⅇ", - 'ⅈ': "ⅈ", '⅓': "⅓", '⅔': "⅔", '⅕': "⅕", '⅖': "⅖", '⅗': "⅗", - '⅘': "⅘", '⅙': "⅙", '⅚': "⅚", '⅛': "⅛", '⅜': "⅜", '⅝': "⅝", - '⅞': "⅞", '←': "←", '↑': "↑", '→': "→", '↓': "↓", '↔': "↔", '↕': "↕", - '↖': "↖", '↗': "↗", '↘': "↘", '↙': "↙", '↚': "↚", '↛': "↛", '↝': "↝", - '↞': "↞", '↟': "↟", '↠': "↠", '↡': "↡", '↢': "↢", '↣': "↣", - '↤': "↤", '↥': "↥", '↦': "↦", '↧': "↧", '↩': "↩", '↪': "↪", - '↫': "↫", '↬': "↬", '↭': "↭", '↮': "↮", '↰': "↰", '↱': "↱", '↲': "↲", - '↳': "↳", '↵': "↵", '↶': "↶", '↷': "↷", '↺': "↺", '↻': "↻", '↼': "↼", - '↽': "↽", '↾': "↾", '↿': "↿", '⇀': "⇀", '⇁': "⇁", '⇂': "⇂", '⇃': "⇃", - '⇄': "⇄", '⇅': "⇅", '⇆': "⇆", '⇇': "⇇", '⇈': "⇈", '⇉': "⇉", '⇊': "⇊", - '⇋': "⇋", '⇌': "⇌", '⇍': "⇍", '⇎': "⇎", '⇏': "⇏", '⇐': "⇐", '⇑': "⇑", - '⇒': "⇒", '⇓': "⇓", '⇔': "⇔", '⇕': "⇕", '⇖': "⇖", '⇗': "⇗", '⇘': "⇘", - '⇙': "⇙", '⇚': "⇚", '⇛': "⇛", '⇝': "⇝", '⇤': "⇤", '⇥': "⇥", '⇵': "⇵", - '⇽': "⇽", '⇾': "⇾", '⇿': "⇿", '∀': "∀", '∁': "∁", '∂': "∂", '∃': "∃", - '∄': "∄", '∅': "∅", '∇': "∇", '∈': "∈", '∉': "∉", '∋': "∋", '∌': "∌", - '∏': "∏", '∐': "∐", '∑': "∑", '−': "−", '∓': "∓", '∔': "∔", '∖': "∖", - '∗': "∗", '∘': "∘", '√': "√", '∝': "∝", '∞': "∞", '∟': "∟", '∠': "∠", - '∡': "∡", '∢': "∢", '∣': "∣", '∤': "∤", '∥': "∥", '∦': "∦", '∧': "∧", - '∨': "∨", '∩': "∩", '∪': "∪", '∫': "∫", '∬': "∬", '∭': "∭", '∮': "∮", - '∯': "∯", '∰': "∰", '∱': "∱", '∲': "∲", '∳': "∳", '∴': "∴", - '∵': "∵", '∶': "∶", '∷': "∷", '∸': "∸", '∺': "∺", '∻': "∻", '∼': "∼", - '∽': "∽", '∾': "∾", '∿': "∿", '≀': "≀", '≁': "≁", '≂': "≂", '≃': "≃", - '≄': "≄", '≅': "≅", '≆': "≆", '≇': "≇", '≈': "≈", '≉': "≉", '≊': "≊", - '≋': "≋", '≌': "≌", '≍': "≍", '≎': "≎", '≏': "≏", '≐': "≐", '≑': "≑", - '≒': "≒", '≓': "≓", '≔': "≔", '≕': "≕", '≖': "≖", '≗': "≗", '≙': "≙", - '≚': "≚", '≜': "≜", '≟': "≟", '≠': "≠", '≡': "≡", '≢': "≢", '≤': "≤", - '≥': "≥", '≦': "≦", '≧': "≧", '≨': "≨", '≩': "≩", '≪': "≪", '≫': "≫", '≬': "≬", - '≭': "≭", '≮': "≮", '≯': "≯", '≰': "≰", '≱': "≱", '≲': "≲", '≳': "≳", - '≴': "≴", '≵': "≵", '≶': "≶", '≷': "≷", '≸': "≸", '≹': "≹", '≺': "≺", '≻': "≻", - '≼': "≼", '≽': "≽", '≾': "≾", '≿': "≿", '⊀': "⊀", '⊁': "⊁", '⊂': "⊂", - '⊃': "⊃", '⊄': "⊄", '⊅': "⊅", '⊆': "⊆", '⊇': "⊇", '⊈': "⊈", '⊉': "⊉", - '⊊': "⊊", '⊋': "⊋", '⊍': "⊍", '⊎': "⊎", '⊏': "⊏", '⊐': "⊐", '⊑': "⊑", - '⊒': "⊒", '⊓': "⊓", '⊔': "⊔", '⊕': "⊕", '⊖': "⊖", '⊗': "⊗", '⊘': "⊘", - '⊙': "⊙", '⊚': "⊚", '⊛': "⊛", '⊝': "⊝", '⊞': "⊞", '⊟': "⊟", '⊠': "⊠", - '⊡': "⊡", '⊢': "⊢", '⊣': "⊣", '⊤': "⊤", '⊥': "⊥", '⊧': "⊧", '⊨': "⊨", - '⊩': "⊩", '⊪': "⊪", '⊫': "⊫", '⊬': "⊬", '⊭': "⊭", '⊮': "⊮", - '⊯': "⊯", '⊰': "⊰", '⊲': "⊲", '⊳': "⊳", '⊴': "⊴", '⊵': "⊵", '⊶': "⊶", - '⊷': "⊷", '⊸': "⊸", '⊹': "⊹", '⊺': "⊺", '⊻': "⊻", '⊽': "⊽", - '⊾': "⊾", '⊿': "⊿", '⋀': "⋀", '⋁': "⋁", '⋂': "⋂", '⋃': "⋃", '⋄': "⋄", - '⋅': "⋅", '⋆': "⋆", '⋇': "⋇", '⋈': "⋈", '⋉': "⋉", '⋊': "⋊", - '⋋': "⋋", '⋌': "⋌", '⋍': "⋍", '⋎': "⋎", '⋏': "⋏", '⋐': "⋐", '⋑': "⋑", - '⋒': "⋒", '⋓': "⋓", '⋔': "⋔", '⋕': "⋕", '⋖': "⋖", '⋗': "⋗", '⋘': "⋘", '⋙': "⋙", - '⋚': "⋚", '⋛': "⋛", '⋞': "⋞", '⋟': "⋟", '⋠': "⋠", '⋡': "⋡", '⋢': "⋢", - '⋣': "⋣", '⋦': "⋦", '⋧': "⋧", '⋨': "⋨", '⋩': "⋩", '⋪': "⋪", '⋫': "⋫", - '⋬': "⋬", '⋭': "⋭", '⋮': "⋮", '⋯': "⋯", '⋰': "⋰", '⋱': "⋱", '⋲': "⋲", - '⋳': "⋳", '⋴': "⋴", '⋵': "⋵", '⋶': "⋶", '⋷': "⋷", '⋹': "⋹", - '⋺': "⋺", '⋻': "⋻", '⋼': "⋼", '⋽': "⋽", '⋾': "⋾", '⌅': "⌅", '⌆': "⌆", - '⌈': "⌈", '⌉': "⌉", '⌊': "⌊", '⌋': "⌋", '⌌': "⌌", '⌍': "⌍", - '⌎': "⌎", '⌏': "⌏", '⌐': "⌐", '⌒': "⌒", '⌓': "⌓", '⌕': "⌕", - '⌖': "⌖", '⌜': "⌜", '⌝': "⌝", '⌞': "⌞", '⌟': "⌟", '⌢': "⌢", - '⌣': "⌣", '⌭': "⌭", '⌮': "⌮", '⌶': "⌶", '⌽': "⌽", '⌿': "⌿", - '⍼': "⍼", '⎰': "⎰", '⎱': "⎱", '⎴': "⎴", '⎵': "⎵", '⎶': "⎶", - '⏜': "⏜", '⏝': "⏝", '⏞': "⏞", '⏟': "⏟", '⏢': "⏢", - '⏧': "⏧", '␣': "␣", 'Ⓢ': "Ⓢ", '─': "─", '│': "│", '┌': "┌", '┐': "┐", - '└': "└", '┘': "┘", '├': "├", '┤': "┤", '┬': "┬", '┴': "┴", '┼': "┼", - '═': "═", '║': "║", '╒': "╒", '╓': "╓", '╔': "╔", '╕': "╕", '╖': "╖", - '╗': "╗", '╘': "╘", '╙': "╙", '╚': "╚", '╛': "╛", '╜': "╜", '╝': "╝", - '╞': "╞", '╟': "╟", '╠': "╠", '╡': "╡", '╢': "╢", '╣': "╣", '╤': "╤", - '╥': "╥", '╦': "╦", '╧': "╧", '╨': "╨", '╩': "╩", '╪': "╪", '╫': "╫", - '╬': "╬", '▀': "▀", '▄': "▄", '█': "█", '░': "░", '▒': "▒", '▓': "▓", - '□': "□", '▪': "▪", '▫': "▫", '▭': "▭", '▮': "▮", '▱': "▱", - '△': "△", '▴': "▴", '▵': "▵", '▸': "▸", '▹': "▹", '▽': "▽", '▾': "▾", - '▿': "▿", '◂': "◂", '◃': "◃", '◊': "◊", '○': "○", '◬': "◬", '◯': "◯", - '◸': "◸", '◹': "◹", '◺': "◺", '◻': "◻", '◼': "◼", - '★': "★", '☆': "☆", '☎': "☎", '♀': "♀", '♂': "♂", '♠': "♠", '♣': "♣", - '♥': "♥", '♦': "♦", '♪': "♪", '♭': "♭", '♮': "♮", '♯': "♯", '✓': "✓", - '✗': "✗", '✠': "✠", '✶': "✶", '❘': "❘", '❲': "❲", '❳': "❳", - '⟦': "⟦", '⟧': "⟧", '⟨': "⟨", '⟩': "⟩", '⟪': "⟪", '⟫': "⟫", '⟬': "⟬", - '⟭': "⟭", '⟵': "⟵", '⟶': "⟶", '⟷': "⟷", '⟸': "⟸", '⟹': "⟹", '⟺': "⟺", - '⟼': "⟼", '⟿': "⟿", '⤂': "⤂", '⤃': "⤃", '⤄': "⤄", '⤅': "⤅", '⤌': "⤌", - '⤍': "⤍", '⤎': "⤎", '⤏': "⤏", '⤐': "⤐", '⤑': "⤑", '⤒': "⤒", - '⤓': "⤓", '⤖': "⤖", '⤙': "⤙", '⤚': "⤚", '⤛': "⤛", '⤜': "⤜", - '⤝': "⤝", '⤞': "⤞", '⤟': "⤟", '⤠': "⤠", '⤣': "⤣", '⤤': "⤤", - '⤥': "⤥", '⤦': "⤦", '⤧': "⤧", '⤨': "⤨", '⤩': "⤩", '⤪': "⤪", - '⤳': "⤳", '⤵': "⤵", '⤶': "⤶", '⤷': "⤷", '⤸': "⤸", '⤹': "⤹", - '⤼': "⤼", '⤽': "⤽", '⥅': "⥅", '⥈': "⥈", '⥉': "⥉", '⥊': "⥊", - '⥋': "⥋", '⥎': "⥎", '⥏': "⥏", '⥐': "⥐", - '⥑': "⥑", '⥒': "⥒", '⥓': "⥓", '⥔': "⥔", - '⥕': "⥕", '⥖': "⥖", '⥗': "⥗", '⥘': "⥘", - '⥙': "⥙", '⥚': "⥚", '⥛': "⥛", '⥜': "⥜", - '⥝': "⥝", '⥞': "⥞", '⥟': "⥟", '⥠': "⥠", - '⥡': "⥡", '⥢': "⥢", '⥣': "⥣", '⥤': "⥤", '⥥': "⥥", '⥦': "⥦", - '⥧': "⥧", '⥨': "⥨", '⥩': "⥩", '⥪': "⥪", '⥫': "⥫", '⥬': "⥬", - '⥭': "⥭", '⥮': "⥮", '⥯': "⥯", '⥰': "⥰", '⥱': "⥱", '⥲': "⥲", - '⥳': "⥳", '⥴': "⥴", '⥵': "⥵", '⥶': "⥶", '⥸': "⥸", '⥹': "⥹", - '⥻': "⥻", '⥼': "⥼", '⥽': "⥽", '⥾': "⥾", '⥿': "⥿", '⦅': "⦅", - '⦆': "⦆", '⦋': "⦋", '⦌': "⦌", '⦍': "⦍", '⦎': "⦎", '⦏': "⦏", - '⦐': "⦐", '⦑': "⦑", '⦒': "⦒", '⦓': "⦓", '⦔': "⦔", '⦕': "⦕", - '⦖': "⦖", '⦚': "⦚", '⦜': "⦜", '⦝': "⦝", '⦤': "⦤", '⦥': "⦥", - '⦦': "⦦", '⦧': "⦧", '⦨': "⦨", '⦩': "⦩", '⦪': "⦪", '⦫': "⦫", - '⦬': "⦬", '⦭': "⦭", '⦮': "⦮", '⦯': "⦯", '⦰': "⦰", '⦱': "⦱", - '⦲': "⦲", '⦳': "⦳", '⦴': "⦴", '⦵': "⦵", '⦶': "⦶", '⦷': "⦷", - '⦹': "⦹", '⦻': "⦻", '⦼': "⦼", '⦾': "⦾", '⦿': "⦿", '⧀': "⧀", '⧁': "⧁", - '⧂': "⧂", '⧃': "⧃", '⧄': "⧄", '⧅': "⧅", '⧉': "⧉", '⧍': "⧍", '⧎': "⧎", - '⧏': "⧏", '⧐': "⧐", '⧚': "∽̱", '⧜': "⧜", '⧝': "⧝", - '⧞': "⧞", '⧣': "⧣", '⧤': "⧤", '⧥': "⧥", '⧫': "⧫", '⧴': "⧴", - '⧶': "⧶", '⨀': "⨀", '⨁': "⨁", '⨂': "⨂", '⨄': "⨄", '⨆': "⨆", '⨌': "⨌", - '⨍': "⨍", '⨐': "⨐", '⨑': "⨑", '⨒': "⨒", '⨓': "⨓", '⨔': "⨔", - '⨕': "⨕", '⨖': "⨖", '⨗': "⨗", '⨢': "⨢", '⨣': "⨣", '⨤': "⨤", - '⨥': "⨥", '⨦': "⨦", '⨧': "⨧", '⨩': "⨩", '⨪': "⨪", '⨭': "⨭", - '⨮': "⨮", '⨯': "⨯", '⨰': "⨰", '⨱': "⨱", '⨳': "⨳", '⨴': "⨴", - '⨵': "⨵", '⨶': "⨶", '⨷': "⨷", '⨸': "⨸", '⨹': "⨹", '⨺': "⨺", - '⨻': "⨻", '⨼': "⨼", '⨿': "⨿", '⩀': "⩀", '⩂': "⩂", '⩃': "⩃", '⩄': "⩄", - '⩅': "⩅", '⩆': "⩆", '⩇': "⩇", '⩈': "⩈", '⩉': "⩉", '⩊': "⩊", - '⩋': "⩋", '⩌': "⩌", '⩍': "⩍", '⩐': "⩐", '⩓': "⩓", '⩔': "⩔", '⩕': "⩕", - '⩖': "⩖", '⩗': "⩗", '⩘': "⩘", '⩚': "⩚", '⩛': "⩛", '⩜': "⩜", '⩝': "⩝", - '⩟': "⩟", '⩦': "⩦", '⩪': "⩪", '⩭': "⩭", '⩮': "⩮", '⩯': "⩯", '⩰': "⩰", - '⩱': "⩱", '⩲': "⩲", '⩳': "⩳", '⩴': "⩴", '⩵': "⩵", '⩷': "⩷", '⩸': "⩸", - '⩹': "⩹", '⩺': "⩺", '⩻': "⩻", '⩼': "⩼", '⩽': "⩽", '⩾': "⩾", '⩿': "⩿", - '⪀': "⪀", '⪁': "⪁", '⪂': "⪂", '⪃': "⪃", '⪄': "⪄", '⪅': "⪅", - '⪆': "⪆", '⪇': "⪇", '⪈': "⪈", '⪉': "⪉", '⪊': "⪊", '⪋': "⪋", '⪌': "⪌", '⪍': "⪍", - '⪎': "⪎", '⪏': "⪏", '⪐': "⪐", '⪑': "⪑", '⪒': "⪒", '⪓': "⪓", '⪔': "⪔", - '⪕': "⪕", '⪖': "⪖", '⪗': "⪗", '⪘': "⪘", '⪙': "⪙", '⪚': "⪚", '⪝': "⪝", - '⪞': "⪞", '⪟': "⪟", '⪠': "⪠", '⪡': "⪡", '⪢': "⪢", '⪤': "⪤", - '⪥': "⪥", '⪦': "⪦", '⪧': "⪧", '⪨': "⪨", '⪩': "⪩", '⪪': "⪪", '⪫': "⪫", - '⪬': "⪬", '⪭': "⪭", '⪮': "⪮", '⪯': "⪯", '⪰': "⪰", '⪳': "⪳", '⪴': "⪴", - '⪵': "⪵", '⪶': "⪶", '⪷': "⪷", '⪸': "⪸", '⪹': "⪹", '⪺': "⪺", '⪻': "⪻", - '⪼': "⪼", '⪽': "⪽", '⪾': "⪾", '⪿': "⪿", '⫀': "⫀", '⫁': "⫁", - '⫂': "⫂", '⫃': "⫃", '⫄': "⫄", '⫅': "⫅", '⫆': "⫆", '⫇': "⫇", - '⫈': "⫈", '⫋': "⫋", '⫌': "⫌", '⫏': "⫏", '⫐': "⫐", '⫑': "⫑", '⫒': "⫒", - '⫓': "⫓", '⫔': "⫔", '⫕': "⫕", '⫖': "⫖", '⫗': "⫗", '⫘': "⫘", - '⫙': "⫙", '⫚': "⫚", '⫛': "⫛", '⫤': "⫤", '⫦': "⫦", '⫧': "⫧", '⫨': "⫨", - '⫩': "⫩", '⫫': "⫫", '⫬': "⫬", '⫭': "⫭", '⫮': "⫮", '⫯': "⫯", '⫰': "⫰", - '⫱': "⫱", '⫲': "⫲", '⫳': "⫳", '⫽': "⫽", 'ff': "ff", 'fi': "fi", 'fl': "fl", - 'ffi': "ffi", 'ffl': "ffl", '𝒜': "𝒜", '𝒞': "𝒞", '𝒟': "𝒟", '𝒢': "𝒢", '𝒥': "𝒥", - '𝒦': "𝒦", '𝒩': "𝒩", '𝒪': "𝒪", '𝒫': "𝒫", '𝒬': "𝒬", '𝒮': "𝒮", '𝒯': "𝒯", - '𝒰': "𝒰", '𝒱': "𝒱", '𝒲': "𝒲", '𝒳': "𝒳", '𝒴': "𝒴", '𝒵': "𝒵", '𝒶': "𝒶", - '𝒷': "𝒷", '𝒸': "𝒸", '𝒹': "𝒹", '𝒻': "𝒻", '𝒽': "𝒽", '𝒾': "𝒾", '𝒿': "𝒿", - '𝓀': "𝓀", '𝓁': "𝓁", '𝓂': "𝓂", '𝓃': "𝓃", '𝓅': "𝓅", '𝓆': "𝓆", '𝓇': "𝓇", - '𝓈': "𝓈", '𝓉': "𝓉", '𝓊': "𝓊", '𝓋': "𝓋", '𝓌': "𝓌", '𝓍': "𝓍", '𝓎': "𝓎", - '𝓏': "𝓏", '𝔄': "𝔄", '𝔅': "𝔅", '𝔇': "𝔇", '𝔈': "𝔈", '𝔉': "𝔉", '𝔊': "𝔊", '𝔍': "𝔍", - '𝔎': "𝔎", '𝔏': "𝔏", '𝔐': "𝔐", '𝔑': "𝔑", '𝔒': "𝔒", '𝔓': "𝔓", '𝔔': "𝔔", '𝔖': "𝔖", - '𝔗': "𝔗", '𝔘': "𝔘", '𝔙': "𝔙", '𝔚': "𝔚", '𝔛': "𝔛", '𝔜': "𝔜", '𝔞': "𝔞", '𝔟': "𝔟", - '𝔠': "𝔠", '𝔡': "𝔡", '𝔢': "𝔢", '𝔣': "𝔣", '𝔤': "𝔤", '𝔥': "𝔥", '𝔦': "𝔦", '𝔧': "𝔧", - '𝔨': "𝔨", '𝔩': "𝔩", '𝔪': "𝔪", '𝔫': "𝔫", '𝔬': "𝔬", '𝔭': "𝔭", '𝔮': "𝔮", '𝔯': "𝔯", - '𝔰': "𝔰", '𝔱': "𝔱", '𝔲': "𝔲", '𝔳': "𝔳", '𝔴': "𝔴", '𝔵': "𝔵", '𝔶': "𝔶", '𝔷': "𝔷", - '𝔸': "𝔸", '𝔹': "𝔹", '𝔻': "𝔻", '𝔼': "𝔼", '𝔽': "𝔽", '𝔾': "𝔾", '𝕀': "𝕀", - '𝕁': "𝕁", '𝕂': "𝕂", '𝕃': "𝕃", '𝕄': "𝕄", '𝕆': "𝕆", '𝕊': "𝕊", '𝕋': "𝕋", - '𝕌': "𝕌", '𝕍': "𝕍", '𝕎': "𝕎", '𝕏': "𝕏", '𝕐': "𝕐", '𝕒': "𝕒", '𝕓': "𝕓", - '𝕔': "𝕔", '𝕕': "𝕕", '𝕖': "𝕖", '𝕗': "𝕗", '𝕘': "𝕘", '𝕙': "𝕙", '𝕚': "𝕚", - '𝕛': "𝕛", '𝕜': "𝕜", '𝕝': "𝕝", '𝕞': "𝕞", '𝕟': "𝕟", '𝕠': "𝕠", '𝕡': "𝕡", - '𝕢': "𝕢", '𝕣': "𝕣", '𝕤': "𝕤", '𝕥': "𝕥", '𝕦': "𝕦", '𝕧': "𝕧", '𝕨': "𝕨", - '𝕩': "𝕩", '𝕪': "𝕪", '𝕫': "𝕫", -} -DECMAP = {v: k for k, v in ENCMAP.items()} - - -class HtmlEntityDecodeError(ValueError): - pass - - -def htmlentity_encode(text, errors="strict"): - s = "" - for c in text: - try: - s += ENCMAP[c] - except KeyError: - i = ord(c) - s += "&" + hex(i)[2:].zfill(0) + ";" if i > 0xff else c - return s, len(text) - - -def htmlentity_decode(text, errors="strict"): - s = "" - i = 0 - while i < len(text): - m = re.match(r"&(?:(?:[A-Za-z][A-Za-z0-9]{1,6}){1,4}|[0-9]{4});", text[i:i+30]) - if m: - entity = m.group() - c = unichr(int(entity[1:5], 16)) if entity[1:5].isdigit() and len(entity) == 6 else \ - " " if entity == " " else None - if c: - s += c - else: - try: - s += DECMAP[entity] - except KeyError: - s += handle_error("html-entity", errors, HtmlEntityDecodeError, decode=True)(text[i], i) - i += len(entity) - else: - s += text[i] - i += 1 - return s, len(text) - - -add("html", htmlentity_encode, htmlentity_decode, r"^html(?:[-_]?entit(?:y|ies))?$", - extra_exceptions=["HtmlEntityDecodeError"]) - +# -*- coding: UTF-8 -*- +"""HTML entity Codec - html entity content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(html_entities|html-entity)': {'': "<This is a test>"}, + 'enc(html)': {'\u1234': "&1234;"}, + 'dec(html|html_entity)': {'&DoesNotExist;': None}, + 'dec(html_entities|html-entity)': { + '<This is a test>': "", + '<This is a test>': "", + }, +} + + +# source: https://dev.w3.org/html5/html-author/charref +ENCMAP = { + '\t': " ", '\n': " ", '!': "!", '"': """, '#': "#", '$': "$", '%': "%", + '&': "&", '\'': "'", '(': "(", ')': ")", '*': "*", '+': "+", ',': ",", + '.': ".", '/': "/", ':': ":", ';': ";", '<': "<", '=': "=", '>': ">", + '?': "?", '@': "@", '[': "[", '\\': "\", ']': "]", '^': "^", '_': "_", + '`': "`", '{': "{", '|': "|", '}': "}", '¡': "¡", '¢': "¢", + '£': "£", '¤': "¤", '¥': "¥", '¦': "¦", '§': "§", '¨': "¨", '©': "©", + 'ª': "ª", '«': "«", '¬': "¬", '­': "­", '®': "®", '¯': "¯", '°': "°", + '±': "±", '²': "²", '³': "³", '´': "´", 'µ': "µ", '¶': "¶", '·': "·", + '¸': "¸", '¹': "¹", 'º': "º", '»': "»", '¼': "¼", '½': "½", '¾': "¾", + '¿': "¿", 'À': "À", 'Á': "Á", 'Â': "Â", 'Ã': "Ã", 'Ä': "Ä", 'Å': "Å", + 'Æ': "Æ", 'Ç': "Ç", 'È': "È", 'É': "É", 'Ê': "Ê", 'Ë': "Ë", 'Ì': "Ì", + 'Í': "Í", 'Î': "Î", 'Ï': "Ï", 'Ð': "Ð", 'Ñ': "Ñ", 'Ò': "Ò", 'Ó': "Ó", + 'Ô': "Ô", 'Õ': "Õ", 'Ö': "Ö", '×': "×", 'Ø': "Ø", 'Ù': "Ù", 'Ú': "Ú", + 'Û': "Û", 'Ü': "Ü", 'Ý': "Ý", 'Þ': "Þ", 'ß': "ß", 'à': "à", 'á': "á", + 'â': "â", 'ã': "ã", 'ä': "ä", 'å': "å", 'æ': "æ", 'ç': "ç", 'è': "è", + 'é': "é", 'ê': "ê", 'ë': "ë", 'ì': "ì", 'í': "í", 'î': "î", 'ï': "ï", + 'ð': "ð", 'ñ': "ñ", 'ò': "ò", 'ó': "ó", 'ô': "ô", 'õ': "õ", 'ö': "ö", + '÷': "÷", 'ø': "ø", 'ù': "ù", 'ú': "ú", 'û': "û", 'ü': "ü", 'ý': "ý", + 'þ': "þ", 'ÿ': "ÿ", 'Ā': "Ā", 'ā': "ā", 'Ă': "Ă", 'ă': "ă", 'Ą': "Ą", + 'ą': "ą", 'Ć': "Ć", 'ć': "ć", 'Ĉ': "Ĉ", 'ĉ': "ĉ", 'Ċ': "Ċ", 'ċ': "ċ", + 'Č': "Č", 'č': "č", 'Ď': "Ď", 'ď': "ď", 'Đ': "Đ", 'đ': "đ", + 'Ē': "Ē", 'ē': "ē", 'Ė': "Ė", 'ė': "ė", 'Ę': "Ę", 'ę': "ę", 'Ě': "Ě", + 'ě': "ě", 'Ĝ': "Ĝ", 'ĝ': "ĝ", 'Ğ': "Ğ", 'ğ': "ğ", 'Ġ': "Ġ", 'ġ': "ġ", + 'Ģ': "Ģ", 'Ĥ': "Ĥ", 'ĥ': "ĥ", 'Ħ': "Ħ", 'ħ': "ħ", 'Ĩ': "Ĩ", + 'ĩ': "ĩ", 'Ī': "Ī", 'ī': "ī", 'Į': "Į", 'į': "į", 'İ': "İ", 'ı': "ı", + 'IJ': "IJ", 'ij': "ij", 'Ĵ': "Ĵ", 'ĵ': "ĵ", 'Ķ': "Ķ", 'ķ': "ķ", 'ĸ': "ĸ", + 'Ĺ': "Ĺ", 'ĺ': "ĺ", 'Ļ': "Ļ", 'ļ': "ļ", 'Ľ': "Ľ", 'ľ': "ľ", + 'Ŀ': "Ŀ", 'ŀ': "ŀ", 'Ł': "Ł", 'ł': "ł", 'Ń': "Ń", 'ń': "ń", + 'Ņ': "Ņ", 'ņ': "ņ", 'Ň': "Ň", 'ň': "ň", 'ʼn': "ʼn", 'Ŋ': "Ŋ", 'ŋ': "ŋ", + 'Ō': "Ō", 'ō': "ō", 'Ő': "Ő", 'ő': "ő", 'Œ': "Œ", 'œ': "œ", 'Ŕ': "Ŕ", + 'ŕ': "ŕ", 'Ŗ': "Ŗ", 'ŗ': "ŗ", 'Ř': "Ř", 'ř': "ř", 'Ś': "Ś", + 'ś': "ś", 'Ŝ': "Ŝ", 'ŝ': "ŝ", 'Ş': "Ş", 'ş': "ş", 'Š': "Š", + 'š': "š", 'Ţ': "Ţ", 'ţ': "ţ", 'Ť': "Ť", 'ť': "ť", 'Ŧ': "Ŧ", + 'ŧ': "ŧ", 'Ũ': "Ũ", 'ũ': "ũ", 'Ū': "Ū", 'ū': "ū", 'Ŭ': "Ŭ", + 'ŭ': "ŭ", 'Ů': "Ů", 'ů': "ů", 'Ű': "Ű", 'ű': "ű", 'Ų': "Ų", 'ų': "ų", + 'Ŵ': "Ŵ", 'ŵ': "ŵ", 'Ŷ': "Ŷ", 'ŷ': "ŷ", 'Ÿ': "Ÿ", 'Ź': "Ź", 'ź': "ź", + 'Ż': "Ż", 'ż': "ż", 'Ž': "Ž", 'ž': "ž", 'ƒ': "ƒ", 'Ƶ': "Ƶ", 'ǵ': "ǵ", + 'ȷ': "ȷ", 'ˆ': "ˆ", 'ˇ': "ˇ", '˘': "˘", '˙': "˙", '˚': "˚", '˛': "˛", + '˜': "˜", '˝': "˝", '̑': "̑", '̲': "_", 'Α': "Α", 'Β': "Β", + 'Γ': "Γ", 'Δ': "Δ", 'Ε': "Ε", 'Ζ': "Ζ", 'Η': "Η", 'Θ': "Θ", 'Ι': "Ι", + 'Κ': "Κ", 'Λ': "Λ", 'Μ': "Μ", 'Ν': "Ν", 'Ξ': "Ξ", 'Ο': "Ο", 'Π': "Π", + 'Ρ': "Ρ", 'Σ': "Σ", 'Τ': "Τ", 'Υ': "Υ", 'Φ': "Φ", 'Χ': "Χ", 'Ψ': "Ψ", + 'Ω': "Ω", 'α': "α", 'β': "β", 'γ': "γ", 'δ': "δ", 'ε': "ϵ", 'ζ': "ζ", + 'η': "η", 'θ': "θ", 'ι': "ι", 'κ': "κ", 'λ': "λ", 'μ': "μ", 'ν': "ν", + 'ξ': "ξ", 'ο': "ο", 'π': "π", 'ρ': "ρ", 'ς': "ς", 'σ': "σ", 'τ': "τ", + 'υ': "υ", 'φ': "φ", 'χ': "χ", 'ψ': "ψ", 'ω': "ω", 'ϑ': "ϑ", 'ϒ': "ϒ", + 'ϕ': "ϕ", 'ϖ': "ϖ", 'Ϝ': "Ϝ", 'ϝ': "ϝ", 'ϰ': "ϰ", 'ϱ': "ϱ", + 'ϵ': "ε", '϶': "϶", 'Ё': "Ё", 'Ђ': "Ђ", 'Ѓ': "Ѓ", 'Є': "Є", 'Ѕ': "Ѕ", + 'І': "І", 'Ї': "Ї", 'Ј': "Ј", 'Љ': "Љ", 'Њ': "Њ", 'Ћ': "Ћ", 'Ќ': "Ќ", + 'Ў': "Ў", 'Џ': "Џ", 'А': "А", 'Б': "Б", 'В': "В", 'Г': "Г", 'Д': "Д", 'Е': "Е", + 'Ж': "Ж", 'З': "З", 'И': "И", 'Й': "Й", 'К': "К", 'Л': "Л", 'М': "М", 'Н': "Н", + 'О': "О", 'П': "П", 'Р': "Р", 'С': "С", 'Т': "Т", 'У': "У", 'Ф': "Ф", 'Х': "Х", + 'Ц': "Ц", 'Ч': "Ч", 'Ш': "Ш", 'Щ': "Щ", 'Ъ': "Ъ", 'Ы': "Ы", 'Ь': "Ь", + 'Э': "Э", 'Ю': "Ю", 'Я': "Я", 'а': "а", 'б': "б", 'в': "в", 'г': "г", 'д': "д", + 'е': "е", 'ж': "ж", 'з': "з", 'и': "и", 'й': "й", 'к': "к", 'л': "л", 'м': "м", + 'н': "н", 'о': "о", 'п': "п", 'р': "р", 'с': "с", 'т': "т", 'у': "у", 'ф': "ф", + 'х': "х", 'ц': "ц", 'ч': "ч", 'ш': "ш", 'щ': "щ", 'ъ': "ъ", 'ы': "ы", + 'ь': "ь", 'э': "э", 'ю': "ю", 'я': "я", 'ё': "ё", 'ђ': "ђ", 'ѓ': "ѓ", + 'є': "є", 'ѕ': "ѕ", 'і': "і", 'ї': "ї", 'ј': "ј", 'љ': "љ", 'њ': "њ", + 'ћ': "ћ", 'ќ': "ќ", 'ў': "ў", 'џ': "џ", '\u2002': " ", '\u2003': " ", + '\u2004': " ", '\u2005': " ", '\u2007': " ", '\u2008': " ", '\u2009': " ", + '\u200a': " ", '​\u200b': "​", '\u200c': "‌", '\u200d': "‍", '\u200e': "‎", + '\u200f': "‏", '‐': "‐", '–': "–", '—': "—", + '―': "―", '‖': "‖", '‘': "‘", '’': "’", '‚': "‚", '“': "“", '”': "”", + '„': "„", '†': "†", '‡': "‡", '•': "•", '‥': "‥", '…': "…", '‰': "‰", + '‱': "‱", '′': "′", '″': "″", '‴': "‴", '‵': "‵", '‹': "‹", + '›': "›", '‾': "‾", '⁁': "⁁", '⁃': "⁃", '⁄': "⁄", '⁏': "⁏", '⁗': "⁗", + '\u205f': " ", '⁠': "⁠", '⁡': "⁡", '⁢': "⁢", '⁣': "⁣", + '€': "€", '⃛': "⃛", '⃜': "⃜", 'ℂ': "ℂ", '℅': "℅", 'ℊ': "ℊ", 'ℋ': "ℋ", + 'ℌ': "ℌ", 'ℍ': "ℍ", 'ℎ': "ℎ", 'ℏ': "ℏ", 'ℐ': "ℐ", 'ℑ': "ℑ", + 'ℒ': "ℒ", 'ℓ': "ℓ", 'ℕ': "ℕ", '№': "№", '℗': "℗", '℘': "℘", 'ℙ': "ℙ", + 'ℚ': "ℚ", 'ℛ': "ℛ", 'ℜ': "ℜ", 'ℝ': "ℝ", '℞': "℞", '™': "™", 'ℤ': "ℤ", + 'Ω': "Ω", '℧': "℧", 'ℨ': "ℨ", '℩': "℩", 'Å': "Å", 'ℬ': "ℬ", 'ℭ': "ℭ", + 'ℯ': "ℯ", 'ℰ': "ℰ", 'ℱ': "ℱ", 'ℳ': "ℳ", 'ℴ': "ℴ", 'ℵ': "ℵ", 'ℶ': "ℶ", + 'ℷ': "ℷ", 'ℸ': "ℸ", 'ⅅ': "ⅅ", 'ⅆ': "ⅆ", 'ⅇ': "ⅇ", + 'ⅈ': "ⅈ", '⅓': "⅓", '⅔': "⅔", '⅕': "⅕", '⅖': "⅖", '⅗': "⅗", + '⅘': "⅘", '⅙': "⅙", '⅚': "⅚", '⅛': "⅛", '⅜': "⅜", '⅝': "⅝", + '⅞': "⅞", '←': "←", '↑': "↑", '→': "→", '↓': "↓", '↔': "↔", '↕': "↕", + '↖': "↖", '↗': "↗", '↘': "↘", '↙': "↙", '↚': "↚", '↛': "↛", '↝': "↝", + '↞': "↞", '↟': "↟", '↠': "↠", '↡': "↡", '↢': "↢", '↣': "↣", + '↤': "↤", '↥': "↥", '↦': "↦", '↧': "↧", '↩': "↩", '↪': "↪", + '↫': "↫", '↬': "↬", '↭': "↭", '↮': "↮", '↰': "↰", '↱': "↱", '↲': "↲", + '↳': "↳", '↵': "↵", '↶': "↶", '↷': "↷", '↺': "↺", '↻': "↻", '↼': "↼", + '↽': "↽", '↾': "↾", '↿': "↿", '⇀': "⇀", '⇁': "⇁", '⇂': "⇂", '⇃': "⇃", + '⇄': "⇄", '⇅': "⇅", '⇆': "⇆", '⇇': "⇇", '⇈': "⇈", '⇉': "⇉", '⇊': "⇊", + '⇋': "⇋", '⇌': "⇌", '⇍': "⇍", '⇎': "⇎", '⇏': "⇏", '⇐': "⇐", '⇑': "⇑", + '⇒': "⇒", '⇓': "⇓", '⇔': "⇔", '⇕': "⇕", '⇖': "⇖", '⇗': "⇗", '⇘': "⇘", + '⇙': "⇙", '⇚': "⇚", '⇛': "⇛", '⇝': "⇝", '⇤': "⇤", '⇥': "⇥", '⇵': "⇵", + '⇽': "⇽", '⇾': "⇾", '⇿': "⇿", '∀': "∀", '∁': "∁", '∂': "∂", '∃': "∃", + '∄': "∄", '∅': "∅", '∇': "∇", '∈': "∈", '∉': "∉", '∋': "∋", '∌': "∌", + '∏': "∏", '∐': "∐", '∑': "∑", '−': "−", '∓': "∓", '∔': "∔", '∖': "∖", + '∗': "∗", '∘': "∘", '√': "√", '∝': "∝", '∞': "∞", '∟': "∟", '∠': "∠", + '∡': "∡", '∢': "∢", '∣': "∣", '∤': "∤", '∥': "∥", '∦': "∦", '∧': "∧", + '∨': "∨", '∩': "∩", '∪': "∪", '∫': "∫", '∬': "∬", '∭': "∭", '∮': "∮", + '∯': "∯", '∰': "∰", '∱': "∱", '∲': "∲", '∳': "∳", '∴': "∴", + '∵': "∵", '∶': "∶", '∷': "∷", '∸': "∸", '∺': "∺", '∻': "∻", '∼': "∼", + '∽': "∽", '∾': "∾", '∿': "∿", '≀': "≀", '≁': "≁", '≂': "≂", '≃': "≃", + '≄': "≄", '≅': "≅", '≆': "≆", '≇': "≇", '≈': "≈", '≉': "≉", '≊': "≊", + '≋': "≋", '≌': "≌", '≍': "≍", '≎': "≎", '≏': "≏", '≐': "≐", '≑': "≑", + '≒': "≒", '≓': "≓", '≔': "≔", '≕': "≕", '≖': "≖", '≗': "≗", '≙': "≙", + '≚': "≚", '≜': "≜", '≟': "≟", '≠': "≠", '≡': "≡", '≢': "≢", '≤': "≤", + '≥': "≥", '≦': "≦", '≧': "≧", '≨': "≨", '≩': "≩", '≪': "≪", '≫': "≫", '≬': "≬", + '≭': "≭", '≮': "≮", '≯': "≯", '≰': "≰", '≱': "≱", '≲': "≲", '≳': "≳", + '≴': "≴", '≵': "≵", '≶': "≶", '≷': "≷", '≸': "≸", '≹': "≹", '≺': "≺", '≻': "≻", + '≼': "≼", '≽': "≽", '≾': "≾", '≿': "≿", '⊀': "⊀", '⊁': "⊁", '⊂': "⊂", + '⊃': "⊃", '⊄': "⊄", '⊅': "⊅", '⊆': "⊆", '⊇': "⊇", '⊈': "⊈", '⊉': "⊉", + '⊊': "⊊", '⊋': "⊋", '⊍': "⊍", '⊎': "⊎", '⊏': "⊏", '⊐': "⊐", '⊑': "⊑", + '⊒': "⊒", '⊓': "⊓", '⊔': "⊔", '⊕': "⊕", '⊖': "⊖", '⊗': "⊗", '⊘': "⊘", + '⊙': "⊙", '⊚': "⊚", '⊛': "⊛", '⊝': "⊝", '⊞': "⊞", '⊟': "⊟", '⊠': "⊠", + '⊡': "⊡", '⊢': "⊢", '⊣': "⊣", '⊤': "⊤", '⊥': "⊥", '⊧': "⊧", '⊨': "⊨", + '⊩': "⊩", '⊪': "⊪", '⊫': "⊫", '⊬': "⊬", '⊭': "⊭", '⊮': "⊮", + '⊯': "⊯", '⊰': "⊰", '⊲': "⊲", '⊳': "⊳", '⊴': "⊴", '⊵': "⊵", '⊶': "⊶", + '⊷': "⊷", '⊸': "⊸", '⊹': "⊹", '⊺': "⊺", '⊻': "⊻", '⊽': "⊽", + '⊾': "⊾", '⊿': "⊿", '⋀': "⋀", '⋁': "⋁", '⋂': "⋂", '⋃': "⋃", '⋄': "⋄", + '⋅': "⋅", '⋆': "⋆", '⋇': "⋇", '⋈': "⋈", '⋉': "⋉", '⋊': "⋊", + '⋋': "⋋", '⋌': "⋌", '⋍': "⋍", '⋎': "⋎", '⋏': "⋏", '⋐': "⋐", '⋑': "⋑", + '⋒': "⋒", '⋓': "⋓", '⋔': "⋔", '⋕': "⋕", '⋖': "⋖", '⋗': "⋗", '⋘': "⋘", '⋙': "⋙", + '⋚': "⋚", '⋛': "⋛", '⋞': "⋞", '⋟': "⋟", '⋠': "⋠", '⋡': "⋡", '⋢': "⋢", + '⋣': "⋣", '⋦': "⋦", '⋧': "⋧", '⋨': "⋨", '⋩': "⋩", '⋪': "⋪", '⋫': "⋫", + '⋬': "⋬", '⋭': "⋭", '⋮': "⋮", '⋯': "⋯", '⋰': "⋰", '⋱': "⋱", '⋲': "⋲", + '⋳': "⋳", '⋴': "⋴", '⋵': "⋵", '⋶': "⋶", '⋷': "⋷", '⋹': "⋹", + '⋺': "⋺", '⋻': "⋻", '⋼': "⋼", '⋽': "⋽", '⋾': "⋾", '⌅': "⌅", '⌆': "⌆", + '⌈': "⌈", '⌉': "⌉", '⌊': "⌊", '⌋': "⌋", '⌌': "⌌", '⌍': "⌍", + '⌎': "⌎", '⌏': "⌏", '⌐': "⌐", '⌒': "⌒", '⌓': "⌓", '⌕': "⌕", + '⌖': "⌖", '⌜': "⌜", '⌝': "⌝", '⌞': "⌞", '⌟': "⌟", '⌢': "⌢", + '⌣': "⌣", '⌭': "⌭", '⌮': "⌮", '⌶': "⌶", '⌽': "⌽", '⌿': "⌿", + '⍼': "⍼", '⎰': "⎰", '⎱': "⎱", '⎴': "⎴", '⎵': "⎵", '⎶': "⎶", + '⏜': "⏜", '⏝': "⏝", '⏞': "⏞", '⏟': "⏟", '⏢': "⏢", + '⏧': "⏧", '␣': "␣", 'Ⓢ': "Ⓢ", '─': "─", '│': "│", '┌': "┌", '┐': "┐", + '└': "└", '┘': "┘", '├': "├", '┤': "┤", '┬': "┬", '┴': "┴", '┼': "┼", + '═': "═", '║': "║", '╒': "╒", '╓': "╓", '╔': "╔", '╕': "╕", '╖': "╖", + '╗': "╗", '╘': "╘", '╙': "╙", '╚': "╚", '╛': "╛", '╜': "╜", '╝': "╝", + '╞': "╞", '╟': "╟", '╠': "╠", '╡': "╡", '╢': "╢", '╣': "╣", '╤': "╤", + '╥': "╥", '╦': "╦", '╧': "╧", '╨': "╨", '╩': "╩", '╪': "╪", '╫': "╫", + '╬': "╬", '▀': "▀", '▄': "▄", '█': "█", '░': "░", '▒': "▒", '▓': "▓", + '□': "□", '▪': "▪", '▫': "▫", '▭': "▭", '▮': "▮", '▱': "▱", + '△': "△", '▴': "▴", '▵': "▵", '▸': "▸", '▹': "▹", '▽': "▽", '▾': "▾", + '▿': "▿", '◂': "◂", '◃': "◃", '◊': "◊", '○': "○", '◬': "◬", '◯': "◯", + '◸': "◸", '◹': "◹", '◺': "◺", '◻': "◻", '◼': "◼", + '★': "★", '☆': "☆", '☎': "☎", '♀': "♀", '♂': "♂", '♠': "♠", '♣': "♣", + '♥': "♥", '♦': "♦", '♪': "♪", '♭': "♭", '♮': "♮", '♯': "♯", '✓': "✓", + '✗': "✗", '✠': "✠", '✶': "✶", '❘': "❘", '❲': "❲", '❳': "❳", + '⟦': "⟦", '⟧': "⟧", '⟨': "⟨", '⟩': "⟩", '⟪': "⟪", '⟫': "⟫", '⟬': "⟬", + '⟭': "⟭", '⟵': "⟵", '⟶': "⟶", '⟷': "⟷", '⟸': "⟸", '⟹': "⟹", '⟺': "⟺", + '⟼': "⟼", '⟿': "⟿", '⤂': "⤂", '⤃': "⤃", '⤄': "⤄", '⤅': "⤅", '⤌': "⤌", + '⤍': "⤍", '⤎': "⤎", '⤏': "⤏", '⤐': "⤐", '⤑': "⤑", '⤒': "⤒", + '⤓': "⤓", '⤖': "⤖", '⤙': "⤙", '⤚': "⤚", '⤛': "⤛", '⤜': "⤜", + '⤝': "⤝", '⤞': "⤞", '⤟': "⤟", '⤠': "⤠", '⤣': "⤣", '⤤': "⤤", + '⤥': "⤥", '⤦': "⤦", '⤧': "⤧", '⤨': "⤨", '⤩': "⤩", '⤪': "⤪", + '⤳': "⤳", '⤵': "⤵", '⤶': "⤶", '⤷': "⤷", '⤸': "⤸", '⤹': "⤹", + '⤼': "⤼", '⤽': "⤽", '⥅': "⥅", '⥈': "⥈", '⥉': "⥉", '⥊': "⥊", + '⥋': "⥋", '⥎': "⥎", '⥏': "⥏", '⥐': "⥐", + '⥑': "⥑", '⥒': "⥒", '⥓': "⥓", '⥔': "⥔", + '⥕': "⥕", '⥖': "⥖", '⥗': "⥗", '⥘': "⥘", + '⥙': "⥙", '⥚': "⥚", '⥛': "⥛", '⥜': "⥜", + '⥝': "⥝", '⥞': "⥞", '⥟': "⥟", '⥠': "⥠", + '⥡': "⥡", '⥢': "⥢", '⥣': "⥣", '⥤': "⥤", '⥥': "⥥", '⥦': "⥦", + '⥧': "⥧", '⥨': "⥨", '⥩': "⥩", '⥪': "⥪", '⥫': "⥫", '⥬': "⥬", + '⥭': "⥭", '⥮': "⥮", '⥯': "⥯", '⥰': "⥰", '⥱': "⥱", '⥲': "⥲", + '⥳': "⥳", '⥴': "⥴", '⥵': "⥵", '⥶': "⥶", '⥸': "⥸", '⥹': "⥹", + '⥻': "⥻", '⥼': "⥼", '⥽': "⥽", '⥾': "⥾", '⥿': "⥿", '⦅': "⦅", + '⦆': "⦆", '⦋': "⦋", '⦌': "⦌", '⦍': "⦍", '⦎': "⦎", '⦏': "⦏", + '⦐': "⦐", '⦑': "⦑", '⦒': "⦒", '⦓': "⦓", '⦔': "⦔", '⦕': "⦕", + '⦖': "⦖", '⦚': "⦚", '⦜': "⦜", '⦝': "⦝", '⦤': "⦤", '⦥': "⦥", + '⦦': "⦦", '⦧': "⦧", '⦨': "⦨", '⦩': "⦩", '⦪': "⦪", '⦫': "⦫", + '⦬': "⦬", '⦭': "⦭", '⦮': "⦮", '⦯': "⦯", '⦰': "⦰", '⦱': "⦱", + '⦲': "⦲", '⦳': "⦳", '⦴': "⦴", '⦵': "⦵", '⦶': "⦶", '⦷': "⦷", + '⦹': "⦹", '⦻': "⦻", '⦼': "⦼", '⦾': "⦾", '⦿': "⦿", '⧀': "⧀", '⧁': "⧁", + '⧂': "⧂", '⧃': "⧃", '⧄': "⧄", '⧅': "⧅", '⧉': "⧉", '⧍': "⧍", '⧎': "⧎", + '⧏': "⧏", '⧐': "⧐", '⧚': "∽̱", '⧜': "⧜", '⧝': "⧝", + '⧞': "⧞", '⧣': "⧣", '⧤': "⧤", '⧥': "⧥", '⧫': "⧫", '⧴': "⧴", + '⧶': "⧶", '⨀': "⨀", '⨁': "⨁", '⨂': "⨂", '⨄': "⨄", '⨆': "⨆", '⨌': "⨌", + '⨍': "⨍", '⨐': "⨐", '⨑': "⨑", '⨒': "⨒", '⨓': "⨓", '⨔': "⨔", + '⨕': "⨕", '⨖': "⨖", '⨗': "⨗", '⨢': "⨢", '⨣': "⨣", '⨤': "⨤", + '⨥': "⨥", '⨦': "⨦", '⨧': "⨧", '⨩': "⨩", '⨪': "⨪", '⨭': "⨭", + '⨮': "⨮", '⨯': "⨯", '⨰': "⨰", '⨱': "⨱", '⨳': "⨳", '⨴': "⨴", + '⨵': "⨵", '⨶': "⨶", '⨷': "⨷", '⨸': "⨸", '⨹': "⨹", '⨺': "⨺", + '⨻': "⨻", '⨼': "⨼", '⨿': "⨿", '⩀': "⩀", '⩂': "⩂", '⩃': "⩃", '⩄': "⩄", + '⩅': "⩅", '⩆': "⩆", '⩇': "⩇", '⩈': "⩈", '⩉': "⩉", '⩊': "⩊", + '⩋': "⩋", '⩌': "⩌", '⩍': "⩍", '⩐': "⩐", '⩓': "⩓", '⩔': "⩔", '⩕': "⩕", + '⩖': "⩖", '⩗': "⩗", '⩘': "⩘", '⩚': "⩚", '⩛': "⩛", '⩜': "⩜", '⩝': "⩝", + '⩟': "⩟", '⩦': "⩦", '⩪': "⩪", '⩭': "⩭", '⩮': "⩮", '⩯': "⩯", '⩰': "⩰", + '⩱': "⩱", '⩲': "⩲", '⩳': "⩳", '⩴': "⩴", '⩵': "⩵", '⩷': "⩷", '⩸': "⩸", + '⩹': "⩹", '⩺': "⩺", '⩻': "⩻", '⩼': "⩼", '⩽': "⩽", '⩾': "⩾", '⩿': "⩿", + '⪀': "⪀", '⪁': "⪁", '⪂': "⪂", '⪃': "⪃", '⪄': "⪄", '⪅': "⪅", + '⪆': "⪆", '⪇': "⪇", '⪈': "⪈", '⪉': "⪉", '⪊': "⪊", '⪋': "⪋", '⪌': "⪌", '⪍': "⪍", + '⪎': "⪎", '⪏': "⪏", '⪐': "⪐", '⪑': "⪑", '⪒': "⪒", '⪓': "⪓", '⪔': "⪔", + '⪕': "⪕", '⪖': "⪖", '⪗': "⪗", '⪘': "⪘", '⪙': "⪙", '⪚': "⪚", '⪝': "⪝", + '⪞': "⪞", '⪟': "⪟", '⪠': "⪠", '⪡': "⪡", '⪢': "⪢", '⪤': "⪤", + '⪥': "⪥", '⪦': "⪦", '⪧': "⪧", '⪨': "⪨", '⪩': "⪩", '⪪': "⪪", '⪫': "⪫", + '⪬': "⪬", '⪭': "⪭", '⪮': "⪮", '⪯': "⪯", '⪰': "⪰", '⪳': "⪳", '⪴': "⪴", + '⪵': "⪵", '⪶': "⪶", '⪷': "⪷", '⪸': "⪸", '⪹': "⪹", '⪺': "⪺", '⪻': "⪻", + '⪼': "⪼", '⪽': "⪽", '⪾': "⪾", '⪿': "⪿", '⫀': "⫀", '⫁': "⫁", + '⫂': "⫂", '⫃': "⫃", '⫄': "⫄", '⫅': "⫅", '⫆': "⫆", '⫇': "⫇", + '⫈': "⫈", '⫋': "⫋", '⫌': "⫌", '⫏': "⫏", '⫐': "⫐", '⫑': "⫑", '⫒': "⫒", + '⫓': "⫓", '⫔': "⫔", '⫕': "⫕", '⫖': "⫖", '⫗': "⫗", '⫘': "⫘", + '⫙': "⫙", '⫚': "⫚", '⫛': "⫛", '⫤': "⫤", '⫦': "⫦", '⫧': "⫧", '⫨': "⫨", + '⫩': "⫩", '⫫': "⫫", '⫬': "⫬", '⫭': "⫭", '⫮': "⫮", '⫯': "⫯", '⫰': "⫰", + '⫱': "⫱", '⫲': "⫲", '⫳': "⫳", '⫽': "⫽", 'ff': "ff", 'fi': "fi", 'fl': "fl", + 'ffi': "ffi", 'ffl': "ffl", '𝒜': "𝒜", '𝒞': "𝒞", '𝒟': "𝒟", '𝒢': "𝒢", '𝒥': "𝒥", + '𝒦': "𝒦", '𝒩': "𝒩", '𝒪': "𝒪", '𝒫': "𝒫", '𝒬': "𝒬", '𝒮': "𝒮", '𝒯': "𝒯", + '𝒰': "𝒰", '𝒱': "𝒱", '𝒲': "𝒲", '𝒳': "𝒳", '𝒴': "𝒴", '𝒵': "𝒵", '𝒶': "𝒶", + '𝒷': "𝒷", '𝒸': "𝒸", '𝒹': "𝒹", '𝒻': "𝒻", '𝒽': "𝒽", '𝒾': "𝒾", '𝒿': "𝒿", + '𝓀': "𝓀", '𝓁': "𝓁", '𝓂': "𝓂", '𝓃': "𝓃", '𝓅': "𝓅", '𝓆': "𝓆", '𝓇': "𝓇", + '𝓈': "𝓈", '𝓉': "𝓉", '𝓊': "𝓊", '𝓋': "𝓋", '𝓌': "𝓌", '𝓍': "𝓍", '𝓎': "𝓎", + '𝓏': "𝓏", '𝔄': "𝔄", '𝔅': "𝔅", '𝔇': "𝔇", '𝔈': "𝔈", '𝔉': "𝔉", '𝔊': "𝔊", '𝔍': "𝔍", + '𝔎': "𝔎", '𝔏': "𝔏", '𝔐': "𝔐", '𝔑': "𝔑", '𝔒': "𝔒", '𝔓': "𝔓", '𝔔': "𝔔", '𝔖': "𝔖", + '𝔗': "𝔗", '𝔘': "𝔘", '𝔙': "𝔙", '𝔚': "𝔚", '𝔛': "𝔛", '𝔜': "𝔜", '𝔞': "𝔞", '𝔟': "𝔟", + '𝔠': "𝔠", '𝔡': "𝔡", '𝔢': "𝔢", '𝔣': "𝔣", '𝔤': "𝔤", '𝔥': "𝔥", '𝔦': "𝔦", '𝔧': "𝔧", + '𝔨': "𝔨", '𝔩': "𝔩", '𝔪': "𝔪", '𝔫': "𝔫", '𝔬': "𝔬", '𝔭': "𝔭", '𝔮': "𝔮", '𝔯': "𝔯", + '𝔰': "𝔰", '𝔱': "𝔱", '𝔲': "𝔲", '𝔳': "𝔳", '𝔴': "𝔴", '𝔵': "𝔵", '𝔶': "𝔶", '𝔷': "𝔷", + '𝔸': "𝔸", '𝔹': "𝔹", '𝔻': "𝔻", '𝔼': "𝔼", '𝔽': "𝔽", '𝔾': "𝔾", '𝕀': "𝕀", + '𝕁': "𝕁", '𝕂': "𝕂", '𝕃': "𝕃", '𝕄': "𝕄", '𝕆': "𝕆", '𝕊': "𝕊", '𝕋': "𝕋", + '𝕌': "𝕌", '𝕍': "𝕍", '𝕎': "𝕎", '𝕏': "𝕏", '𝕐': "𝕐", '𝕒': "𝕒", '𝕓': "𝕓", + '𝕔': "𝕔", '𝕕': "𝕕", '𝕖': "𝕖", '𝕗': "𝕗", '𝕘': "𝕘", '𝕙': "𝕙", '𝕚': "𝕚", + '𝕛': "𝕛", '𝕜': "𝕜", '𝕝': "𝕝", '𝕞': "𝕞", '𝕟': "𝕟", '𝕠': "𝕠", '𝕡': "𝕡", + '𝕢': "𝕢", '𝕣': "𝕣", '𝕤': "𝕤", '𝕥': "𝕥", '𝕦': "𝕦", '𝕧': "𝕧", '𝕨': "𝕨", + '𝕩': "𝕩", '𝕪': "𝕪", '𝕫': "𝕫", +} +DECMAP = {v: k for k, v in ENCMAP.items()} + + +class HtmlEntityDecodeError(ValueError): + pass + + +def htmlentity_encode(text, errors="strict"): + s = "" + for c in text: + try: + s += ENCMAP[c] + except KeyError: + i = ord(c) + s += "&" + hex(i)[2:].zfill(0) + ";" if i > 0xff else c + return s, len(text) + + +def htmlentity_decode(text, errors="strict"): + s = "" + i = 0 + while i < len(text): + m = re.match(r"&(?:(?:[A-Za-z][A-Za-z0-9]{1,6}){1,4}|[0-9]{4});", text[i:i+30]) + if m: + entity = m.group() + c = chr(int(entity[1:5], 16)) if entity[1:5].isdigit() and len(entity) == 6 else \ + " " if entity == " " else None + if c: + s += c + else: + try: + s += DECMAP[entity] + except KeyError: + s += handle_error("html-entity", errors, HtmlEntityDecodeError, decode=True)(text[i], i) + i += len(entity) + else: + s += text[i] + i += 1 + return s, len(text) + + +add("html", htmlentity_encode, htmlentity_decode, r"^html(?:[-_]?entit(?:y|ies))?$", + extra_exceptions=["HtmlEntityDecodeError"]) + diff --git a/tests/test_base.py b/tests/test_base.py index 7b3dae0..a37d1a6 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,236 +1,235 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Base codecs tests. - -""" -import os -import sys -from unittest import TestCase - -from codext.__common__ import * -from codext.base._base import _generate_charset -from codext.base.baseN import base, main2, main32, main64url - - -class TestCodecsBase(TestCase): - def setUp(self): - global STR - STR = "this is a test" - - def test_new_base_codec(self): - for i in [0, 1, 256]: - self.assertRaises(ValueError, _generate_charset, i) - b10 = lambda *a: "0123456789" - base(b10, "base10") - B10 = "2361031878030638688519054699098996" - self.assertEqual(codecs.encode(STR, "base10"), B10) - self.assertEqual(codecs.encode(b(STR), "base10"), b(B10)) - self.assertEqual(codecs.decode(B10, "base10"), STR) - self.assertEqual(codecs.decode(b(B10), "base10"), b(STR)) - self.assertRaises(ValueError, base, 1, "test") - b11 = "0123456789a" - base(b11, "base11") - B11 = "113342054335735319526632a26972419" - self.assertEqual(codecs.encode(STR, "base11"), B11) - self.assertEqual(codecs.decode(B11, "base11"), STR) - self.assertRaises(ValueError, base, object(), "test") - self.assertIsNone(base({'': "01234"}, r"^base5(test)?$")) - self.assertIsNotNone(codecs.encode(STR, "base5test")) - self.assertRaises(ValueError, base, {'': "01234"}, "base5-test", pow2=True) - self.assertEqual("", codecs.decode("", "base5test")) - - def test_codec_base1(self): - C = "A" - for i in range(3): - self.assertIsNotNone(codecs.encode(i * C, "base1")) - self.assertRaises(ValueError, codecs.encode, 4 * C, "unary") - self.assertEqual(codecs.decode("AAAAA", "base1"), "\x05") - - def test_codec_base2(self): - STR = "test" - B2 = "01110100011001010111001101110100" - self.assertEqual(codecs.encode(STR, "base2"), B2) - self.assertEqual(codecs.encode(b(STR), "base2"), b(B2)) - self.assertEqual(codecs.decode(B2, "base2"), STR) - self.assertEqual(codecs.decode(b(B2), "base2"), b(STR)) - B2 = "10001011100110101000110010001011" - self.assertEqual(codecs.encode(STR, "base2-inv"), B2) - self.assertEqual(codecs.decode(B2, "base2-inv"), STR) - B2 = "abbbabaaabbaabababbbaabbabbbabaa" - self.assertEqual(codecs.encode(STR, "base2-ab"), B2) - self.assertEqual(codecs.decode(B2, "base2-ab"), STR) - B2 = "CDDDCDCCCDDCCDCDCDDDCCDDCDDDCDCC" - self.assertEqual(codecs.encode(STR, "base2-CD"), B2) - self.assertEqual(codecs.decode(B2, "base2-CD"), STR) - B2 = "34443433344334343444334434443433" - self.assertEqual(codecs.encode(STR, "base2-34"), B2) - self.assertEqual(codecs.decode(B2, "base2-34"), STR) - - def test_codec_base3(self): - STR = "test" - B3 = "23112113223321323322" - self.assertEqual(codecs.encode(STR, "base3"), B3) - self.assertEqual(codecs.encode(b(STR), "base3"), b(B3)) - self.assertEqual(codecs.decode(B3, "base3"), STR) - self.assertEqual(codecs.decode(b(B3), "base3"), b(STR)) - B3 = "21332331221123121122" - self.assertEqual(codecs.encode(STR, "base3-inv"), B3) - self.assertEqual(codecs.decode(B3, "base3-inv"), STR) - B3 = "bcaabaacbbccbacbccbb" - self.assertEqual(codecs.encode(STR, "base3-abc"), B3) - self.assertEqual(codecs.decode(B3, "base3-abc"), STR) - self.assertRaises(LookupError, codecs.encode, "test", "base3-ab") - self.assertRaises(LookupError, codecs.encode, "test", "base3-abcd") - - def test_codec_base4(self): - STR = "test" - B4 = "2421232224142421" - self.assertEqual(codecs.encode(STR, "base4"), B4) - self.assertEqual(codecs.encode(b(STR), "base4"), b(B4)) - self.assertEqual(codecs.decode(B4, "base4"), STR) - self.assertEqual(codecs.decode(b(B4), "base4"), b(STR)) - B4 = "3134323331413134" - self.assertEqual(codecs.encode(STR, "base4-inv"), B4) - self.assertEqual(codecs.decode(B4, "base4-inv"), STR) - B4 = "bdbabcbbbdadbdba" - self.assertEqual(codecs.encode(STR, "base4-abcd"), B4) - self.assertEqual(codecs.decode(B4, "base4-abcd"), STR) - self.assertRaises(LookupError, codecs.encode, "test", "base4-abc") - self.assertRaises(LookupError, codecs.encode, "test", "base4-abcde") - - def test_codec_base8(self): - STR = "test" - B8 = "dfagcfgddfa=====" - self.assertEqual(codecs.encode(STR, "base8"), B8) - self.assertEqual(codecs.encode(b(STR), "base8"), b(B8)) - self.assertEqual(codecs.decode(B8, "base8"), STR) - self.assertEqual(codecs.decode(b(B8), "base8"), b(STR)) - B8 = "echbfcbeech=====" - self.assertEqual(codecs.encode(STR, "base8-inv"), B8) - self.assertEqual(codecs.decode(B8, "base8-inv"), STR) - B8 = "35062563350=====" - self.assertEqual(codecs.encode(STR, "base8-01234567"), B8) - self.assertEqual(codecs.decode(B8, "base8-01234567"), STR) - self.assertRaises(LookupError, codecs.encode, "test", "base8-0123456") - self.assertRaises(LookupError, codecs.encode, "test", "base8-012345678") - - def test_codec_base16(self): - B16 = "7468697320697320612074657374" - self.assertEqual(codecs.encode(STR, "base16"), B16) - self.assertEqual(codecs.encode(b(STR), "base16"), b(B16)) - self.assertEqual(codecs.decode(B16, "base16"), STR) - self.assertEqual(codecs.decode(b(B16), "base16"), b(STR)) - B16 += "?" - self.assertRaises(ValueError, codecs.decode, B16, "base16") - self.assertEqual(codecs.decode(B16, "base16", "ignore"), STR) - self.assertEqual(codecs.decode(B16, "base16", "replace"), STR + "\x00") - self.assertRaises(ValueError, codecs.decode, B16, "base16", "BAD") - STR2 = "=:;" - B16_1 = "3d3a3b" - B16_2 = "3D3A3B" - B16_3 = "3D3a3B" # mixed case: should fail - self.assertEqual(codecs.encode(STR2, "hex"), B16_2) - self.assertEqual(codecs.decode(B16_1, "hex"), STR2) - self.assertEqual(codecs.decode(B16_2, "hex"), STR2) - self.assertRaises(ValueError, codecs.decode, B16_3, "hex") - - def test_codec_base32(self): - for b32, enc in zip(["ORUGS4ZANFZSAYJAORSXG5A=", "qtwg1h3ypf31yajyqt1zg7y=", "EHK6ISP0D5PI0O90EHIN6T0=", - "fjn6kwt0e5tk0s90fjkr6x0=", "EHM6JWS0D5SJ0R90EHJQ6X0="], - ["base32", "zbase32", "base32-hex", "geohash", "crockford"]): - self.assertEqual(codecs.encode(STR, enc), b32) - self.assertEqual(codecs.encode(b(STR), enc), b(b32)) - self.assertEqual(codecs.decode(b32, enc), STR) - self.assertEqual(codecs.decode(b(b32), enc), b(STR)) - self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc) - self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc, "BAD") - - def test_codec_base36(self): - B36 = "4WMHTK6UZL044O91NKCEB8" - self.assertEqual(codecs.encode(STR, "base36"), B36) - self.assertEqual(codecs.encode(b(STR), "base36"), b(B36)) - self.assertEqual(codecs.decode(B36, "base36"), STR) - self.assertEqual(codecs.decode(b(B36), "base36"), b(STR)) - B36 = "E6WR3UG49VAEEYJBXUMOLI" - self.assertEqual(codecs.encode(STR, "base36-inv"), B36) - self.assertEqual(codecs.decode(B36, "base36-inv"), STR) - self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36-inv") - self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36", "BAD") - self.assertEqual(codecs.decode(B36 + "?", "base36-inv", "ignore"), STR) - - def test_codec_base58(self): - B58 = "jo91waLQA1NNeBmZKUF" - self.assertEqual(codecs.encode(STR, "base58"), B58) - self.assertEqual(codecs.encode(b(STR), "base58"), b(B58)) - self.assertEqual(codecs.decode(B58, "base58"), STR) - self.assertEqual(codecs.decode(b(B58), "base58"), b(STR)) - B58 = "jo9rA2LQwr44eBmZK7E" - self.assertEqual(codecs.encode(STR, "base58-ripple"), B58) - self.assertEqual(codecs.decode(B58, "base58-rp"), STR) - B58 = "JN91Wzkpa1nnDbLyjtf" - self.assertEqual(codecs.encode(STR, "base58-flickr"), B58) - self.assertEqual(codecs.encode(STR, "base58-shorturl"), B58) - self.assertEqual(codecs.decode(B58, "base58-fl"), STR) - self.assertEqual(codecs.encode(STR, "base58-short-url"), B58) - self.assertEqual(codecs.encode(STR, "base58-url"), B58) - - def test_codec_base62(self): - for b62, enc in zip(["CsoB4HQ5gmgMyCenF7E", "M2yLERaFqwqW8MoxPHO"], ["base62", "base62-inv"]): - self.assertEqual(codecs.encode(STR, enc), b62) - self.assertEqual(codecs.encode(b(STR), enc), b(b62)) - self.assertEqual(codecs.decode(b62, enc), STR) - self.assertEqual(codecs.decode(b(b62), enc), b(STR)) - - def test_codec_base64(self): - for b64, enc in zip(["dGhpcyBpcyBhIHRlc3Q=", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): - self.assertEqual(codecs.encode(STR, enc), b64) - self.assertEqual(codecs.encode(b(STR), enc), b(b64)) - self.assertEqual(codecs.decode(b64, enc), STR) - self.assertEqual(codecs.decode(b(b64), enc), b(STR)) - - def test_codec_base91(self): - for b91, enc in zip([",X,<:WRT%yxth90oZB", ",N,<:MHJ%onjXzqeP1", "Jx&[jv4S3Wg>,71@Jk", "yJy^\\IDFsdc?Tof:L#"], - ["base91", "base91-inv", "base91-alt", "base91-alt-inv"]): - self.assertEqual(codecs.encode(STR, enc), b91) - self.assertEqual(codecs.encode(b(STR), enc), b(b91)) - self.assertEqual(codecs.decode(b91, enc), STR) - self.assertEqual(codecs.decode(b(b91), enc), b(STR)) - self.assertIsNotNone(codecs.encode("\x00\x00", "base91")) - self.assertIsNotNone(codecs.decode("abc", "base91")) - self.assertIsNotNone(codecs.decode("AD", "base91")) - self.assertRaises(ValueError, codecs.decode, "\xff", "base91") - self.assertRaises(ValueError, codecs.decode, "a\xff", "base91") - self.assertIsNotNone(codecs.encode("\x00\x00", "base91-alt")) - - def test_codec_base100(self): - if PY3: - B100 = "\U0001f46b\U0001f45f\U0001f460\U0001f46a\U0001f417\U0001f460\U0001f46a\U0001f417\U0001f458" \ - "\U0001f417\U0001f46b\U0001f45c\U0001f46a\U0001f46b" - self.assertEqual(codecs.encode(STR, "base100"), B100) - self.assertEqual(codecs.encode(b(STR), "base100"), b(B100)) - self.assertEqual(codecs.decode(B100, "base100"), STR) - self.assertEqual(codecs.decode(b(B100), "base100"), b(STR)) - self.assertRaises(ValueError, codecs.decode, b(B100)[1:], "base100") - - def test_codec_base_generic(self): - for n in range(2, 255): - bn = "base{}_generic".format(n) - self.assertEqual(codecs.decode(codecs.encode(STR, bn), bn), STR) - self.assertRaises(LookupError, codecs.decode, "test", "base0-generic") - self.assertRaises(LookupError, codecs.decode, "test", "base1-generic") - self.assertRaises(LookupError, codecs.decode, "test", "base256-generic") - - def test_base_main(self): - tmp = sys.argv[:] - tfile = "test-base-main.txt" - with open(tfile, 'w') as f: - f.write("This is a long test string for the sake of causing line wrapping based on default parameters.") - for swap_arg in [[], ["-s"]]: - sys.argv = [tmp[0], tfile] + swap_arg - for m in main32, main64url: - self.assertEqual(m(), 0) - sys.argv = [tmp[0], tfile, "-d"] + swap_arg - self.assertEqual(main2(), 1) - os.remove(tfile) - sys.argv[:] = tmp - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Base codecs tests. + +""" +import sys +from unittest import TestCase + +from codext.__common__ import * +from codext.base._base import _generate_charset +from codext.base.baseN import base, main2, main32, main64url + + +class TestCodecsBase(TestCase): + def setUp(self): + global STR + STR = "this is a test" + + def test_new_base_codec(self): + for i in [0, 1, 256]: + self.assertRaises(ValueError, _generate_charset, i) + b10 = lambda *a: "0123456789" + base(b10, "base10") + B10 = "2361031878030638688519054699098996" + self.assertEqual(codecs.encode(STR, "base10"), B10) + self.assertEqual(codecs.encode(b(STR), "base10"), b(B10)) + self.assertEqual(codecs.decode(B10, "base10"), STR) + self.assertEqual(codecs.decode(b(B10), "base10"), b(STR)) + self.assertRaises(ValueError, base, 1, "test") + b11 = "0123456789a" + base(b11, "base11") + B11 = "113342054335735319526632a26972419" + self.assertEqual(codecs.encode(STR, "base11"), B11) + self.assertEqual(codecs.decode(B11, "base11"), STR) + self.assertRaises(ValueError, base, object(), "test") + self.assertIsNone(base({'': "01234"}, r"^base5(test)?$")) + self.assertIsNotNone(codecs.encode(STR, "base5test")) + self.assertRaises(ValueError, base, {'': "01234"}, "base5-test", pow2=True) + self.assertEqual("", codecs.decode("", "base5test")) + + def test_codec_base1(self): + C = "A" + for i in range(3): + self.assertIsNotNone(codecs.encode(i * C, "base1")) + self.assertRaises(ValueError, codecs.encode, 4 * C, "unary") + self.assertEqual(codecs.decode("AAAAA", "base1"), "\x05") + + def test_codec_base2(self): + STR = "test" + B2 = "01110100011001010111001101110100" + self.assertEqual(codecs.encode(STR, "base2"), B2) + self.assertEqual(codecs.encode(b(STR), "base2"), b(B2)) + self.assertEqual(codecs.decode(B2, "base2"), STR) + self.assertEqual(codecs.decode(b(B2), "base2"), b(STR)) + B2 = "10001011100110101000110010001011" + self.assertEqual(codecs.encode(STR, "base2-inv"), B2) + self.assertEqual(codecs.decode(B2, "base2-inv"), STR) + B2 = "abbbabaaabbaabababbbaabbabbbabaa" + self.assertEqual(codecs.encode(STR, "base2-ab"), B2) + self.assertEqual(codecs.decode(B2, "base2-ab"), STR) + B2 = "CDDDCDCCCDDCCDCDCDDDCCDDCDDDCDCC" + self.assertEqual(codecs.encode(STR, "base2-CD"), B2) + self.assertEqual(codecs.decode(B2, "base2-CD"), STR) + B2 = "34443433344334343444334434443433" + self.assertEqual(codecs.encode(STR, "base2-34"), B2) + self.assertEqual(codecs.decode(B2, "base2-34"), STR) + + def test_codec_base3(self): + STR = "test" + B3 = "23112113223321323322" + self.assertEqual(codecs.encode(STR, "base3"), B3) + self.assertEqual(codecs.encode(b(STR), "base3"), b(B3)) + self.assertEqual(codecs.decode(B3, "base3"), STR) + self.assertEqual(codecs.decode(b(B3), "base3"), b(STR)) + B3 = "21332331221123121122" + self.assertEqual(codecs.encode(STR, "base3-inv"), B3) + self.assertEqual(codecs.decode(B3, "base3-inv"), STR) + B3 = "bcaabaacbbccbacbccbb" + self.assertEqual(codecs.encode(STR, "base3-abc"), B3) + self.assertEqual(codecs.decode(B3, "base3-abc"), STR) + self.assertRaises(LookupError, codecs.encode, "test", "base3-ab") + self.assertRaises(LookupError, codecs.encode, "test", "base3-abcd") + + def test_codec_base4(self): + STR = "test" + B4 = "2421232224142421" + self.assertEqual(codecs.encode(STR, "base4"), B4) + self.assertEqual(codecs.encode(b(STR), "base4"), b(B4)) + self.assertEqual(codecs.decode(B4, "base4"), STR) + self.assertEqual(codecs.decode(b(B4), "base4"), b(STR)) + B4 = "3134323331413134" + self.assertEqual(codecs.encode(STR, "base4-inv"), B4) + self.assertEqual(codecs.decode(B4, "base4-inv"), STR) + B4 = "bdbabcbbbdadbdba" + self.assertEqual(codecs.encode(STR, "base4-abcd"), B4) + self.assertEqual(codecs.decode(B4, "base4-abcd"), STR) + self.assertRaises(LookupError, codecs.encode, "test", "base4-abc") + self.assertRaises(LookupError, codecs.encode, "test", "base4-abcde") + + def test_codec_base8(self): + STR = "test" + B8 = "dfagcfgddfa=====" + self.assertEqual(codecs.encode(STR, "base8"), B8) + self.assertEqual(codecs.encode(b(STR), "base8"), b(B8)) + self.assertEqual(codecs.decode(B8, "base8"), STR) + self.assertEqual(codecs.decode(b(B8), "base8"), b(STR)) + B8 = "echbfcbeech=====" + self.assertEqual(codecs.encode(STR, "base8-inv"), B8) + self.assertEqual(codecs.decode(B8, "base8-inv"), STR) + B8 = "35062563350=====" + self.assertEqual(codecs.encode(STR, "base8-01234567"), B8) + self.assertEqual(codecs.decode(B8, "base8-01234567"), STR) + self.assertRaises(LookupError, codecs.encode, "test", "base8-0123456") + self.assertRaises(LookupError, codecs.encode, "test", "base8-012345678") + + def test_codec_base16(self): + B16 = "7468697320697320612074657374" + self.assertEqual(codecs.encode(STR, "base16"), B16) + self.assertEqual(codecs.encode(b(STR), "base16"), b(B16)) + self.assertEqual(codecs.decode(B16, "base16"), STR) + self.assertEqual(codecs.decode(b(B16), "base16"), b(STR)) + B16 += "?" + self.assertRaises(ValueError, codecs.decode, B16, "base16") + self.assertEqual(codecs.decode(B16, "base16", "ignore"), STR) + self.assertEqual(codecs.decode(B16, "base16", "replace"), STR + "\x00") + self.assertRaises(ValueError, codecs.decode, B16, "base16", "BAD") + STR2 = "=:;" + B16_1 = "3d3a3b" + B16_2 = "3D3A3B" + B16_3 = "3D3a3B" # mixed case: should fail + self.assertEqual(codecs.encode(STR2, "hex"), B16_2) + self.assertEqual(codecs.decode(B16_1, "hex"), STR2) + self.assertEqual(codecs.decode(B16_2, "hex"), STR2) + self.assertRaises(ValueError, codecs.decode, B16_3, "hex") + + def test_codec_base32(self): + for b32, enc in zip(["ORUGS4ZANFZSAYJAORSXG5A=", "qtwg1h3ypf31yajyqt1zg7y=", "EHK6ISP0D5PI0O90EHIN6T0=", + "fjn6kwt0e5tk0s90fjkr6x0=", "EHM6JWS0D5SJ0R90EHJQ6X0="], + ["base32", "zbase32", "base32-hex", "geohash", "crockford"]): + self.assertEqual(codecs.encode(STR, enc), b32) + self.assertEqual(codecs.encode(b(STR), enc), b(b32)) + self.assertEqual(codecs.decode(b32, enc), STR) + self.assertEqual(codecs.decode(b(b32), enc), b(STR)) + self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc) + self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc, "BAD") + + def test_codec_base36(self): + B36 = "4WMHTK6UZL044O91NKCEB8" + self.assertEqual(codecs.encode(STR, "base36"), B36) + self.assertEqual(codecs.encode(b(STR), "base36"), b(B36)) + self.assertEqual(codecs.decode(B36, "base36"), STR) + self.assertEqual(codecs.decode(b(B36), "base36"), b(STR)) + B36 = "E6WR3UG49VAEEYJBXUMOLI" + self.assertEqual(codecs.encode(STR, "base36-inv"), B36) + self.assertEqual(codecs.decode(B36, "base36-inv"), STR) + self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36-inv") + self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36", "BAD") + self.assertEqual(codecs.decode(B36 + "?", "base36-inv", "ignore"), STR) + + def test_codec_base58(self): + B58 = "jo91waLQA1NNeBmZKUF" + self.assertEqual(codecs.encode(STR, "base58"), B58) + self.assertEqual(codecs.encode(b(STR), "base58"), b(B58)) + self.assertEqual(codecs.decode(B58, "base58"), STR) + self.assertEqual(codecs.decode(b(B58), "base58"), b(STR)) + B58 = "jo9rA2LQwr44eBmZK7E" + self.assertEqual(codecs.encode(STR, "base58-ripple"), B58) + self.assertEqual(codecs.decode(B58, "base58-rp"), STR) + B58 = "JN91Wzkpa1nnDbLyjtf" + self.assertEqual(codecs.encode(STR, "base58-flickr"), B58) + self.assertEqual(codecs.encode(STR, "base58-shorturl"), B58) + self.assertEqual(codecs.decode(B58, "base58-fl"), STR) + self.assertEqual(codecs.encode(STR, "base58-short-url"), B58) + self.assertEqual(codecs.encode(STR, "base58-url"), B58) + + def test_codec_base62(self): + for b62, enc in zip(["CsoB4HQ5gmgMyCenF7E", "M2yLERaFqwqW8MoxPHO"], ["base62", "base62-inv"]): + self.assertEqual(codecs.encode(STR, enc), b62) + self.assertEqual(codecs.encode(b(STR), enc), b(b62)) + self.assertEqual(codecs.decode(b62, enc), STR) + self.assertEqual(codecs.decode(b(b62), enc), b(STR)) + + def test_codec_base64(self): + for b64, enc in zip(["dGhpcyBpcyBhIHRlc3Q=", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): + self.assertEqual(codecs.encode(STR, enc), b64) + self.assertEqual(codecs.encode(b(STR), enc), b(b64)) + self.assertEqual(codecs.decode(b64, enc), STR) + self.assertEqual(codecs.decode(b(b64), enc), b(STR)) + + def test_codec_base91(self): + for b91, enc in zip([",X,<:WRT%yxth90oZB", ",N,<:MHJ%onjXzqeP1", "Jx&[jv4S3Wg>,71@Jk", "yJy^\\IDFsdc?Tof:L#"], + ["base91", "base91-inv", "base91-alt", "base91-alt-inv"]): + self.assertEqual(codecs.encode(STR, enc), b91) + self.assertEqual(codecs.encode(b(STR), enc), b(b91)) + self.assertEqual(codecs.decode(b91, enc), STR) + self.assertEqual(codecs.decode(b(b91), enc), b(STR)) + self.assertIsNotNone(codecs.encode("\x00\x00", "base91")) + self.assertIsNotNone(codecs.decode("abc", "base91")) + self.assertIsNotNone(codecs.decode("AD", "base91")) + self.assertRaises(ValueError, codecs.decode, "\xff", "base91") + self.assertRaises(ValueError, codecs.decode, "a\xff", "base91") + self.assertIsNotNone(codecs.encode("\x00\x00", "base91-alt")) + + def test_codec_base100(self): + B100 = "\U0001f46b\U0001f45f\U0001f460\U0001f46a\U0001f417\U0001f460\U0001f46a\U0001f417\U0001f458\U0001f417" \ + "\U0001f46b\U0001f45c\U0001f46a\U0001f46b" + self.assertEqual(codecs.encode(STR, "base100"), B100) + self.assertEqual(codecs.encode(b(STR), "base100"), b(B100)) + self.assertEqual(codecs.decode(B100, "base100"), STR) + self.assertEqual(codecs.decode(b(B100), "base100"), b(STR)) + self.assertRaises(ValueError, codecs.decode, b(B100)[1:], "base100") + self.assertIsNotNone(codecs.decode(b(B100) + b"\n", "base100", "ignore")) + + def test_codec_base_generic(self): + for n in range(2, 255): + bn = "base{}_generic".format(n) + self.assertEqual(codecs.decode(codecs.encode(STR, bn), bn), STR) + self.assertRaises(LookupError, codecs.decode, "test", "base0-generic") + self.assertRaises(LookupError, codecs.decode, "test", "base1-generic") + self.assertRaises(LookupError, codecs.decode, "test", "base256-generic") + + def test_base_main(self): + tmp = sys.argv[:] + tfile = "test-base-main.txt" + with open(tfile, 'w') as f: + f.write("This is a long test string for the sake of causing line wrapping based on default parameters.") + for swap_arg in [[], ["-s"]]: + sys.argv = [tmp[0], tfile] + swap_arg + for m in main32, main64url: + self.assertEqual(m(), 0) + sys.argv = [tmp[0], tfile, "-d"] + swap_arg + self.assertEqual(main2(), 1) + os.remove(tfile) + sys.argv[:] = tmp + diff --git a/tests/test_common.py b/tests/test_common.py index 8bbf410..407997c 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -1,256 +1,237 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Codecs added assets' tests. - -""" -import codecs -import codext -import json -import random -import sys -from codext.__common__ import CODECS_OVERWRITTEN, PERS_MACROS, PERS_MACROS_FILE -from six import b, binary_type, text_type -from unittest import TestCase - - -PY3 = sys.version[0] == "3" - - -def dummy_encode(input, errors="strict"): - return input, len(input) - - -def dummy_decode(input, errors="strict"): - return input, len(input) - - -def dummy_errored_decode(useless): - raise AttributeError - def decode(input, errors="strict"): - return input, len(input) - return decode - - -def ensure_str(s, encoding='utf-8', errors='strict'): - """ Similar to six.ensure_str. Adapted here to avoid messing up with six version errors. """ - if not PY3 and isinstance(s, text_type): - return s.encode(encoding, errors) - elif PY3 and isinstance(s, binary_type): - try: - return s.decode(encoding, errors) - except: - return s.decode("latin-1") - return s - - -def getregentry(encoding): - if encoding == "dummy3": - return codecs.CodecInfo(name="dummy3", encode=dummy_encode, decode=dummy_decode) - - -class TestCommon(TestCase): - def setUp(self): - codext.reset() - - def test_add_codec(self): - self.assertRaises(ValueError, codext.add, "test") - self.assertRaises(ValueError, codext.add, "test", "BAD") - self.assertRaises(ValueError, codext.add, "test", lambda: None, "BAD") - self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) - self.assertEqual(codext.encode("test", "dummy"), "test") - ci = codext.lookup("dummy") - for k in ["add_to_codecs", "category", "examples", "name", "pattern", "text"]: - self.assertIn(k, ci.parameters.keys()) - self.assertIsNotNone(codext.add("dummy_errored", None, dummy_errored_decode, r"dummy_errored(\d+)$")) - self.assertRaises(AttributeError, codext.lookup, "dummy_errored1") - - def test_add_map_codec(self): - ENCMAP = [{'a': "A", 'b': "B", 'c': "C"}, {'d': "D", 'e': "E", 'f': "F"}, {'g': "G", 'h': "H", 'i': "I"}] - self.assertIsNotNone(codext.add_map("dummy2", ENCMAP, pattern=r"^dummy2(?:[-_]?(\d))?$")) - self.assertRaises(ValueError, codext.add_map, "dummy2", "BAD_ENCMAP") - self.assertEqual(codext.encode("abc", "dummy2"), "ABC") - self.assertEqual(codext.encode("abc", "dummy2-1"), "ABC") - self.assertEqual(codext.encode("def", "dummy2-2"), "DEF") - self.assertEqual(codext.encode("ghi", "dummy2-3"), "GHI") - self.assertRaises(LookupError, codext.encode, "test", "dummy2-4") - ENCMAP = {'': {'a': "A", 'b': "B"}, r'bad': {'a': "B", 'b': "A"}} - self.assertIsNotNone(codext.add_map("dummy3", ENCMAP, pattern=r"^dummy3([-_]inverted)?$")) - self.assertRaises(LookupError, codext.encode, "test", "dummy3_inverted") - self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, ignore_case="BAD") - self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, intype="BAD") - self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, outype="BAD") - ci = codext.lookup("dummy2") - for k in ["category", "encmap", "ignore_case", "intype", "no_error", "outype", "repl_char", "sep", "text"]: - self.assertIn(k, ci.parameters.keys()) - - def test_list_codecs(self): - self.assertTrue(len(codext.list()) > 0) - self.assertTrue(len(codext.list("other")) > 0) - self.assertTrue(len(codext.list("native")) > 0) - self.assertTrue(len(codext.list("non-native")) > 0) - self.assertTrue(len(codext.list("native", "non-native", "crypto", "base")) > 0) - self.assertTrue(len(codext.list("native", "language", "crypto")) > 0) - self.assertTrue(len(codext.list("~crypto")) > 0) - self.assertEqual(set(codext.list("~native")), set(codext.list("non-native"))) - self.assertEqual(set(codext.list()), set(codext.list("native") + codext.list("non-native"))) - self.assertRaises(ValueError, codext.list, "BAD_CATEGORY") - self.assertTrue(codext.is_native("base64_codec")) - self.assertFalse(codext.is_native("base64")) - - def test_remove_codec(self): - self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) - self.assertEqual(codext.encode("test", "dummy"), "test") - self.assertIsNone(codext.remove("dummy")) - self.assertRaises(LookupError, codext.encode, "test", "dummy") - # special case, when adding a new codec also to the native codecs registry, then it won't be possible to remove - # it afterwards - self.assertIsNotNone(codecs.add("dummy2", dummy_encode, dummy_decode)) - self.assertEqual(codecs.encode("test", "dummy2"), "test") - self.assertIsNone(codecs.remove("dummy2")) - self.assertEqual(codecs.encode("test", "dummy2"), "test") - self.assertIsNone(codecs.register(getregentry)) - self.assertEqual(codecs.encode("test", "dummy3"), "test") - self.assertIsNone(codecs.remove("dummy3")) - self.assertEqual(codecs.encode("test", "dummy3"), "test") - - def test_clear_codecs(self): - self.assertIsNotNone(codecs.encode("test", "morse")) - self.assertIsNone(codecs.clear()) - self.assertRaises(LookupError, codecs.encode, "test", "morse") - - def test_reset_codecs(self): - self.assertIsNone(codext.reset()) - self.assertIsNotNone(codext.encode("test", "morse")) - self.assertRaises(LookupError, codext.encode, "test", "dummy") - self.assertTrue(len(CODECS_OVERWRITTEN) > 0) - self.assertIsNotNone(str(CODECS_OVERWRITTEN[0])) - - def test_search_codecs(self): - self.assertIsNotNone(codext.search("morse")) - self.assertIsNotNone(codext.search("geohash")) - self.assertIsNotNone(codext.examples("morse")) - self.assertIsNotNone(codext.examples("cp")) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[ab]{1,3}"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=ab)cd"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=-)\w+"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"([^\s])\1"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^\\]"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^a]"))) - - def test_encode_multiple_rounds(self): - if PY3: - self.assertRaises(TypeError, codext.encode, b"test", "utf-8[2]") - s = "test" - for i in range(3): - s = codext.encode(s, "morse") - self.assertEqual(s, codext.encode("test", "morse[3]")) - self.assertIsNotNone(codext.encode("test", "base64[10]")) - - def test_guess_decode(self): - self.assertIsNone(codext.stopfunc._reload_lang()) - self.assertIsNotNone(codext.stopfunc._validate("flag")) - _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None - codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "^test(?:_codec)?$", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) - self.assertIn("test-codec", codext.list_encodings("test")) - self.assertEqual(codext.decode("TEST=", "test"), "TEST") - self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2, - scoring_heuristic=False).items())[0][1], "TEST") - self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include=["test", "base"], - max_depth=2).items())[0][1], "TEST") - STR = "This is a test" - self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", max_depth=1))) - self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "a test", found=["base62"]))) - self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, "base", scoring_heuristic=True, - exclude=["base100"]))) - self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, ["base", "crypto"]))) - self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=None)), 0) - self.assertIn("F1@9", _l(codext.guess("VGVzdCBGMUA5ICE=", codext.stopfunc.flag, max_depth=1, stop=False, - show=True))) - self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", - exclude=("base64", "base64-url"))), 0) - self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", - scoring_heuristic=True, exclude=("base64", "base64-url", "atbash"))), 0) - self.assertRaises(ValueError, codext.guess, STR, max_depth=0) - self.assertRaises(ValueError, codext.guess, STR, exclude=42) - for c in ["base", "language", "native", "stegano"]: - e = codext.list(c) - random.shuffle(e) - for ename in e[:10]: - for encoding in codext.lookup(ename).parameters.get('guess', [ename])[:10]: - try: - enc = codext.encode(STR, encoding) - except (NotImplementedError, ValueError): - continue - except TypeError: - enc = codext.encode(b(STR), encoding) - if codext.decode(enc, encoding) == STR: - continue - for found_encodings, found_dec in codext.guess(enc, "a test", 0, 1, [c], - scoring_heuristic=True, debug=True).items(): - self.assertEqual(ensure_str(STR).lower(), ensure_str(found_dec).lower()) - if c != "base": - # do not check for base as the guessed encoding name can be different, e.g.: - # actual: base2 - # guessed: base2-generic - if "-icase" in encoding: - self.assertEqual(encoding.lower(), found_encodings[0].lower()) - else: - self.assertEqual(encoding, found_encodings[0]) - txt = "".join(chr(i) for i in range(256)) - b64 = codext.encode(txt, "base64") - self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True, include="base"))) - self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST") - - def test_rank_input(self): - codext.remove("test_codec") - self.assertRaises(LookupError, codext.encode, "TEST", "test") - codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "^test(?:_codec)?$", padding_char="=", no_error=True, penalty=1.) - STR = "This is a test string !" - ENC = codext.encode(STR, "base64") - self.assertTrue(len(codext.rank(ENC)) > 20) - self.assertEqual(len(codext.rank(ENC, limit=20)), 20) - self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url", "base64-inv"]) - self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) - self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) - self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) - self.assertIsNotNone(codext.rank("TEST=", include=["test", "base"])[0][0][1], "TEST") - - def test_handle_macros(self): - MACRO = "test-macro-f2ca1bb6c7e907d06dafe4687e579fce76b37e4e93b7605022da52e6ccc26fd2" - STR = "this is a test" - ENC = "H4sIAMrbkmEC/0txzyhIrnQC4QxPj6CcZONAWwAMIDOIFAAAAA==" - codext.remove(MACRO) - l = codext.list_macros() - self.assertTrue(len(l) > 0) - cm = codext.lookup("example-macro") - self.assertIsNotNone(cm) - self.assertRaises(LookupError, codext.lookup, "example-macro", False) - self.assertRaises(ValueError, codext.add_macro, "example-macro", "base64") - self.assertRaises(ValueError, codext.add_macro, "base64", "base91") - self.assertIsNotNone(repr(cm)) - self.assertTrue(hasattr(cm, "parameters")) - self.assertRaises(LookupError, codext.lookup, MACRO) - self.assertIsNone(codext.add_macro(MACRO, "base64", "gzip", "base64")) - self.assertIn(MACRO, codext.list_macros()) - self.assertIsNotNone(codext.encode(STR, MACRO)) - self.assertEqual(codext.decode(ENC, MACRO), STR) - # insert a bad entry for the list of encodings in the JSON file - PERS_MACROS[MACRO] = "not a list or tuple..." - with open(PERS_MACROS_FILE, 'w') as f: - json.dump(PERS_MACROS, f) - codext.reset() - self.assertRaises(ValueError, codext.lookup, MACRO) - self.assertIsNone(codext.remove(MACRO)) - self.assertRaises(LookupError, codext.lookup, MACRO) - self.assertNotIn(MACRO, codext.list_macros()) - self.assertIsNone(codext.remove("THIS-MACRO-DOES-NOT-EXIST")) - self.assertIsNone(codext.remove("VALID-MACRO")) - self.assertIsNone(codext.add_macro("VALID-MACRO", "gzip", "base64")) - self.assertIsNone(codext.remove("VALID-MACRO")) - if PY3: - self.assertIsNone(codext.add_macro("VALID-MACRO", "lzma", "base64")) - self.assertIsNone(codext.remove("VALID-MACRO")) - self.assertRaises(ValueError, codext.add_macro, "SHALL-FAIL", "base26", "sms", "letter-indices") - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Codecs added assets' tests. + +""" +import codext +import json +import random +import sys +from codext.__common__ import * +from codext.__common__ import CODECS_OVERWRITTEN, PERS_MACROS, PERS_MACROS_FILE +from unittest import TestCase + + +def dummy_encode(input, errors="strict"): + return input, len(input) + + +def dummy_decode(input, errors="strict"): + return input, len(input) + + +def dummy_errored_decode(useless): + raise AttributeError + def decode(input, errors="strict"): + return input, len(input) + return decode + + +def getregentry(encoding): + if encoding == "dummy3": + return codecs.CodecInfo(name="dummy3", encode=dummy_encode, decode=dummy_decode) + + +class TestCommon(TestCase): + def setUp(self): + codext.reset() + + def test_add_codec(self): + self.assertRaises(ValueError, codext.add, "test") + self.assertRaises(ValueError, codext.add, "test", "BAD") + self.assertRaises(ValueError, codext.add, "test", lambda: None, "BAD") + self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) + self.assertEqual(codext.encode("test", "dummy"), "test") + ci = codext.lookup("dummy") + for k in ["add_to_codecs", "category", "examples", "name", "pattern", "text"]: + self.assertIn(k, ci.parameters.keys()) + self.assertIsNotNone(codext.add("dummy_errored", None, dummy_errored_decode, r"dummy_errored(\d+)$")) + self.assertRaises(AttributeError, codext.lookup, "dummy_errored1") + + def test_add_map_codec(self): + ENCMAP = [{'a': "A", 'b': "B", 'c': "C"}, {'d': "D", 'e': "E", 'f': "F"}, {'g': "G", 'h': "H", 'i': "I"}] + self.assertIsNotNone(codext.add_map("dummy2", ENCMAP, pattern=r"^dummy2(?:[-_]?(\d))?$")) + self.assertRaises(ValueError, codext.add_map, "dummy2", "BAD_ENCMAP") + self.assertEqual(codext.encode("abc", "dummy2"), "ABC") + self.assertEqual(codext.encode("abc", "dummy2-1"), "ABC") + self.assertEqual(codext.encode("def", "dummy2-2"), "DEF") + self.assertEqual(codext.encode("ghi", "dummy2-3"), "GHI") + self.assertRaises(LookupError, codext.encode, "test", "dummy2-4") + ENCMAP = {'': {'a': "A", 'b': "B"}, r'bad': {'a': "B", 'b': "A"}} + self.assertIsNotNone(codext.add_map("dummy3", ENCMAP, pattern=r"^dummy3([-_]inverted)?$")) + self.assertRaises(LookupError, codext.encode, "test", "dummy3_inverted") + self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, ignore_case="BAD") + self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, intype="BAD") + self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, outype="BAD") + ci = codext.lookup("dummy2") + for k in ["category", "encmap", "ignore_case", "intype", "no_error", "outype", "repl_char", "sep", "text"]: + self.assertIn(k, ci.parameters.keys()) + + def test_list_codecs(self): + self.assertTrue(len(codext.list()) > 0) + self.assertTrue(len(codext.list("other")) > 0) + self.assertTrue(len(codext.list("native")) > 0) + self.assertTrue(len(codext.list("non-native")) > 0) + self.assertTrue(len(codext.list("native", "non-native", "crypto", "base")) > 0) + self.assertTrue(len(codext.list("native", "language", "crypto")) > 0) + self.assertTrue(len(codext.list("~crypto")) > 0) + self.assertEqual(set(codext.list("~native")), set(codext.list("non-native"))) + self.assertEqual(set(codext.list()), set(codext.list("native") + codext.list("non-native"))) + self.assertRaises(ValueError, codext.list, "BAD_CATEGORY") + self.assertTrue(codext.is_native("base64_codec")) + self.assertFalse(codext.is_native("base64")) + + def test_remove_codec(self): + self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) + self.assertEqual(codext.encode("test", "dummy"), "test") + self.assertIsNone(codext.remove("dummy")) + self.assertRaises(LookupError, codext.encode, "test", "dummy") + # special case, when adding a new codec also to the native codecs registry, then it won't be possible to remove + # it afterwards + self.assertIsNotNone(codecs.add("dummy2", dummy_encode, dummy_decode)) + self.assertEqual(codecs.encode("test", "dummy2"), "test") + self.assertIsNone(codecs.remove("dummy2")) + self.assertEqual(codecs.encode("test", "dummy2"), "test") + self.assertIsNone(codecs.register(getregentry)) + self.assertEqual(codecs.encode("test", "dummy3"), "test") + self.assertIsNone(codecs.remove("dummy3")) + self.assertEqual(codecs.encode("test", "dummy3"), "test") + + def test_clear_codecs(self): + self.assertIsNotNone(codecs.encode("test", "morse")) + self.assertIsNone(codecs.clear()) + self.assertRaises(LookupError, codecs.encode, "test", "morse") + + def test_reset_codecs(self): + self.assertIsNone(codext.reset()) + self.assertIsNotNone(codext.encode("test", "morse")) + self.assertRaises(LookupError, codext.encode, "test", "dummy") + self.assertTrue(len(CODECS_OVERWRITTEN) > 0) + self.assertIsNotNone(str(CODECS_OVERWRITTEN[0])) + + def test_search_codecs(self): + self.assertIsNotNone(codext.search("morse")) + self.assertIsNotNone(codext.search("geohash")) + self.assertIsNotNone(codext.examples("morse")) + self.assertIsNotNone(codext.examples("cp")) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[ab]{1,3}"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=ab)cd"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=-)\w+"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"([^\s])\1"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^\\]"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^a]"))) + + def test_encode_multiple_rounds(self): + s = "test" + for i in range(3): + s = codext.encode(s, "morse") + self.assertEqual(s, codext.encode("test", "morse[3]")) + self.assertIsNotNone(codext.encode("test", "base64[10]")) + + def test_guess_decode(self): + self.assertIsNone(codext.stopfunc._reload_lang()) + self.assertIsNotNone(codext.stopfunc._validate("flag")) + _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None + codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), + "^test(?:_codec)?$", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) + self.assertIn("test-codec", codext.list_encodings("test")) + self.assertEqual(codext.decode("TEST=", "test"), "TEST") + self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2, + scoring_heuristic=False).items())[0][1], "TEST") + self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include=["test", "base"], + max_depth=2).items())[0][1], "TEST") + STR = "This is a test" + self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", max_depth=1))) + self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "a test", found=["base62"]))) + self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, "base", scoring_heuristic=True, + exclude=["base100"]))) + self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, ["base", "crypto"]))) + self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=None)), 0) + self.assertIn("F1@9", _l(codext.guess("VGVzdCBGMUA5ICE=", codext.stopfunc.flag, max_depth=1, stop=False, + show=True))) + self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", + exclude=("base64", "base64-url"))), 0) + self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", + scoring_heuristic=True, exclude=("base64", "base64-url", "atbash"))), 0) + self.assertRaises(ValueError, codext.guess, STR, max_depth=0) + self.assertRaises(ValueError, codext.guess, STR, exclude=42) + for c in ["base", "language", "native", "stegano"]: + e = codext.list(c) + random.shuffle(e) + for ename in e[:10]: + for encoding in codext.lookup(ename).parameters.get('guess', [ename])[:10]: + try: + enc = codext.encode(STR, encoding) + except (NotImplementedError, ValueError): + continue + except TypeError: + enc = codext.encode(b(STR), encoding) + if codext.decode(enc, encoding) == STR: + continue + for found_encodings, found_dec in codext.guess(enc, "a test", 0, 1, [c], + scoring_heuristic=True, debug=True).items(): + self.assertEqual(ensure_str(STR).lower(), ensure_str(found_dec).lower()) + if c != "base": + # do not check for base as the guessed encoding name can be different, e.g.: + # actual: base2 + # guessed: base2-generic + if "-icase" in encoding: + self.assertEqual(encoding.lower(), found_encodings[0].lower()) + else: + self.assertEqual(encoding, found_encodings[0]) + txt = "".join(chr(i) for i in range(256)) + b64 = codext.encode(txt, "base64") + self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True, include="base"))) + self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST") + + def test_rank_input(self): + codext.remove("test_codec") + self.assertRaises(LookupError, codext.encode, "TEST", "test") + codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), + "^test(?:_codec)?$", padding_char="=", no_error=True, penalty=1.) + STR = "This is a test string !" + ENC = codext.encode(STR, "base64") + self.assertTrue(len(codext.rank(ENC)) > 20) + self.assertEqual(len(codext.rank(ENC, limit=20)), 20) + self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url", "base64-inv"]) + self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) + self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) + self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) + self.assertIsNotNone(codext.rank("TEST=", include=["test", "base"])[0][0][1], "TEST") + + def test_handle_macros(self): + MACRO = "test-macro-f2ca1bb6c7e907d06dafe4687e579fce76b37e4e93b7605022da52e6ccc26fd2" + STR = "this is a test" + ENC = "H4sIAMrbkmEC/0txzyhIrnQC4QxPj6CcZONAWwAMIDOIFAAAAA==" + codext.remove(MACRO) + l = codext.list_macros() + self.assertTrue(len(l) > 0) + cm = codext.lookup("example-macro") + self.assertIsNotNone(cm) + self.assertRaises(LookupError, codext.lookup, "example-macro", False) + self.assertRaises(ValueError, codext.add_macro, "example-macro", "base64") + self.assertRaises(ValueError, codext.add_macro, "base64", "base91") + self.assertIsNotNone(repr(cm)) + self.assertTrue(hasattr(cm, "parameters")) + self.assertRaises(LookupError, codext.lookup, MACRO) + self.assertIsNone(codext.add_macro(MACRO, "base64", "gzip", "base64")) + self.assertIn(MACRO, codext.list_macros()) + self.assertIsNotNone(codext.encode(STR, MACRO)) + self.assertEqual(codext.decode(ENC, MACRO), STR) + # insert a bad entry for the list of encodings in the JSON file + PERS_MACROS[MACRO] = "not a list or tuple..." + with open(PERS_MACROS_FILE, 'w') as f: + json.dump(PERS_MACROS, f) + codext.reset() + self.assertRaises(ValueError, codext.lookup, MACRO) + self.assertIsNone(codext.remove(MACRO)) + self.assertRaises(LookupError, codext.lookup, MACRO) + self.assertNotIn(MACRO, codext.list_macros()) + self.assertIsNone(codext.remove("THIS-MACRO-DOES-NOT-EXIST")) + self.assertIsNone(codext.remove("VALID-MACRO")) + self.assertIsNone(codext.add_macro("VALID-MACRO", "gzip", "base64")) + self.assertIsNone(codext.remove("VALID-MACRO")) + self.assertIsNone(codext.add_macro("VALID-MACRO", "lzma", "base64")) + self.assertIsNone(codext.remove("VALID-MACRO")) + self.assertRaises(ValueError, codext.add_macro, "SHALL-FAIL", "base26", "sms", "letter-indices") + diff --git a/tests/test_generated.py b/tests/test_generated.py index 614562f..e8eaf10 100644 --- a/tests/test_generated.py +++ b/tests/test_generated.py @@ -1,139 +1,158 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Automatically generated codec tests. - -""" -import os -import re -from itertools import chain -from random import randint -from string import printable -from unittest import TestCase - -from codext.__common__ import * - - -def make_test(**params): - """ Test factory function for auto-creating tests for encodings having __examples__ defined. """ - def _template(self): - tfile = "test-codec-%s.txt" % params['name'] - icase = params.get('ignore_case') - icdec = lambda s: s.lower() if icase in ["decode", "both"] else s - icenc = lambda s: s.lower() if icase in ["encode", "both"] else s - # first, define if only encode is used ; if so, decoding must occur right after encode tests, otherwise just - # execute the defined decode tests - dec = True - for k in params['examples'].keys(): - if k.startswith("dec"): - dec = False - # now execute tests relying on the given examples - for k, examples in params['examples'].items(): - # multiple encoding names can be given, e.g. 'enc(morse|morse-AB|...)' - m = re.match(r"(?:dec|enc|enc-dec)\((.*?)(?:\|(.*?))*\)", k) - if m: - f1 = getattr(codecs, ["decode", "encode"][k.startswith("enc")]) - f2 = getattr(codecs, ["encode", "decode"][k.startswith("enc")]) - for ename in m.groups(): - if ename is None: - continue - # buggy generated encoding names - try: - lookup(ename) - except LookupError: - continue - # erroneous encoding name test - if examples is None: - self.assertRaises(LookupError, f1, "test", ename) - continue - # unhandled character error tests - encmap = params.get('encmap') - if encmap and params['intype'] not in ["bin", "ord"] and not params['no_error']: - if not isinstance(encmap, list): - encmap = [encmap] - for em in encmap: - if k.startswith("dec"): - em = {v: k for k, v in em.items()} - # find one handled character and one unhandled - c1, c2 = None, None - p = list(map(ord, printable)) - for i in chain(p, set(range(256)) - set(p)): - if chr(i) in em.keys(): - c1 = chr(i) - break - for i in chain(set(range(256)) - set(p), p): - if chr(i) not in em.keys(): - c2 = chr(i) - break - # now check that it raises the right error or not given the selected errors handling - if c1 and c2: - sep = params['sep'][0] if len(params['sep']) > 0 else "" - self.assertRaises(ValueError, f1, c2, ename) - self.assertRaises(ValueError, f1, c2, ename, "BAD_ERRORS") - if not k.startswith("enc-dec"): - self.assertEqual(f1(c1 + c2, ename, "ignore"), f1(c1, ename)) - self.assertEqual(f1(c1 + c2, ename, "leave"), f1(c1, ename) + sep + c2) - self.assertEqual(f1(c1 + c2, ename, "replace"), f1(c1, ename) + sep + \ - params.get('repl_minlen', 1) * params['repl_char']) - # examples validation tests - if k.startswith("enc-dec") and isinstance(examples, list): - for e in examples[:]: - rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) - if rd: - examples.remove(e) - for n in (rd.group(2) or "512").split(","): - s = "".join(chr(randint(0, 255)) for i in range(int(n))) - examples.append(s.lower() if rd.group(1) else s) - for s in [""] + examples: - self.assertEqual(icdec(f2(icenc(f1(s, ename)), ename)), icdec(s)) - self.assertEqual(icdec(f2(icenc(f1(b(s), ename)), ename)), b(icdec(s))) - # file tests - with codecs.open(tfile, 'wb', encoding=ename) as f: - f.write(b(s)) - with codecs.open(tfile, 'rb', encoding=ename) as f: - s2 = f.read() if PY3 else f.read().rstrip("\x00") - self.assertEqual(b(icdec(s2)), b(icdec(s))) - os.remove(tfile) - else: - for s1, s2 in examples.items(): - # willingly erroneous tests - if s2 is None: - self.assertRaises((ValueError, NotImplementedError), f1, s1, ename) - continue - # raw text tests - self.assertEqual(icenc(f1(s1, ename)), icenc(s2)) - self.assertEqual(b(icenc(f1(s1, ename))), b(icenc(s2))) - self.assertIsNotNone(f1(s1, ename, "replace")) - self.assertIsNotNone(f1(s1, ename, "ignore")) - if dec: - self.assertEqual(icdec(f2(s2, ename)), icdec(s1)) - self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s1))) - self.assertIsNotNone(f2(s2, ename, "replace")) - self.assertIsNotNone(f2(s2, ename, "ignore")) - if k.startswith("enc"): - # file tests - with codecs.open(tfile, 'wb', encoding=ename) as f: - f.write(b(s1)) - with codecs.open(tfile, 'rb', encoding=ename) as f: - s = f.read() - if not PY3 and re.search("[^\x00]\x00$", s): - s = s[:-1] - self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s))) - os.remove(tfile) - return _template - - -class GeneratedTestCase(TestCase): - pass - - -for encoding in list_encodings(): - try: - ci = lookup(encoding) - except LookupError: - continue - # only consider codecs with __examples__ defined in their globals for dynamic tests generation - if ci.parameters.get('examples') is not None: - f = make_test(**ci.parameters) - f.__name__ = n = "test_" + encoding.replace("-", "_") - setattr(GeneratedTestCase, n, f) - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Automatically generated codec tests. + +""" +from itertools import chain +from random import randint +from string import printable +from unittest import TestCase + +from codext.__common__ import * + + +def make_test(**params): + """ Test factory function for auto-creating tests for encodings having __examples__ defined. """ + def _template(self): + tfile = "test-codec-%s.txt" % params['name'] + icase = params.get('ignore_case') + icdec = lambda s: s.lower() if icase in ["decode", "both"] else s + icenc = lambda s: s.lower() if icase in ["encode", "both"] else s + # first, define if only encode is used ; if so, decoding must occur right after encode tests, otherwise just + # execute the defined decode tests + dec = True + for k in params['examples'].keys(): + if k.startswith("dec"): + dec = False + # now execute tests relying on the given examples + for k, examples in params['examples'].items(): + # multiple encoding names can be given, e.g. 'enc(morse|morse-AB|...)' + m = re.match(r"(?:dec|enc|enc-dec)\((.*?)(?:\|(.*?))*\)(\*)?", k) + if m: + f1 = getattr(codecs, ["decode", "encode"][k.startswith("enc")]) + f2 = getattr(codecs, ["encode", "decode"][k.startswith("enc")]) + for ename in m.groups(): + #FIXME + if ename == "*": + # ignore mode only + continue + if ename is None: + continue + # buggy generated encoding names + try: + lookup(ename) + except LookupError: + continue + # erroneous encoding name test + if examples is None: + self.assertRaises(LookupError, f1, "test", ename) + continue + # unhandled character error tests + encmap = params.get('encmap') + if encmap and params['intype'] not in ["bin", "ord"] and not params['no_error']: + if not isinstance(encmap, list): + encmap = [encmap] + for em in encmap: + if k.startswith("dec"): + em = {v: k for k, v in em.items()} + # find one handled character and one unhandled + c1, c2 = None, None + p = list(map(ord, printable)) + for i in chain(p, set(range(256)) - set(p)): + if chr(i) in em.keys(): + c1 = chr(i) + break + for i in chain(set(range(256)) - set(p), p): + if chr(i) not in em.keys(): + c2 = chr(i) + break + # now check that it raises the right error or not given the selected errors handling + if c1 and c2: + sep = params['sep'][0] if len(params['sep']) > 0 else "" + self.assertRaises(ValueError, f1, c2, ename) + self.assertRaises(ValueError, f1, c2, ename, "BAD_ERRORS") + if not k.startswith("enc-dec"): + self.assertEqual(f1(c1 + c2, ename, "ignore"), f1(c1, ename)) + self.assertEqual(f1(c1 + c2, ename, "leave"), f1(c1, ename) + sep + c2) + self.assertEqual(f1(c1 + c2, ename, "replace"), f1(c1, ename) + sep + \ + params.get('repl_minlen', 1) * params['repl_char']) + # examples validation tests + incr_f1 = codecs.getincrementalencoder(ename)().encode + incr_f2 = codecs.getincrementaldecoder(ename)().decode + # - "enc-dec" tests (uses a list of values that shall remain the same after encoding and decoding, + # no matter what the encoded value is + if k.startswith("enc-dec") and isinstance(examples, list): + for e in examples[:]: + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + if rd: + examples.remove(e) + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + examples.append(s.lower() if rd.group(1) else s) + for s in [""] + examples: + self.assertEqual(icdec(f2(icenc(f1(s, ename)), ename)), icdec(s)) + self.assertEqual(icdec(f2(icenc(f1(b(s), ename)), ename)), b(icdec(s))) + # important note: with respect to the original design, + # IncrementalEncoder(...).encode(...) gives bytes + # IncrementalDecoder(...).encode(...) gives str + self.assertEqual(icdec(incr_f2(icenc(incr_f1(s, ename)), ename)), icdec(s)) + self.assertEqual(icdec(incr_f2(icenc(incr_f1(b(s), ename)), ename)), icdec(s)) + # file tests + with codecs.open(tfile, 'wb', encoding=ename) as f: + f.write(b(s)) + with codecs.open(tfile, 'rb', encoding=ename) as f: + s2 = f.read() + self.assertEqual(b(icdec(s2)), b(icdec(s))) + os.remove(tfile) + # - "enc" and "dec" tests (uses a dictionary with the value to be encoded and the expected encoded + # value) + else: + for s1, s2 in examples.items(): + # willingly erroneous tests + if s2 is None: + self.assertRaises((ValueError, NotImplementedError), f1, s1, ename) + continue + # raw text tests + self.assertEqual(icenc(f1(s1, ename)), icenc(s2)) + self.assertEqual(b(icenc(f1(s1, ename))), b(icenc(s2))) + # important note: with respect to the original design, + # IncrementalEncoder(...).encode(...) gives bytes + #self.assertEqual(icenc(incr_f1(s1, ename)), b(icenc(s2))) + #self.assertEqual(icenc(incr_f1(b(s1), ename)), b(icenc(s2))) + self.assertIsNotNone(f1(s1, ename, "replace")) + self.assertIsNotNone(f1(s1, ename, "ignore")) + if dec: + self.assertEqual(icdec(f2(s2, ename)), icdec(s1)) + self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s1))) + # important note: with respect to the original design, + # IncrementalDecoder(...).encode(...) gives str + #self.assertEqual(icdec(incr_f2(s2, ename)), icdec(s1)) + #self.assertEqual(icdec(incr_f2(b(s2), ename)), icdec(s1)) + self.assertIsNotNone(f2(s2, ename, "replace")) + self.assertIsNotNone(f2(s2, ename, "ignore")) + if k.startswith("enc"): + # file tests + with codecs.open(tfile, 'wb', encoding=ename) as f: + f.write(b(s1)) + with codecs.open(tfile, 'rb', encoding=ename) as f: + s = f.read() + self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s))) + os.remove(tfile) + return _template + + +class GeneratedTestCase(TestCase): + pass + + +for encoding in list_encodings(): + try: + ci = lookup(encoding) + except LookupError: + continue + # only consider codecs with __examples__ defined in their globals for dynamic tests generation + if ci.parameters.get('examples') is not None: + f = make_test(**ci.parameters) + f.__name__ = n = "test_" + encoding.replace("-", "_") + setattr(GeneratedTestCase, n, f) + diff --git a/tests/test_manual.py b/tests/test_manual.py index 6a1d09f..bed4884 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -1,172 +1,168 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Manual codec tests. - -""" -import hashlib -import os -import random -from six import binary_type, string_types -from unittest import TestCase - -from codext.__common__ import * -from codext.binary.baudot import _check_alphabet -from codext.hashing.checksums import CRC - - -class ComplementaryTestCase(TestCase): - def test_codec_baudot(self): - self.assertRaises(ValueError, _check_alphabet, ["BAD_ALPHABET"]) - - def test_codec_dna(self): - self.assertEqual(codecs.decode("ABC", "dna-1", errors="ignore"), "\x02") - self.assertEqual(codecs.decode("ABC", "dna-2", errors="replace"), "[00??01]") - - def test_codec_morse(self): - self.assertRaises(LookupError, codecs.encode, "test", "morse-AAB") - - def test_codec_sms(self): - self.assertEqual(codecs.decode("A-B-222-3-4-5", "sms", "leave"), "ABcdgj") - - -class ManualTestCase(TestCase): - def test_codec_affine(self): - STR = "this is a test" - AFF1 = "vjkubkubcbvguv" - self.assertRaises(LookupError, codecs.encode, STR, "affine-BAD") - self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u-BAD") - # uses by default an alphabet with lowercase, uppercase, whitespace and parameters a=1 and b=2 - self.assertEqual(codecs.encode(STR, "affine"), codecs.encode(STR, "affine-?l?u?s-1,2")) - self.assertEqual(codecs.encode(STR, "affine"), AFF1) - self.assertEqual(codecs.encode(b(STR), "affine"), b(AFF1)) - self.assertEqual(codecs.decode(AFF1, "affine"), STR) - self.assertEqual(codecs.decode(b(AFF1), "affine"), b(STR)) - AFF2 = "ORWJdWJdidOCJO" - self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-5,8"), AFF2) - self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-5,8"), b(AFF2)) - self.assertEqual(codecs.decode(AFF2, "affine-?l?u?d?s-5,8"), STR) - self.assertEqual(codecs.decode(b(AFF2), "affine-?l?u?d?s-5,8"), b(STR)) - AFF3 = "QsuOcuOcecQmOQ" - self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-2,4"), AFF3) - self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-2,4"), b(AFF3)) - self.assertEqual(codecs.decode(AFF3, "affine-?l?u?d?s-2,4"), STR) - self.assertEqual(codecs.decode(b(AFF3), "affine-?l?u?d?s-2,4"), b(STR)) - self.assertRaises(ValueError, codecs.decode, ".BAD.", "affine-?l?u?d?s-2,4") - self.assertIsNotNone(codecs.encode("TEST", "affine_?u-1,2")) - # example of parameters that cause mapping collisions - self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u?d?s-6,8") - - def test_codec_atbash(self): - STR = "This is a test" - ATB1 = "Gsrh rh z gvhg" - self.assertIsNotNone(codecs.encode("test", "atbash-whatevers")) - # uses by default an alphabet with lowercase and uppercase - self.assertEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-?l?u")) - self.assertNotEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-[?l?u]")) - self.assertEqual(codecs.encode(STR, "atbash_cipher"), ATB1) - self.assertEqual(codecs.encode(b(STR), "atbash-cipher"), b(ATB1)) - self.assertEqual(codecs.decode(ATB1, "atbash"), STR) - self.assertEqual(codecs.decode(b(ATB1), "atbash"), b(STR)) - ATB2 = "N^]/a]/a a.{/." - self.assertEqual(codecs.encode(STR, "atbash-[?l?u?p?s]"), ATB2) - self.assertEqual(codecs.encode(b(STR), "atbash_cipher-[?l?u?p?s]"), b(ATB2)) - self.assertEqual(codecs.decode(ATB2, "atbash-[?l?u?p?s]"), STR) - self.assertEqual(codecs.decode(b(ATB2), "atbash_cipher-[?l?u?p?s]"), b(STR)) - - def test_codec_case_related_manips(self): - STR = "This is a test" - self.assertEqual(codecs.encode(STR, "lower"), "this is a test") - self.assertEqual(codecs.encode(b(STR), "uppercase"), b("THIS IS A TEST")) - self.assertEqual(codecs.encode(STR, "capitalize"), "This is a test") - self.assertEqual(codecs.decode(b(STR), "capitalize"), b("this is a test")) - self.assertEqual(codecs.encode(STR, "title"), "This Is A Test") - self.assertEqual(codecs.decode(b(STR), "title"), b("this is a test")) - self.assertEqual(codecs.encode(b(STR), "swapcase"), b("tHIS IS A TEST")) - self.assertEqual(codecs.encode(b(STR), "camelcase"), b("thisIsATest")) - self.assertEqual(codecs.encode(b(STR), "kebabcase"), b("this-is-a-test")) - self.assertEqual(codecs.encode(b(STR), "pascalcase"), b("ThisIsATest")) - self.assertEqual(codecs.encode(b(STR), "slugify"), b("this-is-a-test")) - self.assertEqual(codecs.encode(b(STR), "snakecase"), b("this_is_a_test")) - self.assertRaises(NotImplementedError, codecs.decode, STR, "camel") - self.assertRaises(NotImplementedError, codecs.decode, STR, "pascal") - self.assertRaises(NotImplementedError, codecs.decode, STR, "slug") - self.assertRaises(NotImplementedError, codecs.decode, STR, "snake") - - def test_codec_dummy_str_manips(self): - STR = "this is a test" - self.assertEqual(codecs.decode(STR, "reverse"), "tset a si siht") - self.assertEqual(codecs.decode(STR, "reverse_words"), "siht si a tset") - self.assertEqual(codecs.decode(STR.split()[0], "reverse"), codecs.decode(STR.split()[0], "reverse-words")) - self.assertEqual(codecs.encode(STR, "replace-i1"), STR.replace("i", "1")) - self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR) - self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that")) - self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR) - self.assertEqual(codecs.encode(STR, "tokenize-2"), "th is i s a te st") - self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200") - - def test_codec_hash_functions(self): - STR = b"This is a test string!" - for h in ["adler32", "md2", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: - self.assertIsNotNone(codecs.encode(STR, h)) - self.assertRaises(NotImplementedError, codecs.decode, STR, h) - if PY3: - self.assertEqual(len(codecs.encode(STR, "blake2b_64")), 128) - self.assertRaises(LookupError, codecs.encode, STR, "blake2b_0") - self.assertRaises(LookupError, codecs.encode, STR, "blake2b-65") - self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2b") - self.assertEqual(len(codecs.encode(STR, "blake2s_32")), 64) - self.assertRaises(LookupError, codecs.encode, STR, "blake2s_0") - self.assertRaises(LookupError, codecs.encode, STR, "blake2s-33") - self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2s") - self.assertIsNotNone(codecs.encode(STR, "shake128")) - self.assertRaises(LookupError, codecs.encode, STR, "shake128_0") - self.assertRaises(NotImplementedError, codecs.decode, STR, "shake128") - self.assertIsNotNone(codecs.encode(STR, "shake256")) - self.assertRaises(LookupError, codecs.encode, STR, "shake256-0") - self.assertRaises(NotImplementedError, codecs.decode, STR, "shake256") - for h in ["sha3_224", "sha3_256", "sha3_384", "sha3_512"]: - self.assertIsNotNone(codecs.encode(STR, h)) - self.assertRaises(NotImplementedError, codecs.decode, STR, h) - if UNIX: - import crypt - METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] - for m in METHODS: - h = "crypt-" + m - self.assertIsNotNone(codecs.encode(STR, h)) - self.assertRaises(NotImplementedError, codecs.decode, STR, h) - # CRC checks - STR = "123456789" - for n, variants in CRC.items(): - for name, params in variants.items(): - enc = ("crc%d-%s" % (n, name) if isinstance(n, int) else "crc-%s" % name).rstrip("-") - print(enc) - self.assertEqual(codecs.encode(STR, enc), "%0{}x".format(round((n or 16)/4+.5)) % params[5]) - - def test_codec_markdown(self): - HTM = "

Test title

\n\n

Test paragraph

\n" - MD = "# Test title\n\nTest paragraph" - TFILE = "test-codec-markdown.html" - self.assertTrue(isinstance(codecs.encode(MD, "markdown"), string_types)) - self.assertTrue(not PY3 or isinstance(codecs.encode(b(MD), "markdown"), binary_type)) - self.assertEqual(codecs.encode(MD, "markdown"), HTM) - self.assertRaises(NotImplementedError, codecs.decode, MD, "markdown") - with codecs.open(TFILE, 'w', encoding="markdown") as f: - f.write(b(MD)) - with codecs.open(TFILE) as f: - s = f.read() - self.assertEqual(HTM, ensure_str(s)) - os.remove(TFILE) - - def test_codec_whitespace_after_before(self): - STR = "test" - for i in range(100): - c = "whitespace{}{}*after{}{}*before".format("-+"[random.randint(0, 1)], random.randint(1, 3), - "-+"[random.randint(0, 1)], random.randint(1, 3)) - self.assertEqual(codecs.decode("\n" + codecs.encode(STR, c) + "\n", c), STR) - # in this special case, the whitespaces between words cannot be encoded because: - # - ord(" ") == 32 - # - the next minimal value in the printable characters excluding the latest 6 is ord("!") == 33 - # and therefore ord(" ")-random(0,20)-random(0,20) will never fall into the valid ordinals ! - self.assertRaises(ValueError, codecs.encode, "this is a test", "whitespace-after-before") - self.assertIn("\x00", codecs.encode("this is a test", "whitespace-after-before", "replace")) - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Manual codec tests. + +""" +import os +import random +from unittest import TestCase + +from codext.__common__ import * +from codext.binary.baudot import _check_alphabet +from codext.hashing.checksums import CRC + + +class ComplementaryTestCase(TestCase): + def test_codec_baudot(self): + self.assertRaises(ValueError, _check_alphabet, ["BAD_ALPHABET"]) + + def test_codec_dna(self): + self.assertEqual(codecs.decode("ABC", "dna-1", errors="ignore"), "\x02") + self.assertEqual(codecs.decode("ABC", "dna-2", errors="replace"), "[00??01]") + + def test_codec_morse(self): + self.assertRaises(LookupError, codecs.encode, "test", "morse-AAB") + + def test_codec_sms(self): + self.assertEqual(codecs.decode("A-B-222-3-4-5", "sms", "leave"), "ABcdgj") + + +class ManualTestCase(TestCase): + def test_codec_affine(self): + STR = "this is a test" + AFF1 = "vjkubkubcbvguv" + self.assertRaises(LookupError, codecs.encode, STR, "affine-BAD") + self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u-BAD") + # uses by default an alphabet with lowercase, uppercase, whitespace and parameters a=1 and b=2 + self.assertEqual(codecs.encode(STR, "affine"), codecs.encode(STR, "affine-?l?u?s-1,2")) + self.assertEqual(codecs.encode(STR, "affine"), AFF1) + self.assertEqual(codecs.encode(b(STR), "affine"), b(AFF1)) + self.assertEqual(codecs.decode(AFF1, "affine"), STR) + self.assertEqual(codecs.decode(b(AFF1), "affine"), b(STR)) + AFF2 = "ORWJdWJdidOCJO" + self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-5,8"), AFF2) + self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-5,8"), b(AFF2)) + self.assertEqual(codecs.decode(AFF2, "affine-?l?u?d?s-5,8"), STR) + self.assertEqual(codecs.decode(b(AFF2), "affine-?l?u?d?s-5,8"), b(STR)) + AFF3 = "QsuOcuOcecQmOQ" + self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-2,4"), AFF3) + self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-2,4"), b(AFF3)) + self.assertEqual(codecs.decode(AFF3, "affine-?l?u?d?s-2,4"), STR) + self.assertEqual(codecs.decode(b(AFF3), "affine-?l?u?d?s-2,4"), b(STR)) + self.assertRaises(ValueError, codecs.decode, ".BAD.", "affine-?l?u?d?s-2,4") + self.assertIsNotNone(codecs.encode("TEST", "affine_?u-1,2")) + # example of parameters that cause mapping collisions + self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u?d?s-6,8") + + def test_codec_atbash(self): + STR = "This is a test" + ATB1 = "Gsrh rh z gvhg" + self.assertIsNotNone(codecs.encode("test", "atbash-whatevers")) + # uses by default an alphabet with lowercase and uppercase + self.assertEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-?l?u")) + self.assertNotEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-[?l?u]")) + self.assertEqual(codecs.encode(STR, "atbash_cipher"), ATB1) + self.assertEqual(codecs.encode(b(STR), "atbash-cipher"), b(ATB1)) + self.assertEqual(codecs.decode(ATB1, "atbash"), STR) + self.assertEqual(codecs.decode(b(ATB1), "atbash"), b(STR)) + ATB2 = "N^]/a]/a a.{/." + self.assertEqual(codecs.encode(STR, "atbash-[?l?u?p?s]"), ATB2) + self.assertEqual(codecs.encode(b(STR), "atbash_cipher-[?l?u?p?s]"), b(ATB2)) + self.assertEqual(codecs.decode(ATB2, "atbash-[?l?u?p?s]"), STR) + self.assertEqual(codecs.decode(b(ATB2), "atbash_cipher-[?l?u?p?s]"), b(STR)) + + def test_codec_case_related_manips(self): + STR = "This is a test" + self.assertEqual(codecs.encode(STR, "lower"), "this is a test") + self.assertEqual(codecs.encode(b(STR), "uppercase"), b("THIS IS A TEST")) + self.assertEqual(codecs.encode(STR, "capitalize"), "This is a test") + self.assertEqual(codecs.decode(b(STR), "capitalize"), b("this is a test")) + self.assertEqual(codecs.encode(STR, "title"), "This Is A Test") + self.assertEqual(codecs.decode(b(STR), "title"), b("this is a test")) + self.assertEqual(codecs.encode(b(STR), "swapcase"), b("tHIS IS A TEST")) + self.assertEqual(codecs.encode(b(STR), "camelcase"), b("thisIsATest")) + self.assertEqual(codecs.encode(b(STR), "kebabcase"), b("this-is-a-test")) + self.assertEqual(codecs.encode(b(STR), "pascalcase"), b("ThisIsATest")) + self.assertEqual(codecs.encode(b(STR), "slugify"), b("this-is-a-test")) + self.assertEqual(codecs.encode(b(STR), "snakecase"), b("this_is_a_test")) + self.assertRaises(NotImplementedError, codecs.decode, STR, "camel") + self.assertRaises(NotImplementedError, codecs.decode, STR, "pascal") + self.assertRaises(NotImplementedError, codecs.decode, STR, "slug") + self.assertRaises(NotImplementedError, codecs.decode, STR, "snake") + + def test_codec_dummy_str_manips(self): + STR = "this is a test" + self.assertEqual(codecs.decode(STR, "reverse"), "tset a si siht") + self.assertEqual(codecs.decode(STR, "reverse_words"), "siht si a tset") + self.assertEqual(codecs.decode(STR.split()[0], "reverse"), codecs.decode(STR.split()[0], "reverse-words")) + self.assertEqual(codecs.encode(STR, "replace-i1"), STR.replace("i", "1")) + self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR) + self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that")) + self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR) + self.assertEqual(codecs.encode(STR, "tokenize-2"), "th is i s a te st") + self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200") + + def test_codec_hash_functions(self): + STR = b"This is a test string!" + for h in ["adler32", "md2", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: + self.assertIsNotNone(codecs.encode(STR, h)) + self.assertRaises(NotImplementedError, codecs.decode, STR, h) + self.assertEqual(len(codecs.encode(STR, "blake2b_64")), 128) + self.assertRaises(LookupError, codecs.encode, STR, "blake2b_0") + self.assertRaises(LookupError, codecs.encode, STR, "blake2b-65") + self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2b") + self.assertEqual(len(codecs.encode(STR, "blake2s_32")), 64) + self.assertRaises(LookupError, codecs.encode, STR, "blake2s_0") + self.assertRaises(LookupError, codecs.encode, STR, "blake2s-33") + self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2s") + self.assertIsNotNone(codecs.encode(STR, "shake128")) + self.assertRaises(LookupError, codecs.encode, STR, "shake128_0") + self.assertRaises(NotImplementedError, codecs.decode, STR, "shake128") + self.assertIsNotNone(codecs.encode(STR, "shake256")) + self.assertRaises(LookupError, codecs.encode, STR, "shake256-0") + self.assertRaises(NotImplementedError, codecs.decode, STR, "shake256") + for h in ["sha3_224", "sha3_256", "sha3_384", "sha3_512"]: + self.assertIsNotNone(codecs.encode(STR, h)) + self.assertRaises(NotImplementedError, codecs.decode, STR, h) + if UNIX: + import crypt + METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] + for m in METHODS: + h = "crypt-" + m + self.assertIsNotNone(codecs.encode(STR, h)) + self.assertRaises(NotImplementedError, codecs.decode, STR, h) + # CRC checks + STR = "123456789" + for n, variants in CRC.items(): + for name, params in variants.items(): + enc = ("crc%d-%s" % (n, name) if isinstance(n, int) else "crc-%s" % name).rstrip("-") + print(enc) + self.assertEqual(codecs.encode(STR, enc), "%0{}x".format(round((n or 16)/4+.5)) % params[5]) + + def test_codec_markdown(self): + HTM = "

Test title

\n\n

Test paragraph

\n" + MD = "# Test title\n\nTest paragraph" + TFILE = "test-codec-markdown.html" + self.assertTrue(isinstance(codecs.encode(MD, "markdown"), str)) + self.assertEqual(codecs.encode(MD, "markdown"), HTM) + self.assertRaises(NotImplementedError, codecs.decode, MD, "markdown") + with codecs.open(TFILE, 'w', encoding="markdown") as f: + f.write(b(MD)) + with codecs.open(TFILE) as f: + s = f.read() + self.assertEqual(HTM, ensure_str(s)) + os.remove(TFILE) + + def test_codec_whitespace_after_before(self): + STR = "test" + for i in range(100): + c = "whitespace{}{}*after{}{}*before".format("-+"[random.randint(0, 1)], random.randint(1, 3), + "-+"[random.randint(0, 1)], random.randint(1, 3)) + self.assertEqual(codecs.decode("\n" + codecs.encode(STR, c) + "\n", c), STR) + # in this special case, the whitespaces between words cannot be encoded because: + # - ord(" ") == 32 + # - the next minimal value in the printable characters excluding the latest 6 is ord("!") == 33 + # and therefore ord(" ")-random(0,20)-random(0,20) will never fall into the valid ordinals ! + self.assertRaises(ValueError, codecs.encode, "this is a test", "whitespace-after-before") + self.assertIn("\x00", codecs.encode("this is a test", "whitespace-after-before", "replace")) + From c1d268d7be7d58cdd315f1f69047652af811a7ce Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 27 Apr 2023 23:46:30 +0000 Subject: [PATCH 14/62] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index 78f9f98..3033e1b 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.03%coverage99.03% \ No newline at end of file +coverage: 99.16%coverage99.16% \ No newline at end of file From b643181673d14e41ba41399bdd315a7cacf71692 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 27 May 2023 23:13:33 +0200 Subject: [PATCH 15/62] Refined documentation --- docs/mkdocs.yml | 112 ++++++++++++++++++++------------------- docs/pages/css/extra.css | 26 +++++++++ 2 files changed, 83 insertions(+), 55 deletions(-) create mode 100644 docs/pages/css/extra.css diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index a39ccb0..387710b 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -1,55 +1,57 @@ -site_author: dhondta -site_name: "Codext - Extension of native codecs for Python" -repo_url: https://github.com/dhondta/python-codext -copyright: Copyright © 2021-2023 Alexandre D'Hondt -docs_dir: pages -nav: - - Introduction: index.md - - Features: features.md - - 'Guess mode': guessing.md - - Encodings: - - Base: enc/base.md - - Binary: enc/binary.md - - Common: enc/common.md - - Compressions: enc/compressions.md - - Cryptography: enc/crypto.md - - Hashing: enc/hashing.md - - Languages: enc/languages.md - - Others: enc/others.md - - Steganography: enc/stegano.md - - 'String manipulations': manipulations.md - - 'CLI tool': cli.md - - 'Create your codec': howto.md -extra: - generator: false - social: - - icon: fontawesome/solid/paper-plane - link: mailto:alexandre.dhondt@gmail.com - name: Contact Alex - - icon: fontawesome/brands/github - link: https://github.com/dhondta - name: Alex on GitHub - - icon: fontawesome/brands/linkedin - link: https://www.linkedin.com/in/alexandre-d-2ab2aa14/ - name: Alex on LinkedIn - - icon: fontawesome/brands/twitter - link: https://twitter.com/alex_dhondt - name: Alex on Twitter -theme: - name: material - palette: - - scheme: default - toggle: - icon: material/brightness-7 - name: Switch to dark mode - - scheme: slate - toggle: - icon: material/brightness-4 - name: Switch to light mode - logo: img/logo.png - favicon: img/icon.png -use_directory_urls: false -markdown_extensions: - - toc: - permalink: true - - admonition +site_author: dhondta +site_name: "Codext - Extension of native codecs for Python" +repo_url: https://github.com/dhondta/python-codext +copyright: Copyright © 2021-2023 Alexandre D'Hondt +docs_dir: pages +nav: + - Introduction: index.md + - Features: features.md + - 'Guess mode': guessing.md + - Encodings: + - Base: enc/base.md + - Binary: enc/binary.md + - Common: enc/common.md + - Compressions: enc/compressions.md + - Cryptography: enc/crypto.md + - Hashing: enc/hashing.md + - Languages: enc/languages.md + - Others: enc/others.md + - Steganography: enc/stegano.md + - 'String manipulations': manipulations.md + - 'CLI tool': cli.md + - 'Create your codec': howto.md +extra: + generator: false + social: + - icon: fontawesome/solid/paper-plane + link: mailto:alexandre.dhondt@gmail.com + name: Contact Alex + - icon: fontawesome/brands/github + link: https://github.com/dhondta + name: Alex on GitHub + - icon: fontawesome/brands/linkedin + link: https://www.linkedin.com/in/alexandre-d-2ab2aa14/ + name: Alex on LinkedIn + - icon: fontawesome/brands/twitter + link: https://twitter.com/alex_dhondt + name: Alex on Twitter +extra_css: + - css/extra.css +theme: + name: material + palette: + - scheme: default + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - scheme: slate + toggle: + icon: material/brightness-4 + name: Switch to light mode + logo: img/logo.png + favicon: img/icon.png +use_directory_urls: false +markdown_extensions: + - toc: + permalink: true + - admonition diff --git a/docs/pages/css/extra.css b/docs/pages/css/extra.css new file mode 100644 index 0000000..c78f454 --- /dev/null +++ b/docs/pages/css/extra.css @@ -0,0 +1,26 @@ +/* Full width (only works for some themes, including 'material') */ +@media only screen and (min-width: 76.25em) { + .md-main__inner { + max-width: none; + } + .md-sidebar--primary { + left: 0; + } + .md-sidebar--secondary { + right: 0; + margin-left: 0; + -webkit-transform: none; + transform: none; + } +} + +/* See https://github.com/mkdocs/mkdocs/wiki/MkDocs-Recipes */ +/* Add Support for Checkbox Lists */ +.task-list-item { + list-style-type: none; +} + +.task-list-item input { + margin: 0 4px 0.25em -20px; + vertical-align: middle; +} From 8eed486279e6f2a531e30fabf8772d06a56fcc50 Mon Sep 17 00:00:00 2001 From: dhondta Date: Fri, 8 Sep 2023 16:06:21 +0200 Subject: [PATCH 16/62] Fixed #7 --- src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 3 +-- src/codext/__init__.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index d3fbbb2..37e98a8 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.0 +1.15.1 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index a2ff0ef..cb32c75 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -1398,8 +1398,7 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, except TypeError: expf = expf(f) if isinstance(expf, (int, float)): - tmp = expf - expf = (1/f - .1 <= 1/expf <= 1/f + .1) + expf = 1/f - .1 <= 1/expf <= 1/f + .1 elif isinstance(expf, (tuple, list)) and len(expf) == 2: expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] s += [-1., .1][expf] diff --git a/src/codext/__init__.py b/src/codext/__init__.py index 67d6b5a..2a37ebe 100644 --- a/src/codext/__init__.py +++ b/src/codext/__init__.py @@ -227,7 +227,7 @@ def _format_action_invocation(self, action): else: print(ensure_str(c or "Could not %scode :-(" % ["en", "de"][args.command == "decode"]), end="") elif args.command == "guess": - s, lb = args.stop_function, args.lang_backend + s, lb = args.stop_function, getattr(args, "lang_backend", "none") if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): stopfunc._reload_lang(lb) From 10487853b55c0434bf52465ca92cee0616965cc9 Mon Sep 17 00:00:00 2001 From: dhondta Date: Thu, 28 Sep 2023 23:59:51 +0200 Subject: [PATCH 17/62] Fixed documentation --- .readthedocs.yml | 5 +++++ docs/pages/img/logo.png | Bin 21838 -> 15408 bytes 2 files changed, 5 insertions(+) diff --git a/.readthedocs.yml b/.readthedocs.yml index 0e991f8..aca74b8 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,5 +1,10 @@ version: 2 +build: + os: "ubuntu-22.04" + tools: + python: "3.11" + mkdocs: configuration: docs/mkdocs.yml diff --git a/docs/pages/img/logo.png b/docs/pages/img/logo.png index d14178df4385b6f0b9635f3fb79ce7d2a4361d3d..a1827f847b39fc1358e580c291e151d0f89b3bff 100644 GIT binary patch literal 15408 zcmYkj2VBkX7dYM$GBOh)BO%n?L!aXAz4x`cd+&WIDLXWbP?@1*luf9tj1Yy$&KAjv zME`fc@9+Qm|L^N|pL3sap0l3woOABGRct0{K-j3TfPjDj6f&L*w3~bW`h)_Xkr&2| z4+t1~)JYIKOMaDxBJxge0K7&39f&*Zq&Y%Jsl1gE)^ej=@WuADQ z1(*(o$D`t5Ko?GHx0rxQL^L=a6c2+TA@N{D&%po32@wp9N5uE=!;`6HcHRGPx1LUw zQ>OVJgI=MY!N5T9N{S53=HmTutB&wL#@s5qLkCFZ|H$b7-MeS8(`QxvuUDyZ>l6SX zkV#MiAX}wQrm@RRz!cs86`zVvMRVm^vz*W1AXzS%ScZ1}54%|gaPWUnvjGXt0RImO zQfL&*Cqb}In!ybNM1+N491aEsEx{nzRsxrY$5M$5mDA{B^2pE zx}XlC2CHMrrEZp8YEjwj2&V+jRf7aHFb;_nTU{KJ9;FdsJ#GP!Er)4v7%v`2*E@V{ zDS?9F0_Zq!3a}{-4uBFY0&CU--*jl0m!)**F*3KAqc9;P3P;bdorLFt$sDnsi1OQe zD3!w@I49(R0GqHFq+JSLZjep z2|dho^^Xv`Bwiv=L_xp`)f+iro{Q6euKqg7el zG%p>`AgVD!V4e&G7U}`hB&sC}saj4$$jw?fgib~ZL3lNvEtJSPY)lWm8YGo!k|20c zu^me%La-nWO{f?0kwgz#1N92fObs96QwU`uwB1E=&>>JFg3Bj+%t#AJZ37I(1}6(? zcrpq=r4;FDRx?_OL0JGHVo^PPYAuQB;ln^!5`qWPx=3&oU+q+rfDyp-)IH`b;G>y5 z76C65GT9P2f=v@h0AXQGDkw!rg(IOpm|6=HtMOJq8(1c=Km$kckrX2v)gwCxMrmOa zpgy-B!$$R}NsdM1Y*49FidI++Xt$V#XF=r>rCtc9L)BiU2?2qVl|m^6tJAquWVl!? zb?8ZUnh6U9h$504vL5LxMR*#WX&_3_fT*#0j~_4@U=$=a;!Q}vkRg66oWns#y)qjV zE2R;wBr*n$pvh?>g~@^w!C?$wHkPf&BaJeG1P6qb&8A> zC7tX<*(ekiTuNmb$t)Xzj$t!lXs*(#anj`sm5xnxf{09%*g?We7#c6w;86Pc%pQ&i zV244eu^6Bdhgu<^%B4yUUk11;BGYT78E6WJ3a?RNVSX)_NCY!Ma0^P}(h*QxGgk!T zB0WaG55f0feMqWDE%A^M1|9~a1e3fzy56tj(=`eeL1bd;2x1P5Ds{pPE`^3j(xBV| zw@9G@X&?Z9G^1LA(Lp?971+ZfNRU{J0_N*+hDIU}AkgA~u}yregbHUFU~D5@CuX}W z7#BfdaRNq-RV)2o3J9&o!2ug1Qt28fkO+*Trgm1eL_< zkw94D9@VPd2nyg|v1*XP#Xw*+Jg^CC6e7fK4BLls2-!9P2J0cS99SL0MAC}@^n55u zqZbkc5I2|Ya=MUA8c*R@DBV&%O+m!L6)q!HDduqKXbgZBFA;GCE*Jx(rLd(~tqkIm z8xTqxPc0#PWJ;}_fYf6oP=*zw2mBjRq|_qGfQzxJoB(Y^gHnaiX`xm!1dfFW*c_Ty zPa?}8a*$iWVT*}+kzQ+%GiYG1+C}AaAt<>=OxAONg=&qRf)Y7tasLF zqm%|3AORwm2*Io65VW7ba4?y8qgY6_5n*|2p%mR;_4KT*hATV?&)}qvF$OaFKX=5>Qz z-k>JC1Qr35GNQ~2#d(B$ImN(GDUCKcgu^x(Su&wn&XEa;Toxb6 zmkX={ECs0+YaI%qmnKtlWK4;J0}{bG0vgtc;lSxAD%Ys+i;Z>}-OhLD;1&elCUcnJ zd?`RA)PhziSwaMvha&dSqXo_c*6348by|*8f??axLblvu7a(XNGTCB80+)-@is4(? z2tUE6QJP(Fv)`;hQkZnFhbYl7!~&0=Ec9~8CacTfkt3O zV899m7;N&}EC!84=alk!P^Zq}QQJ&(BHAIb$c07&O#-qpgf2IMB2p~Jk>EVAP-x*h%oa9;%JP`}207h= zaTx6kHJ}Z5=00joZoXVFhzJH+(K3|!C)iQ zAyfG62$V_9bXzbQyHamgVGUrMSgsNKgiI>Zs)55v0-{4~)-z~m7agjA0K%6*$yT~m zNpgBYSfdnY0BJE|lF-iep%f|!-h{)*JustMqbI8DV57-vws`C)wq9f>sO)O4S*(^~ zr3kB<4uiOqJQvc;P~(s^KZY*w;4yrbU(2(Rh%Ob!B{tg;G^*67kPBTl2t+AD868#@ zQ%F)%G(-+psk0$WJ~J2MLqf=a2?`lh7u`iwBj6AQyC+r>!yyDFOY2mdFc6|uO5mcL z5*yvcra75Tl9`T2DY06J%jISHxe&b526%-Yzssg+fGZ9QrcvQ|3POsCKXohJtgPRA6&A4<)kFuy`4qY!EW7Js||jZSf+cRF{uPHn=%lnZaei zQe_G-6_`MyU|~v;OO2y5p-Kpoit~x!a-&+KA` zMMK9is6G!$52D!FNQhjfv3a~Olh30f*=!D*4P>!X%vunM%Jh1TA`ej`=2*mfmER6* zEkYtOzyT+iSx}^e!+ri~C)Mf`($U)#eW{E?ha`3St zm5oJkOFS}&Q*ALq6<#J%D|DevN|KAmg=no(nFq`qg&F+>kXyj#p>-l5Pi(}pg?7Nu!7?P4;MV!>z8n zGJ=i3rP2_7qsQmP_861Y=K-OlAUVh)@OV6Y23bP{Ni0SXLFk(V7j#|1xV?$_C(xNfrx}L!N7P11T6E3>>>t3L{w667L=KSql%;~ zpN7dJ%1s8bP)DYlts1q;PX!KAPq4za8f*lH97EuF9cGE#4+jBP3eYd^34Ub+7$0s0 zapgXTmWwo);eNZxiuLnxCauUT(it^EAiu+sEd;t2Xi;RQnMMMe=vuzigtz!88oPjO zbSMaPB1{A#6HF+!OD@nlDIT+5r!Zl82tSrX)!QLDuTMmQLkt`_4T2=nlys&8;$vc6 zY!VkGGxVHoAcQ1}Jqj>LB<4Bje1u+SWeUkk4v6NIS-34V?eP)YKNwZMQ zJT0B=SAwN3E5Ra%3AjLljHP?YOryahMgg#2&*dWpDuvIa(NY;85J#-0vW;Au4N8|1 zg%G9?Ok{{aQZ5%rcZeWvPig`}s@%Y3&gJNlT$ljpW@?paBNEvIxs`T^+od8ZH8Q6H z=)hx*2C@{);u|RrtO#SG`CTrI4(YO>P!_OB=VBqHZW7WdqA+w=p${R_Fb!O!k%eM2 zoL~i%>eng(svsYzn~rVu4$kDA(XT{+=X7X~0v&Kqrw;LFx6tMXH1H7;;LFR)AiLx5r;I z-6XX~j@Ci6IuykKvH}B8J|Flr@s)Ie*kMFlSRS%X&jqNL$uKN6mqkLdfpH)y&?}%~ zH{R>=Fx_;nAIWt06eqB7l@kfqLWM}MK!PBOU1$}{tny)vXq%m8)6$=z_NMeRU={2-N^ z&*>>00MoHNH4@|2(y&xBOe2OVq$ZnzZE>miW|YI`_V`Gg9=vP>lR*XmJ;kFD09iZH z(IW>Q8f3yKtxO95w!6T70;+a5(QK@kBD~S)L<1(rSA#3^g@pX4WP2KxLOjdM>t9!7)>K!$Qq6l z5HgV?f#`e=x(Ch0qeM0~jHbq^JU~u})tXEOhr?@gAn4$pEQDZpNYN&P$jU-1+(x}# zN9*x`77~Gt!qO~$6s(6gEQtuBYsn~XkJ7DV5FTi3aw>>M?QxwrKMh6G_CQ=V8YEWR zKu`ciNhnw^tcO@BNPB4C=Sg(-B{3 zaDP1=y}Pc+T~!Emyr~;GaKc`=;r^vpuU>sWar9`G(`xsnwLAH&^x)@(ude34A2@JeYFAg+ z)*DJlZ~X@8!O+^IDUW`7K8_CG!!515e*MfIeq8qV_YIn)THA{LflNzvqu#ae?Af!W zVaw!lm1>WNKv=fn@a7#yPM%!Bk6VVHAj{EZsL8>mw#{iF8I_dyDBm=g2InWZ$6#0qp+~>`MY<0 zw$Njb&7YIpFFJ4;cFB<=kPVwQL7q>sMeTEYJhwZ(eti`BoVKoo`SHW@aR6yQUIH&P z@JfFD`ZayP%*4ctH*c=3QPILX`V3MoTefV<)Ty#DfUrcJzkas!*6@ZUYNYEAR*M=g2$cy2xJ+Rd9YW`RIC!oqK;b%_fW z79R`RD=eE6B;LMh)BQswC0C1LKLiE_T3xQ=Ymb>in)(bIK79PdiJ3D(=gyngFC#Ov z@1gfmwNL7!c6A2)89!r&iJX>pKEJ4FOh!hAG44*NuxRkOja#;DTX+3B)$8?M&CJa7 zIHHiHm>y~DTC--&h_)w{Z&`WMVqz|Z<_|dJC#8?0Q7E#!1C;f#TY#OT9Z@F>7w=pi z!YjM?>9zfL_EbPb-?3Q{Gn%Nmvs>2gIWenj&E69>FKFPg3v>oU)iPwrki6E{ui@)j zPB}s(rs`x0nY{TRdrDLR!W6c6@#5yTL#B|BwX@ag*w0VT(S*y7G&MB|qH52vgF(x% zcMwphgG3|}$1&c_Ij&dzVH~H#os0Q#-0-03)vR&jo}Z|y>b7p7yLU=60J}Z6v9Pc! zVIy^t?#UTAZ|&mzOBT(l;|DKWj%v*(yT)9}-3`K6rshp;+HuVK{T!>&{L{1OhQG~>g+e}zT(ps(Q- zC-Y~&N$E}6GDbO2(0w~`op$uB4EoUXEAEU>FBZ?AKXzox`rW&CUmV$EuFb24PX0F~ zAtC>`u3`D6!ti?ug(n!EjnuUnyX?vvH`wC2fI@l~RLY%CZ>`HL$VystF6zZ+{el5w z7a*!?rW5CF;uF)-lH(Vgr#Dkpk4^IbcOjtpeCi)fe0+TOj=z_Gerle>T{yr@GnKr{ zeDi8*uI=zr$|Ofjo;8qB_@d=KySC`PjJxmcts=#rO_wiU-VUukVQO~Yx_R^9JgXJ- zuHrs_bX*>Hq-4*j0)h14)9HIGL)PJVvnGf9Lf^|Vhi58jG+J;Ha3bE0Oj+3Rc?P@e zo;l{+<0nsMydK@hOfr?f`W;sMIDW{yPkqff+eT7~mWQ0VaHC~O<$8Fgkj)OrlSkDK zw{Ktcm4a6)l|#^du3WzS9u0eQ^HARiMm6tlR!+f!;|-RB*P3z5bH<5smbQ%Ob-yGz zJUY29D)O{wzE7o}JNI$u)+Wzqdh)XSx!zOc0kKc6k|?@mF4)|ZY4Zw;S8CtaH12~_ zk2dUMWyW?rzOby{vsGDH3)dd_Sy53DRGDBtakAqnqGa~Quqo4pAO3zmW%Zz%`hGp$ zSZSf-*Vl)i=zHtjzs`@doUKhZxz~$=mT#R1k4*`lc(%N4 zb(HM$Pw#}g!S}~q)qahRXjKc+b8CKd{`mO(skdeA41mTK-NKb)9mQ8dTH6PvP2uui zcF<1u`Bq-LKKJu;{pa5WiayT!57GA@>~D-Qjl6SlhUAFS!RI%IaI+Wp9!^QW`C)K+ zM|iP-VK55dVC1bZ{npUq_v#9Xrxc6WZWJF5igOl!kFUj`g z3IB}&p)c6SA4ODaXXTCkyr8@D!CpW}p<<`=*!+L9-z(ja8BcRF)=%xF zZJBYg<;9_h_q`uXU0yUw`L9jBwsPtX3Y~r+sP*`R`G0l_%7jDv9@@M)($NPLIj!dD z=#i_FM6XcZ87$2oxFBZxmaZxDo70BwYX}?sdeX&~gEMZAxb-Ne>2lTxOiu2Y)~bWj z<6H6RxjXMgoqxkJG2qmU_x;4Q1)uia>6o7jy}Ik1DssddIVbPJqnOB%(vc~E$Lh3r zy}ycqgRu2xUILzQ2G#Up+4&`3JdX6r-NSmRhX3l=275e%pAj&P&);==Uw=sH(wn7- zu8n>3=-|Nk%?)KEql%-hPrE1feIGakvU>Cer+MIzvyD$_`(Vbp{`X^R`jqz@5nq0A zSKGx47h;E|e0p>onl-s>P+vsM-F`Dt3(LPYjrlom#K1mSvkiTJTb(kUv+MMR#&s8K zvzN(kJe(d?nVa3sG-P@@ipT#G4&L>*2z;0|?v43eY$S-*-ugPQG99!+F+ST2oWZ=J^-*z@euOwaU z>PxKeul=-a!O$bQaiSdyS0)y=PWshl9iGxW`Q%*o#0NPMh&TP{cM$36VCs7d2)vI*yg(x3R1mdT}()M zcz42@gF{YEepCH$en$Aeg#Mz>8=p7K+8=NuF0Pie!iWZHgT$HBVT^_>;X#W0b^1X}qigk{w%J7JA2x&6xM|jX552*z=S28-Ky}f>2q5vuKkDEP{_L;+m)4et|nj_R|r}4#%NhFi`TFGw=v>l z80LrZu-dwTx`SU@{_^UWm?sBUI|K`c#MNx7N)pvu+sk|PI+?K|KfG&e2;|py%Kgx$ zEl|zu@Q%lh+ot|uVg4hu{P=qVSFBB{7#_4aIr?JLS$Xi@gGryqKGlwSIQEXIvTp`s zX!(_Y$;%r_*4&2iNXyf+Yinw1`h)y+x$1wL)f>jpTM@|6l&z*%9dXBCYsj_}#iv&f ztE}2PrLD1)w_`;?qUZDS9XocE&Z~6y0(e{Oss06?EO>O*s?C;?A*td3u$fO zNmCIiSG~tp?-;jUaP3h*qVH?WzpigA>+R$}t)1$dTNx8i`Mz&#?6J1~ zC%WRA4oxZ=(SM$mZ0H{i-&o2p1h-yVHFVt7QxA`74u9+D*#7jWhIKR$@p19{P4M1F z^PgXK@9npJcKN(Y``8h;w>S13U3#;2-+$SIGiR31i!fAV4o<1fj_!>#+&^Fm3JOwx z|8TEg$QE=SIQ`SMp;;Br=E}|WKlY4E7ztJ7l)k)dx<@IJw6CJ^K^+ z>=(_sPm`}cm3GGV>!Y2KK6q~Jx-r2WjXYLrOIzEl-JZ6mJ4khxw$E!E9zl60%sp|U zZ=i8*;;duso-56jyk4%`Hz=zUNV=oEQXBfTK9SnGVUvG&ZFt*k|Nft;+|{FxD6b6( z-6CE!%c5JId5)AlwpY^QH6x+-5vE7qOOsPO)5n5`1QKd8m&K-b)tm{feDQu;U4dU%Y+$_U5=_=V^Hydi}a6bddkl`-XjoO?Qj? z2Dj#t#)2P3ufYzg$4x$Dc^Zc`&)>qV+Rus~wxfJ*$Cg>5Eo06X4Y_up49x z{W=k~3d2zFlnYs=<%_M)Bbq}e)Ls~5A%!;n`kAJE^P>91#-%ab+=OGU2M^6W-_pDC z^bahP{kAEgwH7u7AF( zcJrl+g8e1J zCE&JNLuhd9so>bF-h^)vh}3;!j`U)6klvnnNyOxNe}onvYHiGm>`m(f@}HW@+k4>W zw{HdaPFfb!E3Tdhm{q%L&i%M)0@T9N4-tK+*z12_JJ~rQ6RXwke$}lr!dqF*+gH_G z8jxS1)b@)ywRJ^Ne%XiETHK?8x@iYNVW9D!kb>e1o#Vd!a9=(mBGo*+hN=G*X4-zq zGpI4AQB00})Qwv)aLHKn*H_odQd3hkKR!MP|NEx{`OA}=KR^HQ+nwUuvtzK6B7Mix z{wQPScmIuzS8dv1IW_qo9JRUh>R`@OZ_MMN3EhQj^6bO6W%UawZS-{P9ocJhP4I-f zS(<+{S7t8k7VSH>GqpM8Lz8*JJ$qGTX=2R$Btovd_rR#?n#|rcjIFTuhUn<+`}!UV z3=WNaoRT*m*0HMUSM}Bz$3Q#NIlfOp^R|eVjndCOEn0mF6#n1uc|)#W7kKwq&N_3_ zx*|;fIB(jQ7d3P0{#>X>Z?6AYF!R|Jc-xDoPquy|<;^#2Vnv+OuWCz4Uthj(;#$g| zj<(*ID~v)^yCEUr@IHVDEhwdHe>+jS6VrZr>3*_e=7HOluxpEx@7tLXcZ$24k9U_g zEE}QuF*G&)%C(^C(71tPua3p#t5Vkw06m!~EG$?Fgn0Xj#I3E1JKu<(Bz)YG>9B+|-&lsFEd1FTnDlrv zSP_|mTi!n~sWTQ*lDGQ5g;Q&vZ!SSpJG1@#%~18NLdoQo#H_IdvLmuEOz_S$}bD6#6SIJUknH zaYcCVq|O*_wz7LdL&)NrtCQPCPpFZ4(J%krnE{sl-Sc6;yx??-`*whO?w;>|-!DWt zld{JQ+?jgy=aP8iv{y=fpfLJ&<3k~B4uQM9PdRUL_>0{v(Y?r*%nHo78|UAxTAyAV zDfuy={drS`qW8NO#5Iu>7jE7(eO@_uBFO%VeC`qDZq}wFn*QafI$;bx*d*)GA&zAAnyo!4c3Ssej}>5j z!HW+cX68SKgp?ak2TeWMe@>E+R-Q$ea<}3kWAuZeiJh{Kr;Uq}5obq%9u533_~osj zYwhnVCr_+!isBAw@-Ngdg9yvQ%#g&;rU$i9BI5k*y9 z&{$_}`uvy6Fs~oWYtW&iievi%US}t6McRv(FCi8{jo$BfNG^BA?b*{v5LV3G+|j8W zxAz4Fw0!(w-SBdS&a^+U?8)=mN1Yij#dS~o=cL1#pY9IZtw_zyn{(v?Y_N5}BwFx+ zk}1&(7M~ZtJ^8!qpzX_7>-{Y(YiQzR^4VzFo3PZihpPVm9G&*}hsc^4I-v9My?d_Y zK!pFhw#GGdZpfCMSMH69w`48S@6)nAZ~edPIFp&9*IsNwJ$Gcl?4QUc|%J$w{%&*5s>5iZd~5trwr{n8#L+AzKA}5RVB#n73qH$ z;`@AlHBHi*aC1)dWS2gpDPY9eiiykq{@E3XxtegWI-gXAoqaFgTfOG<=9Rsc!cSi8 z`u*)=^u;}j%Yy6FeRc{|j%~^W;xkRd)ESuJYKFkdc!hv%W-hc9JJo_leJ_ zYf|=4Dax2z(Ryc-alWe?E4?2T_Icq?;ovOV9q`rR_@>{9gkK-T%Xgq7DLsYGz23Gb zCvD&g^OQJl_Tf6PFuM=(XlBe<fX_EiU=;#Ui&kO7E+mWaAR1gDLH1AwM z*?_H0=?Blw?f*MzqRW}kq8rsw=<@X z1PPu}rg!US@z(FY05U7<(C!VmqVCB*qbsjBzWQ#P2j0<ncm1%ErTPe49OFdR2MLxv2~^AddF-_I?x+*ekitOh0cA3v3c^`!aeDaW}43q`jNk5LiGN#Ro~;d8zNKQ z1ke`@FE#L&RtvdvS29hRktzER%saEXtNGvDVI?r)?dfsccQMcE47ChnLvE8$7n?{-7iw|gRE`jhai%e1(8Et=Nb zqwmpcg7@C6*BeQr{8h!pgPt9jyIR%#yHDW1WVWVF{cwhkAf3Cj*{HZ0**>l6(d^S? z)$(EcIuh03W2;}^j5~I6#O>(XovE{AFE34voasy{LPsXNuIu~c%*Wc4xU0w3HsqvF zIp!QbYrf&rnYtqae8H0uKl-enI;^)+0*dt}<_41{S{`}Jx4dNIEG-tYarkG;T& z`+AU-{In$fj(EU;mBf;=!xa~spZ^)smYiNhYY~6-4VRR1v)&xqZE$LT{JJTkl^sp} z`A}mOL_PQ+&75@@C|Q(B#~xYUSiV z!raDj;B^Q3j@)>u-(>yL+`^fzxXvqK&j^FKD?|azg`wfEeMaz*{ zn0_M~W?!l8XMj~--M#TZ``TWShsXN*$mR0#6zQ5s`bsHL5scVzGji_P)Uf7ThjzXm zNDRF)?yBS!o7oY3H+Lj{J2^45jc|9w`0j+6Ph;=z%wB%pZu)Wc?cZN(?|iXBi=xkr zf3+nTq<{N+&g3s2Pyfi7o$;bSBmuYe@ZX|8E0W(mKbVYRuL)Z7>`TPQCMj;Wt-Y&n z)FTn{`q%Y}nY$NcTiZjqOY`b_MZ%MkKlMqg`ZA^!HSl7=ae_l*i$DH0`{4T)3g#0y z43xI(S>)a@$f3Ra?kBAp^W=JTLQ;b$WcsCP7YfrW(6;O2ThIOqE*{dC2s31raGR$oQ&Zeqgh?Sw6 zBgexAaI!a*#JU2BSMInmNfloQ^@0V>RgK+TuNrvhS>?)xsf+m+V>UKToLSz_zkI-z zO}uHjyhZu1^Q}Yb-d7HrHZY#pZ~OgsgC38bwf*nj`7-tNltq0QM+2{np1XEH&;irZ zBcGE_6})}E?V#)PiMbi@h;q4A**wxHkb-TFMl3-9k*`}dNpR9d1vsKmMwzK^}cxfYG_6GSC;l=#7fP}h?acE#~kY)7QO9s zAHenrX}WwlyInn`@0Atc2M_*Dic9JGudeaU>MCYy z&4SvF#>lI8G8G_yj>3fA-MPrlLj-1^iJA^h7e{E_%*3Y3^Ja`)KM8N2Fg zO2*A+Y_&I&N2iS072KZv{^etE{HDzfy*5`Z9Q3Gn$Av>#1)vyzkXgN8NztW&^KgF@ z7u&SCb^ux9>ZeV9#m;_e&lrCiH6k zCxCQ*o~1v5fmM5_s_RtZGs#~*eODTrj~rp1usq#f^6?(=@9FsJK9M7~Bp%6v)K4mo zE{vL+vte+yE@=crSIia3vH7Rj!VNF>{W>1m{9ac`OT5D0lgVGcOVw@XgV~cP)===oN{% z7hlr*ev~hN31aty3C92ajeivSV|K@;;9TB@RhJSq^$BuV*mb+@*9Z6GI;LEoGuN>z zXr|)NG7ET${y+1z^)~dxr!naxUvA@7j^9NEJ$_X0zVtG=Jxw<(A3r<{4M<^aDi{1!hPXEG}T^d+(uKtWJ(U^QZh!FNSa|Vzu{pv>< z06w&pPWZ~LtDm-U*`kX#R_!Za{NT+fNl{U2#rMlQzqB7&lYXu@E|F>bos_294ol`D< zNciy7jfuX3xKLBe{4$OBFSg{;Hrewg!!P-)vx;P`>8lDm)lAG3o;t3Q%-gTJfh`Ksv z{<*-?Y3t{9dJiGuQure+BX-jUn~O&rsXTj`j$A*mrQ_M+gS;5ih#6a-JUfshnV8~4 zBHuERZ|rr73pu9ore_aghO9Zfa)DrV-%V;sLRA;!$kC%WgGN>@eQ^KekA2T@6-Qp& zo1a}?)2Aa#y=|`2KF*NL_^`e8e88;BhcUSmqN66)w4SI<)U|Q3d~DmkBYAmw<;I7Qg9=)n z@#XF@^b@S9?FXJsKk~0Df6AfP^LIZa2fbakq`PMFpZOSLiU{Tkmp|P%Ci74Aou{We z3nzBpY8;#J;&#EGktvFqw=NW&9rmVz`g=|%p(2$&FP-u6yZ)Ty*PNM4i8C5WDVzBj z?;8fCe*3!Z7|DI;(xaQ19$Vh8^Vq1G$Bt2tp5Kq^tyzeqo9DP1oIj8O0ezALEpgXcdLTGwZw5EWS*Dm?`tD3BM!eWE%H{ImGdsN%zHC_ zM#rd`AwRC%Y`uZX`3~jn8!%~fN@$m~eaM>d*^@&c`n>EixtcgY`pcBQDj;%r zu6xe91Vj|!#98&@f1Rr*H)RCR%hR%Et{S< zuaDj(=7VrKK6h>MnRDq4r7N1c4xCp8(_^ZC$$ySLy?T2`%#LI9 z?$NlbzT8&b`BBZx^T~(4{P0_QyYV-ckBRybxTY}tZ&JpV_2W$$4KdwUk3Va7Zhuhm z`+L~14cE&8C$7{knZ5o`Q?>cj23zRU__U^l?`uO(<{Hyi^^Y7b;!4*=r=c?y8gBn7 z=U#}Mr+fG9RrmB(AnT_%y^WdHU$rdb^rtnSe!=1d8&8S61(#`_pBJ>ZJqs)-S>aHcZc6u{I)Pc?s+Mfh$1=l5roep5UbGS*h^}^+Cp+jc&hVEz%Jtq&YOP4`=rG&z8AEzOz$M(O0lU*CpuYbjL za0JLRIROtdEoj3?Q=4_`Q##j0Rv#KtJitObOC?T8aq0`Ew;vlp5&l_yJ+AUe{pMBL z=Ci%eC$*1nwiKuRG$n<7W)FHNcPxITuzJ2@?ZOH1k7r&R zxMEP@qCVf2FRu@6dIl`Ue|-Nc^XqkcL^YnnS^c)ND$BBNhN6ZwB`kZzdSt&9I{J}7 zb@jWX{&9EH&7;4s9smknelv9Q7tGPX*hL5SJEV{l!}@G&>5;+$uPJ?GK;&ZmwIMf$ z&1twbZVpE|`!M}m&yp(}!~Of>QN52(Xhcv_ZhwV z&91FEF#b+o`|JS%;XRXLQ{MHs8<;EismHHR*s5e!C;xgLya=cTJrAg_SRbywy>0Cz z+Ww3Nsega6N5To&*i}M+cMv&@mIE_yfRJZNiIS-*rJ z{kO8Im@zTB*Ac<1ht^C6W2f}p-;CcPJz-o0JdqO`xrY|;sOx95<8bygNBWUww&+z*hoUj-e(YkIHwmq$H$#Q5k~Uz^Pq8e#fRq~-!^uj8f4Uqo3 zDm?z>%ZrPfQMt?V_e~>9wLw?<9|B6xHD_D{qTe0cbSq3Zc?q$;Jv+zad_8{UueY)d z>;XR(Us*G<>0H6c>T4yPn_^RrZ7p7@UKd_Dtf3eA)Ak3GKm9X*c{d6<;60(e>3vXU z6r=iGYV73$cVhRJeLJzeAT7k(JR;#-$(2XxcdEeaFJC;i3--7Ek-lDxo*z2sa`s9@ zb<^rR$bos45f_&aIJ$P5DP&aClA=V9{7v1KMPq~1B3!zljC$m=<;j_Q`_~>uAn1uf zO8%HxY5yI4WY|xfmwxBAZYugobMxrY2xB=${isd3anI1Xz>hvSvpN3w;-vJEt_bo> z|A}7J-aO?r;eql}TG^#)*xK0RD9h9T=G#rZK$tm z-Lpr(0&2&ksfgzTSpLjFCk$?Pd`SJ@br25x)OJTC3%C|tnq4&5Bx*Z@dc z$hTg1<>TVw!p4su|K;1a%&aWh--7FY=lxChj!m3BXFl-g&DHDIu~ceU{Jb-lSWkbi z*0uO&oPJjSGduYpco^vBOtNcT$m|#0n>&x~Ss@Q056Eka_y!Vg( zJ30VQ&znBJHa1fVjd-q6Ws$}yB{u$0?0-xC5RhDBF#1Dr{tyUWDG?d<8Z}S^>Qx$X zoHR}%{_i&k2m)CQ0xt$bm|&1(sQ)(*JO~5~FrNzs`$I86 zi4vzaS%9}C7_kNcTL6^MTCGyTkqF6gMt=wa4n+n*5u;tm5lkw@AA$wS$~c7tc!f(u z3e9L2v0ji66sG~2gW*BJK`@{SE7xmOKqCSQ90Ur2!H|$3Fk|F;qXKqDe(bUo1m zsX!n1|C?>JQeqUy{<}b)UZ{x|Y7}N2M}#nG)f7z}{J#q`OZ0{~fGhvu(f`do+S!<> zmHf9>EHTH4074-CP&~j}ahyP=7pQ$n!QhD!ig5d}*y zVH0pzyo_p!6yk&NQD!uh6sfWX6O|}*gdP${gRud9z{IiS;)nzWmtat-DO89KB?HkI zU;@!1rILYh%)nYP96iUF$l`*;MsqY1ficBl*id)`GCDy7vPxJ9Iy@a{iD4+CnR=i# zmrh`^)M|600ws&4KoU4iIEGA@U?a>rRk#=>0CRQ3L^Tu&HA|I9<|qw%kQ^9*sY|3I z%mlL}3eDDy4rGW%TX9@8L8_9{I6Md!3PDD4A<-r^97dKZED$Kl0@bJxibxF-#01)- z*-E7~N=QUP!7?QTfhJRN6bwWk45*t$57SCbFct?(74ji$tr=(@9A#7)kx@)NSRq6M z^GkG5ql1w7kz9=w9~>!W$`O16P-PUsHJC&|%>l+@tSABs&(l&zS8N6(O#r%LpyW8I z#>!!$FjfXETB8$?El?Pa%!27qqx>_Im~aaW8W$|JXoS*uKA;L1nmG!Iz=23gvvqWi zU~0IAjt3?DXA%OIM2!bZLJSG5p#fDWnALA0I zdN`T|=VJtVsGLtEXvyJdVB=#bLY+XQ;$b;(A`xf=>~?|~jU-6Id0Gn-ql3qxwHy)O z2*_8##-c$=DiS3StK=G!7EQ;}{Jpc%)jOPy?zOuI33Wk+6htnw1Ai6k#Q`L*c={Au)sicD2$v) zC+MR`H;jyAh(mLXAi&~d^bx=W5qzvc3Sx+5TnNf6vfwBxh6y7GqG**G6duo^;AsRCi-48! zqwtZV%7ta|xl}M*0Et6@kVKOpQif3JNhTJA#}kVYbgr7Il*mD5jMM;&l4?b24o|}h z=b&*2Gn)xPkin60O1T9=k{}{P+s09=*>rW+%X(W6wzI3XRNM2^Ku%oYP7GT2Oj0w9N=(#udL zr2=Q<0DOst$14~_3!RDr!S&&ZDzJ`AQ<~tT4uh$U;3?zPN+uW`A7|D?({NygH5{eY z^B@$io|S-s)A1;X9F_8a4pwpkswu3YN0-S)Xk8@6dgD+GLA|ia#RW;76u3< z5*iLm0$S9Ip2L_qq1t=()Vv<;> zID=k{VW6U|aE;2UkBZVIvgjb`sG7z_i0D|r=V4%0 zCJdxiAk^&O$S4g|2UJB$!SQhhoGG3g%~638Wpo2#6r7MK7!DY4fkY17@P|0L2uuh-`#Xks5GfbR@`#qi`YsphCcCS#Tl%7!5ipJtCS%1gtv{)36bo z-~=$)K%uI1NIKrcQz#KS9@h|^sEZEPkpxDm(7@4id1z%YM=X*`Kxh($AZ15E%`yx| zVi42#R+c4!C{Xa(!BiMW#gQWsph$eYF#@Ryk0cnNG%PzJK8~o1h!zVW;z(s&Fcuf3 zjEWY9xsDw_|gQPTs(>lb#iVv3Ss5S)fyBt z0w;?C(0=@=ALL?qoM^?UI%5(*BrFSymW-NDe4>hH62(U(64=2|06Qo^bV{N%-V#iU z0BJc!)|AX-i$if6&-67(b@Hv(@#GsspF z1VqwBQACh9fg&0%LQ!#}JAhAuih&oM0j4Sw((ZCd3Im#+f(6lOiu!yNhAagkaGT?V5APF@Zu-70gMabdu^h^@psN(Pe ziNO-+IFU>ahmLxOc#|nEI*}_E00IY-bQ)w_xSXvQ2t`_)#(<4r2S@R_aFk9GO@b&B zVDJQ#MV<&E^YsuS-jKjU@)-!cF)|_^MU{h;rbLC2YE|=*MzIP%8oP=)7HA|0g=dn) z7`#A48Aa7hgPsN^E91G?I1<+?A;Jv?1rX{H06-`hjkg4HYd8yn*OC;fI4lyb1!-t( z4pOMmnJi?JMjHo-C+lc{U&X|c>5vE^7!IfkS;6Jev1+7&qBnEsW;g@|d2ZBmo^|;7AC1 zT(lL?X0sL>O`@rpDl>#DWMNb)9vDg0@B#QmNYL;j6e2O%jOQR2a=skEvIL7N(hN5y z04NQElbSdnJX?-8sp${|PYO!pYXyK_6BKkM4uS@A1au}1$>iaz0!$*59}LB-f`uwU zB1|$G4&aRN=*S4T@;`0_N&(TurbvWYN*$vM{I!YoZAFqEr%s5ebPAi2q;! zEqas-fG%U@5hytnAvMT&5fY|=o&ZV2nwU}@a`Zb0VM1D1Vrv3Qf+rIJLNLKl1d#-i zDfA#V1As4hyf6xiFo4Y@j!_(^!$Y;Bco!n4Ak0DveKcrg>tT2S@CHNC3Gr0ER4Fn_ zqvdD}3{PQ=&Ja%181&d+I%`zy1RY>Y^0fpV z)6CYxc`#TUV5R8y6Q;W=-T9RF%~w;5d0{!bPUvJ$gNW)im`EAX@Viq zjMZ?ZqhUNjOn@ScfDi%35yG)R@D%|h03xukQRcy@67Fb63RW>HX61idaj+t!i7J;O zt;k@J9IGNQWM;Vgqi$55(pv^TnZBc zDI%ch|7D5(lQIH+|1Tp4#c#MGDYLQhu_5BnOv~yohsLj)(%4k<*e~uGOm+FmGlWF( z(#d^(#dyaQ){2#GeG972&YyVabn={Ij+;|LKk!zrz)xyL+x<8znzVCP+8p=H6){Iu zq~D7lhV~CXIzJ^~S8g38@BGj8UpItp_zJz2+j^=0h@p4+EB~Xq#m+X?$A?0GyPRxD zobZR<2fWNqqs9FovGbIPPi z-_jUG%(G$0yoYev`zX%VKEqt3U-N$^tWm0q3!)2AIq35R%3ZsB!xoR zx_R^FozFOB&Epcfn-=FTcfGcHVtaf0^q7n4*LBtRva_>mGG=eEWYSI=t{@)WI{;UV zl)qFiUC>cw`S$K*MMZ^^c}=zM?Hhx^aL&oeN!weV68G%*UuOp_ z*`#Fs`(;}b$=JBdj=N#Q$g$ZQ1_bTjKRjq}sZ_MSEGqri-nwPVynuUm?)>Z|oje_L z@ywNVe=72f2ZD9))-(Q^dtQJ$d#`*we2h^9>rpg6f9}5F@0W|+Rg>T6XLui&JagvE zFWow4FT|otN&iMfm!|l@w%_w!+f!FlQ}ZN!``GIVLj&KA{QCMPlI(wC{I=AxvfG!i z)U=!L*jqjo9H)#;_GnwNYs`z?K=U1or)G7xo4>ZrN_U>FSN`m3gnI6*cP>48GZcjRjJeI zqAyMRZ7cSkI3elQn(R9JibAY|O^36?t*O5ZJIC_SXxlS8ksZIIL|9M`XiBFB)}{GiH#Sr%npZV8 zHZIZmU`<=HpKhDrdFJ9L`$a8@+h>J13KjJE^A$g)^3TnV!)@>SlQbq* zy!QC>KW;m_&ii#g$;WPdDD{e-IGz9ZtH{Ria(UP3b47aR#DAx=que+37c5$|=<0Zz zUvHo7UhFYHbE)d$tMK!H?KwDn>BY_cC96MN^%OGFzJAyz3d|Wl{m3M*M~9bR(Y>$T z`TJbwoWr(T7c-&~j=6fYra#yt_9N0%$3NGI7QbIeJi!e7y}&(_z6oPHup@59wgBqM zhC=8+6(VoSqNxuUna5fWHacyZ6XNG5t_F6nt<88FM5}3ZnJaSMKXk|IV0%t}Y`;Ff z0x$xrTdpT1ogB&-=KxCX`>lw$vl7?M(WL6YP?C=i?jiJKMC| z9DSS8;kTgt6|tOqh|X`P{GEGl>9xx(ww3I8xbR8iiHERzi`~;`MLsJE=d37Trw|J6 zH^rO!V>ZQ6XKc$k9D1^$sN~F<)r1_sYcKFmX1b2c_Je#HpS$@Iwy8HqLp#98$^Ve_ zwq~OV`YZ9g77dMP6dFSDjkmW|qnHG_A6~1)o;8N)0ImJIU_Ak@MPw~9S z2x^0;zdjnX8BFc#MI5k&)TNVpW5`kNmm!w{GfAm@sjLb);U5vV$z?0Pa{eL8$tBF| ziGBs|ocpz#YT}&51FzWv+skJcJbm!o7PpLa5Kx`HXU|X9k#NAqK;I;V)M^cry}Yit+YH*Q zuxZ=e*p)U#=ESJ4+{=7sSnS7J`e!mkn&M$+`!tHbIjc#Mk=~UHKRC;VHhtUOq;I(6 zZ#fdY!0{A0w4i7u_Cf}JzYfw>Mrms{RAK$sgz{%1CI zP3_ExT%EC_8a1)9k~SzMyq>&g@|%F{z2erptc}Qd89ymBXEc2|KXK3U^~1j~t!_`L z#h2fcAibLo`H8~`X^+0X2|2(no3AC4$+IuMRLUoPyg)z)$nNb3Te+m}$@?2FvhT@T zuWg5UI{&CzmFyo7u=>rFgtD0I!%w^#7|#nyg)Ot)+}u)-DU$<#^Ie8^@;+aX3J-6e zS$lhMZ~{c?(!ALB=8qre)kmWO^D8PKjq#Nyf)AONIF-N`i1qbze=3ht_k(=hZ>?RK zEbvqUjCA-zbegJE*Ur!El0b4#9C*0*d1(*n6#UA~{^iU!Z(Ja)^NsP-#*n4I?XwC4 z8r2t1zW8!#`9JUM{O4X_8-{ew-RK%mFHeGhP|mr+pB~~!&BEoYL~~2_e~zB)_5{vY z&OL2JD*3aRjcW>jVFL>>`4gGxs0gR|C37oS*^rGPe+w%g^^l;vo5RbPC7Tbn`h$Gm zrq?}a+*A@Yyr8akwnsne&6zz{yr?lSN6gqSMN_l<3)^mgyO=Ew+|y-~-e2zT>EgM4 zwX1l{h9zxsj&Ic8*6omroA2HyzF)S`ZT7n3oSMpeQ^vk}RyOS<+(Rt9!VVILt=m6_ z#rp7I(Nr?SP*x{CMdoY2jcKx}9T-||$z9AYF%@65{P}3!joGvbv10Jr>o)kz4VUPy z`J}2rOLL9+{&ib%syZ|MWy9sqBob-0Kk@t5G1IRly4%%fWG7PZ%SkofJyY!A?0L!% zj)T?S^x??sF|Ve03v`Da*hxj?q8w#Iu{$X5;Rg>756`r>XA2ai*=d@u>W57P6;7c4^}kp;(=Nr#?z=0&>Q zwq0_Tw=IRDc0Txo%ZQE^+)8(O>%d)~-5y%fxOBVM+TJC^bmZ>KCGU2e;J-g-i`~Ea zjmdt)^_AlVPPK>I7C#XoN!N>JCW#HF9@t)sk&PQSE^L;YTX{p?54RjK#?^G@&YNl3 z;E@R7-L2C%9a;CC2%DGL2oQQTNBR7I1D@t|Y^eI?IGdo@fxHP9KHl8f8tBNGwD0hz zVgZKQn%>Zw{j+=emX1qY77L!7_{VEb-SGo^XH3YQP!2%Siy&y)Jj&%dpDV)!4S#It zA7kVzV$2E4daCKJ*H;_K4j->go*nCX7M`SVJy^wendH>ZDu|9%Z9^a$M58?9{9C#{z+d&0c+)v-A6kg*$>Y`<x7%KG+O3r?^b}Uip(g>9u`~9htd2e%!gidWn zie>lWkbCJ;x{Yk>)=1upl{xfZk6nwS^L>kV-whhNL}*CI+_f#egGH!Ys}$3xPrsoI zd7(j}P`}43Hx4_!6y;@|-Mcqt`QqNxS$jNP$Dcl5-W9tl;%U|bRerrVhHBZa_`PHMSgtN2f>_t1%=bh9wVJnuhTU|iTp}Q5@{yV+jbHDgM3S7vVrdafB zX?R?IjM2>#0|y;l?|ASkd+h9y!;xbz{9HNR#r?v#K^*!@--;KyGwz)y&ZpMBiCuPN zD|@2nhJwNs2t|Bzp&zkJiE#dtpjd1R32oRs0dzNbFlN26WWVjhB{`JLe2;wM=gF>N zM;2^c_<>z@Db@SDtHE4q@hHRX z$D5`ji;gwE+qc75*c>l&qMZK`W0bAke?EvjH5QYtYAer_p0B6~d@S}IIhy=vQOfN{ z-+&}K-XDke@`2p9nelKN6_0K7znn=4hz0^SkAQH);x7Z|w(S)&`n|jU3iiX0-G^Lw zvtqfj^H;B41>(2U(5%eN)|o}2DO-_(eJ;7~HG`ECHs)fTe+a#OFRzS0F>T0@^yq|b z$yrp~mZ|PX(+W7nvBO2e!S=b#NLTQRqa<5IPTGzQOOi{wD~}E496h>hK>1D7N?U3A z)Bb&$-{)0!iz+H@-?bR+*30eeG6)}Lg_b0uAN5YQ#g2{Nuwlct6~5I0nC$3b zFc5M$UEJjfxfa(Dz}rNt`m^1FMpd5PU(K-D^}}!d^92$1kD5>(6QMTcOCF?C0%O{Oyuqo*7hiV2d|_cgQcbe0*;)0$Z~gJ< z&gr}!Vz$rZReQeprRl30y=Gk*+*&7|n|gfN(1}l}$;%t@V_Sbto>oy7dgz{Fo?^h$ zBVfJw=EykjgXn5 zl0W|(3fd2}=L;l*{{ zubDqiAlDp!Qc~SrbQmOfh+68*KUL|{{N#i)j8)&VZu%J(7wQBAkGld?6Xf#}!9K|2q3f#>pQR#A!D9Q66$f z`qnr*^vdVmKOYw{1DjD4AeD3OrE(OHKEc2rQ6_xi@BuU&a0L zTAQg36v|2R9x+$8(8s6dcw_uWMI}FNtGYFZQ2#Mv)_FAUR1DmWb|pqZq;Ae zGja8|!@<$-ObH9ZSLy#Ae!R@}`q$y{O2kFz?N8u2V)EWaQ8>c08yUuq=kp4}rW8xq zZ^D)@T2#*(Nxl@mY}KZDUn}ynw~P&N$i2us59F6FGbdjjtSD>TKXUG3?8wWkfNA2M zcxb)f>A?q1aPy=$vFq=}Zkt?f_1k%&ZOtOYtwXQp#{TL;K%%I`S903L;tC*bN z4;;<(&y~kyr=W|xA zx&q|RZug6`k#-d(%AJ6ex*ApybKES~j8{9?>2FG({_>bPa~(aOZbZf$%KpQ7z4y?u z8e^Xrz-tU!>}QxN)_K{b{Gir!ADau0KaKx4@NVYSr4`H1kB?5Ro(^H)xnx zn&ULR2@2%&CNkOV#UZ*AFI2j%*}87sS^w0THQuv5CqNNHAFO+0{^iSE zw(Ehh_x?ZzHJ+Bu)pbtiu!vasxV07M?tZz`*LnY2C;J_Vrcpq8me}!PZFoby3lkb0 zK4V@Ve7G%LY1w$@a?{y*C8GajuOGIPUA^T?*3h?mvp?*;Y#yGEk6LAO^g-V2!_R-x zoUo{yTG)a+Z|zw0sHmswHadTIzHnsM z#_ZFdTW!b`%A)%F_cJ@IGR0^h+wu59d}G%(55=QA*Mp@CQGw@!J5x&o7Icv_k7j+p zu=%dTR?V!J&vTqv9|p@lK_RUzUzTsx&}K`Pc93hCPk*OY{GI_@vl5g`3F|`#FxTa7 z>;Y0Y`QCfi6?wS+X%f%amfjY0ec;!TGjpo+J|(@ADqY>(p3eDnb4>NJy+fUQ#p8Z& z7<-cT)R%om>Ig3^v;45kC{h>n+lxtxUkT_<{|4F6oHNHIj~_l<9>_eg!ntB@`1XU} zZ78%&*dVbZH?h#;(V8>gK^W3b@nRQ{FL{FjBQ2^=c^vw;-UYO>CkZb42OE0A$u4?$ z@|p2$YQr9Rwats%ygaYcr}Ea!AxNrQitlkat5K=iBw!_2%16}_`kV>|0}y)h%-c6av^ z5<+&wB~_=^TZ5=?x?L8%-mu4KbJ({Q%*NwmvY!^_RV6>2C$6iTjHx}(|6-eS+&B+P zUWI%tOLI?Mw75Z7kt*9ZD2>kjNt}%lsa0iMy#DilN>bV?FHWo*%w(IcQ1PD zPF6(Dq!kCokdTj>pZ8pA=Z)`bXqo)yMM*1l%t$h#w->gmd}LaR&joPU{IQVLHJ)$% zIKju~lSGI-*~gr#XwuG47p~mKT(x5Isy8$(06^}0kTsny!2X(mj>@G|B=HUF4_4@)py?6V84Y>DN-rTrz zA*+0lo^K&cvat(vRMdQ))0|HOur+0HAhDrYv*Chy?4wI1KR-_z_n0=lt34iDmc{%#YvN@R>Q(k-L6d@$@D3AmfDB4Fw*{TFc;Rawe#A z*JbQdw_`t}FWyoxfo>E13S!YR_xBu+Ol9s-f2S-ykg`BZD{Qi5}!1nUb#x2cr z7RXQR_bkwsYG*v;ADknCx-AZ4N2rKu?Tq+(yts`%7@JAq7kP-T!OKI$R>->QDxYHlxB z(%odQZ(X2Yh5uA_*b?nNlpQr@Hs#a%G0v~isY^e7#^@=32cP1&SNok-27*)hMK&d= zX>Qwc9C{j33Bl&dAT;L`*bF+esrr)7t7=n!ThD}XnklnjHp`Y)&pzHd^CQ$mO#5Ls z2jp7HUZ{a&)32T7os&vR*IWOH=r+haWUp0IP`zLmv&XGflbF39}e@8x}hP#q7t*OW(+lk1t$ z_G{LCzl^MY`)0yj%>LEgA*Q0|DRbVjYaAyye`tT;)KP-$G_xm-16>_Y%r0t-fByKy z@9L_zQx`mSBL>`K`Pq54#CZ2kbGc}C+LZXOZX=_0U&RdX>%VVqMyz10XqN49+V3Xr z__{f9O;K7?PxpW!;ZOIYB{sVjr15t=0)svt@l0nmSB2K4pQ&??DtMXSlH+3-y9kB4 zG~+?fb23P}Zguy%W4b+l3kEurMd88QSJu9-${et{b!YhMt5=xhoaDl_XV9ll2U#W! z@BL6|UOM0w^Uv(t_xQ$H(C}Zcr=II3ZtV_ymr99X{GCuy{4T8h!IJClcFv6*o6)^D zZdNBdAM4#R4udZto=Ds!RwskM#m=nh3&ZM@>k91t=GM~eZso!FuSLDkA zKarQ8teu^0iZ2FuFu@I%_I>;yEmSwsL+4e0>v-*b``%L@n+ISa&ReuH_DPuU3{N) z=;l+v<*mZ+@#nPvnYOyW>Sy!a!y8Ivuk=d^uV-gBhI8C!Y$Ke%p-cV(M)`aqWoG%Hec0Et zp!t(ULSbh7Y1*B+Mf=?!t&uWZ#UbCnXfI&+EeANk>z-`v4*YH}4qjx2eeJV7t)@{N zt(!tt`qn;SF?mL?>G!X%jOrHST;lJuBto>Sf)@=rZgMPXu+a5phK$%PTeyZDTc4xx12*{w%BCAUi zaQu1>kH;?`2%PDJP7lb;(Z>v4O@OW%e{kC#@fD|@K3U>}Y$4{%vCg!uw)NH_^Zwuc z*Sdq&oX%YFGkEQwOGoi_pXjt-;5ckZ{X{1x>yhF6PUyQc)9_P6y@fd|XWp4bL7JvuLdoYK7!dI5n~=eKDeen?8l3v#x)yoh;aZT%BYdrk@M zC=T9HwJbbY(-k;Bzj;CGU1#~vilxa_QAhI+7e0&syHS<#fDM)3@q&HPe4d2Z;Zsvk zeyEYQ>ZPvq$LYXb4IgmVWZII??Kc8B@4%S5Fx1=H&kxe;W9E$aDz6{vE63-pO{yF! zym37v{DPr~d?D@6j{mz0AicWoSLxkFL*rF3J}GM*mtMP+eTnz|PvFc+JtMxoj9li1bt(z z^?3UXXO`cEdDlI!4s;Y{Om*4_AU;Pb%xy5#D47kra9{5+o)zq#(lhw8dv-Ci_~gI) zMJdGxye9PjTarDjKV^jPy7MJVH0(BYLAJ}M*Grdg9bCB0w&#nASg_`D_o7*CU7Ffc zM|&6G2Rwi=qFsgBJoG>F@2($H$G!xQy^&m!NK_FD|}w*1(bNSD$R`>Rjk< zg7vsPTXnwp**FVDDe}*~ zc(gkFYxmCAC||o8-UQmbq7npe?@qrf>73VJ7CO#Zz6e%ppZiitE`Kp|O!lOgzL#tM zkP2_w^CXq1#?OBCK8CLv$SX!EJ7kGie>eZB-SN-o<7YtHzAX*h7&!QnH9g~R#ehpi zd3EgP%lL{g21xZ#4huRqzl+mb^RkHOj&DEq=fa(ZG%Uq$Cva#~UgraCWH$ezbqy#n zf6M1|tCugQ&R`0<_0HM5||){&+rC_~v2I+R1X8~ozJqWo|AuUW%a zhir2L^}+KsgVVPXI&jWcI=f7;6RN-=VQva;a?Ll$nB-LkHu6`zFdEN{MD#;f#fv;KYKHf5GA z{rt;<;v>k~6jE5sLviZ>c2(zrLq=z2?E6(0=cicTWqo>D+X5|F?ElKOim9TP{c5Rw z+CQIedC~o1y~na)W8k)XkfZrs40nhs@5g5It?Y7@9s0>PUd}2!1y{l@Ssstb(?}=M z9U-ULc_Ymp;Vm(AIN_{_Ed7{kz-Yu|Gb&+PJ8#E@mmi8M4E)Ye%#A z$m-C^>XYtNw!fUIOo(vY{yju29~&JnYPfalR`*uzPpV^uKY8!E^7zJL*=G|-B=;!R z){dLb|Nd|)Wca23$=B<4jxhI$pB_LG*P6=IudOxYCD$Uxrf<{e+E!+teh~V;)YRhN zi0as-cKM?;DkpB9xc$dMapvZH8`3Q9qP_pQad8bDz)yaZw`b~-?a3&j(=YF!AAR`2 zv5@=v#jU){b$h<`jr{$JE&rL_9s0+|;P5j}wf(?I=$c6@lZX{d4BM0Id{?Jl_jUWW zUZ%YlrTz3hXfU!sw96;7YR?;=Xk42s<=wb7XQ!O;>u0yS-8&c$@?<*}k?owl_Wqcf zx#oM}Wxod_hIw(1%zO4Ge#*}vslQ^|PJMWtOnNYqTv}wCHDO1_Y}z_XSQa?lPhQqP zQM~_h`ZnTFpF{3$rws3T*tZiwT{#O_+mi+TNb$*}HQ-J911s+}0M2Fj&!enV?2MSD z&zeV%Q5L+ud+X&;NBJN3CYQNyd#n89UheG%B5*48Zt<1dmpn%S+nG6soib9gfja<8 zn1Opr0``abmJ}5gulifp?Hy8zpZGrPW&WG`ef-Jdhqj)NhCg4u)n~ZNrzelO`y_kQ zU)PEk9~m=s=b~r1j`Kd=e&X~dAK>cC6EP4Be@pSsZM$>lPUL*|%*LMc@FaBl-NyCT zhd!NkzCk%K?cTnBomtOzR_BNx+I!L-)s7TEF6e)jL~ z>gr;uTV{*|%@%8}5C3Y}x7xae=DL&7o;|JNz<~q5&rIW-oFne(RG9uW+>LB+@pw#8 z*4qymTiWfb1804nP}{Wn0Wt1AKD0Q(9J2GuMDxL6SEiGFtK^ftgKECZ^j9PIdY9aJ zSJpM`GQZ&K#+x^9Mr0nVc6)KW9^&b&%u4Btf?zJVobkK$j0zlsCCo&zeCnoc8N2IB zA7@hK0wF$Z^|jwW-nV_Z{{V<7X9v%IEjKUiGOyU6uF85pCOs_C@9(r1hTd`XNlsNE zXQthb#{&1@>-%o|G)yhq5F~%nG$)|FV1vtcpG&j;M34i2*E+4P>06h(;04dFVr7os zz4kaCcf8BZ_zy?c^`8`HTwGb3mdNrH-@#t^IMh(Tq@#bo zs6NVgx24H$IO$U1w6|{y578BdC3}DO)qE7lI)BrnVw|Ar^)Bp@uH4TL>+#u;8Wc)U zo+m@Qic7~BD_p~d`)*Bqv-+~}GZ_~9`bxr*(1hNV?|}ng{j^2gC*i_P*da9@hr2yy z4{1-9KkV%IJLiFnpO=tc6 zv>WZ%ly-CM@R1ZUUe=E5)dcNlX4-t3)I8fEVdi+R?PV#9=+~bYIALrfUDi0y9=>eT zaH4+E1Jfg;FshUFa5O$&u`6KL%oz_W0a?E zkUVQ$09x;TWf^jN8Oo=>Gpnar5DakEF*K}v>*1Rj4x!`j8Ghf`^X&TDN5_dz0+wyu zOKC1SV@tXKO$-+>pQV;})&R-&5|-9k&fG{>6&-Cqk2}sb+ zEq=dBmj14L<#SfFY9qSZ`9NoKecr6njQrV)Z&WWI3_ZW?lik>k=-7_gM{33nOEE=7 zZqFWTQkr~qeUAkb-k#jLn*F%<44d>yNV!ohZ~kmJPdbEp zXBv2av3b>-_5wG&w0`0I8{V^E6ZU@HuyHRt()rND*~5b~mc2*{I2w2C>V^Hi z)26zQmzlCr~rU7dcVcAgM+=Ggok|6Inz1G|@9ea|X*8l1NTcOWvSz386B(v*+Q zu&%Ajl-BRpt#7}%Nqz4JiM6j@z};8IUVCud=Ly8UNnaq!w(|QT*XAIvok4}9Bdtg- z?fk+kWyl&g&qIWX{&2#lHv&6(TI&v$my1}=j z3(aR0vu0R^_U;ojR^y+*1E$kw%sD08297SDLOj0WMAtRf{kO_*V~%g~c~D=!%nCr& z4%^D}xnm-q&G4E$`TMU;6Q+TJ3XrlVs~i18){VQq@IzH>&pcVepU>mUtTS`AfAkOi zUW-LubgymtFtcIHrA)WtHr2Qrvg=C<(RoY7LF*Fbs2}kmj%r%fERw>>_uSiwo=;7S zn7sS3|JKC#p7hINr+OY}Sbkx)a=3qCRl(mdpRysyLFD1xcAn;(G1Cidf23Y~Yi*&q z;a1h~4y|%ujyRmb{$BbrD(B9aa=J+n8lZ7X^@i$sRWAi&Q(-Suo}7GwbQ4REGP(WotX0!`EQUq5Z+fzpng>D6^FI z9SnIK)*O~!b;^Im-p>64mD~K54H<=?wD_1C$pe$luOb&Y+GHM^xXk9)=r7RD+ z;!+f&hLg=ZE#(fkkJ<5OtMl2JJoum5yn&3)^wJNPGM_&eeM?>DR~(zal;t@8X2uiU z8rO_e&e7Urif4H`2DrW((9!PPb8hbXqM$#|{Z=O_k1Xt;Jv|3|WP9Vm>_meHF-lS0 zQ^2P0i8RhU@$LcnZK|`-n`n!v3pM8ZW`BWtI?n|$x3YJR9p^eU^66>8#O*Ut`j5S@ zuY481Il7X_W-@=IGK#LgQJOHFYktgl+z4E?RTF|&+hNx`M=`rNM;6;>-RUu_D(~~o z|JbDYvePZ`&dgqa`atM|yf}t$#{);1PwD|t z(W?u?J>cs%X2^aW8$Z22Zwf?n=Klpc62K|QdKlb<)?@~QfxZ=D)TmL^u3fum!0;bm2&~a)9L`r1Md-MhqN1Yjl}hD< ze3PZoXz&zOm&L!bvNF}_^emNxWtBogLNFG|RfB?-P9{^zv(G-8T2fM%aKdg)0U4I` zHk-`}7K_EBQqS5G%fMt4-|Y?RI1)NV^`P|!njFxYkBj<8Z=MaCPJRY8)=D4<1 zfHtP0)&D|a25o@7H+eFsRe|(w^FOD*q^nf598J`Fh30iauv1MEgJVsue2RB16MNtT1*yry~efFpQr6YwV z7Y;%KzU_sZBK|< zU!-VmB;SJ-IDw1zKnMIjyT4S(7{IVT*w=qf5QPkXqibyCx6?;l`Z==a3)A@yi{kF4 z@H-8KObTF$F@5Mbu3B3S?8;mZ!@J%~JFoE{w=sZa7oUR%`#%j=%PvBnwzoh|$$8kD zwS`uvVl5~j1Rm-&$3w&T6S9JW0{X-?gTX-C16>iV!-fs}>HF`$AImdf4_A?hu}$ch zH=jY)p+g4@p$KCN&`E&|?+GQ^9m9I+hYal1JxCkicALI}%$uu-JGZ#58s2p*oj5x> zX%gtv!SL7hKWPUxHU_X7OC{XieFE(e$2HjP-Nw@v{?C4YQ`Ev=4;WsmNg+dk2MieS zM?paW9+eKbGq@E178{MmT!D2~i-9aED(YpGN~K}|!`VAp{_w=L+bbR}DR0IE*#n!!k|^S&f}DfhDb8Gt|AOLM29s-Nkr&cLweRGm@#Z2e05|wn5@+< zz%E|@lMYV2R(6?=1w-HD!vj;{#Fc&Y`b?;?0P`DUd`l_(F=pxI5tA>&6(Hs|@S7yl zXfzi?Lqiv^D;(ktVSTG@Cv>S>cAo*W#r-&CQ9V-sM+{z6kk#j$mK#qjBG_KkCQ?z= zzQyq=t_e3zQ09wmm*XnJ;KP{Ec65*-I>B*~|NF&lFn#E&u>QnHP-QN60frFa z1_2lMJ>v$#IpZG?p^Jh&=QfJEkvM=gHDt8l;!XN!$&g`b{ zN~6-kn4S;Qp*o*=rCJY#kqM)Dv6D*(Sxq)U?mk1bb_TZ^-$Uuh0 zc(3M{!+{eykdc`WQdOrn9M}G9Y>rq0g;c=2{SNbY_AU}t;`zdpaPrE2FoboW7ya+; zc~SIIDuudYN}u^*Je?Y~CX)zR9bj6m_K4YR9vB-NyTWKRx@Bog&GN<@@bSWb1Y3s?UZs?`3cn=J(+R#Qx}xFS#M~9(?dYHetd9IDGgp5V5hbSw^F=E7y`Q z02yO5k88FxoqTwE~YDiV@5%>|iMSJB8G4?tX4 zJe?C+T2n;3iqIB@M%KMqn`!tqom>qup>65^AG3=v`+LKQl`tSag-%(+L=bO{Z3p8D z3tvG70OJe>1489$XnmM4VZvluMM`lMaY67mWMaD>y2cZ$|I4|JP*hz&XL_Rn7M<8nk61!WC@dFnTjDs-2ZqDt;!OB6?+Be7f^ae=%*7z% z8e07X8E?2&Vq#)pR#sNDWLHs3flRGd*8~Lxy`uO%+Eojqr9dXtRfI{jVcJ$;ciQO! zCRT?Tp1W-+tU34=UHk;A@Puk3=yI5iX?&LaCWMU6w{<1nqO+SRKz_z1#=bn1#QuE%}Xs3%@JiD)QI``bb@J zB^jM>>ya1fZSCYwNyeAJ$-Q~T&`K3=HwBb#Y!qa2c8pT`wlu&*Ttz~V@zr1T*u04> zk=T%l1BRf9kj=OQXai*Z4`SR%d!DyW*6z|9WNwyx%2uQ+{MU2L32OM;awVA*aF~1Y z)_1vW;sz%W+Qv#AAAVoVRpbGgi2J7@kYV8ALSovqX*p}wtm!ag#*D&w^X7HqRydR@ zKE?c4-k^~KB~#5u>@X~!L1sGmCO4#746={&O769SI>Cp4H1<;7Tt$9^Z0OLT>n~rv zJi{$*O{l_QOM(mlzCbBPRb>EWEuJZ(CNZSP|#~?1?9yfY)Dt9k0(@O~+r5F)`eHwbz9tpIWW{0uzVH zVps8(76F;XX@@Vf4`DIUCOy(mgk~3DULcd>kCdb?Q&Ur6!-fq&plbZ&lTV^KXB9Om zWOCJOQBl!zN~LlIIBI|_cO4$vA|T7IDS^G$&spsxeYSbCHm)zSDIs%N@mi3g7E`YLHa0N z++YF-_iPpbxTE7Fk8FJ(h3|+;GIrV;))1g$ZxJvAjn%do42E&s*U(?;Qm!N`EiJ|J zj?3xsQ6x~+XMnOATZAOTAxVwGz`l5loa8ai}nH~PK+z;*1$`t|EgLx&D^ z3%(P|8fhtz8I8uNN~LlQmX2Z?F8+zC&=A02E-%Ohfo0#k_1Yv zAw4WC3~fPi6%g@j6txza1ipAUcK%#zPe?i7~-8OC7Je-%8_caPDH^2rC9Jp}v z=FKP-xHscw_hKfi1wlraHDD?&GUHPca3r|ogEt*lKLl{r6=Cs(3BxCl=|?6w+Ptt7 zw4HQ7C|vwGozCxDw{GorVv3wlW-&jpOvrMrR;$~=91>&k@m}aW*uH)HUPHj}ETB{M zclZ;=fueDk^V%)92T=gfnKNoh!UZzke>ZF3x3eLif;K+Fk%)w*FI<(!N;Hn*e znlNZ}S+g@;6+aJ@G55PvpxJC*N|L0`+i9{sglutKT%7AnDlZFvNe3^`D-?=iul2BY z7bcr5W~!2)MINOV{%^R*TkyAnj9vo}1lEuJsa301pQ6c(fyz`1h&axxAy!mv+M0S&HCsW6z5UL1(z&Ie(t>{D5S(Sxw#g3(MF68IucjHrmeA32O zknshRO4UXIaNRQ!RL@kghs*w(oKSZ;>e8FqW~}!(mKR=l;lR7^zB`m%8IDV?b@~dg9Z(9 z+3SuTJqm5wv{}mrj5&}2=MUjR$RtS+nQ3lr(n(DH`4ch#;2vZy7@j0#3;ZvG%+28~ zwkG47kMQvDz2)WQG&2u8@BpNx)twQ~1`M~A>{C8aO}bSOk^2@1t;ysBE&3C(#;ON@ zN;09kxZczhXT2v!lJQDZk@YOw+`$%3uxHO6=+UDG4&@5BNWZ8Q!*Bc#vZhm)ayBb+ zs)5U0S9>X?iAyWSECE`r_UPrym+3evf*?}4fT0v~MW$lCin(*=zRfkv{T(2aBc0^* z6;YRbNy&{~lld7k)MIoNYF3jh$ucuDVa%8@wRXEbsQ&v{(-U2fE=N4y!A;45fq_RY z7K<9cgyODKr%vJANZ0>C<}S&6WSo}#aPis5RhL$)HD2PG>|V#Qe*JowHER}!O~4Q8 z7GL#^_hhh@D_8EEIdkSH?1K(Vyebvwvkn00(>=TbhWdQfs#R;=eDh7VVGWJ3 zD`Y?>dODlOKnP-LY;3H{3XOl4x=HY|HuU;7x=jcte3+S3z8YKq?T}fmR<7nkb&Bis z4xH!53m5kBcD5lJVA&N`a94>xfERQ%xr;_QCrLB|WU_$7myR8mVMWaX9 zhnRHYvv3QQ^Yn!T;Vz=Fo&rgdHQ#;rUFxPyn=Y{@ZNY*Cqmz@9=W!=8BO~LjB}mLG=1YqgFHXI9@gn>55?Hx%<-F+VXgqP))mY>Boc+0+ z0}@065k0&m_0Q$L-+ue8)1*n0up$dC*fIT#ubr+}iNa;BB}g##h4IIbX#7ELyclEk z&zLl6(r|;pfRfMEOeRy|#~*+EF?W=G`}Uot(P-MSKX*Evj`QcwXH->Hxjg`b!4T7- zLx)7BFZ#Xk-~XE644@F8?U(@p0ji{=q-3pDi-lw8bv4?l3JMA`zWeSwT Date: Mon, 13 Nov 2023 18:34:54 +0100 Subject: [PATCH 18/62] Fixed #8 --- pytest.ini | 2 +- src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 46 ++++++++++++++++++++-------------------- src/codext/base/_base.py | 6 +++++- tests/test_generated.py | 1 - 5 files changed, 30 insertions(+), 27 deletions(-) diff --git a/pytest.ini b/pytest.ini index fcccae1..ab4c198 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,2 @@ [pytest] -pythonpath = src +python_paths = src diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index 37e98a8..318dd9d 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.1 +1.15.3 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index cb32c75..275dafd 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -104,9 +104,9 @@ def __new__(cls, name): try: self.codecs = MACROS[name] except KeyError: - raise LookupError("unknown macro: %s" % name) + raise LookupError(f"unknown macro: {name}") if not isinstance(self.codecs, (tuple, list)): - raise ValueError("bad macro list: %s" % str(self.codecs)) + raise ValueError(f"bad macro list: {self.codecs}") self.codecs = [lookup(e, False) for e in self.codecs] # lookup(e, False) self.parameters = {'name': name, 'category': "macro"} # ^ means that macros won't be nestable # test examples to check that the chain of encodings works @@ -158,7 +158,7 @@ def encode(self, input, error="strict"): return input, l def __repr__(self): - return "" % (self.name, id(self)) + return f"" # inspired from: https://stackoverflow.com/questions/10875442/possible-to-change-a-functions-repr-in-python @@ -172,7 +172,7 @@ def __call__(self, *args, **kwargs): return self.__func(*args, **kwargs) def __repr__(self): - return "" % (self.__name, id(self)) + return f"" def __stdin_pipe(): @@ -200,7 +200,7 @@ def _input(infile): def _set_exc(name, etype="ValueError"): if not hasattr(builtins, name): - exec("class %s(%s): __module__ = 'builtins'" % (name, etype)) + exec(f"class {name}({etype}): __module__ = 'builtins'") setattr(builtins, name, locals()[name]) _set_exc("InputSizeLimitError") _set_exc("ParameterError") @@ -237,11 +237,11 @@ def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs= if encode: if not isinstance(encode, FunctionType): raise ValueError("Bad 'encode' function") - _set_exc("%sEncodeError" % exc_name(ename)) # create the custom encode exception as a builtin + _set_exc(f"{exc_name(ename)}EncodeError") # create the custom encode exception as a builtin if decode: if not isinstance(decode, FunctionType): raise ValueError("Bad 'decode' function") - _set_exc("%sDecodeError" % exc_name(ename)) # create the custom decode exception as a builtin + _set_exc(f"{exc_name(ename)}DecodeError") # create the custom decode exception as a builtin if not encode and not decode: raise ValueError("At least one en/decoding function must be defined") for exc in kwargs.get('extra_exceptions', []): @@ -375,7 +375,7 @@ def add_macro(mname, *encodings): raise ValueError("Macro name already exists") try: ci = lookup(mname, False) - raise ValueError("Macro name clashes with codec '%s'" % ci.name) + raise ValueError(f"Macro name clashes with codec '{ci.name}'") except LookupError: pass try: @@ -463,7 +463,7 @@ def _wrapper(param): isinstance(mapdict, dict) and p in mapdict.keys(): smapdict = {k: v for k, v in mapdict[p].items()} else: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + raise LookupError(f"Bad parameter for encoding '{ename}': '{p}'") # case 3: dictionary of regex-selected encoding mappings elif isinstance(mapdict, dict) and isinstance(list(mapdict.values())[0], dict): tmp = None @@ -474,7 +474,7 @@ def _wrapper(param): tmp = d break if tmp is None: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + raise LookupError(f"Bad parameter for encoding '{ename}': '{p}'") smapdict = tmp # case 4: encoding characters translation else: @@ -494,7 +494,7 @@ def _wrapper(param): for k, v in smapdict.items(): smapdict[k] = [x.translate(t) for x in v] if isinstance(v, list) else v.translate(t) else: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + raise LookupError(f"Bad parameter for encoding '{ename}': '{p}'") if ignore_case is not None: cases = ["upper", "lower"] case_d = cases[any(c in str(list(smapdict.values())) for c in "abcdefghijklmnopqrstuvwxyz")] @@ -538,7 +538,7 @@ def code(text, errors="strict"): text = ensure_str(text) if not decode: if intype == "bin": - text = "".join("{:0>8}".format(bin(ord(c))[2:]) for c in text) + text = "".join(f"{bin(ord(c))[2:]:0>8}" for c in text) elif intype == "ord": text = "".join(str(ord(c)).zfill(3) for c in text) r = "" @@ -720,7 +720,7 @@ def list_encodings(*categories): enc.append(name) for category in categories: if category not in CODECS_CATEGORIES: - raise ValueError("Category '%s' does not exist" % category) + raise ValueError(f"Category '{category}' does not exist") return sorted(list(set(enc)), key=_human_keys) @@ -755,7 +755,7 @@ def remove(name): pass for s in ["En", "De"]: try: - delattr(builtins, "%s%scodeError" % (name.capitalize(), s)) + delattr(builtins, f"{name.capitalize()}{s}codeError") except AttributeError: pass codecs.remove = remove @@ -801,7 +801,7 @@ def b(s): return s -def ensure_str(s, encoding='utf-8', errors='strict'): +def ensure_str(s, encoding="utf-8", errors='strict'): """ Dummy str conversion function. """ if isinstance(s, bytes): try: @@ -859,7 +859,7 @@ def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=Fal :param decode: whether we are encoding or decoding :param item: position item description (for describing the error ; e.g. "group" or "token") """ - exc = "%s%scodeError" % (exc_name(ename), ["En", "De"][decode]) + exc = f"{exc_name(ename)}{['En','De'][decode]}codeError" def _handle_error(token, position, output="", eename=None): """ This handles an encoding/decoding error according to the selected handling mode. @@ -883,7 +883,7 @@ def _handle_error(token, position, output="", eename=None): elif errors == "ignore": return "" else: - raise ValueError("Unsupported error handling '{}'".format(errors)) + raise ValueError(f"Unsupported error handling '{errors}'") return _handle_error @@ -950,7 +950,7 @@ def lookup(encoding, macro=True): try: return CodecMacro(encoding) except LookupError: - e = LookupError("unknown encoding: %s" % encoding) + e = LookupError(f"unknown encoding: {encoding}") e.__cause__ = e # stop exception chaining raise e codecs.lookup = lookup @@ -1112,7 +1112,7 @@ def __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max, parsed=False) __groups[value[0]] = result tokens.append(result) else: - raise NotImplementedError("Unhandled code '{}'".format(code)) + raise NotImplementedError(f"Unhandled code '{code}'") if len(tokens) == 0: tokens = [""] i = 0 @@ -1231,11 +1231,11 @@ def _load_lang_backend(backend=None): stopfunc.CLD3_LANGUAGES if _lb == "cld3" else \ stopfunc.TEXTBLOB_LANGUAGES if _lb == "textblob" else \ []): - n = "lang_%s" % lang + n = f"lang_{lang}" setattr(stopfunc, n, _lang(lang)) getattr(stopfunc, n).__name__ = getattr(stopfunc, n).__qualname__ = n if LANG: - flng = "lang_%s" % LANG + flng = f"lang_{LANG}" if getattr(stopfunc, flng, None): stopfunc.default = getattr(stopfunc, flng) stopfunc._reload_lang = _load_lang_backend @@ -1263,7 +1263,7 @@ def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings if not stop and (show or debug) and found not in result: s = repr(input) s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s - s = "[+] %s: %s" % (", ".join(found), s) + s = "[+] {', '.join(found)}: {s}" print(s if len(s) <= 80 else s[:77] + "...") result[found] = input if depth >= max_depth or len(result) > 0 and stop: @@ -1274,7 +1274,7 @@ def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings if len(result) > 0 and stop: return if debug: - print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) + print(f"[*] Depth %0{len(str(max_depth))}d/%d: {encoding}" % (depth+1, max_depth)) __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ), stop, show, scoring_heuristic, extended, debug) diff --git a/src/codext/base/_base.py b/src/codext/base/_base.py index 27a31e3..f41df0b 100755 --- a/src/codext/base/_base.py +++ b/src/codext/base/_base.py @@ -5,6 +5,7 @@ from argparse import ArgumentParser, RawTextHelpFormatter from math import log from string import ascii_lowercase as lower, ascii_uppercase as upper, digits, printable +from sys import stdout from textwrap import wrap as wraptext from types import FunctionType, MethodType @@ -280,8 +281,11 @@ def _main(): except Exception as err: print("%sbase%s: invalid input" % (getattr(err, "output", ""), base)) return 1 + if args.decode: + stdout.buffer.write(c) + return 0 c = ensure_str(c) - if swap and args.swapcase and not args.decode: + if swap and args.swapcase: c = codecs.encode(c, "swapcase") for l in (wraptext(c, args.wrap) if args.wrap > 0 else [c]) if wrap else c.split("\n"): print(l) diff --git a/tests/test_generated.py b/tests/test_generated.py index e8eaf10..57b7b4e 100644 --- a/tests/test_generated.py +++ b/tests/test_generated.py @@ -32,7 +32,6 @@ def _template(self): f1 = getattr(codecs, ["decode", "encode"][k.startswith("enc")]) f2 = getattr(codecs, ["encode", "decode"][k.startswith("enc")]) for ename in m.groups(): - #FIXME if ename == "*": # ignore mode only continue From 3e837f45d799fdea61dad4f43b7c1f90701b29d7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 13 Nov 2023 23:14:41 +0000 Subject: [PATCH 19/62] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index 3033e1b..f637fde 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.16%coverage99.16% \ No newline at end of file +coverage: 99.13%coverage99.13% \ No newline at end of file From f415240a94e7d9c9f3dbb21244028eaa063c3759 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Jan 2024 16:43:19 +0000 Subject: [PATCH 20/62] Bump tj-actions/verify-changed-files from 12 to 17 in /.github/workflows Bumps [tj-actions/verify-changed-files](https://github.com/tj-actions/verify-changed-files) from 12 to 17. - [Release notes](https://github.com/tj-actions/verify-changed-files/releases) - [Changelog](https://github.com/tj-actions/verify-changed-files/blob/main/HISTORY.md) - [Commits](https://github.com/tj-actions/verify-changed-files/compare/v12...v17) --- updated-dependencies: - dependency-name: tj-actions/verify-changed-files dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 62476a7..de73aff 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -60,7 +60,7 @@ jobs: pytest --cov=$package --cov-report=xml genbadge coverage -i coverage.xml -o $cov_badge_path - name: Verify Changed files - uses: tj-actions/verify-changed-files@v12 + uses: tj-actions/verify-changed-files@v17 id: changed_files with: files: ${{ env.cov_badge_path }} From 58789428a760399fb02cb9dd1c9fbe9e892db306 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 3 Jan 2024 07:14:25 +0000 Subject: [PATCH 21/62] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index f637fde..fa2dd63 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.13%coverage99.13% \ No newline at end of file +coverage: 98.92%coverage98.92% \ No newline at end of file From 3f1733f5b9e8b29565c6ce00d21af5f5dc165a20 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 10 Jan 2024 22:17:48 +0100 Subject: [PATCH 22/62] Fixed requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ffe2fce..b5db972 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -six +markdown2>=2.4.0 From 2a48f1a25f3293096f5bf831401bdcf74be31d8d Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 7 Jul 2024 15:27:19 +0200 Subject: [PATCH 23/62] Added support for Python 3.12 --- .github/workflows/python-package.yml | 2 +- pyproject.toml | 1 + src/codext/VERSION.txt | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index de73aff..4947463 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -19,7 +19,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/pyproject.toml b/pyproject.toml index b204596..7c5d1b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Software Development :: Libraries :: Python Modules", ] dependencies = [ diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index 318dd9d..701a6a4 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.3 +1.15.4 From 98ea759553b3cbf7a97a6299ffe3cb721c0cef6a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 7 Jul 2024 13:29:15 +0000 Subject: [PATCH 24/62] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index fa2dd63..8cef4a9 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 98.92%coverage98.92% \ No newline at end of file +coverage: 99.04%coverage99.04% \ No newline at end of file From 06d7ca58336fa5479bdd0ca53a56605f6b2ad1a4 Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 6 Jan 2025 23:49:03 +0100 Subject: [PATCH 25/62] Fixed #10 --- .github/workflows/python-package.yml | 2 +- pyproject.toml | 5 ----- src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 5 +++-- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4947463..bdf7f9c 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -19,7 +19,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/pyproject.toml b/pyproject.toml index 7c5d1b6..f8ad01b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,11 +23,6 @@ classifiers = [ "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", "Topic :: Software Development :: Libraries :: Python Modules", ] dependencies = [ diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index 701a6a4..be2c181 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.4 +1.15.5 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index 275dafd..7ad45d9 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -200,8 +200,9 @@ def _input(infile): def _set_exc(name, etype="ValueError"): if not hasattr(builtins, name): - exec(f"class {name}({etype}): __module__ = 'builtins'") - setattr(builtins, name, locals()[name]) + ns = {} + exec(f"class {name}({etype}): __module__ = 'builtins'", {}, ns) + setattr(builtins, name, ns[name]) _set_exc("InputSizeLimitError") _set_exc("ParameterError") From 6b4da6a6ef5affc1083dbeb3b60613d5c4293be1 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 7 Jan 2025 00:09:11 +0100 Subject: [PATCH 26/62] Fixed dependency to removed crypt module --- pyproject.toml | 1 + src/codext/hashing/crypt.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f8ad01b..2323ece 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", ] dependencies = [ + "crypt-r; python_version >= '3.13'", "markdown2>=2.4.0", ] dynamic = ["version"] diff --git a/src/codext/hashing/crypt.py b/src/codext/hashing/crypt.py index 0d44d8e..eddc668 100644 --- a/src/codext/hashing/crypt.py +++ b/src/codext/hashing/crypt.py @@ -12,7 +12,10 @@ if UNIX: - import crypt + try: + import crypt + except ImportError: + import crypt_r as crypt METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] From 9811df6922b7abdb2252289c104ff09a508b3fbb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 6 Jan 2025 23:11:03 +0000 Subject: [PATCH 27/62] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index 8cef4a9..1006657 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.04%coverage99.04% \ No newline at end of file +coverage: 99.11%coverage99.11% \ No newline at end of file From 46748f226a37f67f6a9e8f5048ed25d30d89d257 Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 9 Jun 2025 17:49:32 +0200 Subject: [PATCH 28/62] Fixed dependency to removed crypt module (2) --- pyproject.toml | 2 +- src/codext/hashing/crypt.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2323ece..849d94a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", ] dependencies = [ - "crypt-r; python_version >= '3.13'", + "legacycrypt; python_version >= '3.13'", "markdown2>=2.4.0", ] dynamic = ["version"] diff --git a/src/codext/hashing/crypt.py b/src/codext/hashing/crypt.py index eddc668..9ef8ed5 100644 --- a/src/codext/hashing/crypt.py +++ b/src/codext/hashing/crypt.py @@ -15,7 +15,7 @@ try: import crypt except ImportError: - import crypt_r as crypt + import legacycrypt as crypt METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] From cc05c071ec0b769da8bf6bcc428986d7a1b5f0ba Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 9 Jun 2025 18:04:15 +0200 Subject: [PATCH 29/62] Fixed dependency to removed crypt module (2) --- ...python-package.yml => publish-package.yml} | 29 ++++++++++++++- .github/workflows/pypi-publish.yml | 37 ------------------- src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 8 ++-- tests/test_manual.py | 5 ++- 5 files changed, 37 insertions(+), 44 deletions(-) rename .github/workflows/{python-package.yml => publish-package.yml} (71%) delete mode 100644 .github/workflows/pypi-publish.yml diff --git a/.github/workflows/python-package.yml b/.github/workflows/publish-package.yml similarity index 71% rename from .github/workflows/python-package.yml rename to .github/workflows/publish-package.yml index bdf7f9c..8d9914d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/publish-package.yml @@ -19,7 +19,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} @@ -77,3 +77,30 @@ jobs: with: github_token: ${{ secrets.github_token }} branch: ${{ github.ref }} + deploy: + runs-on: ubuntu-latest + if: ${{ github.event.workflow_run.conclusion == 'success' }} + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Check for version change + uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + version: + - '**/VERSION.txt' + - if: steps.filter.outputs.version == 'true' + name: Cleanup README + run: | + sed -ri 's/^(##*)\s*:.*:\s*/\1 /g' README.md + awk '{if (match($0,"## Supporters")) exit; print}' README.md > README + mv -f README README.md + - run: python3 -m pip install --upgrade build && python3 -m build + - name: Upload ${{ env.package }} to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + verbose: true + verify_metadata: false diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml deleted file mode 100644 index 392e026..0000000 --- a/.github/workflows/pypi-publish.yml +++ /dev/null @@ -1,37 +0,0 @@ -# This workflow will deploy the Python package to PyPi.org - -name: deploy - -env: - package: codext - -on: - push: - branches: - - main - paths: - - '**/VERSION.txt' - workflow_run: - workflows: ["build"] - types: [completed] - -jobs: - deploy: - runs-on: ubuntu-latest - if: ${{ github.event.workflow_run.conclusion == 'success' }} - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - name: Cleanup README - run: | - sed -ri 's/^(##*)\s*:.*:\s*/\1 /g' README.md - awk '{if (match($0,"## Supporters")) exit; print}' README.md > README - mv -f README README.md - - run: python3 -m pip install --upgrade build && python3 -m build - - name: Upload ${{ env.package }} to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.PYPI_API_TOKEN }} - verbose: true - verify_metadata: false diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index be2c181..ab826b5 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.5 +1.15.6 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index 7ad45d9..3042950 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -370,7 +370,7 @@ def add_macro(mname, *encodings): :param mname: macro name :param encodings: encoding names of the encodings to be chained with the macro """ - global PERS_MACROS + global PERS_MACROS # noqa: F824 # check for name clash with alreday existing macros and codecs if mname in MACROS or mname in PERS_MACROS: raise ValueError("Macro name already exists") @@ -630,7 +630,7 @@ def __get_value(token, position, case_changed=False): def clear(): """ Clear codext's local registry of search functions. """ - global __codecs_registry, MACROS, PERS_MACROS + global __codecs_registry, MACROS, PERS_MACROS # noqa: F824 __codecs_registry, MACROS, PERS_MACROS = [], {}, {} codecs.clear = clear @@ -733,7 +733,7 @@ def list_macros(): def remove(name): """ Remove all search functions matching the input encoding name from codext's local registry or any macro with the given name. """ - global __codecs_registry, MACROS, PERS_MACROS + global __codecs_registry, MACROS, PERS_MACROS # noqa: F824 tbr = [] for search_function in __codecs_registry: if search_function(name) is not None: @@ -764,7 +764,7 @@ def remove(name): def reset(): """ Reset codext's local registry of search functions and macros. """ - global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS + global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS # noqa: F824 clear() d = os.path.dirname(__file__) for pkg in sorted(os.listdir(d)): diff --git a/tests/test_manual.py b/tests/test_manual.py index bed4884..c6e3c74 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -125,7 +125,10 @@ def test_codec_hash_functions(self): self.assertIsNotNone(codecs.encode(STR, h)) self.assertRaises(NotImplementedError, codecs.decode, STR, h) if UNIX: - import crypt + try: + import crypt + except ImportError: + import legacycrypt as crypt METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] for m in METHODS: h = "crypt-" + m From f1b9b6b250effd29010fde717ea5801716df40bf Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 9 Jun 2025 16:12:13 +0000 Subject: [PATCH 30/62] Updated coverage.svg --- ...publish-package.yml => python-package.yml} | 25 +++++++++++-------- docs/coverage.svg | 2 +- 2 files changed, 16 insertions(+), 11 deletions(-) rename .github/workflows/{publish-package.yml => python-package.yml} (79%) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/python-package.yml similarity index 79% rename from .github/workflows/publish-package.yml rename to .github/workflows/python-package.yml index 8d9914d..bbf5a50 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/python-package.yml @@ -26,18 +26,14 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} + - name: Install pandoc + run: sudo apt-get install -y pandoc - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest pytest-cov pytest-pythonpath coverage + python -m pip install pytest pytest-cov pytest-pythonpath coverage pip install -r requirements.txt pip install . - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test ${{ env.package }} with pytest run: | pytest --cov=$package @@ -48,6 +44,12 @@ jobs: cov_badge_path: docs/coverage.svg steps: - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: "3.12" + - name: Install pandoc + run: sudo apt-get install -y pandoc notification-daemon - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip @@ -79,7 +81,7 @@ jobs: branch: ${{ github.ref }} deploy: runs-on: ubuntu-latest - if: ${{ github.event.workflow_run.conclusion == 'success' }} + needs: coverage steps: - uses: actions/checkout@v3 with: @@ -97,8 +99,11 @@ jobs: sed -ri 's/^(##*)\s*:.*:\s*/\1 /g' README.md awk '{if (match($0,"## Supporters")) exit; print}' README.md > README mv -f README README.md - - run: python3 -m pip install --upgrade build && python3 -m build - - name: Upload ${{ env.package }} to PyPI + - if: steps.filter.outputs.version == 'true' + name: Build ${{ env.package }} package + run: python3 -m pip install --upgrade build && python3 -m build + - if: steps.filter.outputs.version == 'true' + name: Upload ${{ env.package }} to PyPi uses: pypa/gh-action-pypi-publish@release/v1 with: password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/docs/coverage.svg b/docs/coverage.svg index 1006657..4d30c44 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.11%coverage99.11% \ No newline at end of file +coverage: 98.90%coverage98.90% \ No newline at end of file From dac31601077326f54a3dad91c3d279a0a91c01ee Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 9 Jun 2025 18:20:01 +0200 Subject: [PATCH 31/62] New release --- .github/workflows/python-package.yml | 4 ---- src/codext/VERSION.txt | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index bbf5a50..85432f3 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -26,8 +26,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install pandoc - run: sudo apt-get install -y pandoc - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip @@ -48,8 +46,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: "3.12" - - name: Install pandoc - run: sudo apt-get install -y pandoc notification-daemon - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index ab826b5..d86159f 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.6 +1.15.7 From 3859ebe131ccffb4805b4747af3b174014271449 Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 16 Jun 2025 23:50:52 +0200 Subject: [PATCH 32/62] Fixed #11 --- src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index d86159f..51c7561 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.7 +1.15.8 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index 3042950..ae92325 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -261,7 +261,7 @@ def getregentry(encoding): while True: try: g = m.group(i) or "" - if g.isdigit() and not g.startswith("0") and "".join(set(g)) != "01": + if g.isdigit() and not g.startswith("0") and (re.match(r"10+", g) or "".join(set(g)) != "01"): g = int(g) args += [g] i += 1 From a1c0eea919b8ab8b13e1c91e84f3d724536c7c75 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 18 Jun 2025 00:17:09 +0200 Subject: [PATCH 33/62] Fixed README --- README.md | 686 +++++++++++++++++++++++++++--------------------------- 1 file changed, 343 insertions(+), 343 deletions(-) diff --git a/README.md b/README.md index 35aa6c2..58c1c9d 100644 --- a/README.md +++ b/README.md @@ -1,343 +1,343 @@ -

-

CodExt Tweet

-

Encode/decode anything.

- -[![PyPi](https://img.shields.io/pypi/v/codext.svg)](https://pypi.python.org/pypi/codext/) -[![Read The Docs](https://readthedocs.org/projects/python-codext/badge/?version=latest)](https://python-codext.readthedocs.io/en/latest/?badge=latest) -[![Build Status](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml/badge.svg)](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml) -[![Coverage Status](https://raw.githubusercontent.com/dhondta/python-codext/main/docs/coverage.svg)](#) -[![Python Versions](https://img.shields.io/pypi/pyversions/codext.svg)](https://pypi.python.org/pypi/codext/) -[![Known Vulnerabilities](https://snyk.io/test/github/dhondta/python-codext/badge.svg?targetFile=requirements.txt)](https://snyk.io/test/github/dhondta/python-codext?targetFile=requirements.txt) -[![DOI](https://zenodo.org/badge/236679865.svg)](https://zenodo.org/badge/latestdoi/236679865) -[![License](https://img.shields.io/pypi/l/codext.svg)](https://pypi.python.org/pypi/codext/) - -[**CodExt**](https://github.com/dhondta/python-codext) is a (Python2-3 compatible) library that extends the native [`codecs`](https://docs.python.org/3/library/codecs.html) library (namely for adding new custom encodings and character mappings) and provides **120+ new codecs**, hence its name combining *CODecs EXTension*. It also features a **guess mode** for decoding multiple layers of encoding and **CLI tools** for convenience. - -```sh -$ pip install codext -``` - -Want to contribute a new codec ? | Want to contribute a new macro ? -:----------------------------------:|:------------------------------------: -Check the [documentation](https://python-codext.readthedocs.io/en/latest/howto.html) first
Then [PR](https://github.com/dhondta/python-codext/pulls) your new codec | [PR](https://github.com/dhondta/python-codext/pulls) your updated version of [`macros.json`](https://github.com/dhondta/python-codext/blob/main/codext/macros.json) - -## :mag: Demonstrations - -

Using CodExt from the command line

-

Using base tools from the command line

-

Using the unbase command line tool

- -## :computer: Usage (main CLI tool) Tweet on codext - -```session -$ codext -i test.txt encode dna-1 -GTGAGCGGGTATGTGA - -$ echo -en "test" | codext encode morse -- . ... - - -$ echo -en "test" | codext encode braille -⠞⠑⠎⠞ - -$ echo -en "test" | codext encode base100 -👫👜👪👫 -``` - -### Chaining codecs - -```sh -$ echo -en "Test string" | codext encode reverse -gnirts tseT - -$ echo -en "Test string" | codext encode reverse morse ---. -. .. .-. - ... / - ... . - - -$ echo -en "Test string" | codext encode reverse morse dna-2 -AGTCAGTCAGTGAGAAAGTCAGTGAGAAAGTGAGTGAGAAAGTGAGTCAGTGAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTTAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTGAGAAAGTC - -$ echo -en "Test string" | codext encode reverse morse dna-2 octal -101107124103101107124103101107124107101107101101101107124103101107124107101107101101101107124107101107124107101107101101101107124107101107124103101107124107101107101101101107124103101107101101101107124107101107124107101107124107101107101101101107124124101107101101101107124103101107101101101107124107101107124107101107124107101107101101101107124107101107101101101107124103 - -$ echo -en "AGTCAGTCAGTGAGAAAGTCAGTGAGAAAGTGAGTGAGAAAGTGAGTCAGTGAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTTAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTGAGAAAGTC" | codext -d dna-2 morse reverse -test string -``` - -### Using macros - -```sh -$ codext add-macro my-encoding-chain gzip base63 lzma base64 - -$ codext list macros -example-macro, my-encoding-chain - -$ echo -en "Test string" | codext encode my-encoding-chain -CQQFAF0AAIAAABuTgySPa7WaZC5Sunt6FS0ko71BdrYE8zHqg91qaqadZIR2LafUzpeYDBalvE///ug4AA== - -$ codext remove-macro my-encoding-chain - -$ codext list macros -example-macro -``` - -## :computer: Usage (base CLI tool) Tweet on unbase - -```session -$ echo "Test string !" | base122 -*.7!ft9�-f9Â - -$ echo "Test string !" | base91 -"ONK;WDZM%Z%xE7L - -$ echo "Test string !" | base91 | base85 -B2P|BJ6A+nO(j|-cttl% - -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr -QVx5tvgjvCAkXaMSuKoQmCnjeCV1YyyR3WErUUErFf - -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | base58-flickr -d | base36 -d | base85 -d | base91 -d -Test string ! -``` - -```session -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -m 3 -Test string ! - -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -f Test -Test string ! -``` - -## :computer: Usage (Python) - -Getting the list of available codecs: - -```python ->>> import codext - ->>> codext.list() -['ascii85', 'base85', 'base100', 'base122', ..., 'tomtom', 'dna', 'html', 'markdown', 'url', 'resistor', 'sms', 'whitespace', 'whitespace-after-before'] - ->>> codext.encode("this is a test", "base58-bitcoin") -'jo91waLQA1NNeBmZKUF' - ->>> codext.encode("this is a test", "base58-ripple") -'jo9rA2LQwr44eBmZK7E' - ->>> codext.encode("this is a test", "base58-url") -'JN91Wzkpa1nnDbLyjtf' - ->>> codecs.encode("this is a test", "base100") -'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' - ->>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") -'this is a test' - ->>> for i in range(8): - print(codext.encode("this is a test", "dna-%d" % (i + 1))) -GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA -CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA -ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG -AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC -TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG -TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC -GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT -CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT ->>> codext.decode("GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA", "dna-1") -'this is a test' - ->>> codecs.encode("this is a test", "morse") -'- .... .. ... / .. ... / .- / - . ... -' - ->>> codecs.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") -'this is a test' - ->>> with open("morse.txt", 'w', encoding="morse") as f: - f.write("this is a test") -14 - ->>> with open("morse.txt",encoding="morse") as f: - f.read() -'this is a test' - ->>> codext.decode(""" - = - X - : - x - n - r - y - Y - y - p - a - ` - n - | - a -o - h - ` - g - o - z """, "whitespace-after+before") -'CSC{not_so_invisible}' - ->>> print(codext.encode("An example test string", "baudot-tape")) -***.** - . * -***.* -* . - .* -* .* - . * -** .* -***.** -** .** - .* -* . -* *. * - .* -* *. -* *. * -* . -* *. -* *. * -***. - *.* -***.* - * .* -``` - -## :page_with_curl: List of codecs - -#### [BaseXX](https://python-codext.readthedocs.io/en/latest/enc/base.html) - -- [X] `base1`: useless, but for the sake of completeness -- [X] `base2`: simple conversion to binary (with a variant with a reversed alphabet) -- [X] `base3`: conversion to ternary (with a variant with a reversed alphabet) -- [X] `base4`: conversion to quarternary (with a variant with a reversed alphabet) -- [X] `base8`: simple conversion to octal (with a variant with a reversed alphabet) -- [X] `base10`: simple conversion to decimal -- [X] `base11`: conversion to digits with a "*a*" -- [X] `base16`: simple conversion to hexadecimal (with a variant holding an alphabet with digits and letters inverted) -- [X] `base26`: conversion to alphabet letters -- [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32.html)) -- [X] `base36`: [Base36](https://en.wikipedia.org/wiki/Base36) conversion to letters and digits (with a variant inverting both groups) -- [X] `base45`: [Base45](https://datatracker.ietf.org/doc/html/draft-faltstrom-base45-04.txt) DRAFT algorithm (with a variant inverting letters and digits) -- [X] `base58`: multiple versions of [Base58](https://en.bitcoinwiki.org/wiki/Base58) (bitcoin, flickr, ripple) -- [X] `base62`: [Base62](https://en.wikipedia.org/wiki/Base62) conversion to lower- and uppercase letters and digits (with a variant with letters and digits inverted) -- [X] `base63`: similar to `base62` with the "`_`" added -- [X] `base64`: classical conversion according to RFC4648 with its variant URL (or *file*) (it also holds a variant with letters and digits inverted) -- [X] `base67`: custom conversion using some more special characters (also with a variant with letters and digits inverted) -- [X] `base85`: all variants of Base85 ([Ascii85](https://fr.wikipedia.org/wiki/Ascii85), [z85](https://rfc.zeromq.org/spec/32), [Adobe](https://dencode.com/string/ascii85), [(x)btoa](https://dencode.com/string/ascii85), [RFC1924](https://datatracker.ietf.org/doc/html/rfc1924), [XML](https://datatracker.ietf.org/doc/html/draft-kwiatkowski-base85-for-xml-00)) -- [X] `base91`: [Base91](http://base91.sourceforge.net) custom conversion -- [X] `base100` (or *emoji*): [Base100](https://github.com/AdamNiederer/base100) custom conversion -- [X] `base122`: [Base100](http://blog.kevinalbs.com/base122) custom conversion -- [X] `base-genericN`: see [base encodings](https://python-codext.readthedocs.io/en/latest/enc/base.html) ; supports any possible base - -This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `base85` codec. - -#### [Binary](https://python-codext.readthedocs.io/en/latest/enc/binary.html) - -- [X] `baudot`: supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... -- [X] `baudot-spaced`: variant of `baudot` ; groups of 5 bits are whitespace-separated -- [X] `baudot-tape`: variant of `baudot` ; outputs a string that looks like a perforated tape -- [X] `bcd`: _Binary Coded Decimal_, encodes characters from their (zero-left-padded) ordinals -- [X] `bcd-extended0`: variant of `bcd` ; encodes characters from their (zero-left-padded) ordinals using prefix bits `0000` -- [X] `bcd-extended1`: variant of `bcd` ; encodes characters from their (zero-left-padded) ordinals using prefix bits `1111` -- [X] `excess3`: uses Excess-3 (aka Stibitz code) binary encoding to convert characters from their ordinals -- [X] `gray`: aka reflected binary code -- [X] `manchester`: XORes each bit of the input with `01` -- [X] `manchester-inverted`: variant of `manchester` ; XORes each bit of the input with `10` -- [X] `rotateN`: rotates characters by the specified number of bits (*N* belongs to [1, 7] ; Python 3 only) - -#### [Common](https://python-codext.readthedocs.io/en/latest/enc/common.html) - -- [X] `a1z26`: keeps words whitespace-separated and uses a custom character separator -- [X] `cases`: set of case-related encodings (including camel-, kebab-, lower-, pascal-, upper-, snake- and swap-case, slugify, capitalize, title) -- [X] `dummy`: set of simple encodings (including integer, replace, reverse, word-reverse, substite and strip-spaces) -- [X] `octal`: dummy octal conversion (converts to 3-digits groups) -- [X] `octal-spaced`: variant of `octal` ; dummy octal conversion, handling whitespace separators -- [X] `ordinal`: dummy character ordinals conversion (converts to 3-digits groups) -- [X] `ordinal-spaced`: variant of `ordinal` ; dummy character ordinals conversion, handling whitespace separators - -#### [Compression](https://python-codext.readthedocs.io/en/latest/enc/compressions.html) - -- [X] `gzip`: standard Gzip compression/decompression -- [X] `lz77`: compresses the given data with the algorithm of Lempel and Ziv of 1977 -- [X] `lz78`: compresses the given data with the algorithm of Lempel and Ziv of 1978 -- [X] `pkzip_deflate`: standard Zip-deflate compression/decompression -- [X] `pkzip_bzip2`: standard BZip2 compression/decompression -- [X] `pkzip_lzma`: standard LZMA compression/decompression - -> :warning: Compression functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. - -#### [Cryptography](https://python-codext.readthedocs.io/en/latest/enc/crypto.html) - -- [X] `affine`: aka Affine Cipher -- [X] `atbash`: aka Atbash Cipher -- [X] `bacon`: aka Baconian Cipher -- [X] `barbie-N`: aka Barbie Typewriter (*N* belongs to [1, 4]) -- [X] `citrix`: aka Citrix CTX1 password encoding -- [X] `railfence`: aka Rail Fence Cipher -- [X] `rotN`: aka Caesar cipher (*N* belongs to [1,25]) -- [X] `scytaleN`: encrypts using the number of letters on the rod (*N* belongs to [1,[) -- [X] `shiftN`: shift ordinals (*N* belongs to [1,255]) -- [X] `xorN`: XOR with a single byte (*N* belongs to [1,255]) - -> :warning: Crypto functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. - -#### [Hashing](https://python-codext.readthedocs.io/en/latest/enc/hashing.html) - -- [X] `blake`: includes BLAKE2b and BLAKE2s (Python 3 only ; relies on `hashlib`) -- [X] `checksums`: includes Adler32 and CRC32 (relies on `zlib`) -- [X] `crypt`: Unix's crypt hash for passwords (Python 3 and Unix only ; relies on `crypt`) -- [X] `md`: aka Message Digest ; includes MD4 and MD5 (relies on `hashlib`) -- [X] `sha`: aka Secure Hash Algorithms ; includes SHA1, 224, 256, 384, 512 (Python2/3) but also SHA3-224, -256, -384 and -512 (Python 3 only ; relies on `hashlib`) -- [X] `shake`: aka SHAKE hashing (Python 3 only ; relies on `hashlib`) - -> :warning: Hash functions are of course definitely **NOT** encoding functions ; they are implemented for convenience with the `.encode(...)` API from `codecs` and useful for chaning codecs. - -#### [Languages](https://python-codext.readthedocs.io/en/latest/enc/languages.html) - -- [X] `braille`: well-known braille language (Python 3 only) -- [X] `ipsum`: aka lorem ipsum -- [X] `galactic`: aka galactic alphabet or Minecraft enchantment language (Python 3 only) -- [X] `leetspeak`: based on minimalistic elite speaking rules -- [X] `morse`: uses whitespace as a separator -- [X] `navajo`: only handles letters (not full words from the Navajo dictionary) -- [X] `radio`: aka NATO or radio phonetic alphabet -- [X] `southpark`: converts letters to Kenny's language from Southpark (whitespace is also handled) -- [X] `southpark-icase`: case insensitive variant of `southpark` -- [X] `tap`: converts text to tap/knock code, commonly used by prisoners -- [X] `tomtom`: similar to `morse`, using slashes and backslashes - -#### [Others](https://python-codext.readthedocs.io/en/latest/enc/others.html) - -- [X] `dna`: implements the 8 rules of DNA sequences (N belongs to [1,8]) -- [X] `letter-indices`: encodes consonants and/or vowels with their corresponding indices -- [X] `markdown`: unidirectional encoding from Markdown to HTML - -#### [Steganography](https://python-codext.readthedocs.io/en/latest/enc/stegano.html) - -- [X] `hexagram`: uses Base64 and encodes the result to a charset of [I Ching hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) (as implemented [here](https://github.com/qntm/hexagram-encode)) -- [X] `klopf`: aka Klopf code ; Polybius square with trivial alphabetical distribution -- [X] `resistor`: aka resistor color codes -- [X] `rick`: aka Rick cipher (in reference to Rick Astley's song "*Never gonna give you up*") -- [X] `sms`: also called _T9 code_ ; uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding -- [X] `whitespace`: replaces bits with whitespaces and tabs -- [X] `whitespace_after_before`: variant of `whitespace` ; encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") - -#### [Web](https://python-codext.readthedocs.io/en/latest/enc/web.html) - -- [X] `html`: implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) -- [X] `url`: aka URL encoding - - -## :clap: Supporters - -[![Stargazers repo roster for @dhondta/python-codext](https://reporoster.com/stars/dark/dhondta/python-codext)](https://github.com/dhondta/python-codext/stargazers) - -[![Forkers repo roster for @dhondta/python-codext](https://reporoster.com/forks/dark/dhondta/python-codext)](https://github.com/dhondta/python-codext/network/members) - -

Back to top

+

+

CodExt Tweet

+

Encode/decode anything.

+ +[![PyPi](https://img.shields.io/pypi/v/codext.svg)](https://pypi.python.org/pypi/codext/) +[![Read The Docs](https://readthedocs.org/projects/python-codext/badge/?version=latest)](https://python-codext.readthedocs.io/en/latest/?badge=latest) +[![Build Status](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml/badge.svg)](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml) +[![Coverage Status](https://raw.githubusercontent.com/dhondta/python-codext/main/docs/coverage.svg)](#) +[![Python Versions](https://img.shields.io/pypi/pyversions/codext.svg)](https://pypi.python.org/pypi/codext/) +[![Known Vulnerabilities](https://snyk.io/test/github/dhondta/python-codext/badge.svg?targetFile=requirements.txt)](https://snyk.io/test/github/dhondta/python-codext?targetFile=requirements.txt) +[![DOI](https://zenodo.org/badge/236679865.svg)](https://zenodo.org/badge/latestdoi/236679865) +[![License](https://img.shields.io/pypi/l/codext.svg)](https://pypi.python.org/pypi/codext/) + +[**CodExt**](https://github.com/dhondta/python-codext) is a (Python2-3 compatible) library that extends the native [`codecs`](https://docs.python.org/3/library/codecs.html) library (namely for adding new custom encodings and character mappings) and provides **120+ new codecs**, hence its name combining *CODecs EXTension*. It also features a **guess mode** for decoding multiple layers of encoding and **CLI tools** for convenience. + +```sh +$ pip install codext +``` + +Want to contribute a new codec ? | Want to contribute a new macro ? +:----------------------------------:|:------------------------------------: +Check the [documentation](https://python-codext.readthedocs.io/en/latest/howto.html) first
Then [PR](https://github.com/dhondta/python-codext/pulls) your new codec | [PR](https://github.com/dhondta/python-codext/pulls) your updated version of [`macros.json`](https://github.com/dhondta/python-codext/blob/main/codext/macros.json) + +## :mag: Demonstrations + +

Using CodExt from the command line

+

Using base tools from the command line

+

Using the unbase command line tool

+ +## :computer: Usage (main CLI tool) Tweet on codext + +```session +$ codext -i test.txt encode dna-1 +GTGAGCGGGTATGTGA + +$ echo -en "test" | codext encode morse +- . ... - + +$ echo -en "test" | codext encode braille +⠞⠑⠎⠞ + +$ echo -en "test" | codext encode base100 +👫👜👪👫 +``` + +### Chaining codecs + +```sh +$ echo -en "Test string" | codext encode reverse +gnirts tseT + +$ echo -en "Test string" | codext encode reverse morse +--. -. .. .-. - ... / - ... . - + +$ echo -en "Test string" | codext encode reverse morse dna-2 +AGTCAGTCAGTGAGAAAGTCAGTGAGAAAGTGAGTGAGAAAGTGAGTCAGTGAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTTAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTGAGAAAGTC + +$ echo -en "Test string" | codext encode reverse morse dna-2 octal +101107124103101107124103101107124107101107101101101107124103101107124107101107101101101107124107101107124107101107101101101107124107101107124103101107124107101107101101101107124103101107101101101107124107101107124107101107124107101107101101101107124124101107101101101107124103101107101101101107124107101107124107101107124107101107101101101107124107101107101101101107124103 + +$ echo -en "AGTCAGTCAGTGAGAAAGTCAGTGAGAAAGTGAGTGAGAAAGTGAGTCAGTGAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTTAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTGAGAAAGTC" | codext -d dna-2 morse reverse +test string +``` + +### Using macros + +```sh +$ codext add-macro my-encoding-chain gzip base63 lzma base64 + +$ codext list macros +example-macro, my-encoding-chain + +$ echo -en "Test string" | codext encode my-encoding-chain +CQQFAF0AAIAAABuTgySPa7WaZC5Sunt6FS0ko71BdrYE8zHqg91qaqadZIR2LafUzpeYDBalvE///ug4AA== + +$ codext remove-macro my-encoding-chain + +$ codext list macros +example-macro +``` + +## :computer: Usage (base CLI tool) Tweet on unbase + +```session +$ echo "Test string !" | base122 +*.7!ft9�-f9Â + +$ echo "Test string !" | base91 +"ONK;WDZM%Z%xE7L + +$ echo "Test string !" | base91 | base85 +B2P|BJ6A+nO(j|-cttl% + +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr +QVx5tvgjvCAkXaMSuKoQmCnjeCV1YyyR3WErUUErFf + +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | base58-flickr -d | base36 -d | base85 -d | base91 -d +Test string ! +``` + +```session +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -m 3 +Test string ! + +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -f Test +Test string ! +``` + +## :computer: Usage (Python) + +Getting the list of available codecs: + +```python +>>> import codext + +>>> codext.list() +['ascii85', 'base85', 'base100', 'base122', ..., 'tomtom', 'dna', 'html', 'markdown', 'url', 'resistor', 'sms', 'whitespace', 'whitespace-after-before'] + +>>> codext.encode("this is a test", "base58-bitcoin") +'jo91waLQA1NNeBmZKUF' + +>>> codext.encode("this is a test", "base58-ripple") +'jo9rA2LQwr44eBmZK7E' + +>>> codext.encode("this is a test", "base58-url") +'JN91Wzkpa1nnDbLyjtf' + +>>> codecs.encode("this is a test", "base100") +'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' + +>>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") +'this is a test' + +>>> for i in range(8): + print(codext.encode("this is a test", "dna-%d" % (i + 1))) +GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA +CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA +ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG +AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC +TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG +TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC +GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT +CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT +>>> codext.decode("GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA", "dna-1") +'this is a test' + +>>> codecs.encode("this is a test", "morse") +'- .... .. ... / .. ... / .- / - . ... -' + +>>> codecs.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") +'this is a test' + +>>> with open("morse.txt", 'w', encoding="morse") as f: + f.write("this is a test") +14 + +>>> with open("morse.txt",encoding="morse") as f: + f.read() +'this is a test' + +>>> codext.decode(""" + = + X + : + x + n + r + y + Y + y + p + a + ` + n + | + a +o + h + ` + g + o + z """, "whitespace-after+before") +'CSC{not_so_invisible}' + +>>> print(codext.encode("An example test string", "baudot-tape")) +***.** + . * +***.* +* . + .* +* .* + . * +** .* +***.** +** .** + .* +* . +* *. * + .* +* *. +* *. * +* . +* *. +* *. * +***. + *.* +***.* + * .* +``` + +## :page_with_curl: List of codecs + +#### [BaseXX](https://python-codext.readthedocs.io/en/latest/enc/base.html) + +- [X] `base1`: useless, but for the sake of completeness +- [X] `base2`: simple conversion to binary (with a variant with a reversed alphabet) +- [X] `base3`: conversion to ternary (with a variant with a reversed alphabet) +- [X] `base4`: conversion to quarternary (with a variant with a reversed alphabet) +- [X] `base8`: simple conversion to octal (with a variant with a reversed alphabet) +- [X] `base10`: simple conversion to decimal +- [X] `base11`: conversion to digits with a "*a*" +- [X] `base16`: simple conversion to hexadecimal (with a variant holding an alphabet with digits and letters inverted) +- [X] `base26`: conversion to alphabet letters +- [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32.html)) +- [X] `base36`: [Base36](https://en.wikipedia.org/wiki/Base36) conversion to letters and digits (with a variant inverting both groups) +- [X] `base45`: [Base45](https://datatracker.ietf.org/doc/html/draft-faltstrom-base45-04.txt) DRAFT algorithm (with a variant inverting letters and digits) +- [X] `base58`: multiple versions of [Base58](https://en.bitcoinwiki.org/wiki/Base58) (bitcoin, flickr, ripple) +- [X] `base62`: [Base62](https://en.wikipedia.org/wiki/Base62) conversion to lower- and uppercase letters and digits (with a variant with letters and digits inverted) +- [X] `base63`: similar to `base62` with the "`_`" added +- [X] `base64`: classical conversion according to RFC4648 with its variant URL (or *file*) (it also holds a variant with letters and digits inverted) +- [X] `base67`: custom conversion using some more special characters (also with a variant with letters and digits inverted) +- [X] `base85`: all variants of Base85 ([Ascii85](https://fr.wikipedia.org/wiki/Ascii85), [z85](https://rfc.zeromq.org/spec/32), [Adobe](https://dencode.com/string/ascii85), [(x)btoa](https://dencode.com/string/ascii85), [RFC1924](https://datatracker.ietf.org/doc/html/rfc1924), [XML](https://datatracker.ietf.org/doc/html/draft-kwiatkowski-base85-for-xml-00)) +- [X] `base91`: [Base91](http://base91.sourceforge.net) custom conversion +- [X] `base100` (or *emoji*): [Base100](https://github.com/AdamNiederer/base100) custom conversion +- [X] `base122`: [Base100](http://blog.kevinalbs.com/base122) custom conversion +- [X] `base-genericN`: see [base encodings](https://python-codext.readthedocs.io/en/latest/enc/base.html) ; supports any possible base + +This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `base85` codec. + +#### [Binary](https://python-codext.readthedocs.io/en/latest/enc/binary.html) + +- [X] `baudot`: supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... +- [X] `baudot-spaced`: variant of `baudot` ; groups of 5 bits are whitespace-separated +- [X] `baudot-tape`: variant of `baudot` ; outputs a string that looks like a perforated tape +- [X] `bcd`: _Binary Coded Decimal_, encodes characters from their (zero-left-padded) ordinals +- [X] `bcd-extended0`: variant of `bcd` ; encodes characters from their (zero-left-padded) ordinals using prefix bits `0000` +- [X] `bcd-extended1`: variant of `bcd` ; encodes characters from their (zero-left-padded) ordinals using prefix bits `1111` +- [X] `excess3`: uses Excess-3 (aka Stibitz code) binary encoding to convert characters from their ordinals +- [X] `gray`: aka reflected binary code +- [X] `manchester`: XORes each bit of the input with `01` +- [X] `manchester-inverted`: variant of `manchester` ; XORes each bit of the input with `10` +- [X] `rotateN`: rotates characters by the specified number of bits (*N* belongs to [1, 7] ; Python 3 only) + +#### [Common](https://python-codext.readthedocs.io/en/latest/enc/common.html) + +- [X] `a1z26`: keeps words whitespace-separated and uses a custom character separator +- [X] `cases`: set of case-related encodings (including camel-, kebab-, lower-, pascal-, upper-, snake- and swap-case, slugify, capitalize, title) +- [X] `dummy`: set of simple encodings (including integer, replace, reverse, word-reverse, substite and strip-spaces) +- [X] `octal`: dummy octal conversion (converts to 3-digits groups) +- [X] `octal-spaced`: variant of `octal` ; dummy octal conversion, handling whitespace separators +- [X] `ordinal`: dummy character ordinals conversion (converts to 3-digits groups) +- [X] `ordinal-spaced`: variant of `ordinal` ; dummy character ordinals conversion, handling whitespace separators + +#### [Compression](https://python-codext.readthedocs.io/en/latest/enc/compressions.html) + +- [X] `gzip`: standard Gzip compression/decompression +- [X] `lz77`: compresses the given data with the algorithm of Lempel and Ziv of 1977 +- [X] `lz78`: compresses the given data with the algorithm of Lempel and Ziv of 1978 +- [X] `pkzip_deflate`: standard Zip-deflate compression/decompression +- [X] `pkzip_bzip2`: standard BZip2 compression/decompression +- [X] `pkzip_lzma`: standard LZMA compression/decompression + +> :warning: Compression functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. + +#### [Cryptography](https://python-codext.readthedocs.io/en/latest/enc/crypto.html) + +- [X] `affine`: aka Affine Cipher +- [X] `atbash`: aka Atbash Cipher +- [X] `bacon`: aka Baconian Cipher +- [X] `barbie-N`: aka Barbie Typewriter (*N* belongs to [1, 4]) +- [X] `citrix`: aka Citrix CTX1 password encoding +- [X] `railfence`: aka Rail Fence Cipher +- [X] `rotN`: aka Caesar cipher (*N* belongs to [1,25]) +- [X] `scytaleN`: encrypts using the number of letters on the rod (*N* belongs to [1,[) +- [X] `shiftN`: shift ordinals (*N* belongs to [1,255]) +- [X] `xorN`: XOR with a single byte (*N* belongs to [1,255]) + +> :warning: Crypto functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. + +#### [Hashing](https://python-codext.readthedocs.io/en/latest/enc/hashing.html) + +- [X] `blake`: includes BLAKE2b and BLAKE2s (Python 3 only ; relies on `hashlib`) +- [X] `checksums`: includes Adler32 and CRC32 (relies on `zlib`) +- [X] `crypt`: Unix's crypt hash for passwords (Python 3 and Unix only ; relies on `crypt`) +- [X] `md`: aka Message Digest ; includes MD4 and MD5 (relies on `hashlib`) +- [X] `sha`: aka Secure Hash Algorithms ; includes SHA1, 224, 256, 384, 512 (Python2/3) but also SHA3-224, -256, -384 and -512 (Python 3 only ; relies on `hashlib`) +- [X] `shake`: aka SHAKE hashing (Python 3 only ; relies on `hashlib`) + +> :warning: Hash functions are of course definitely **NOT** encoding functions ; they are implemented for convenience with the `.encode(...)` API from `codecs` and useful for chaning codecs. + +#### [Languages](https://python-codext.readthedocs.io/en/latest/enc/languages.html) + +- [X] `braille`: well-known braille language (Python 3 only) +- [X] `ipsum`: aka lorem ipsum +- [X] `galactic`: aka galactic alphabet or Minecraft enchantment language (Python 3 only) +- [X] `leetspeak`: based on minimalistic elite speaking rules +- [X] `morse`: uses whitespace as a separator +- [X] `navajo`: only handles letters (not full words from the Navajo dictionary) +- [X] `radio`: aka NATO or radio phonetic alphabet +- [X] `southpark`: converts letters to Kenny's language from Southpark (whitespace is also handled) +- [X] `southpark-icase`: case insensitive variant of `southpark` +- [X] `tap`: converts text to tap/knock code, commonly used by prisoners +- [X] `tomtom`: similar to `morse`, using slashes and backslashes + +#### [Others](https://python-codext.readthedocs.io/en/latest/enc/others.html) + +- [X] `dna`: implements the 8 rules of DNA sequences (N belongs to [1,8]) +- [X] `letter-indices`: encodes consonants and/or vowels with their corresponding indices +- [X] `markdown`: unidirectional encoding from Markdown to HTML + +#### [Steganography](https://python-codext.readthedocs.io/en/latest/enc/stegano.html) + +- [X] `hexagram`: uses Base64 and encodes the result to a charset of [I Ching hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) (as implemented [here](https://github.com/qntm/hexagram-encode)) +- [X] `klopf`: aka Klopf code ; Polybius square with trivial alphabetical distribution +- [X] `resistor`: aka resistor color codes +- [X] `rick`: aka Rick cipher (in reference to Rick Astley's song "*Never gonna give you up*") +- [X] `sms`: also called _T9 code_ ; uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding +- [X] `whitespace`: replaces bits with whitespaces and tabs +- [X] `whitespace_after_before`: variant of `whitespace` ; encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") + +#### [Web](https://python-codext.readthedocs.io/en/latest/enc/web.html) + +- [X] `html`: implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) +- [X] `url`: aka URL encoding + + +## :clap: Supporters + +[![Stargazers repo roster for @dhondta/python-codext](https://reporoster.com/stars/dark/dhondta/python-codext)](https://github.com/dhondta/python-codext/stargazers) + +[![Forkers repo roster for @dhondta/python-codext](https://reporoster.com/forks/dark/dhondta/python-codext)](https://github.com/dhondta/python-codext/network/members) + +

Back to top

From 935cd8c9100e7009bac2c67b406501937a1abac5 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 17 Sep 2025 22:41:28 +0200 Subject: [PATCH 34/62] Fixed some vulnerabilities --- .github/workflows/python-package.yml | 4 ++-- pyproject.toml | 4 ++-- pytest.ini | 2 +- requirements.txt | 3 ++- src/codext/VERSION.txt | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 85432f3..96f267f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -29,7 +29,7 @@ jobs: - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip - python -m pip install pytest pytest-cov pytest-pythonpath coverage + python -m pip install pytest pytest-cov coverage pip install -r requirements.txt pip install . - name: Test ${{ env.package }} with pytest @@ -49,7 +49,7 @@ jobs: - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip - python -m pip install pytest pytest-cov pytest-pythonpath + python -m pip install pytest pytest-cov pip install -r requirements.txt pip install . - name: Make coverage badge for ${{ env.package }} diff --git a/pyproject.toml b/pyproject.toml index 849d94a..1644aee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=61.0", "setuptools-scm"] +requires = ["setuptools>=80.0.0", "setuptools-scm"] build-backend = "setuptools.build_meta" [tool.setuptools.dynamic] @@ -27,7 +27,7 @@ classifiers = [ ] dependencies = [ "legacycrypt; python_version >= '3.13'", - "markdown2>=2.4.0", + "markdown2>=2.5.4", ] dynamic = ["version"] diff --git a/pytest.ini b/pytest.ini index ab4c198..fcccae1 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,2 @@ [pytest] -python_paths = src +pythonpath = src diff --git a/requirements.txt b/requirements.txt index b5db972..51e438c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -markdown2>=2.4.0 +markdown2>=2.5.4 +legacycrypt; python_version >= '3.13' diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index 51c7561..054a2bd 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.8 +1.15.9 From 5aaec705c823ea2f906a50b643ca3608684ac918 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Sep 2025 20:43:39 +0000 Subject: [PATCH 35/62] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index 4d30c44..9f90515 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 98.90%coverage98.90% \ No newline at end of file +coverage: 99.04%coverage99.04% \ No newline at end of file From 4f6be48b92fdc6212765f37cae3973773e128613 Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 12 Jan 2026 00:05:47 +0100 Subject: [PATCH 36/62] Applied some minor improvements --- .github/workflows/python-package.yml | 3 ++- requirements.txt | 2 +- src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 20 +++++++++++--------- src/codext/hashing/__init__.py | 17 +++++++++-------- src/codext/hashing/blake.py | 2 +- src/codext/hashing/checksums.py | 2 +- src/codext/hashing/crypt.py | 2 +- src/codext/hashing/md.py | 3 +-- src/codext/hashing/mmh3.py | 18 ++++++++++++++++++ src/codext/hashing/sha.py | 2 +- src/codext/hashing/shake.py | 2 +- 12 files changed, 48 insertions(+), 27 deletions(-) create mode 100644 src/codext/hashing/mmh3.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 96f267f..2d88e53 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -31,6 +31,7 @@ jobs: python -m pip install --upgrade pip python -m pip install pytest pytest-cov coverage pip install -r requirements.txt + pip install tinyscript>=1.31 pip install . - name: Test ${{ env.package }} with pytest run: | @@ -45,7 +46,7 @@ jobs: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: - python-version: "3.12" + python-version: "3.13" - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip diff --git a/requirements.txt b/requirements.txt index 51e438c..dcaadfd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -markdown2>=2.5.4 legacycrypt; python_version >= '3.13' +markdown2>=2.5.4 diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index 054a2bd..e8018a2 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.9 +1.15.10 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index ae92325..861d342 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -6,7 +6,6 @@ import os import random import re -import sre_parse import sys from encodings.aliases import aliases as ALIASES from functools import reduce, update_wrapper, wraps @@ -37,8 +36,12 @@ from importlib import reload except ImportError: pass +try: + import re._parser as sre_parse +except ImportError: + import sre_parse -# from Python 3.11, it seems that 'sre_parse' is not bound to 're' anymore +# from Python 3.11, 'sre_parse' is bound as '_parser' ; monkey-patch it for backward-compatibility re.sre_parse = sre_parse @@ -870,10 +873,9 @@ def _handle_error(token, position, output="", eename=None): :param output: output, as decoded up to the position of the error """ if errors == "strict": - msg = "'%s' codec can't %scode %s '%s' in %s %d" - token = ensure_str(token) - token = token[:7] + "..." if len(token) > 10 else token - err = getattr(builtins, exc)(msg % (eename or ename, ["en", "de"][decode], kind, token, item, position)) + token = f"{token[:7]}..." if len(token := ensure_str(token)) > 10 else token + err = getattr(builtins, exc)(f"'{eename or ename}' codec can't {['en','de'][decode]}code {kind} '{token}' " + f"in {item} {position}") err.output = output err.__cause__ = err raise err @@ -1264,8 +1266,8 @@ def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings if not stop and (show or debug) and found not in result: s = repr(input) s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s - s = "[+] {', '.join(found)}: {s}" - print(s if len(s) <= 80 else s[:77] + "...") + s = f"[+] {', '.join(found)}: {s}" + print(s if len(s) <= 80 else f"{s[:77]}...") result[found] = input if depth >= max_depth or len(result) > 0 and stop: return @@ -1275,7 +1277,7 @@ def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings if len(result) > 0 and stop: return if debug: - print(f"[*] Depth %0{len(str(max_depth))}d/%d: {encoding}" % (depth+1, max_depth)) + print(f"[*] Depth {depth+1:0{len(str(max_depth))}}/{max_depth}: {encoding}") __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ), stop, show, scoring_heuristic, extended, debug) diff --git a/src/codext/hashing/__init__.py b/src/codext/hashing/__init__.py index 2aa13a0..b7e9fcc 100755 --- a/src/codext/hashing/__init__.py +++ b/src/codext/hashing/__init__.py @@ -1,8 +1,9 @@ -# -*- coding: UTF-8 -*- -from .blake import * -from .checksums import * -from .crypt import * -from .md import * -from .sha import * -from .shake import * - +# -*- coding: UTF-8 -*- +from .blake import * +from .checksums import * +from .crypt import * +from .md import * +from .mmh3 import * +from .sha import * +from .shake import * + diff --git a/src/codext/hashing/blake.py b/src/codext/hashing/blake.py index 6656c46..e168819 100644 --- a/src/codext/hashing/blake.py +++ b/src/codext/hashing/blake.py @@ -1,5 +1,5 @@ # -*- coding: UTF-8 -*- -"""Case Codecs - string hashing with blake. +"""Blake2 Codecs - string hashing with blake. These are codecs for hashing strings, for use with other codecs in encoding chains. diff --git a/src/codext/hashing/checksums.py b/src/codext/hashing/checksums.py index f94dd2e..85dbe67 100644 --- a/src/codext/hashing/checksums.py +++ b/src/codext/hashing/checksums.py @@ -1,5 +1,5 @@ # -*- coding: UTF-8 -*- -"""Case Codecs - string common checksums. +"""Checksum Codecs - string common checksums. These are codecs for hashing strings, for use with other codecs in encoding chains. diff --git a/src/codext/hashing/crypt.py b/src/codext/hashing/crypt.py index 9ef8ed5..2a9ed95 100644 --- a/src/codext/hashing/crypt.py +++ b/src/codext/hashing/crypt.py @@ -1,5 +1,5 @@ # -*- coding: UTF-8 -*- -"""Case Codecs - string hashing with Unix's Crypt. +"""Crypt Hashing Codec - string hashing with Unix's Crypt. These are codecs for hashing strings, for use with other codecs in encoding chains. diff --git a/src/codext/hashing/md.py b/src/codext/hashing/md.py index 521a01c..0f8a053 100644 --- a/src/codext/hashing/md.py +++ b/src/codext/hashing/md.py @@ -1,5 +1,5 @@ # -*- coding: UTF-8 -*- -"""Case Codecs - string hashing with Message Digest (MD). +"""MD Hashing Codecs - string hashing with Message Digest (MD). These are codecs for hashing strings, for use with other codecs in encoding chains. @@ -56,4 +56,3 @@ def md2(data): add("md5", lambda s, error="strict": (hashlib.new("md5", b(s)).hexdigest(), len(s)), guess=None) if "md4" in hashlib.algorithms_available: add("md4", lambda s, error="strict": (hashlib.new("md4", b(s)).hexdigest(), len(s)), guess=None) - diff --git a/src/codext/hashing/mmh3.py b/src/codext/hashing/mmh3.py new file mode 100644 index 0000000..8c26639 --- /dev/null +++ b/src/codext/hashing/mmh3.py @@ -0,0 +1,18 @@ +# -*- coding: UTF-8 -*- +"""MMH3 Codecs - string hashing with MurmurHash3. + +These are codecs for hashing strings, for use with other codecs in encoding chains. + +These codecs: +- transform strings from str to str +- transform strings from bytes to bytes +- transform file content from str to bytes (write) +""" +from ..__common__ import * + + +if "mmh3_32" in hashlib.algorithms_available: + add("mmh3_32", lambda s, error="strict": (hashlib.mmh3_32(b(s)).hexdigest(), len(s)), guess=None) +if "mmh3_128" in hashlib.algorithms_available: + add("mmh3_128", lambda s, error="strict": (hashlib.mmh3_128(b(s)).hexdigest(), len(s)), guess=None) + diff --git a/src/codext/hashing/sha.py b/src/codext/hashing/sha.py index 1351fe8..044e159 100644 --- a/src/codext/hashing/sha.py +++ b/src/codext/hashing/sha.py @@ -1,5 +1,5 @@ # -*- coding: UTF-8 -*- -"""Case Codecs - string hashing with Secure Hash Algorithms. +"""SHA Hashing Codecs - string hashing with Secure Hash Algorithms. These are codecs for hashing strings, for use with other codecs in encoding chains. diff --git a/src/codext/hashing/shake.py b/src/codext/hashing/shake.py index 22c7b99..2b04424 100644 --- a/src/codext/hashing/shake.py +++ b/src/codext/hashing/shake.py @@ -1,5 +1,5 @@ # -*- coding: UTF-8 -*- -"""Case Codecs - string hashing with SHAKE. +"""Shake Hashing Codecs - string hashing with SHAKE. These are codecs for hashing strings, for use with other codecs in encoding chains. From fb7e4be6df6db8cd90c727d225acb315cbd4a8cd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 11 Jan 2026 23:07:55 +0000 Subject: [PATCH 37/62] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index 9f90515..efa3c52 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.04%coverage99.04% \ No newline at end of file +coverage: 98.83%coverage98.83% \ No newline at end of file From 358f0f21968685b3bd1283f5ffe3d2357ca146ea Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 14 Mar 2026 23:19:11 +0000 Subject: [PATCH 38/62] Initial plan From e25289b679651fb81d68b284a9e04d51324c0efd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 14 Mar 2026 23:30:24 +0000 Subject: [PATCH 39/62] Fix ImportError crash when libcrypt/libxcrypt is missing Co-authored-by: dhondta <9108102+dhondta@users.noreply.github.com> --- src/codext/hashing/crypt.py | 30 +++++++++++++++++------------- tests/test_manual.py | 8 ++++++-- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/codext/hashing/crypt.py b/src/codext/hashing/crypt.py index 2a9ed95..f83806f 100644 --- a/src/codext/hashing/crypt.py +++ b/src/codext/hashing/crypt.py @@ -15,18 +15,22 @@ try: import crypt except ImportError: - import legacycrypt as crypt + try: + import legacycrypt as crypt + except ImportError: + crypt = None - METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] - - def crypt_hash(method): - method = (method or "").lstrip("-_") or "blowfish" - if method not in METHODS: - raise NotImplementedError("method '%s' is not implemented" % method) - def _encode(input, error="strict"): - m = getattr(crypt, "METHOD_" + method.upper()) - return crypt.crypt(ensure_str(input), crypt.mksalt(m)), len(input) - return _encode - - add("crypt", crypt_hash, pattern=r"^crypt(|[-_](?:%s))$" % "|".join(METHODS), guess=None) + if crypt is not None: + METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] + + def crypt_hash(method): + method = (method or "").lstrip("-_") or "blowfish" + if method not in METHODS: + raise NotImplementedError("method '%s' is not implemented" % method) + def _encode(input, error="strict"): + m = getattr(crypt, "METHOD_" + method.upper()) + return crypt.crypt(ensure_str(input), crypt.mksalt(m)), len(input) + return _encode + + add("crypt", crypt_hash, pattern=r"^crypt(|[-_](?:%s))$" % "|".join(METHODS), guess=None) diff --git a/tests/test_manual.py b/tests/test_manual.py index c6e3c74..8cfe907 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -128,8 +128,12 @@ def test_codec_hash_functions(self): try: import crypt except ImportError: - import legacycrypt as crypt - METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] + try: + import legacycrypt as crypt + except ImportError: + crypt = None + METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] \ + if crypt is not None else [] for m in METHODS: h = "crypt-" + m self.assertIsNotNone(codecs.encode(STR, h)) From ba7c3c2ab345bcbcf668148af24cc378da0416b0 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 15 Mar 2026 00:55:26 +0100 Subject: [PATCH 40/62] Fixed #13 --- src/codext/VERSION.txt | 2 +- src/codext/binary/baudot.py | 2 +- src/codext/hashing/md.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index e8018a2..4ef4640 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.10 +1.15.11 diff --git a/src/codext/binary/baudot.py b/src/codext/binary/baudot.py index 1cdd111..ba43397 100755 --- a/src/codext/binary/baudot.py +++ b/src/codext/binary/baudot.py @@ -102,7 +102,7 @@ # Russian MTK-2 alphabet (source: https://fr.qwe.wiki/wiki/Baudot_code) MTK2 = [ "11111", "11011", - "\x00Е\n\xff СИУ\r\xffРЙНФЦКТЗЛВХЫПЯОБГ\xffМЬЖ\xff", + "\x00Е\nA СИУ\r\xffРЙНФЦКТЗЛВХЫПЯОБГ\xffМЬЖ\xff", "\x003\n- '87\r\xff4Ю,Э:(5+)2Щ6019?Ш\xff./=\xff", ] # Murray code ; NB: not all fractions are supported (source: https://en.wikipedia.org/wiki/Baudot_code) diff --git a/src/codext/hashing/md.py b/src/codext/hashing/md.py index 0f8a053..eae8e38 100644 --- a/src/codext/hashing/md.py +++ b/src/codext/hashing/md.py @@ -56,3 +56,4 @@ def md2(data): add("md5", lambda s, error="strict": (hashlib.new("md5", b(s)).hexdigest(), len(s)), guess=None) if "md4" in hashlib.algorithms_available: add("md4", lambda s, error="strict": (hashlib.new("md4", b(s)).hexdigest(), len(s)), guess=None) + From 9703456256e743f64d3e897dc472e6aca6246308 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 14 Mar 2026 23:57:35 +0000 Subject: [PATCH 41/62] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index efa3c52..4d30c44 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 98.83%coverage98.83% \ No newline at end of file +coverage: 98.90%coverage98.90% \ No newline at end of file From 2ee1ccf10b83df8de859f8f3a42ca2d1a538ed83 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 21 Mar 2026 08:14:21 +0100 Subject: [PATCH 42/62] Updated workflow --- .github/workflows/python-package.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 2d88e53..8214cd4 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,6 +12,10 @@ on: pull_request: branches: [ "main" ] +permissions: + id-token: write # for OIDC + contents: read + jobs: build: runs-on: ${{ matrix.os }} @@ -100,9 +104,5 @@ jobs: name: Build ${{ env.package }} package run: python3 -m pip install --upgrade build && python3 -m build - if: steps.filter.outputs.version == 'true' - name: Upload ${{ env.package }} to PyPi + name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.PYPI_API_TOKEN }} - verbose: true - verify_metadata: false From 140f754d09b8720430f0f186a054c9172477be10 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 21 Mar 2026 09:14:07 +0100 Subject: [PATCH 43/62] Added AI instructions for new encodings --- .github/copilot-instructions.md | 60 +++++++++++++++++++++++++ .github/prompts/add_codec.prompt.md | 17 +++++++ .github/pull_request_template.md | 11 +++++ docs/ADDING_CODECS.md | 69 +++++++++++++++++++++++++++++ 4 files changed, 157 insertions(+) create mode 100755 .github/copilot-instructions.md create mode 100755 .github/prompts/add_codec.prompt.md create mode 100755 .github/pull_request_template.md create mode 100755 docs/ADDING_CODECS.md diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100755 index 0000000..1a2fd7f --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,60 @@ +# Copilot Instructions — Enhancements Only + +## Scope +This repository focuses on **adding new encoding/decoding schemes only**. + +Copilot MUST: +- Propose **new codecs only** +- Avoid refactoring unrelated code +- Avoid dependency changes unless strictly required for the codec +- Avoid stylistic or formatting changes + +## Context +This project extends Python's codecs with many encoding/decoding schemes and CLI tools. +It already includes a wide variety of bases, ciphers, compression, and niche encodings. + +## Enhancement Guidelines + +When adding a new encoding: +1. Check if it already exists in the project +2. Follow the existing codec structure and naming conventions +3. Provide: + - `encode()` implementation + - `decode()` implementation + - Registration into the codec registry +4. Ensure CLI compatibility (if applicable) + +## Implementation Constraints + +- Pure Python preferred +- No heavy dependencies +- Deterministic transformations only +- Reversible encoding required unless explicitly documented + +## Testing + +Every new codec MUST include: +- Unit tests (encode/decode roundtrip) +- Edge cases (empty input, binary data if applicable) + +## Documentation + +Each codec must include: +- Short description +- Reference (standard, RFC, or algorithm source) +- Example usage + +## Output Format (IMPORTANT) + +When asked to add a codec, Copilot should: +1. Briefly justify the encoding (1–2 lines) +2. Provide full implementation +3. Provide tests +4. Provide documentation snippet + +## Explicit Non-Goals + +- No refactoring +- No performance optimization passes +- No linting-only changes +- No CI/CD changes \ No newline at end of file diff --git a/.github/prompts/add_codec.prompt.md b/.github/prompts/add_codec.prompt.md new file mode 100755 index 0000000..9b470c8 --- /dev/null +++ b/.github/prompts/add_codec.prompt.md @@ -0,0 +1,17 @@ +Add a new encoding scheme to this repository. + +Constraints: +- Follow copilot-instructions.md strictly +- Do not modify unrelated code +- Use existing codec patterns + +Task: +Add encoding: {{ENCODING_NAME}} + +Requirements: +- Implement according to ADDING_CODECS.md guideline +- Add tests if needed (if `__examples__` cannot be consistently defined) +- Add minimal documentation (in the relevant category page under `docs/pages`) + +Reference: +{{LINK_OR_DESCRIPTION}} \ No newline at end of file diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100755 index 0000000..cff8c8d --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,11 @@ +## Type +- [ ] New encoding (required) + +## Checklist +- [ ] No unrelated changes +- [ ] Codec is new (not already implemented) +- [ ] Tests included (if cannot be automated with `tests/test_generated`) +- [ ] Documentation (included in the right page in `docs/pages/enc`) + +## Description +Explain the encoding and its source. \ No newline at end of file diff --git a/docs/ADDING_CODECS.md b/docs/ADDING_CODECS.md new file mode 100755 index 0000000..014e2fa --- /dev/null +++ b/docs/ADDING_CODECS.md @@ -0,0 +1,69 @@ +# Adding a Codec + +1. Categorize accordingly ; categories are the folder names in `src/codext` (further folder references are relative to this). When a category cannot be put in one of these folders, it shall be put by default in `others`. + +2. Add the `.py` file in the relevant category folder, named with the short name of the new codec. + +3. Respect the typical structure of a codec's `.py` file according to the following template (double-bracketed enclosures indicate codec parameters, double-arrowed enclosures indicate instructions that may refer to further steps of this guideline): + + ```python + # -*- coding: UTF-8 -*- + """{{codec_long_name}} Codec - {{codec_short_name}} content encoding. + + {{codec_description}} + + This codec: + - en/decodes strings from str to str + - en/decodes strings from bytes to bytes + - decodes file content to str (read) + - encodes file content from str to bytes (write) + + Reference: {{codec_source_hyperlink}} + """ + from ..__common__ import * + + + __examples__ = {<>} + <>] + + + <> + <> + + + <> + ``` + +4. Choose the right add function + + If the codec is a simple mapping, use the `add_map` function. + + Examples: `languages/braille`, `languages/morse`, `languages/southpark` + + In some cases, an algorithm can even be equivalent to one or a number of mappings and can then be defined as a dynamic generation of `ENCMAP`. + + Examples: `stegano/resistor`, `crypto/barbie` + + When the codec is more complex than a mapping, use the `add` function. + +5. Configure the add function + + Refer to the relevant function signature in `__common__.py`. + +6. Write the codec logic + + If the codec is a mapping, at least `ENC_MAP` should be defined and refered in the parameters of the `add_map` function. + + Examples: `stegano/rick`, `stegano/klopf` + + If the codec is not a mapping, the logic can be written in the following order: the encoding function first, then the decoding function. + + Examples: `stegano/whitespace`, `crypto/railfence` + +7. Write some examples + + Examples are used during the automated test generation. They should then be carefully written to also cover some edge cases. A set of 3-8 examples is generally a must. + +8. Specify the names to be used with the guessing mode + + The `__guess__` list of codec names is used to limit the possibilities in the tree search from the guessing mode. Especially when the codec is dynamic and may have a large (or even infinite) number of dynamic names, it is necessary to set a limited number, generally maximum 16 as a best practice. This list, when relevant, shall be used with due care. From 4d2cb0d269fed6aa7e186c38ecc9439a525f592d Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 22 Mar 2026 00:02:36 +0100 Subject: [PATCH 44/62] Updated workflow --- .github/workflows/python-package.yml | 45 ++++++++++++++++++---------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 8214cd4..2cd4bbb 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -3,9 +3,6 @@ name: build -env: - package: codext - on: push: branches: [ "main" ] @@ -13,21 +10,35 @@ on: branches: [ "main" ] permissions: + actions: write id-token: write # for OIDC contents: read jobs: + prepare: + runs-on: ubuntu-latest + outputs: + package: ${{ steps.pkg.outputs.package }} + steps: + - name: Compute package name from the repository's + id: pkg + run: | + name="${GITHUB_REPOSITORY##*/}" + echo "package=${name#python-}" >> $GITHUB_OUTPUT build: + needs: prepare runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: [ubuntu-latest] python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + env: + package: ${{ needs.prepare.outputs.package }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v5 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install ${{ env.package }} @@ -41,16 +52,18 @@ jobs: run: | pytest --cov=$package coverage: - needs: build + needs: [prepare, build] runs-on: ubuntu-latest env: cov_badge_path: docs/coverage.svg + package: ${{ needs.prepare.outputs.package }} + python_version: "3.13" steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + - uses: actions/checkout@v5 + - name: Set up Python ${{ env.python_version }} + uses: actions/setup-python@v6 with: - python-version: "3.13" + python-version: ${{ env.python_version }} - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip @@ -63,7 +76,7 @@ jobs: pytest --cov=$package --cov-report=xml genbadge coverage -i coverage.xml -o $cov_badge_path - name: Verify Changed files - uses: tj-actions/verify-changed-files@v17 + uses: tj-actions/verify-changed-files@v20 id: changed_files with: files: ${{ env.cov_badge_path }} @@ -78,17 +91,17 @@ jobs: if: steps.changed_files.outputs.files_changed == 'true' uses: ad-m/github-push-action@master with: - github_token: ${{ secrets.github_token }} + github_token: ${{ secrets.GITHUB_TOKEN }} branch: ${{ github.ref }} deploy: runs-on: ubuntu-latest - needs: coverage + needs: [prepare, coverage] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v5 with: fetch-depth: 0 - name: Check for version change - uses: dorny/paths-filter@v2 + uses: dorny/paths-filter@v4 id: filter with: filters: | @@ -101,7 +114,7 @@ jobs: awk '{if (match($0,"## Supporters")) exit; print}' README.md > README mv -f README README.md - if: steps.filter.outputs.version == 'true' - name: Build ${{ env.package }} package + name: Build ${{ needs.prepare.outputs.package }} package run: python3 -m pip install --upgrade build && python3 -m build - if: steps.filter.outputs.version == 'true' name: Publish to PyPI From c57dacd37ee25a79e9c6dec010941131b30da38a Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 22 Mar 2026 09:51:32 +0100 Subject: [PATCH 45/62] Updated documentation --- .github/copilot-instructions.md | 61 ++++++++++++++++-------- .github/pull_request_template.md | 5 +- .github/workflows/python-package.yml | 5 +- docs/ADDING_CODECS.md | 69 ---------------------------- docs/mkdocs.yml | 5 +- docs/pages/howto.md | 62 ++++++++++++++++--------- docs/requirements.txt | 11 ++--- 7 files changed, 93 insertions(+), 125 deletions(-) delete mode 100755 docs/ADDING_CODECS.md diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 1a2fd7f..a1d90f8 100755 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -10,19 +10,12 @@ Copilot MUST: - Avoid stylistic or formatting changes ## Context -This project extends Python's codecs with many encoding/decoding schemes and CLI tools. +This project extends Python's codecs with many encoding/decoding schemes and a CLI tool. It already includes a wide variety of bases, ciphers, compression, and niche encodings. ## Enhancement Guidelines +When adding a new encoding, follow the guideline in the documentation at `docs/pages/howto.md`. -When adding a new encoding: -1. Check if it already exists in the project -2. Follow the existing codec structure and naming conventions -3. Provide: - - `encode()` implementation - - `decode()` implementation - - Registration into the codec registry -4. Ensure CLI compatibility (if applicable) ## Implementation Constraints @@ -33,28 +26,56 @@ When adding a new encoding: ## Testing -Every new codec MUST include: -- Unit tests (encode/decode roundtrip) -- Edge cases (empty input, binary data if applicable) +Every new codec: +- SHOULD include a list of `__examples__` that tells the automated tests what encoding/decoding transformations need to be verified ; it this cannot be made, unit tests (encode/decode roundtrip) SHALL be provided in `tests/test_manual.py` +- Edge cases (empty input, binary data if applicable), either in the `__examples__` list or in the explicit tests in `tests/test_manual.py` ## Documentation -Each codec must include: -- Short description -- Reference (standard, RFC, or algorithm source) -- Example usage +Each codec SHALL comply with the following structure: + + ```python + # -*- coding: UTF-8 -*- + """{{codec_long_name}} Codec - {{codec_short_name}} content encoding. + + {{codec_description}} + + This codec: + - en/decodes strings from str to str + - en/decodes strings from bytes to bytes + - decodes file content to str (read) + - encodes file content from str to bytes (write) + + Reference: {{codec_source_hyperlink}} + """ + from ..__common__ import * + + + __examples__ = {<>} + <>] + + + <> + <> + + + <> + ``` + +In this template, `{{ ... }}` enclosures indicate codec's properties and `<< ... >>``enclosures indicate placeholder actions referring to steps from the documentation about how to make a codec at `docs/pages/howto.md`. ## Output Format (IMPORTANT) When asked to add a codec, Copilot should: 1. Briefly justify the encoding (1–2 lines) -2. Provide full implementation -3. Provide tests -4. Provide documentation snippet +2. Provide full implementation (according to section _Adding a new codec to `codext`_ of the documentation at `docs/pages/howto.md`) +3. Provide tests (according to section _Self-generated tests_) +4. Add it to the `README.md` of the repository +5. Propose the update of the documentation (under the relevant page for the category of codec) ## Explicit Non-Goals - No refactoring - No performance optimization passes - No linting-only changes -- No CI/CD changes \ No newline at end of file +- No CI/CD changes diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index cff8c8d..9d466f2 100755 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,6 +1,3 @@ -## Type -- [ ] New encoding (required) - ## Checklist - [ ] No unrelated changes - [ ] Codec is new (not already implemented) @@ -8,4 +5,4 @@ - [ ] Documentation (included in the right page in `docs/pages/enc`) ## Description -Explain the encoding and its source. \ No newline at end of file +Explain the encoding and its source. diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 2cd4bbb..cdf1ddd 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -10,8 +10,7 @@ on: branches: [ "main" ] permissions: - actions: write - id-token: write # for OIDC + id-token: write contents: read jobs: @@ -53,6 +52,8 @@ jobs: pytest --cov=$package coverage: needs: [prepare, build] + permissions: + contents: write runs-on: ubuntu-latest env: cov_badge_path: docs/coverage.svg diff --git a/docs/ADDING_CODECS.md b/docs/ADDING_CODECS.md deleted file mode 100755 index 014e2fa..0000000 --- a/docs/ADDING_CODECS.md +++ /dev/null @@ -1,69 +0,0 @@ -# Adding a Codec - -1. Categorize accordingly ; categories are the folder names in `src/codext` (further folder references are relative to this). When a category cannot be put in one of these folders, it shall be put by default in `others`. - -2. Add the `.py` file in the relevant category folder, named with the short name of the new codec. - -3. Respect the typical structure of a codec's `.py` file according to the following template (double-bracketed enclosures indicate codec parameters, double-arrowed enclosures indicate instructions that may refer to further steps of this guideline): - - ```python - # -*- coding: UTF-8 -*- - """{{codec_long_name}} Codec - {{codec_short_name}} content encoding. - - {{codec_description}} - - This codec: - - en/decodes strings from str to str - - en/decodes strings from bytes to bytes - - decodes file content to str (read) - - encodes file content from str to bytes (write) - - Reference: {{codec_source_hyperlink}} - """ - from ..__common__ import * - - - __examples__ = {<>} - <>] - - - <> - <> - - - <> - ``` - -4. Choose the right add function - - If the codec is a simple mapping, use the `add_map` function. - - Examples: `languages/braille`, `languages/morse`, `languages/southpark` - - In some cases, an algorithm can even be equivalent to one or a number of mappings and can then be defined as a dynamic generation of `ENCMAP`. - - Examples: `stegano/resistor`, `crypto/barbie` - - When the codec is more complex than a mapping, use the `add` function. - -5. Configure the add function - - Refer to the relevant function signature in `__common__.py`. - -6. Write the codec logic - - If the codec is a mapping, at least `ENC_MAP` should be defined and refered in the parameters of the `add_map` function. - - Examples: `stegano/rick`, `stegano/klopf` - - If the codec is not a mapping, the logic can be written in the following order: the encoding function first, then the decoding function. - - Examples: `stegano/whitespace`, `crypto/railfence` - -7. Write some examples - - Examples are used during the automated test generation. They should then be carefully written to also cover some edge cases. A set of 3-8 examples is generally a must. - -8. Specify the names to be used with the guessing mode - - The `__guess__` list of codec names is used to limit the possibilities in the tree search from the guessing mode. Especially when the codec is dynamic and may have a large (or even infinite) number of dynamic names, it is necessary to set a limited number, generally maximum 16 as a best practice. This list, when relevant, shall be used with due care. diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 387710b..af5a4b3 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -1,7 +1,7 @@ site_author: dhondta site_name: "Codext - Extension of native codecs for Python" repo_url: https://github.com/dhondta/python-codext -copyright: Copyright © 2021-2023 Alexandre D'Hondt +copyright: Copyright © 2021-2026 Alexandre D'Hondt docs_dir: pages nav: - Introduction: index.md @@ -37,6 +37,8 @@ extra: name: Alex on Twitter extra_css: - css/extra.css +plugins: + - search theme: name: material palette: @@ -50,7 +52,6 @@ theme: name: Switch to light mode logo: img/logo.png favicon: img/icon.png -use_directory_urls: false markdown_extensions: - toc: permalink: true diff --git a/docs/pages/howto.md b/docs/pages/howto.md index 9e59805..4719fde 100644 --- a/docs/pages/howto.md +++ b/docs/pages/howto.md @@ -1,9 +1,9 @@ The purpose of this section is to provide a tutorial for creating new codecs accordingly. -As explained in [this section](./features.html), `codext` provides the possibility to add new codecs in two ways: +As explained in [this section](./features), `codext` provides the possibility to add new codecs in two ways: -1. [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56): using this function, the *encode* and *decode* functions must be given as arguments. -2. [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160): using this function, an *encoding map* must be given but can be formatted in different ways to handle various use cases. +1. [`add`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L56): using this function, the *encode* and *decode* functions must be given as arguments. +2. [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160): using this function, an *encoding map* must be given but can be formatted in different ways to handle various use cases. In both cases, a *pattern* is given in argument and aims to define the set of all strings that aim to select this codec. @@ -49,29 +49,33 @@ Whatever solution is chosen, the following arguments shall be considered: ### Which `add` function ? -At this point, it is necessary to determine what kind of codec you want. If it is a simple map of characters, you should definitely use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160). If it is more complex and cannot be handled using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160)'s options, then you should use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) and define the encode/decode functions yourself. +At this point, it is necessary to determine what kind of codec you want. If it is a simple map of characters, you should definitely use [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160). If it is more complex and cannot be handled using [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160)'s options, then you should use [`add`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L56) and define the encode/decode functions yourself. A few examples: -- `morse` is a simple map that does not handle case ; it then uses [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `ignore_case` set to "`encode`" (not "`both`" for encoding and decoding as it does not matter anyway for decoding) -- `whitespace` has 2 codecs defined ; the simple one is a simple bit encoding map, therefore using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype` set to "`bin`" (for pre-converting characters to bits before applying the encoding map), and the complex one uses [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) with its specific endocde/decode functions +- `morse` is a simple map that does not handle case ; it then uses [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) with `ignore_case` set to "`encode`" (not "`both`" for encoding and decoding as it does not matter anyway for decoding) +- `whitespace` has 2 codecs defined ; the simple one is a simple bit encoding map, therefore using [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) with `intype` set to "`bin`" (for pre-converting characters to bits before applying the encoding map), and the complex one uses [`add`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L56) with its specific endocde/decode functions - `atbash` defines a dynamic map with a "factory" function, that creates the encoding map according to the parameters supplied in the codec name So, before going further, determine the following: -- What does the new codec map from and to ? E.g. if binary input and ordinal output, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype="bin"` and `outype="ord"`. -- Is this codec ignoring case ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and specify which operation(s) should ignore case, e.g. `ignore_case="both"` or `ignore_case="decode"`. -- Should this codec handle no error ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) do not forget to specify `no_error=True`. -- Does the codec yields variable-length encoded tokens ? If so, you can still use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) but you should define `sep` (separator) as `codext` will not be able to handle ambiguities. +- What does the new codec map from and to ? E.g. if binary input and ordinal output, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) with `intype="bin"` and `outype="ord"`. +- Is this codec ignoring case ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) and specify which operation(s) should ignore case, e.g. `ignore_case="both"` or `ignore_case="decode"`. +- Should this codec handle no error ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) do not forget to specify `no_error=True`. +- Does the codec yields variable-length encoded tokens ? If so, you can still use [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) but you should define `sep` (separator) as `codext` will not be able to handle ambiguities. -If you find aspects that are not covered in these questions, you shall use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56), then refering to [Case 1](#case-1-generic-encoding-definition). Otherwise, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and refer +If you find aspects that are not covered in these questions, you shall use [`add`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L56), then refering to [Case 1](#case-1-generic-encoding-definition). Otherwise, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) and refer to [Case 2](#case-2-encoding-map). ----- ### Case 1: Generic encoding definition -This uses: [`codext.add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) +This uses: [`codext.add`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L56) + +This applies when the codec is more complex than a mapping, as defined in _[Case 2: Encoding map](./#case-2-encoding-map)_. + +**Examples**: [`crypto/barbie`](https://github.com/dhondta/python-codext/blob/main/src/codext/crypto/barbie.py), [`crypto/railfence`](https://github.com/dhondta/python-codext/blob/main/src/codext/crypto/railfence.py), [`stegano/resistor`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/resitor.py), [`stegano/whitespace`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/whitespace.py) The following shall be considered: @@ -92,7 +96,7 @@ Both functions must take 2 arguments and return 2 values (in order to stick to ` This last mode is an addition to the native ones. It can be useful for some encodings that must cause no error while encoding and can therefore have their original characters in the output. -Also, while defining the `encode` and/or `decode` functions, `codext.handle_error` can be used as a shortcut to handle the different modes. It returns a wrapped function that takes `token` and `position` as arguments (see [`excess3`](https://github.com/dhondta/python-codext/blob/master/codext/binary/excess3.py) for an example). +Also, while defining the `encode` and/or `decode` functions, `codext.handle_error` can be used as a shortcut to handle the different modes. It returns a wrapped function that takes `token` and `position` as arguments (see [`excess3`](https://github.com/dhondta/python-codext/blob/main/src/codext/binary/excess3.py) for an example). ```python >>> help(codext.handle_error) @@ -125,7 +129,11 @@ _handle_error(token, position) ### Case 2: Encoding map -This uses: [`codext.add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) +This uses: [`codext.add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) + +This applies when the codec can be defined a simple mapping between source and destination tokens. + +**Examples**: [`languages/braille`](https://github.com/dhondta/python-codext/blob/main/src/codext/languages/braille.py), [`languages/morse`](https://github.com/dhondta/python-codext/blob/main/src/codext/languages/morse.py), [`languages/southpark`](https://github.com/dhondta/python-codext/blob/main/src/codext/languages/southpark.py), [`stegano/klopf`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/klopf.py), [`stegano/rick`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/rick.py) The following options shall be considered: @@ -143,10 +151,10 @@ The following options shall be considered: `encmap` can be defined as follows: -1. **Simple map**: In this case, the encoding map is a dictionary mapping each input character to an output one (see [`radio`](https://github.com/dhondta/python-codext/blob/master/codext/languages/radio.py) for an example). -2. **List of maps**: In this case, encoding maps are put in a list and referenced by their order number starting from 1, meaning that the `pattern` shall define a capture group with values from 1 to the length of this list (see [`dna`](https://github.com/dhondta/python-codext/blob/master/codext/others/dna.py) for an example). +1. **Simple map**: In this case, the encoding map is a dictionary mapping each input character to an output one (see [`radio`](https://github.com/dhondta/python-codext/blob/main/src/codext/languages/radio.py) for an example). +2. **List of maps**: In this case, encoding maps are put in a list and referenced by their order number starting from 1, meaning that the `pattern` shall define a capture group with values from 1 to the length of this list (see [`dna`](https://github.com/dhondta/python-codext/blob/main/src/codext/others/dna.py) for an example). 3. **Parametrized map**: This variant defines a dictionary of regex-selected encoding maps, that is, a dictionary of dictionaries with keys matching the captured groups from codec's pattern. -4. **Map factory function**: This one is implemented by a function that returns the composed encoding map. This function takes a single argument according to the capture group from the `pattern` (see [`affine`](https://github.com/dhondta/python-codext/blob/master/codext/crypto/affine.py) for an example). +4. **Map factory function**: This one is implemented by a function that returns the composed encoding map. This function takes a single argument according to the capture group from the `pattern` (see [`affine`](https://github.com/dhondta/python-codext/blob/main/src/codext/crypto/affine.py) for an example). !!! note "Mapping one input character to multiple output characters" @@ -156,7 +164,7 @@ The following options shall be considered: ### Self-generated tests -In order to facilitate testing, a test suite can be automatically generated from a set of *examples*. This is defined in the `__examples__` dunder inside codec's source file (see [`sms`](https://github.com/dhondta/python-codext/blob/master/codext/stegano/sms.py) for an example). By default, the `add`/`add_map` function will get `__examples__` from the global scope but this behavior can be overridden by specifying the keyword-argument `examples` (e.g. `add(..., examples=__examples1__)` ; see [`ordinal`](https://github.com/dhondta/python-codext/blob/master/codext/common/ordinal.py) for an example). +In order to facilitate testing, a test suite can be automatically generated from a set of *examples*. This is defined in the `__examples__` dunder inside codec's source file (see [`sms`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/sms.py) for an example). By default, the `add`/`add_map` function will get `__examples__` from the global scope but this behavior can be overridden by specifying the keyword-argument `examples` (e.g. `add(..., examples=__examples1__)` ; see [`ordinal`](https://github.com/dhondta/python-codext/blob/main/src/codext/common/ordinal.py) for an example). A set of examples is a dictionary specifying the test cases to be considered. The keys are the descriptions of the test cases and the values can be either dictionaries of input texts and their output encoded texts or lists of input texts. Each key has the format "`operation(encodings)`". Operations can be: @@ -228,13 +236,23 @@ __examples__ = { ----- +### Codec names for the guessing mode + +The `__guess__` list of codec names is used to limit the possibilities in the tree search from the [guessing mode](./guessing). Especially when the codec is dynamic and may have a large (or even infinite) number of dynamic names, it is necessary to set a limited number in order to avoid exponentially increasing computation time. This list, when relevant, shall be used with due care. + +!!! note "Mapping one input character to multiple output characters" + + As a best practice, static names for the [guessing mode](./guessing) should be limited to 16, in order to avoid exponential computation time in the search tree algorithm. + +----- + ### Adding a new codec to `codext` As a checklist when making a codec for addition in `codext`, please follow these steps: 1. Create your codec file (i.e. starting with a copy of an existing similar one) -2. Place it into the right category folder -3. Add it to the list in [`README.md`](https://github.com/dhondta/python-codext/blob/master/README.md#list-of-codecs) -4. Add its documentation in the [right Markdown file](https://github.com/dhondta/python-codext/tree/master/docs/enc) -5. If self-generated tests are not enough, add manual tests in [the related file](https://github.com/dhondta/python-codext/blob/master/tests/test_manual.py) +2. Place it into the right category folder (when a category cannot be put in one of the folders under the root of [`codext`](https://github.com/dhondta/python-codext/blob/main/src/codext), it shall be put by default in [`others`](https://github.com/dhondta/python-codext/blob/main/src/codext/others)) +3. Add it to the list in [`README.md`](https://github.com/dhondta/python-codext/blob/main/src/README.md#list-of-codecs) +4. Add its documentation in the [right Markdown file](https://github.com/dhondta/python-codext/tree/main/src/docs/enc) +5. If self-generated tests are not enough, add manual tests in [the related file](https://github.com/dhondta/python-codext/blob/main/src/tests/test_manual.py) diff --git a/docs/requirements.txt b/docs/requirements.txt index a4427bc..ebcf1c7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,5 @@ -jinja2<3.1.0 -mkdocs>=1.3.0 -mkdocs-bootswatch -mkdocs-material -mkdocs-rtd-dropdown -pymdown-extensions +jinja2>=3.1 +markdown>=3.5 +mkdocs>=1.5 +mkdocs-material>=9.5 +pymdown-extensions>=10.0 From 139fb2dd1e157ef8c92f43314e035a02fe51890a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 22 Mar 2026 09:37:44 +0000 Subject: [PATCH 46/62] Updated documentation --- docs/coverage.svg | 2 +- docs/pages/howto.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index 4d30c44..9f90515 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 98.90%coverage98.90% \ No newline at end of file +coverage: 99.04%coverage99.04% \ No newline at end of file diff --git a/docs/pages/howto.md b/docs/pages/howto.md index 4719fde..8cb3fc8 100644 --- a/docs/pages/howto.md +++ b/docs/pages/howto.md @@ -75,7 +75,7 @@ This uses: [`codext.add`](https://github.com/dhondta/python-codext/blob/main/src This applies when the codec is more complex than a mapping, as defined in _[Case 2: Encoding map](./#case-2-encoding-map)_. -**Examples**: [`crypto/barbie`](https://github.com/dhondta/python-codext/blob/main/src/codext/crypto/barbie.py), [`crypto/railfence`](https://github.com/dhondta/python-codext/blob/main/src/codext/crypto/railfence.py), [`stegano/resistor`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/resitor.py), [`stegano/whitespace`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/whitespace.py) +**Examples**: [`crypto/barbie`](https://github.com/dhondta/python-codext/blob/main/src/codext/crypto/barbie.py), [`crypto/railfence`](https://github.com/dhondta/python-codext/blob/main/src/codext/crypto/railfence.py), [`stegano/resistor`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/resistor.py), [`stegano/whitespace`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/whitespace.py) The following shall be considered: From ba8a59d19fe95caeabeec7a1e687ceecc3689575 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 22 Mar 2026 10:53:32 +0100 Subject: [PATCH 47/62] Added issue template --- .github/ISSUE_TEMPLATE/add-encoding.yml | 21 +++++++++++++++++++++ .github/workflows/python-package.yml | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100755 .github/ISSUE_TEMPLATE/add-encoding.yml diff --git a/.github/ISSUE_TEMPLATE/add-encoding.yml b/.github/ISSUE_TEMPLATE/add-encoding.yml new file mode 100755 index 0000000..798bea7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/add-encoding.yml @@ -0,0 +1,21 @@ +name: Add new encoding +description: Propose a new encoding to be added +title: "Add new encoding: [codec]" +labels: ["enhancement"] +body: + - type: textarea + id: description + attributes: + label: Description + description: Describe the encoding, its purpose, and how it works + placeholder: Provide a clear and concise description + validations: + required: true + - type: input + id: reference + attributes: + label: Reference + description: Provide a reference URL for the encoding + placeholder: https:// + validations: + required: false diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index cdf1ddd..f991aae 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -93,7 +93,7 @@ jobs: uses: ad-m/github-push-action@master with: github_token: ${{ secrets.GITHUB_TOKEN }} - branch: ${{ github.ref }} + branch: main deploy: runs-on: ubuntu-latest needs: [prepare, coverage] From c823cf5bb6397b74b510239d34b73a7a995969ba Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 22 Mar 2026 11:53:26 +0100 Subject: [PATCH 48/62] Fixed workflow --- .github/workflows/python-package.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index f991aae..3dd219f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -61,6 +61,13 @@ jobs: python_version: "3.13" steps: - uses: actions/checkout@v5 + with: + fetch-depth: 0 + ref: ${{ github.head_ref || github.ref_name }} + - name: Sync with remote + run: | + git fetch origin + git rebase origin/${{ github.ref_name }} - name: Set up Python ${{ env.python_version }} uses: actions/setup-python@v6 with: @@ -92,8 +99,9 @@ jobs: if: steps.changed_files.outputs.files_changed == 'true' uses: ad-m/github-push-action@master with: + branch: ${{ github.head_ref || github.ref_name }} + force: true github_token: ${{ secrets.GITHUB_TOKEN }} - branch: main deploy: runs-on: ubuntu-latest needs: [prepare, coverage] From 864a4b394884e92b7ae3459c4b09e84674e157ef Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 22 Mar 2026 11:15:30 +0000 Subject: [PATCH 49/62] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index 9f90515..efa3c52 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.04%coverage99.04% \ No newline at end of file +coverage: 98.83%coverage98.83% \ No newline at end of file From 283db479a5857492666b72efe528bb7593f5cf61 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 22 Mar 2026 11:53:26 +0100 Subject: [PATCH 50/62] Fixed workflow --- .github/workflows/python-package.yml | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3dd219f..3a03b0b 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -88,20 +88,17 @@ jobs: id: changed_files with: files: ${{ env.cov_badge_path }} - - name: Commit files + - name: Push coverage badge if: steps.changed_files.outputs.files_changed == 'true' run: | git config --local user.email "github-actions[bot]@users.noreply.github.com" git config --local user.name "github-actions[bot]" + git fetch origin + git checkout coverage-badge || git checkout -b coverage-badge git add $cov_badge_path - git commit -m "Updated coverage.svg" - - name: Push changes - if: steps.changed_files.outputs.files_changed == 'true' - uses: ad-m/github-push-action@master - with: - branch: ${{ github.head_ref || github.ref_name }} - force: true - github_token: ${{ secrets.GITHUB_TOKEN }} + git diff --cached --quiet && exit 0 + git commit -m "Update coverage badge" + git push origin coverage-badge --force deploy: runs-on: ubuntu-latest needs: [prepare, coverage] From 2c82e2cac476c5d9370f22a9511310742d79adbf Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 22 Mar 2026 16:35:31 +0100 Subject: [PATCH 51/62] Moved checksums to dedicated folder --- .github/workflows/python-package.yml | 4 ---- src/codext/checksums/__init__.py | 4 ++++ src/codext/checksums/adler.py | 17 +++++++++++++++++ .../{hashing/checksums.py => checksums/crc.py} | 9 +++------ src/codext/hashing/__init__.py | 1 - tests/test_manual.py | 2 +- 6 files changed, 25 insertions(+), 12 deletions(-) create mode 100755 src/codext/checksums/__init__.py create mode 100644 src/codext/checksums/adler.py rename src/codext/{hashing/checksums.py => checksums/crc.py} (97%) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3a03b0b..d505f24 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -64,10 +64,6 @@ jobs: with: fetch-depth: 0 ref: ${{ github.head_ref || github.ref_name }} - - name: Sync with remote - run: | - git fetch origin - git rebase origin/${{ github.ref_name }} - name: Set up Python ${{ env.python_version }} uses: actions/setup-python@v6 with: diff --git a/src/codext/checksums/__init__.py b/src/codext/checksums/__init__.py new file mode 100755 index 0000000..951bb54 --- /dev/null +++ b/src/codext/checksums/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: UTF-8 -*- +from .adler import * +from .crc import * + diff --git a/src/codext/checksums/adler.py b/src/codext/checksums/adler.py new file mode 100644 index 0000000..5aa312c --- /dev/null +++ b/src/codext/checksums/adler.py @@ -0,0 +1,17 @@ +# -*- coding: UTF-8 -*- +"""Adler Codecs - Adler32 checksum algorithm. + +This is a codec for computing checksums, for use with other codecs in encoding chains. + +These codecs: +- transform strings from str to str +- transform strings from bytes to bytes +- transform file content from str to bytes (write) +""" +from zlib import adler32 + +from ..__common__ import add, b + + +add("adler32", lambda data, error="strict": (adler32(b(data)) & 0xffffffff, len(data)), guess=None) + diff --git a/src/codext/hashing/checksums.py b/src/codext/checksums/crc.py similarity index 97% rename from src/codext/hashing/checksums.py rename to src/codext/checksums/crc.py index 85dbe67..a057d7f 100644 --- a/src/codext/hashing/checksums.py +++ b/src/codext/checksums/crc.py @@ -1,16 +1,14 @@ # -*- coding: UTF-8 -*- -"""Checksum Codecs - string common checksums. +"""CRC Codecs - Cyclic Redundancy Check checksum algorithm. -These are codecs for hashing strings, for use with other codecs in encoding chains. +This is a codec for computing checksums, for use with other codecs in encoding chains. These codecs: - transform strings from str to str - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -from zlib import adler32 - -from ..__common__ import add, b +from ..__common__ import add CRC = { @@ -261,7 +259,6 @@ def _encode(data, error="strict"): return _crc -add("adler32", lambda data, error="strict": (adler32(b(data)) & 0xffffffff, len(data)), guess=None) add("crca", crc_checksum(), pattern=_pattern(), guess=None) for i in CRC.keys(): if isinstance(i, int): diff --git a/src/codext/hashing/__init__.py b/src/codext/hashing/__init__.py index b7e9fcc..21f0ef5 100755 --- a/src/codext/hashing/__init__.py +++ b/src/codext/hashing/__init__.py @@ -1,6 +1,5 @@ # -*- coding: UTF-8 -*- from .blake import * -from .checksums import * from .crypt import * from .md import * from .mmh3 import * diff --git a/tests/test_manual.py b/tests/test_manual.py index 8cfe907..5a5f164 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -9,7 +9,7 @@ from codext.__common__ import * from codext.binary.baudot import _check_alphabet -from codext.hashing.checksums import CRC +from codext.checksums.crc import CRC class ComplementaryTestCase(TestCase): From 1116607d7ca32580ec965d615f9a949d4bd7e44e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 15:52:25 +0000 Subject: [PATCH 52/62] Initial plan From d3f47c861bebaceda7bc468124ba42c8e5941f0f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 22 Mar 2026 16:04:01 +0000 Subject: [PATCH 53/62] Add Luhn Mod N checksum codec Co-authored-by: dhondta <9108102+dhondta@users.noreply.github.com> Agent-Logs-Url: https://github.com/dhondta/python-codext/sessions/2ceb2d9a-a14b-4de4-8712-b44d0df3e8b0 --- src/codext/checksums/__init__.py | 1 + src/codext/checksums/luhn.py | 96 ++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 src/codext/checksums/luhn.py diff --git a/src/codext/checksums/__init__.py b/src/codext/checksums/__init__.py index 951bb54..bb20356 100755 --- a/src/codext/checksums/__init__.py +++ b/src/codext/checksums/__init__.py @@ -1,4 +1,5 @@ # -*- coding: UTF-8 -*- from .adler import * from .crc import * +from .luhn import * diff --git a/src/codext/checksums/luhn.py b/src/codext/checksums/luhn.py new file mode 100644 index 0000000..42905d3 --- /dev/null +++ b/src/codext/checksums/luhn.py @@ -0,0 +1,96 @@ +# -*- coding: UTF-8 -*- +"""Luhn Codec - Luhn Mod N checksum algorithm. + +The Luhn algorithm, also known as the "modulus 10" algorithm, is a simple checksum +formula used to validate identification numbers (e.g. credit card numbers, IMEI +numbers). Encoding appends a check character; decoding verifies the check character +and strips it. + +The Luhn Mod N generalization extends the algorithm to alphabets of arbitrary size N. +When called as 'luhn' or 'luhn-10', the standard decimal alphabet (0-9, N=10) is +used. When called as 'luhn-' for 2 ≤ N ≤ 36, the first N characters of +'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' form the alphabet. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) + +Reference: https://en.wikipedia.org/wiki/Luhn_algorithm + https://bitcoinwiki.org/wiki/luhn-mod-n-algorithm +""" +from ..__common__ import * + + +__examples__ = { + 'enc(luhn|luhn-10|luhn10)': { + '7992739871': '79927398713', + '': '', + '0': '00', + '1': '18', + }, + 'dec(luhn|luhn-10|luhn10)': { + '79927398713': '7992739871', + '': '', + '00': '0', + '18': '1', + }, + 'enc-dec(luhn)': ['123456789', '0' * 10, '9999999999999999'], + 'enc-dec(luhn-16)': ['0123456789ABCDEF', 'DEADBEEF'], + 'enc-dec(luhn-36)': ['HELLO', 'WORLD123'], +} + +_FULL_ALPHABET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" + + +def _luhn_encode(n=""): + mod = n if isinstance(n, int) else 10 + alphabet = _FULL_ALPHABET[:mod] + + def _encode(text, errors="strict"): + text = ensure_str(text).upper() if mod > 10 else ensure_str(text) + if not text: + return "", 0 + for pos, c in enumerate(text): + if c not in alphabet: + handle_error("luhn", errors, kind="character")(c, pos, text) + total = 0 + for i, c in enumerate(reversed(text)): + code = alphabet.index(c) + if i % 2 == 0: + d = code * 2 + code = d % mod + d // mod + total += code + check = (mod - total % mod) % mod + return text + alphabet[check], len(b(text)) + + return _encode + + +def _luhn_decode(n=""): + mod = n if isinstance(n, int) else 10 + alphabet = _FULL_ALPHABET[:mod] + + def _decode(text, errors="strict"): + text = ensure_str(text).upper() if mod > 10 else ensure_str(text) + if not text: + return "", 0 + for pos, c in enumerate(text): + if c not in alphabet: + handle_error("luhn", errors, decode=True, kind="character")(c, pos, text) + total = 0 + for i, c in enumerate(reversed(text)): + code = alphabet.index(c) + if i % 2 == 1: + d = code * 2 + code = d % mod + d // mod + total += code + if total % mod != 0: + handle_error("luhn", errors, decode=True)(text[-1], len(text) - 1, text[:-1]) + return text[:-1], len(b(text)) + + return _decode + + +add("luhn", _luhn_encode, _luhn_decode, pattern=r"^luhn[-_]?(\d{1,2})?$", guess=None) From 4c1c316593bb5e76cdc00f0d772bc4169ae7ec10 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 22 Mar 2026 17:02:51 +0100 Subject: [PATCH 54/62] Fixed incorrect file permissions --- .github/ISSUE_TEMPLATE/add-encoding.yml | 0 .github/copilot-instructions.md | 0 .github/prompts/add_codec.prompt.md | 0 .github/pull_request_template.md | 0 LICENSE | 1348 +++++++++++------------ README.md | 34 +- docs/pages/enc/others.md | 158 +-- src/codext/__info__.py | 32 +- src/codext/base/__init__.py | 128 +-- src/codext/base/_base.py | 0 src/codext/base/_base2n.py | 224 ++-- src/codext/base/base100.py | 0 src/codext/base/base122.py | 0 src/codext/base/base45.py | 168 +-- src/codext/base/base85.py | 0 src/codext/base/base91.py | 226 ++-- src/codext/base/baseN.py | 264 ++--- src/codext/binary/__init__.py | 16 +- src/codext/binary/baudot.py | 0 src/codext/binary/bcd.py | 160 +-- src/codext/binary/excess3.py | 130 +-- src/codext/binary/gray.py | 50 +- src/codext/binary/manchester.py | 100 +- src/codext/binary/rotate.py | 0 src/codext/checksums/__init__.py | 0 src/codext/common/__init__.py | 14 +- src/codext/common/a1z26.py | 120 +- src/codext/common/dummy.py | 114 +- src/codext/common/octal.py | 62 +- src/codext/common/ordinal.py | 56 +- src/codext/compressions/__init__.py | 24 +- src/codext/compressions/gzipp.py | 88 +- src/codext/compressions/pkzip.py | 0 src/codext/crypto/__init__.py | 24 +- src/codext/crypto/affine.py | 64 +- src/codext/crypto/atbash.py | 68 +- src/codext/crypto/bacon.py | 72 +- src/codext/crypto/barbie.py | 108 +- src/codext/crypto/citrix.py | 104 +- src/codext/crypto/rot.py | 204 ++-- src/codext/crypto/scytale.py | 108 +- src/codext/crypto/shift.py | 68 +- src/codext/crypto/xor.py | 70 +- src/codext/hashing/__init__.py | 0 src/codext/languages/__init__.py | 24 +- src/codext/languages/braille.py | 0 src/codext/languages/ipsum.py | 194 ++-- src/codext/languages/leetspeak.py | 46 +- src/codext/languages/morse.py | 80 +- src/codext/languages/navajo.py | 70 +- src/codext/languages/radio.py | 58 +- src/codext/languages/southpark.py | 88 +- src/codext/languages/tomtom.py | 70 +- src/codext/others/__init__.py | 14 +- src/codext/others/dna.py | 84 +- src/codext/others/kbshift.py | 132 +-- src/codext/others/letters.py | 182 +-- src/codext/others/markdown.py | 44 +- src/codext/stegano/__init__.py | 16 +- src/codext/stegano/hexagram.py | 0 src/codext/stegano/klopf.py | 50 +- src/codext/stegano/resistor.py | 56 +- src/codext/stegano/rick.py | 62 +- src/codext/stegano/sms.py | 54 +- src/codext/stegano/whitespace.py | 142 +-- src/codext/web/__init__.py | 8 +- src/codext/web/html.py | 0 src/codext/web/url.py | 58 +- 68 files changed, 2971 insertions(+), 2967 deletions(-) mode change 100755 => 100644 .github/ISSUE_TEMPLATE/add-encoding.yml mode change 100755 => 100644 .github/copilot-instructions.md mode change 100755 => 100644 .github/prompts/add_codec.prompt.md mode change 100755 => 100644 .github/pull_request_template.md mode change 100755 => 100644 src/codext/base/__init__.py mode change 100755 => 100644 src/codext/base/_base.py mode change 100755 => 100644 src/codext/base/_base2n.py mode change 100755 => 100644 src/codext/base/base100.py mode change 100755 => 100644 src/codext/base/base122.py mode change 100755 => 100644 src/codext/base/base45.py mode change 100755 => 100644 src/codext/base/base85.py mode change 100755 => 100644 src/codext/base/base91.py mode change 100755 => 100644 src/codext/base/baseN.py mode change 100755 => 100644 src/codext/binary/__init__.py mode change 100755 => 100644 src/codext/binary/baudot.py mode change 100755 => 100644 src/codext/binary/bcd.py mode change 100755 => 100644 src/codext/binary/excess3.py mode change 100755 => 100644 src/codext/binary/gray.py mode change 100755 => 100644 src/codext/binary/manchester.py mode change 100755 => 100644 src/codext/binary/rotate.py mode change 100755 => 100644 src/codext/checksums/__init__.py mode change 100755 => 100644 src/codext/common/__init__.py mode change 100755 => 100644 src/codext/common/a1z26.py mode change 100755 => 100644 src/codext/common/dummy.py mode change 100755 => 100644 src/codext/common/octal.py mode change 100755 => 100644 src/codext/common/ordinal.py mode change 100755 => 100644 src/codext/compressions/__init__.py mode change 100755 => 100644 src/codext/compressions/gzipp.py mode change 100755 => 100644 src/codext/compressions/pkzip.py mode change 100755 => 100644 src/codext/crypto/__init__.py mode change 100755 => 100644 src/codext/crypto/affine.py mode change 100755 => 100644 src/codext/crypto/atbash.py mode change 100755 => 100644 src/codext/crypto/bacon.py mode change 100755 => 100644 src/codext/crypto/barbie.py mode change 100755 => 100644 src/codext/crypto/rot.py mode change 100755 => 100644 src/codext/crypto/scytale.py mode change 100755 => 100644 src/codext/crypto/shift.py mode change 100755 => 100644 src/codext/crypto/xor.py mode change 100755 => 100644 src/codext/hashing/__init__.py mode change 100755 => 100644 src/codext/languages/__init__.py mode change 100755 => 100644 src/codext/languages/braille.py mode change 100755 => 100644 src/codext/languages/ipsum.py mode change 100755 => 100644 src/codext/languages/leetspeak.py mode change 100755 => 100644 src/codext/languages/morse.py mode change 100755 => 100644 src/codext/languages/navajo.py mode change 100755 => 100644 src/codext/languages/radio.py mode change 100755 => 100644 src/codext/languages/southpark.py mode change 100755 => 100644 src/codext/languages/tomtom.py mode change 100755 => 100644 src/codext/others/__init__.py mode change 100755 => 100644 src/codext/others/dna.py mode change 100755 => 100644 src/codext/others/kbshift.py mode change 100755 => 100644 src/codext/others/letters.py mode change 100755 => 100644 src/codext/others/markdown.py mode change 100755 => 100644 src/codext/stegano/__init__.py mode change 100755 => 100644 src/codext/stegano/hexagram.py mode change 100755 => 100644 src/codext/stegano/klopf.py mode change 100755 => 100644 src/codext/stegano/resistor.py mode change 100755 => 100644 src/codext/stegano/rick.py mode change 100755 => 100644 src/codext/stegano/sms.py mode change 100755 => 100644 src/codext/stegano/whitespace.py mode change 100755 => 100644 src/codext/web/__init__.py mode change 100755 => 100644 src/codext/web/html.py mode change 100755 => 100644 src/codext/web/url.py diff --git a/.github/ISSUE_TEMPLATE/add-encoding.yml b/.github/ISSUE_TEMPLATE/add-encoding.yml old mode 100755 new mode 100644 diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md old mode 100755 new mode 100644 diff --git a/.github/prompts/add_codec.prompt.md b/.github/prompts/add_codec.prompt.md old mode 100755 new mode 100644 diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md old mode 100755 new mode 100644 diff --git a/LICENSE b/LICENSE index f288702..3877ae0 100644 --- a/LICENSE +++ b/LICENSE @@ -1,674 +1,674 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/README.md b/README.md index 58c1c9d..8026d48 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ [![DOI](https://zenodo.org/badge/236679865.svg)](https://zenodo.org/badge/latestdoi/236679865) [![License](https://img.shields.io/pypi/l/codext.svg)](https://pypi.python.org/pypi/codext/) -[**CodExt**](https://github.com/dhondta/python-codext) is a (Python2-3 compatible) library that extends the native [`codecs`](https://docs.python.org/3/library/codecs.html) library (namely for adding new custom encodings and character mappings) and provides **120+ new codecs**, hence its name combining *CODecs EXTension*. It also features a **guess mode** for decoding multiple layers of encoding and **CLI tools** for convenience. +[**CodExt**](https://github.com/dhondta/python-codext) is a (Python2-3 compatible) library that extends the native [`codecs`](https://docs.python.org/3/library/codecs) library (namely for adding new custom encodings and character mappings) and provides **120+ new codecs**, hence its name combining *CODecs EXTension*. It also features a **guess mode** for decoding multiple layers of encoding and **CLI tools** for convenience. ```sh $ pip install codext @@ -19,7 +19,7 @@ $ pip install codext Want to contribute a new codec ? | Want to contribute a new macro ? :----------------------------------:|:------------------------------------: -Check the [documentation](https://python-codext.readthedocs.io/en/latest/howto.html) first
Then [PR](https://github.com/dhondta/python-codext/pulls) your new codec | [PR](https://github.com/dhondta/python-codext/pulls) your updated version of [`macros.json`](https://github.com/dhondta/python-codext/blob/main/codext/macros.json) +Check the [documentation](https://python-codext.readthedocs.io/en/latest/howto) first
Then [PR](https://github.com/dhondta/python-codext/pulls) your new codec | [PR](https://github.com/dhondta/python-codext/pulls) your updated version of [`macros.json`](https://github.com/dhondta/python-codext/blob/main/codext/macros.json) ## :mag: Demonstrations @@ -210,7 +210,7 @@ o ## :page_with_curl: List of codecs -#### [BaseXX](https://python-codext.readthedocs.io/en/latest/enc/base.html) +#### [BaseXX](https://python-codext.readthedocs.io/en/latest/enc/base) - [X] `base1`: useless, but for the sake of completeness - [X] `base2`: simple conversion to binary (with a variant with a reversed alphabet) @@ -221,7 +221,7 @@ o - [X] `base11`: conversion to digits with a "*a*" - [X] `base16`: simple conversion to hexadecimal (with a variant holding an alphabet with digits and letters inverted) - [X] `base26`: conversion to alphabet letters -- [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32.html)) +- [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32)) - [X] `base36`: [Base36](https://en.wikipedia.org/wiki/Base36) conversion to letters and digits (with a variant inverting both groups) - [X] `base45`: [Base45](https://datatracker.ietf.org/doc/html/draft-faltstrom-base45-04.txt) DRAFT algorithm (with a variant inverting letters and digits) - [X] `base58`: multiple versions of [Base58](https://en.bitcoinwiki.org/wiki/Base58) (bitcoin, flickr, ripple) @@ -233,11 +233,11 @@ o - [X] `base91`: [Base91](http://base91.sourceforge.net) custom conversion - [X] `base100` (or *emoji*): [Base100](https://github.com/AdamNiederer/base100) custom conversion - [X] `base122`: [Base100](http://blog.kevinalbs.com/base122) custom conversion -- [X] `base-genericN`: see [base encodings](https://python-codext.readthedocs.io/en/latest/enc/base.html) ; supports any possible base +- [X] `base-genericN`: see [base encodings](https://python-codext.readthedocs.io/en/latest/enc/base) ; supports any possible base This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `base85` codec. -#### [Binary](https://python-codext.readthedocs.io/en/latest/enc/binary.html) +#### [Binary](https://python-codext.readthedocs.io/en/latest/enc/binary) - [X] `baudot`: supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... - [X] `baudot-spaced`: variant of `baudot` ; groups of 5 bits are whitespace-separated @@ -251,7 +251,12 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba - [X] `manchester-inverted`: variant of `manchester` ; XORes each bit of the input with `10` - [X] `rotateN`: rotates characters by the specified number of bits (*N* belongs to [1, 7] ; Python 3 only) -#### [Common](https://python-codext.readthedocs.io/en/latest/enc/common.html) +#### [Checksums](https://python-codext.readthedocs.io/en/latest/enc/checksums) + +- [X] `adler`: Adler32 algorithm (relies on `zlib`) +- [X] `crc`: CRC of lengths 8, 10-17, 21, 24, 30-32, 40, 64, 82 with a variety of polynoms + +#### [Common](https://python-codext.readthedocs.io/en/latest/enc/common) - [X] `a1z26`: keeps words whitespace-separated and uses a custom character separator - [X] `cases`: set of case-related encodings (including camel-, kebab-, lower-, pascal-, upper-, snake- and swap-case, slugify, capitalize, title) @@ -261,7 +266,7 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba - [X] `ordinal`: dummy character ordinals conversion (converts to 3-digits groups) - [X] `ordinal-spaced`: variant of `ordinal` ; dummy character ordinals conversion, handling whitespace separators -#### [Compression](https://python-codext.readthedocs.io/en/latest/enc/compressions.html) +#### [Compression](https://python-codext.readthedocs.io/en/latest/enc/compressions) - [X] `gzip`: standard Gzip compression/decompression - [X] `lz77`: compresses the given data with the algorithm of Lempel and Ziv of 1977 @@ -272,7 +277,7 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba > :warning: Compression functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. -#### [Cryptography](https://python-codext.readthedocs.io/en/latest/enc/crypto.html) +#### [Cryptography](https://python-codext.readthedocs.io/en/latest/enc/crypto) - [X] `affine`: aka Affine Cipher - [X] `atbash`: aka Atbash Cipher @@ -287,10 +292,9 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba > :warning: Crypto functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. -#### [Hashing](https://python-codext.readthedocs.io/en/latest/enc/hashing.html) +#### [Hashing](https://python-codext.readthedocs.io/en/latest/enc/hashing) - [X] `blake`: includes BLAKE2b and BLAKE2s (Python 3 only ; relies on `hashlib`) -- [X] `checksums`: includes Adler32 and CRC32 (relies on `zlib`) - [X] `crypt`: Unix's crypt hash for passwords (Python 3 and Unix only ; relies on `crypt`) - [X] `md`: aka Message Digest ; includes MD4 and MD5 (relies on `hashlib`) - [X] `sha`: aka Secure Hash Algorithms ; includes SHA1, 224, 256, 384, 512 (Python2/3) but also SHA3-224, -256, -384 and -512 (Python 3 only ; relies on `hashlib`) @@ -298,7 +302,7 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba > :warning: Hash functions are of course definitely **NOT** encoding functions ; they are implemented for convenience with the `.encode(...)` API from `codecs` and useful for chaning codecs. -#### [Languages](https://python-codext.readthedocs.io/en/latest/enc/languages.html) +#### [Languages](https://python-codext.readthedocs.io/en/latest/enc/languages) - [X] `braille`: well-known braille language (Python 3 only) - [X] `ipsum`: aka lorem ipsum @@ -312,13 +316,13 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba - [X] `tap`: converts text to tap/knock code, commonly used by prisoners - [X] `tomtom`: similar to `morse`, using slashes and backslashes -#### [Others](https://python-codext.readthedocs.io/en/latest/enc/others.html) +#### [Others](https://python-codext.readthedocs.io/en/latest/enc/others) - [X] `dna`: implements the 8 rules of DNA sequences (N belongs to [1,8]) - [X] `letter-indices`: encodes consonants and/or vowels with their corresponding indices - [X] `markdown`: unidirectional encoding from Markdown to HTML -#### [Steganography](https://python-codext.readthedocs.io/en/latest/enc/stegano.html) +#### [Steganography](https://python-codext.readthedocs.io/en/latest/enc/stegano) - [X] `hexagram`: uses Base64 and encodes the result to a charset of [I Ching hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) (as implemented [here](https://github.com/qntm/hexagram-encode)) - [X] `klopf`: aka Klopf code ; Polybius square with trivial alphabetical distribution @@ -328,7 +332,7 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba - [X] `whitespace`: replaces bits with whitespaces and tabs - [X] `whitespace_after_before`: variant of `whitespace` ; encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") -#### [Web](https://python-codext.readthedocs.io/en/latest/enc/web.html) +#### [Web](https://python-codext.readthedocs.io/en/latest/enc/web) - [X] `html`: implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) - [X] `url`: aka URL encoding diff --git a/docs/pages/enc/others.md b/docs/pages/enc/others.md index 3470611..408ac07 100644 --- a/docs/pages/enc/others.md +++ b/docs/pages/enc/others.md @@ -1,79 +1,79 @@ -## Others - -All kinds of other codecs are categorized in "*Others*". - ------ - -### DNA - -This implements the 8 methods of ATGC nucleotides following the rule of complementary pairing, according the literature about coding and computing of DNA sequences. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`dna` (rule 1) | text <-> DNA-1 | `dna1`, `dna-1`, `dna_1` | -`dna` (rule X) | text <-> DNA-X | ... | -`dna` (rule 8) | text <-> DNA-8 | `dna8`, `dna-8`, `dna_8` | - -```python ->>> for i in range(8): - print(codext.encode("this is a test", "dna-%d" % (i + 1))) -GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA -CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA -ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG -AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC -TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG -TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC -GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT -CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT ->>> codext.decode("GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA", "dna-1") -'this is a test' -``` - ------ - -### Letter indices - -This encodes consonants and/or vowels with their respective indices. This codec is case insensitive, strips white spaces and only applies to letters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`consonant-indices` | text <-> text with consonant indices | `consonants_indices`, `consonants_index` | while decoding, searches from the longest match, possibly not producing the original input -`vowel-indices` | text <-> text with vowel indices | `vowels_indices`, `vowels_index` | -`consonant-vowel-indices` | text <-> text with consonant and vowel indices | `consonants-vowels_index` | prefixes consonants with `C` and vowels with `V` - -```python ->>> codext.encode("This is a test", "consonant-index") -'166I15I15A16E1516' ->>> codext.decode("166I15I15A16E1516", "consonant-index") -'THISISATEST' -``` - -```python ->>> codext.encode("This is a test", "vowel-index") -'TH3S3S1T2ST' ->>> codext.decode("TH3S3S1T2ST", "vowel-index") -'THISISATEST' -``` - -```python ->>> codext.encode("This is a test", "consonant-vowel-index") -'C16C6V3C15V3C15V1C16V2C15C16' ->>> codext.decode("C16C6V3C15V3C15V1C16V2C15C16", "consonant-vowel-index") -'THISISATEST' -``` - ------ - -### Markdown - -This is only for "encoding" (converting) Markdown to HTML. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`markdown` | Markdown --> HTML | `markdown`, `Markdown`, `md` | unidirectional ! - -```python ->>> codext.encode("# Test\nparagraph", "markdown") -'

Test

\n\n

paragraph

\n' -``` - +## Others + +All kinds of other codecs are categorized in "*Others*". + +----- + +### DNA + +This implements the 8 methods of ATGC nucleotides following the rule of complementary pairing, according the literature about coding and computing of DNA sequences. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`dna` (rule 1) | text <-> DNA-1 | `dna1`, `dna-1`, `dna_1` | +`dna` (rule X) | text <-> DNA-X | ... | +`dna` (rule 8) | text <-> DNA-8 | `dna8`, `dna-8`, `dna_8` | + +```python +>>> for i in range(8): + print(codext.encode("this is a test", "dna-%d" % (i + 1))) +GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA +CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA +ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG +AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC +TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG +TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC +GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT +CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT +>>> codext.decode("GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA", "dna-1") +'this is a test' +``` + +----- + +### Letter indices + +This encodes consonants and/or vowels with their respective indices. This codec is case insensitive, strips white spaces and only applies to letters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`consonant-indices` | text <-> text with consonant indices | `consonants_indices`, `consonants_index` | while decoding, searches from the longest match, possibly not producing the original input +`vowel-indices` | text <-> text with vowel indices | `vowels_indices`, `vowels_index` | +`consonant-vowel-indices` | text <-> text with consonant and vowel indices | `consonants-vowels_index` | prefixes consonants with `C` and vowels with `V` + +```python +>>> codext.encode("This is a test", "consonant-index") +'166I15I15A16E1516' +>>> codext.decode("166I15I15A16E1516", "consonant-index") +'THISISATEST' +``` + +```python +>>> codext.encode("This is a test", "vowel-index") +'TH3S3S1T2ST' +>>> codext.decode("TH3S3S1T2ST", "vowel-index") +'THISISATEST' +``` + +```python +>>> codext.encode("This is a test", "consonant-vowel-index") +'C16C6V3C15V3C15V1C16V2C15C16' +>>> codext.decode("C16C6V3C15V3C15V1C16V2C15C16", "consonant-vowel-index") +'THISISATEST' +``` + +----- + +### Markdown + +This is only for "encoding" (converting) Markdown to HTML. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`markdown` | Markdown --> HTML | `markdown`, `Markdown`, `md` | unidirectional ! + +```python +>>> codext.encode("# Test\nparagraph", "markdown") +'

Test

\n\n

paragraph

\n' +``` + diff --git a/src/codext/__info__.py b/src/codext/__info__.py index f299990..85c3966 100644 --- a/src/codext/__info__.py +++ b/src/codext/__info__.py @@ -1,16 +1,16 @@ -# -*- coding: UTF-8 -*- -"""Codext package information. - -""" -import os -from datetime import datetime - -__author__ = "Alexandre D'Hondt" -__copyright__ = "© 2019-{} A. D'Hondt".format(datetime.now().year) -__email__ = "alexandre.dhondt@gmail.com" -__license__ = "GPLv3 (https://www.gnu.org/licenses/gpl-3.0.fr.html)" -__source__ = "https://github.com/dhondta/python-codext" - -with open(os.path.join(os.path.dirname(__file__), "VERSION.txt")) as f: - __version__ = f.read().strip() - +# -*- coding: UTF-8 -*- +"""Codext package information. + +""" +import os +from datetime import datetime + +__author__ = "Alexandre D'Hondt" +__copyright__ = "© 2019-{} A. D'Hondt".format(datetime.now().year) +__email__ = "alexandre.dhondt@gmail.com" +__license__ = "GPLv3 (https://www.gnu.org/licenses/gpl-3.0.fr.html)" +__source__ = "https://github.com/dhondta/python-codext" + +with open(os.path.join(os.path.dirname(__file__), "VERSION.txt")) as f: + __version__ = f.read().strip() + diff --git a/src/codext/base/__init__.py b/src/codext/base/__init__.py old mode 100755 new mode 100644 index 8c0d220..79deab5 --- a/src/codext/base/__init__.py +++ b/src/codext/base/__init__.py @@ -1,64 +1,64 @@ -# -*- coding: UTF-8 -*- -from argparse import ArgumentParser, RawTextHelpFormatter -from types import MethodType - -from .base45 import * -from .base85 import * -from .base91 import * -from .base100 import * -from .base122 import * -from .baseN import * -from ..__common__ import * -from ..__info__ import __version__ - - -def main(): - descr = """Usage: unbase [OPTION]... [FILE] -Decode multi-layer base encoded FILE, or standard input, to standard output. - -With no FILE, or when FILE is -, read standard input. - -Optional arguments: - -E, --extended also consider generic base codecs while guess-decoding - -f, --stop-function set the result chceking function (default: text) - format: printables|text|flag|lang_[bigram] - -M, --max-depth maximum codec search depth (default: 5) - -m, --min-depth minimum codec search depth (default: 0) - -p, --pattern pattern to be matched while searching - -s, --show show the decoding chain - - --help display this help and exit - --verbose show guessing information and steps - --version output version information and exit - -Report unbase bugs to -Full documentation at: -""" - parser = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) - parser.format_help = MethodType(lambda s: s.description, parser) - group = parser.add_mutually_exclusive_group() - parser.add_argument("file", nargs="?") - parser.add_argument("-E", "--extended", action="store_true") - group.add_argument("-f", "--stop-function", default="text") - parser.add_argument("-M", "--max-depth", type=int, default=10) - parser.add_argument("-m", "--min-depth", type=int, default=0) - group.add_argument("-p", "--pattern") - parser.add_argument("-s", "--show", action="store_true") - parser.add_argument("--help", action="help") - parser.add_argument("--version", action="version") - parser.add_argument("--verbose", action="store_true") - parser.version = "CodExt " + __version__ - args = parser.parse_args() - c, e = _input(args.file), [["base%d-generic" % i for i in range(2, 256)], []][args.extended] - c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") - r = codecs.guess(c, stopfunc._validate(args.stop_function), 0, args.max_depth, "base", tuple(e), stop=False, - show=args.verbose, debug=args.verbose) - if len(r) == 0: - print("Could not decode :-(") - return 0 - ans = max(r.items(), key=lambda x: len(x[0])) - if args.show: - print(" - ".join(ans[0])) - print(ensure_str(ans[1])) - return 0 - +# -*- coding: UTF-8 -*- +from argparse import ArgumentParser, RawTextHelpFormatter +from types import MethodType + +from .base45 import * +from .base85 import * +from .base91 import * +from .base100 import * +from .base122 import * +from .baseN import * +from ..__common__ import * +from ..__info__ import __version__ + + +def main(): + descr = """Usage: unbase [OPTION]... [FILE] +Decode multi-layer base encoded FILE, or standard input, to standard output. + +With no FILE, or when FILE is -, read standard input. + +Optional arguments: + -E, --extended also consider generic base codecs while guess-decoding + -f, --stop-function set the result chceking function (default: text) + format: printables|text|flag|lang_[bigram] + -M, --max-depth maximum codec search depth (default: 5) + -m, --min-depth minimum codec search depth (default: 0) + -p, --pattern pattern to be matched while searching + -s, --show show the decoding chain + + --help display this help and exit + --verbose show guessing information and steps + --version output version information and exit + +Report unbase bugs to +Full documentation at: +""" + parser = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) + parser.format_help = MethodType(lambda s: s.description, parser) + group = parser.add_mutually_exclusive_group() + parser.add_argument("file", nargs="?") + parser.add_argument("-E", "--extended", action="store_true") + group.add_argument("-f", "--stop-function", default="text") + parser.add_argument("-M", "--max-depth", type=int, default=10) + parser.add_argument("-m", "--min-depth", type=int, default=0) + group.add_argument("-p", "--pattern") + parser.add_argument("-s", "--show", action="store_true") + parser.add_argument("--help", action="help") + parser.add_argument("--version", action="version") + parser.add_argument("--verbose", action="store_true") + parser.version = "CodExt " + __version__ + args = parser.parse_args() + c, e = _input(args.file), [["base%d-generic" % i for i in range(2, 256)], []][args.extended] + c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") + r = codecs.guess(c, stopfunc._validate(args.stop_function), 0, args.max_depth, "base", tuple(e), stop=False, + show=args.verbose, debug=args.verbose) + if len(r) == 0: + print("Could not decode :-(") + return 0 + ans = max(r.items(), key=lambda x: len(x[0])) + if args.show: + print(" - ".join(ans[0])) + print(ensure_str(ans[1])) + return 0 + diff --git a/src/codext/base/_base.py b/src/codext/base/_base.py old mode 100755 new mode 100644 diff --git a/src/codext/base/_base2n.py b/src/codext/base/_base2n.py old mode 100755 new mode 100644 index d34072d..0e1f2d2 --- a/src/codext/base/_base2n.py +++ b/src/codext/base/_base2n.py @@ -1,112 +1,112 @@ -# -*- coding: UTF-8 -*- -"""BaseN functions with N a power of 2. - -""" -from math import ceil, log - -from ..__common__ import * -from ..__common__ import _set_exc -from ._base import base, _get_charset - - -_bin = lambda x: bin(x if isinstance(x, int) else ord(x)) - - -# base en/decoding functions for N a power of 2 -_set_exc("Base2NDecodeError") -_set_exc("Base2NEncodeError") - - -def base2n(charset, pattern=None, name=None, **kwargs): - """ Base-N codec factory for N a power of 2. - - :param charset: charset selection function - :param pattern: matching pattern for the codec name (first capturing group is used as the parameter for selecting - the charset) - :param name: forced encoding name (useful e.g. for zbase32) - """ - base(charset, pattern, True, base2n_encode, base2n_decode, name, **kwargs) - - -def base2n_encode(string, charset, errors="strict"): - """ 8-bits characters to base-N encoding for N a power of 2. - - :param string: string to be decoded - :param charset: base-N characters set - :param errors: errors handling marker - """ - bs, r, n = "", "", len(charset) - # find the number of bits for the given character set and the quantum - nb_out = int(log(n, 2)) - q = nb_out - while q % 8 != 0: - q += nb_out - # iterate over the characters, gathering bits to be mapped to the charset - for i, c in enumerate(b(string)): - bs += "{:0>8}".format(_bin(c)[2:]) - while len(bs) >= nb_out: - r += charset[int(bs[:nb_out], 2)] - bs = bs[nb_out:] - if len(bs) > 0: - for i in range(0, len(bs), nb_out): - c = ("{:0<%d}" % nb_out).format(bs[i:i+nb_out]) - p = len(c) - len(bs[i:i+nb_out]) - r += charset[int(c, 2)] - l = len(r) * nb_out - while l % q != 0: - l += nb_out - return r + int(l / nb_out - len(r)) * "=" - - -def base2n_decode(string, charset, errors="strict"): - """ Base-N to 8-bits characters decoding for N a power of 2. - - :param string: string to be decoded - :param charset: base-N characters set - :param errors: errors handling marker - """ - bs, r, n = "", "", len(charset) - # particular case: for hex, ensure the right case in the charset ; not that this way, if mixed cases are used, it - # will trigger an error (this is the expected behavior) - if n == 16: - if any(c in string for c in "abcdef"): - charset = charset.lower() - elif any(c in string for c in "ABCDEF"): - charset = charset.upper() - string = re.sub(r"\s", "", string) - # find the number of bits for the given character set and the number of padding characters - nb_in = int(log(n, 2)) - n_pad = len(string) - len(string.rstrip("=")) - # iterate over the characters, mapping them to the character set and converting the resulting bits to 8-bits chars - for i, c in enumerate(string): - if c == "=": - bs += "0" * nb_in - else: - try: - bs += ("{:0>%d}" % nb_in).format(_bin(charset.index(c))[2:]) - except ValueError: - if errors == "strict": - e = Base2NDecodeError("'base%d' codec can't decode character '%s' in position %d" % (n, c, i)) - e.__cause__ = e # block exceptions chaining - raise e - elif errors == "replace": - bs += "0" * nb_in - elif errors == "ignore": - continue - else: - raise ValueError("Unsupported error handling {}".format(errors)) - if len(bs) > 8: - r += chr(int(bs[:8], 2)) - bs = bs[8:] - # if the number of bits is not multiple of 8 bits, it could mean a bad padding - if len(bs) != 8: - if errors == "strict": - raise Base2NDecodeError("Incorrect padding") - elif errors in ["replace", "ignore"]: - pass - else: - raise ValueError("Unsupported error handling {}".format(errors)) - r += chr(int(bs, 2)) - np = int(ceil(n_pad * nb_in / 8.0)) - return r[:-np] if np > 0 else r - +# -*- coding: UTF-8 -*- +"""BaseN functions with N a power of 2. + +""" +from math import ceil, log + +from ..__common__ import * +from ..__common__ import _set_exc +from ._base import base, _get_charset + + +_bin = lambda x: bin(x if isinstance(x, int) else ord(x)) + + +# base en/decoding functions for N a power of 2 +_set_exc("Base2NDecodeError") +_set_exc("Base2NEncodeError") + + +def base2n(charset, pattern=None, name=None, **kwargs): + """ Base-N codec factory for N a power of 2. + + :param charset: charset selection function + :param pattern: matching pattern for the codec name (first capturing group is used as the parameter for selecting + the charset) + :param name: forced encoding name (useful e.g. for zbase32) + """ + base(charset, pattern, True, base2n_encode, base2n_decode, name, **kwargs) + + +def base2n_encode(string, charset, errors="strict"): + """ 8-bits characters to base-N encoding for N a power of 2. + + :param string: string to be decoded + :param charset: base-N characters set + :param errors: errors handling marker + """ + bs, r, n = "", "", len(charset) + # find the number of bits for the given character set and the quantum + nb_out = int(log(n, 2)) + q = nb_out + while q % 8 != 0: + q += nb_out + # iterate over the characters, gathering bits to be mapped to the charset + for i, c in enumerate(b(string)): + bs += "{:0>8}".format(_bin(c)[2:]) + while len(bs) >= nb_out: + r += charset[int(bs[:nb_out], 2)] + bs = bs[nb_out:] + if len(bs) > 0: + for i in range(0, len(bs), nb_out): + c = ("{:0<%d}" % nb_out).format(bs[i:i+nb_out]) + p = len(c) - len(bs[i:i+nb_out]) + r += charset[int(c, 2)] + l = len(r) * nb_out + while l % q != 0: + l += nb_out + return r + int(l / nb_out - len(r)) * "=" + + +def base2n_decode(string, charset, errors="strict"): + """ Base-N to 8-bits characters decoding for N a power of 2. + + :param string: string to be decoded + :param charset: base-N characters set + :param errors: errors handling marker + """ + bs, r, n = "", "", len(charset) + # particular case: for hex, ensure the right case in the charset ; not that this way, if mixed cases are used, it + # will trigger an error (this is the expected behavior) + if n == 16: + if any(c in string for c in "abcdef"): + charset = charset.lower() + elif any(c in string for c in "ABCDEF"): + charset = charset.upper() + string = re.sub(r"\s", "", string) + # find the number of bits for the given character set and the number of padding characters + nb_in = int(log(n, 2)) + n_pad = len(string) - len(string.rstrip("=")) + # iterate over the characters, mapping them to the character set and converting the resulting bits to 8-bits chars + for i, c in enumerate(string): + if c == "=": + bs += "0" * nb_in + else: + try: + bs += ("{:0>%d}" % nb_in).format(_bin(charset.index(c))[2:]) + except ValueError: + if errors == "strict": + e = Base2NDecodeError("'base%d' codec can't decode character '%s' in position %d" % (n, c, i)) + e.__cause__ = e # block exceptions chaining + raise e + elif errors == "replace": + bs += "0" * nb_in + elif errors == "ignore": + continue + else: + raise ValueError("Unsupported error handling {}".format(errors)) + if len(bs) > 8: + r += chr(int(bs[:8], 2)) + bs = bs[8:] + # if the number of bits is not multiple of 8 bits, it could mean a bad padding + if len(bs) != 8: + if errors == "strict": + raise Base2NDecodeError("Incorrect padding") + elif errors in ["replace", "ignore"]: + pass + else: + raise ValueError("Unsupported error handling {}".format(errors)) + r += chr(int(bs, 2)) + np = int(ceil(n_pad * nb_in / 8.0)) + return r[:-np] if np > 0 else r + diff --git a/src/codext/base/base100.py b/src/codext/base/base100.py old mode 100755 new mode 100644 diff --git a/src/codext/base/base122.py b/src/codext/base/base122.py old mode 100755 new mode 100644 diff --git a/src/codext/base/base45.py b/src/codext/base/base45.py old mode 100755 new mode 100644 index 6f15150..272c3e9 --- a/src/codext/base/base45.py +++ b/src/codext/base/base45.py @@ -1,84 +1,84 @@ -# -*- coding: UTF-8 -*- -"""Base45 Codec - base45 content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ._base import _get_charset, digits, lower, main, upper -from ..__common__ import * - - -__examples__ = { - 'enc(base45|base-45|base_45)': {'this is a test!': "AWE+EDH44.OEOCC7WE QEX0"}, - 'enc(base45-inv|base_45_inv)': {'this is a test!': "K6O+ONREE.YOYMMH6O 0O7A"}, - 'dec(base45)': {'BAD STRING\00': None, 'AWE+EDH44.OEOCC7WE QEX000': None}, -} -__guess__ = ["base45", "base45-inv"] - - -B45 = { - '': digits + upper + " $%*+-./:", - '[-_]inv(?:erted)?$': upper + digits + " $%*+-./:", -} - - -__chr = lambda c: chr(c >> 8) + chr(c & 0xff) if isinstance(c, int) and 256 <= c <= 65535 else \ - chr(c) if isinstance(c, int) else c -__ord = lambda c: ord(c) if not isinstance(c, int) else c - - -def base45_encode(mode): - b45 = _get_charset(B45, mode) - def encode(text, errors="strict"): - t, s = b(text), "" - for i in range(0, len(text), 2): - n = 256 * __ord(t[i]) - try: - n += __ord(t[i+1]) - except IndexError: - n = __ord(t[i]) - s += b45[n % 45] + b45[n // 45] - break - m = n // 45**2 - n -= m * 45**2 - s += b45[n % 45] + b45[n // 45] + b45[m] - return s, len(text) - return encode - - -def base45_decode(mode): - b45 = {c: i for i, c in enumerate(_get_charset(B45, mode))} - def decode(text, errors="strict"): - t, s = b(text), "" - ehandler = handle_error("base45", errors, decode=True) - for i in range(0, len(text), 3): - try: - n = b45[__chr(t[i])] - except KeyError: - ehandler(__chr(t[i]), i, s) - try: - j = i + 1 - n += 45 * b45[__chr(t[j])] - except KeyError: - ehandler(__chr(t[j]), j, s) - except IndexError: - ehandler(__chr(t[i]), i, s) - try: - k = i + 2 - n += 45 ** 2 * b45[__chr(t[k])] - except KeyError: - ehandler(__chr(t[k]), k, s) - except IndexError: - s += __chr(n) - continue - s += __chr(n // 256) + __chr(n % 256) - return s, len(text) - return decode - - -add("base45", base45_encode, base45_decode, r"^base[-_]?45(|[-_]inv(?:erted)?)$", expansion_factor=1.5) -main = main(45, "") - +# -*- coding: UTF-8 -*- +"""Base45 Codec - base45 content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ._base import _get_charset, digits, lower, main, upper +from ..__common__ import * + + +__examples__ = { + 'enc(base45|base-45|base_45)': {'this is a test!': "AWE+EDH44.OEOCC7WE QEX0"}, + 'enc(base45-inv|base_45_inv)': {'this is a test!': "K6O+ONREE.YOYMMH6O 0O7A"}, + 'dec(base45)': {'BAD STRING\00': None, 'AWE+EDH44.OEOCC7WE QEX000': None}, +} +__guess__ = ["base45", "base45-inv"] + + +B45 = { + '': digits + upper + " $%*+-./:", + '[-_]inv(?:erted)?$': upper + digits + " $%*+-./:", +} + + +__chr = lambda c: chr(c >> 8) + chr(c & 0xff) if isinstance(c, int) and 256 <= c <= 65535 else \ + chr(c) if isinstance(c, int) else c +__ord = lambda c: ord(c) if not isinstance(c, int) else c + + +def base45_encode(mode): + b45 = _get_charset(B45, mode) + def encode(text, errors="strict"): + t, s = b(text), "" + for i in range(0, len(text), 2): + n = 256 * __ord(t[i]) + try: + n += __ord(t[i+1]) + except IndexError: + n = __ord(t[i]) + s += b45[n % 45] + b45[n // 45] + break + m = n // 45**2 + n -= m * 45**2 + s += b45[n % 45] + b45[n // 45] + b45[m] + return s, len(text) + return encode + + +def base45_decode(mode): + b45 = {c: i for i, c in enumerate(_get_charset(B45, mode))} + def decode(text, errors="strict"): + t, s = b(text), "" + ehandler = handle_error("base45", errors, decode=True) + for i in range(0, len(text), 3): + try: + n = b45[__chr(t[i])] + except KeyError: + ehandler(__chr(t[i]), i, s) + try: + j = i + 1 + n += 45 * b45[__chr(t[j])] + except KeyError: + ehandler(__chr(t[j]), j, s) + except IndexError: + ehandler(__chr(t[i]), i, s) + try: + k = i + 2 + n += 45 ** 2 * b45[__chr(t[k])] + except KeyError: + ehandler(__chr(t[k]), k, s) + except IndexError: + s += __chr(n) + continue + s += __chr(n // 256) + __chr(n % 256) + return s, len(text) + return decode + + +add("base45", base45_encode, base45_decode, r"^base[-_]?45(|[-_]inv(?:erted)?)$", expansion_factor=1.5) +main = main(45, "") + diff --git a/src/codext/base/base85.py b/src/codext/base/base85.py old mode 100755 new mode 100644 diff --git a/src/codext/base/base91.py b/src/codext/base/base91.py old mode 100755 new mode 100644 index 21a21d5..4082256 --- a/src/codext/base/base91.py +++ b/src/codext/base/base91.py @@ -1,113 +1,113 @@ -# -*- coding: UTF-8 -*- -"""Base91 Codec - base91 content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ._base import _get_charset, digits, lower, main, upper -from ..__common__ import * - -# no __examples__ ; handled manually in tests/test_base.py -__guess__ = ["base91", "base91-inv", "base91-alt", "base91-alt-inv"] - - -B91 = { - r'': upper + lower + digits + "!#$%&()*+,./:;<=>?@[]^_`{|}~\"", - r'[-_]inv(erted)?$': digits + upper + lower + "!#$%&()*+,./:;<=>?@[]^_`{|}~\"", - r'[-_]alt(ernate)?$': "!#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_" + lower + "{|}", - r'[-_]alt(ernate)?[-_]inv(erted)?$': "!#$%&'()*+,-./" + upper + ":;<=>?@" + lower + "[\\]^_" + digits + "{|}", -} - - -__chr = lambda c: chr(c) if isinstance(c, int) else c -__ord = lambda c: ord(c) if not isinstance(c, int) else c - - -def base91_encode(mode): - b91 = _get_charset(B91, mode) - def encode(text, errors="strict"): - t, s, bits = b(text), "", "" - if re.search(r'[-_]alt(ernate)?$', mode): - while len(bits) < 13 and t: - bits += "{:08b}".format(__ord(t[0])) - t = t[1:] - while len(bits) > 13 or t: - n = int(bits[:13], 2) - s += b91[n // 91] + b91[n % 91] - bits = bits[13:] - while len(bits) < 13 and t: - bits += "{:08b}".format(__ord(t[0])) - t = t[1:] - if len(bits) > 0: - if len(bits) < 7: - bits += "0" * (6 - len(bits)) - s += b91[int(bits, 2)] - else: - bits += "0" * (13 - len(bits)) - n = int(bits, 2) - s += b91[n // 91] + b91[n % 91] - else: - for c in t: - bits = bin(__ord(c))[2:].zfill(8) + bits - if len(bits) > 13: - n = int(bits[-13:], 2) - if n > 88: - bits = bits[:-13] - else: - n = int(bits[-14:], 2) - bits = bits[:-14] - s += b91[n % 91] + b91[n // 91] - if len(bits) > 0: - n = int(bits, 2) - s += b91[n % 91] - if len(bits) > 7 or n > 90: - s += b91[n // 91] - return s, len(t) - return encode - - -def base91_decode(mode): - b91 = {c: i for i, c in enumerate(_get_charset(B91, mode))} - def decode(text, errors="strict"): - t, s, bits, alt = b(_stripl(text, True, True)), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None - ehandler = handle_error("base91", errors, decode=True) - for i in range(0, len(t), 2): - try: - n = b91[__chr(t[i])] * [1, 91][alt] - except KeyError: - ehandler(__chr(t[i]), i, s) - try: - j = i + 1 - n += b91[__chr(t[j])] * [91, 1][alt] - except IndexError: - pass - except KeyError: - ehandler(__chr(t[j]), j, s) - if alt: - bits += "{:013b}".format(n) - while 8 <= len(bits): - s += chr(int(bits[0:8], 2)) - bits = bits[8:] - else: - bits = bin(n)[2:].zfill([14, 13][n & 8191 > 88]) + bits - while len(bits) > 8: - s += chr(int(bits[-8:], 2)) - bits = bits[:-8] - if alt and len(t) % 2 == 1: - bits += "{:06b}".format(b91[__chr(t[-1])]) - while 8 <= len(bits): - s += chr(int(bits[:8], 2)) - bits = bits[8:] - elif not alt and len(bits) > 0 and not set(bits) == {"0"}: - s += chr(int(bits, 2)) - return s.rstrip("\0"), len(t) - return decode - - -add("base91", base91_encode, base91_decode, r"^base[-_]?91((?:|[-_]alt(?:ernate)?)(?:|[-_]inv(?:erted)?)?)$", - entropy=6.5, expansion_factor=1.231) -main91 = main(91, "") - +# -*- coding: UTF-8 -*- +"""Base91 Codec - base91 content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ._base import _get_charset, digits, lower, main, upper +from ..__common__ import * + +# no __examples__ ; handled manually in tests/test_base.py +__guess__ = ["base91", "base91-inv", "base91-alt", "base91-alt-inv"] + + +B91 = { + r'': upper + lower + digits + "!#$%&()*+,./:;<=>?@[]^_`{|}~\"", + r'[-_]inv(erted)?$': digits + upper + lower + "!#$%&()*+,./:;<=>?@[]^_`{|}~\"", + r'[-_]alt(ernate)?$': "!#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_" + lower + "{|}", + r'[-_]alt(ernate)?[-_]inv(erted)?$': "!#$%&'()*+,-./" + upper + ":;<=>?@" + lower + "[\\]^_" + digits + "{|}", +} + + +__chr = lambda c: chr(c) if isinstance(c, int) else c +__ord = lambda c: ord(c) if not isinstance(c, int) else c + + +def base91_encode(mode): + b91 = _get_charset(B91, mode) + def encode(text, errors="strict"): + t, s, bits = b(text), "", "" + if re.search(r'[-_]alt(ernate)?$', mode): + while len(bits) < 13 and t: + bits += "{:08b}".format(__ord(t[0])) + t = t[1:] + while len(bits) > 13 or t: + n = int(bits[:13], 2) + s += b91[n // 91] + b91[n % 91] + bits = bits[13:] + while len(bits) < 13 and t: + bits += "{:08b}".format(__ord(t[0])) + t = t[1:] + if len(bits) > 0: + if len(bits) < 7: + bits += "0" * (6 - len(bits)) + s += b91[int(bits, 2)] + else: + bits += "0" * (13 - len(bits)) + n = int(bits, 2) + s += b91[n // 91] + b91[n % 91] + else: + for c in t: + bits = bin(__ord(c))[2:].zfill(8) + bits + if len(bits) > 13: + n = int(bits[-13:], 2) + if n > 88: + bits = bits[:-13] + else: + n = int(bits[-14:], 2) + bits = bits[:-14] + s += b91[n % 91] + b91[n // 91] + if len(bits) > 0: + n = int(bits, 2) + s += b91[n % 91] + if len(bits) > 7 or n > 90: + s += b91[n // 91] + return s, len(t) + return encode + + +def base91_decode(mode): + b91 = {c: i for i, c in enumerate(_get_charset(B91, mode))} + def decode(text, errors="strict"): + t, s, bits, alt = b(_stripl(text, True, True)), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None + ehandler = handle_error("base91", errors, decode=True) + for i in range(0, len(t), 2): + try: + n = b91[__chr(t[i])] * [1, 91][alt] + except KeyError: + ehandler(__chr(t[i]), i, s) + try: + j = i + 1 + n += b91[__chr(t[j])] * [91, 1][alt] + except IndexError: + pass + except KeyError: + ehandler(__chr(t[j]), j, s) + if alt: + bits += "{:013b}".format(n) + while 8 <= len(bits): + s += chr(int(bits[0:8], 2)) + bits = bits[8:] + else: + bits = bin(n)[2:].zfill([14, 13][n & 8191 > 88]) + bits + while len(bits) > 8: + s += chr(int(bits[-8:], 2)) + bits = bits[:-8] + if alt and len(t) % 2 == 1: + bits += "{:06b}".format(b91[__chr(t[-1])]) + while 8 <= len(bits): + s += chr(int(bits[:8], 2)) + bits = bits[8:] + elif not alt and len(bits) > 0 and not set(bits) == {"0"}: + s += chr(int(bits, 2)) + return s.rstrip("\0"), len(t) + return decode + + +add("base91", base91_encode, base91_decode, r"^base[-_]?91((?:|[-_]alt(?:ernate)?)(?:|[-_]inv(?:erted)?)?)$", + entropy=6.5, expansion_factor=1.231) +main91 = main(91, "") + diff --git a/src/codext/base/baseN.py b/src/codext/base/baseN.py old mode 100755 new mode 100644 index cf4abe4..c3965c7 --- a/src/codext/base/baseN.py +++ b/src/codext/base/baseN.py @@ -1,132 +1,132 @@ -# -*- coding: UTF-8 -*- -"""BaseN Codecs - base content encodings. - -These codecs: -- en/decode strings from str to str -- en/decode strings from bytes to bytes -- decode file content to str (read) -- encode file content from str to bytes (write) -""" -from ..__common__ import * -from ._base import base, base_generic, digits, lower, main, upper -from ._base2n import base2n - - -B1 = {chr(i): chr(i) for i in range(2**8)} -B1[''] = "A" -base(B1, r"^(?:base[-_]?1(|[-_].)|unary)$", guess=[]) -main1 = main(1) - - -B2 = {r'': "01", r'[-_]inv(erted)?$': "10"} -base2n(B2, r"^(?:base[-_]?2|bin(?:ary)?)(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{2})$", expansion_factor=8.) -main2 = main(2) - - -B3 = {r'': "123", r'[-_]inv(erted)?$': "321"} -base(B3, r"^base[-_]?3(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{3})$", expansion_factor=5.) -main3 = main(3) - - -B4 = {r'': "1234", r'[-_]inv(erted)?$': "4321"} -base2n(B4, r"^base[-_]?4(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{4})$", expansion_factor=4.) -main4 = main(4) - - -B8 = {r'': "abcdefgh", r'[-_]inv(erted)?$': "hgfedcba"} -base2n(B8, r"^base[-_]?8(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{8})$") -main8 = main(8) - - -B10 = {r'': "0123456789"} -base(B10, r"^(?:base[-_]?10|int(?:eger)?|dec(?:imal)?)$") -main10 = main(10) - - -B11 = {r'': "0123456789a", r'[-_]inv(erted)?$': "a0123456789"} -base(B11, r"^base[-_]?11(|[-_]inv(?:erted)?)$") -main11 = main(11) - - -B16 = {'': digits + "ABCDEF", '[-_]inv(erted)?$': "ABCDEF" + digits} -base2n(B16, r"^(?:base[-_]?16|hex)(|[-_]inv(?:erted)?)$", expansion_factor=2.) -main16 = main(16, "RFC 4648") - - -B26 = {'': upper} -base(B26, r"^base[-_]?26$") -main26 = main(26, inv=False) - - -B32 = { - r'': upper + "234567", - r'[-_]?z(?:base32)?$': "ybndrfg8ejkmcpqxot1uwisza345h769", - r'[-_]inv(erted)?$': "234567" + upper, - r'(?:[-_](ext(ended)?)?)?[-_]hex$': digits + upper[:22], - r'[-_]?crockford': digits + "ABCDEFGHJKMNPQRSTVWXYZ", - r'[-_]?geohash': digits + "bcdefghjkmnpqrstuvwxyz", -} -base2n(B32, r"^(?:base[-_]?32(|[-_]inv(?:erted)?|(?:[-_]ext(?:ended)?)?[-_]hex|[-_](?:z|geohash|crockford))|" - r"(zbase32|geohash|crockford))$", padding_char="=", - guess=["base32", "base32-inv", "base32-hex", "base32-geohash", "base32-crockford"]) -main32 = main(32, "RFC 4648") -main32hex = main(32, "RFC 4648", "hex", False) -main32geo = main(32, "", "geohash", False) -main32crk = main(32, "", "crockford", False) -mainz32 = main(32, "", "z", False) - - -B36 = {'': digits + upper, '[-_]inv(erted)?$': upper + digits} -base(B36, r"^base[-_]?36(|[-_]inv(?:erted)?)$") -main36 = main(36, "") - - -B58 = { - r'(|[-_]?(bc|bitcoin))$': "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz", - r'[-_]?(rp|ripple)$': "rpshnaf39wBUDNEGHJKLM4PQRST7VWXYZ2bcdeCg65jkm8oFqi1tuvAxyz", - r'[-_]?(fl|flickr|short[-]?url|url)$': "123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ", -} -base(B58, r"^base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))$", - guess=["base58-bitcoin", "base58-ripple", "base58-flickr"]) -main58bc = main(58, "", "bitcoin") -main58rp = main(58, "", "ripple") -main58fl = main(58, "", "flickr") - - -B62 = {'': digits + upper + lower, '[-_]inv(erted)?$': upper + lower + digits} -base(B62, r"^base[-_]?62(|[-_]inv(?:erted)?)$") -main62 = main(62, "") - - -B63 = {'': digits + upper + lower + "_", 'inv': upper + lower + digits + "_"} -base(B63, r"^base[-_]?63(|[-_]inv(?:erted)?)$") -main63 = main(63) - - -B64 = { - r'': upper + lower + digits + "+/", - r'[-_]inv(erted)?$': digits + upper + lower + "+/", - r'[-_]?(file|url)(safe)?$': upper + lower + digits + "-_", -} -base2n(B64, r"^base[-_]?64(|[-_]inv(?:erted)?|[-_]?(?:file|url)(?:safe)?)$", padding_char="=", - guess=["base64", "base64-inv", "base64-url"]) -main64 = main(64, "RFC 4648") -main64url = main(64, "RFC 4648 / Base64URL", "url", False) - - -B67 = { - r'': upper + lower + digits + "-_.!~", - r'[-_]inv(erted)?$': lower + upper + digits + "-_.!~", -} -base(B67, r"^base[-_]?67(|[-_]inv(?:erted)?)$") -main67 = main(67) - - -B128 = {r'': "".join(chr(i) for i in range(128))} -base(B128, r"^base[-_]?128$", padding_char="=") -main128 = main(128, None, False, wrap=False) - - -# generic base encodings, to be added after all others as they have the precedence -base_generic() - +# -*- coding: UTF-8 -*- +"""BaseN Codecs - base content encodings. + +These codecs: +- en/decode strings from str to str +- en/decode strings from bytes to bytes +- decode file content to str (read) +- encode file content from str to bytes (write) +""" +from ..__common__ import * +from ._base import base, base_generic, digits, lower, main, upper +from ._base2n import base2n + + +B1 = {chr(i): chr(i) for i in range(2**8)} +B1[''] = "A" +base(B1, r"^(?:base[-_]?1(|[-_].)|unary)$", guess=[]) +main1 = main(1) + + +B2 = {r'': "01", r'[-_]inv(erted)?$': "10"} +base2n(B2, r"^(?:base[-_]?2|bin(?:ary)?)(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{2})$", expansion_factor=8.) +main2 = main(2) + + +B3 = {r'': "123", r'[-_]inv(erted)?$': "321"} +base(B3, r"^base[-_]?3(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{3})$", expansion_factor=5.) +main3 = main(3) + + +B4 = {r'': "1234", r'[-_]inv(erted)?$': "4321"} +base2n(B4, r"^base[-_]?4(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{4})$", expansion_factor=4.) +main4 = main(4) + + +B8 = {r'': "abcdefgh", r'[-_]inv(erted)?$': "hgfedcba"} +base2n(B8, r"^base[-_]?8(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{8})$") +main8 = main(8) + + +B10 = {r'': "0123456789"} +base(B10, r"^(?:base[-_]?10|int(?:eger)?|dec(?:imal)?)$") +main10 = main(10) + + +B11 = {r'': "0123456789a", r'[-_]inv(erted)?$': "a0123456789"} +base(B11, r"^base[-_]?11(|[-_]inv(?:erted)?)$") +main11 = main(11) + + +B16 = {'': digits + "ABCDEF", '[-_]inv(erted)?$': "ABCDEF" + digits} +base2n(B16, r"^(?:base[-_]?16|hex)(|[-_]inv(?:erted)?)$", expansion_factor=2.) +main16 = main(16, "RFC 4648") + + +B26 = {'': upper} +base(B26, r"^base[-_]?26$") +main26 = main(26, inv=False) + + +B32 = { + r'': upper + "234567", + r'[-_]?z(?:base32)?$': "ybndrfg8ejkmcpqxot1uwisza345h769", + r'[-_]inv(erted)?$': "234567" + upper, + r'(?:[-_](ext(ended)?)?)?[-_]hex$': digits + upper[:22], + r'[-_]?crockford': digits + "ABCDEFGHJKMNPQRSTVWXYZ", + r'[-_]?geohash': digits + "bcdefghjkmnpqrstuvwxyz", +} +base2n(B32, r"^(?:base[-_]?32(|[-_]inv(?:erted)?|(?:[-_]ext(?:ended)?)?[-_]hex|[-_](?:z|geohash|crockford))|" + r"(zbase32|geohash|crockford))$", padding_char="=", + guess=["base32", "base32-inv", "base32-hex", "base32-geohash", "base32-crockford"]) +main32 = main(32, "RFC 4648") +main32hex = main(32, "RFC 4648", "hex", False) +main32geo = main(32, "", "geohash", False) +main32crk = main(32, "", "crockford", False) +mainz32 = main(32, "", "z", False) + + +B36 = {'': digits + upper, '[-_]inv(erted)?$': upper + digits} +base(B36, r"^base[-_]?36(|[-_]inv(?:erted)?)$") +main36 = main(36, "") + + +B58 = { + r'(|[-_]?(bc|bitcoin))$': "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz", + r'[-_]?(rp|ripple)$': "rpshnaf39wBUDNEGHJKLM4PQRST7VWXYZ2bcdeCg65jkm8oFqi1tuvAxyz", + r'[-_]?(fl|flickr|short[-]?url|url)$': "123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ", +} +base(B58, r"^base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))$", + guess=["base58-bitcoin", "base58-ripple", "base58-flickr"]) +main58bc = main(58, "", "bitcoin") +main58rp = main(58, "", "ripple") +main58fl = main(58, "", "flickr") + + +B62 = {'': digits + upper + lower, '[-_]inv(erted)?$': upper + lower + digits} +base(B62, r"^base[-_]?62(|[-_]inv(?:erted)?)$") +main62 = main(62, "") + + +B63 = {'': digits + upper + lower + "_", 'inv': upper + lower + digits + "_"} +base(B63, r"^base[-_]?63(|[-_]inv(?:erted)?)$") +main63 = main(63) + + +B64 = { + r'': upper + lower + digits + "+/", + r'[-_]inv(erted)?$': digits + upper + lower + "+/", + r'[-_]?(file|url)(safe)?$': upper + lower + digits + "-_", +} +base2n(B64, r"^base[-_]?64(|[-_]inv(?:erted)?|[-_]?(?:file|url)(?:safe)?)$", padding_char="=", + guess=["base64", "base64-inv", "base64-url"]) +main64 = main(64, "RFC 4648") +main64url = main(64, "RFC 4648 / Base64URL", "url", False) + + +B67 = { + r'': upper + lower + digits + "-_.!~", + r'[-_]inv(erted)?$': lower + upper + digits + "-_.!~", +} +base(B67, r"^base[-_]?67(|[-_]inv(?:erted)?)$") +main67 = main(67) + + +B128 = {r'': "".join(chr(i) for i in range(128))} +base(B128, r"^base[-_]?128$", padding_char="=") +main128 = main(128, None, False, wrap=False) + + +# generic base encodings, to be added after all others as they have the precedence +base_generic() + diff --git a/src/codext/binary/__init__.py b/src/codext/binary/__init__.py old mode 100755 new mode 100644 index ea0005b..2b97568 --- a/src/codext/binary/__init__.py +++ b/src/codext/binary/__init__.py @@ -1,8 +1,8 @@ -# -*- coding: UTF-8 -*- -from .baudot import * -from .bcd import * -from .excess3 import * -from .gray import * -from .manchester import * -from .rotate import * - +# -*- coding: UTF-8 -*- +from .baudot import * +from .bcd import * +from .excess3 import * +from .gray import * +from .manchester import * +from .rotate import * + diff --git a/src/codext/binary/baudot.py b/src/codext/binary/baudot.py old mode 100755 new mode 100644 diff --git a/src/codext/binary/bcd.py b/src/codext/binary/bcd.py old mode 100755 new mode 100644 index 9f21147..a692f6b --- a/src/codext/binary/bcd.py +++ b/src/codext/binary/bcd.py @@ -1,80 +1,80 @@ -# -*- coding: UTF-8 -*- -"""BCD Codec - Binary Coded Decimal content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples1__ = { - 'enc(bcd|binary-coded-decimal|binary_coded_decimal)': { - 'This is a test!': "\x08A\x04\x10Q\x15\x03!\x05\x11P2\tp2\x11a\x01\x11Q\x16\x030", - }, - 'dec(binary-coded-decimal)': { - '\xaf': None, - '\xff': None, - '\x08A\x04\x10Q\x15\x03!\x05\x11P2\tp2\x11a\x01\x11Q\x16\x030': "This is a test!", - }, -} -__examples2__ = { - 'enc(bcd-ext0|bcd_extended_zeros)': { - 'This is a test': "\x00\x08\x04\x01\x00\x04\x01\x00\x05\x01\x01\x05\x00\x03\x02\x01\x00\x05\x01\x01\x05\x00" - "\x03\x02\x00\t\x07\x00\x03\x02\x01\x01\x06\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00", - }, -} -__examples3__ = { - 'enc(bcd-ext1|bcd_extended_ones)': { - 'This is a test': "\xf0\xf8\xf4\xf1\xf0\xf4\xf1\xf0\xf5\xf1\xf1\xf5\xf0\xf3\xf2\xf1\xf0\xf5\xf1\xf1\xf5\xf0" - "\xf3\xf2\xf0\xf9\xf7\xf0\xf3\xf2\xf1\xf1\xf6\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0", - }, -} - - -CODE = {str(i): bin(i)[2:].zfill(4) for i in range(10)} - - -def bcd_encode(prefix=""): - def encode(text, errors="strict"): - r, bits = "", prefix - for c in text: - for i in str(ord(c)).zfill(3): - bits += CODE[i] - if len(bits) == 8: - r += chr(int(bits, 2)) - bits = prefix - if len(bits) > 0: - r += chr(int(bits + "0000", 2)) - return r, len(b(text)) - return encode - - -def bcd_decode(prefix=""): - def decode(text, errors="strict"): - code = {v: k for k, v in CODE.items()} - r, d = "", "" - for i, c in enumerate(text): - bin_c = bin(ord(c))[2:].zfill(8) - for k in range(len(prefix), 8, 4): - hb = bin_c[k:k+4] - try: - d += code[hb] - except KeyError: - d += handle_error("bcd", errors, decode=True)(hb, i) - if len(d) == 3: - r += chr(int(d)) - d = "" - return r, len(b(text)) - return decode - - -add("bcd", bcd_encode(), bcd_decode(), pattern=r"^(?:bcd|binary[-_]coded[-_]decimals?)$", examples=__examples1__, - entropy=lambda e: .45739*e+2.63519, printables_rate=.2) -add("bcd-extended0", bcd_encode("0000"), bcd_decode("0000"), examples=__examples2__, entropy=lambda e: .13584*e+2.07486, - pattern=r"^(?:bcd|binary[-_]coded[-_]decimals?)[-_]ext(?:ended)?(?:[-_]?0|[-_]zeros?)$") -add("bcd-extended1", bcd_encode("1111"), bcd_decode("1111"), examples=__examples3__, entropy=lambda e: .13584*e+2.07486, - pattern=r"^(?:bcd|binary[-_]coded[-_]decimals?)[-_]ext(?:ended)?(?:[-_]?1|[-_]ones?)$") - +# -*- coding: UTF-8 -*- +"""BCD Codec - Binary Coded Decimal content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples1__ = { + 'enc(bcd|binary-coded-decimal|binary_coded_decimal)': { + 'This is a test!': "\x08A\x04\x10Q\x15\x03!\x05\x11P2\tp2\x11a\x01\x11Q\x16\x030", + }, + 'dec(binary-coded-decimal)': { + '\xaf': None, + '\xff': None, + '\x08A\x04\x10Q\x15\x03!\x05\x11P2\tp2\x11a\x01\x11Q\x16\x030': "This is a test!", + }, +} +__examples2__ = { + 'enc(bcd-ext0|bcd_extended_zeros)': { + 'This is a test': "\x00\x08\x04\x01\x00\x04\x01\x00\x05\x01\x01\x05\x00\x03\x02\x01\x00\x05\x01\x01\x05\x00" + "\x03\x02\x00\t\x07\x00\x03\x02\x01\x01\x06\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00", + }, +} +__examples3__ = { + 'enc(bcd-ext1|bcd_extended_ones)': { + 'This is a test': "\xf0\xf8\xf4\xf1\xf0\xf4\xf1\xf0\xf5\xf1\xf1\xf5\xf0\xf3\xf2\xf1\xf0\xf5\xf1\xf1\xf5\xf0" + "\xf3\xf2\xf0\xf9\xf7\xf0\xf3\xf2\xf1\xf1\xf6\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0", + }, +} + + +CODE = {str(i): bin(i)[2:].zfill(4) for i in range(10)} + + +def bcd_encode(prefix=""): + def encode(text, errors="strict"): + r, bits = "", prefix + for c in text: + for i in str(ord(c)).zfill(3): + bits += CODE[i] + if len(bits) == 8: + r += chr(int(bits, 2)) + bits = prefix + if len(bits) > 0: + r += chr(int(bits + "0000", 2)) + return r, len(b(text)) + return encode + + +def bcd_decode(prefix=""): + def decode(text, errors="strict"): + code = {v: k for k, v in CODE.items()} + r, d = "", "" + for i, c in enumerate(text): + bin_c = bin(ord(c))[2:].zfill(8) + for k in range(len(prefix), 8, 4): + hb = bin_c[k:k+4] + try: + d += code[hb] + except KeyError: + d += handle_error("bcd", errors, decode=True)(hb, i) + if len(d) == 3: + r += chr(int(d)) + d = "" + return r, len(b(text)) + return decode + + +add("bcd", bcd_encode(), bcd_decode(), pattern=r"^(?:bcd|binary[-_]coded[-_]decimals?)$", examples=__examples1__, + entropy=lambda e: .45739*e+2.63519, printables_rate=.2) +add("bcd-extended0", bcd_encode("0000"), bcd_decode("0000"), examples=__examples2__, entropy=lambda e: .13584*e+2.07486, + pattern=r"^(?:bcd|binary[-_]coded[-_]decimals?)[-_]ext(?:ended)?(?:[-_]?0|[-_]zeros?)$") +add("bcd-extended1", bcd_encode("1111"), bcd_decode("1111"), examples=__examples3__, entropy=lambda e: .13584*e+2.07486, + pattern=r"^(?:bcd|binary[-_]coded[-_]decimals?)[-_]ext(?:ended)?(?:[-_]?1|[-_]ones?)$") + diff --git a/src/codext/binary/excess3.py b/src/codext/binary/excess3.py old mode 100755 new mode 100644 index 858bcb7..831cf41 --- a/src/codext/binary/excess3.py +++ b/src/codext/binary/excess3.py @@ -1,65 +1,65 @@ -# -*- coding: UTF-8 -*- -"""Excess-3 Codec - Excess-3 code (aka Stibitz code) content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(excess3|xs-3|stibitz)': { - 'This is a test!': ";t7C\x84H6T8D\x83e<\xa3eD\x944D\x84I6`", - 'This is another test ': ";t7C\x84H6T8D\x83e<\xa4CDDICt4DseD\x944D\x84I6P", - }, - 'dec(excess-3|xs3)': { - '\x00': None, - '\xff': None, - ';t7C\x84H6T8D\x83e<\xa3eD\x944D\x84I6`': "This is a test!", - ';t7C\x84H6T8D\x83e<\xa4CDDICt4DseD\x944D\x84I6P': "This is another test ", - }, -} - - -CODE = { - '0': "0011", '1': "0100", '2': "0101", '3': "0110", '4': "0111", - '5': "1000", '6': "1001", '7': "1010", '8': "1011", '9': "1100", -} - - -def excess3_encode(text, errors="strict"): - r, bits = "", "" - for c in text: - for i in str(ord(c)).zfill(3): - bits += CODE[i] - if len(bits) == 8: - r += chr(int(bits, 2)) - bits = "" - if len(bits) > 0: - r += chr(int(bits + "0000", 2)) - return r, len(b(text)) - - -def excess3_decode(text, errors="strict"): - code = {v: k for k, v in CODE.items()} - r, d = "", "" - for i, c in enumerate(text): - bin_c = bin(ord(c))[2:].zfill(8) - for k in range(0, 8, 4): - hb = bin_c[k:k+4] - try: - d += code[hb] - except KeyError: # (normal case) occurs when 0000 was used for padding - if i != len(text) - 1 or k != 4 or hb != "0000": - d += handle_error("excess3", errors, decode=True)(hb, i) - if len(d) == 3: - r += chr(int(d)) - d = "" - return r, len(b(text)) - - -add("excess3", excess3_encode, excess3_decode, pattern=r"^(?:excess\-?3|xs\-?3|stibitz)$", printables_rate=.45) - +# -*- coding: UTF-8 -*- +"""Excess-3 Codec - Excess-3 code (aka Stibitz code) content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(excess3|xs-3|stibitz)': { + 'This is a test!': ";t7C\x84H6T8D\x83e<\xa3eD\x944D\x84I6`", + 'This is another test ': ";t7C\x84H6T8D\x83e<\xa4CDDICt4DseD\x944D\x84I6P", + }, + 'dec(excess-3|xs3)': { + '\x00': None, + '\xff': None, + ';t7C\x84H6T8D\x83e<\xa3eD\x944D\x84I6`': "This is a test!", + ';t7C\x84H6T8D\x83e<\xa4CDDICt4DseD\x944D\x84I6P': "This is another test ", + }, +} + + +CODE = { + '0': "0011", '1': "0100", '2': "0101", '3': "0110", '4': "0111", + '5': "1000", '6': "1001", '7': "1010", '8': "1011", '9': "1100", +} + + +def excess3_encode(text, errors="strict"): + r, bits = "", "" + for c in text: + for i in str(ord(c)).zfill(3): + bits += CODE[i] + if len(bits) == 8: + r += chr(int(bits, 2)) + bits = "" + if len(bits) > 0: + r += chr(int(bits + "0000", 2)) + return r, len(b(text)) + + +def excess3_decode(text, errors="strict"): + code = {v: k for k, v in CODE.items()} + r, d = "", "" + for i, c in enumerate(text): + bin_c = bin(ord(c))[2:].zfill(8) + for k in range(0, 8, 4): + hb = bin_c[k:k+4] + try: + d += code[hb] + except KeyError: # (normal case) occurs when 0000 was used for padding + if i != len(text) - 1 or k != 4 or hb != "0000": + d += handle_error("excess3", errors, decode=True)(hb, i) + if len(d) == 3: + r += chr(int(d)) + d = "" + return r, len(b(text)) + + +add("excess3", excess3_encode, excess3_decode, pattern=r"^(?:excess\-?3|xs\-?3|stibitz)$", printables_rate=.45) + diff --git a/src/codext/binary/gray.py b/src/codext/binary/gray.py old mode 100755 new mode 100644 index f1be17a..e32c979 --- a/src/codext/binary/gray.py +++ b/src/codext/binary/gray.py @@ -1,25 +1,25 @@ -# -*- coding: UTF-8 -*- -"""Gray Codec - gray code content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(gray|reflected-bin|reflected_binary)': { - 'this is a test': "N\\]J0]J0Q0NWJN", - 'THIS IS A TEST': "~lmz0mz0a0~gz~", - }, -} - - -ENCMAP = {chr(i): chr(i ^ (i >> 1)) for i in range(256)} - - -add_map("gray", ENCMAP, pattern=r"^(?:gray|reflected[-_]bin(?:ary)?)$", entropy=lambda e: e) - +# -*- coding: UTF-8 -*- +"""Gray Codec - gray code content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(gray|reflected-bin|reflected_binary)': { + 'this is a test': "N\\]J0]J0Q0NWJN", + 'THIS IS A TEST': "~lmz0mz0a0~gz~", + }, +} + + +ENCMAP = {chr(i): chr(i ^ (i >> 1)) for i in range(256)} + + +add_map("gray", ENCMAP, pattern=r"^(?:gray|reflected[-_]bin(?:ary)?)$", entropy=lambda e: e) + diff --git a/src/codext/binary/manchester.py b/src/codext/binary/manchester.py old mode 100755 new mode 100644 index 32f3ac5..a50181d --- a/src/codext/binary/manchester.py +++ b/src/codext/binary/manchester.py @@ -1,50 +1,50 @@ -# -*- coding: UTF-8 -*- -"""Manchester Codec - Manchester content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples1__ = {'enc(manchester)': {'This is a test!': "fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV"}} -__examples2__ = { - 'enc(manchester-inverted|ethernet|ieee802.4)': { - 'This is a test!': "\x99\x9a\x96j\x96i\x95\xa5\xa6\xaa\x96i\x95\xa5\xa6\xaa\x96\xa9\xa6\xaa\x95\x9a\x96\x99" - "\x95\xa5\x95\x9a\xa6\xa9", - }, -} - - -def manchester_encode(clock): - def encode(text, errors="strict"): - r = "" - for c in text: - bin_c = bin(ord(c))[2:].zfill(8) - for i in range(0, 8, 4): - r += chr(int("".join(2*bit for bit in bin_c[i:i+4]), 2) ^ clock) - return r, len(b(text)) - return encode - - -def manchester_decode(clock): - def decode(text, errors="strict"): - r, bits = "", "" - for c in text: - bin_c = bin(ord(c) ^ clock)[2:].zfill(8) - bits += "".join(bin_c[i] for i in range(0, len(bin_c), 2)) - if len(bits) == 8: - r += chr(int(bits, 2)) - bits = "" - return r, len(b(text)) - return decode - - -add("manchester", manchester_encode(0x55), manchester_decode(0x55), examples=__examples1__, printables_rate=.25, - entropy=lambda e: .17616*e+2.56229) -add("manchester-inverted", manchester_encode(0xaa), manchester_decode(0xaa), examples=__examples2__, - pattern=r"^(?:manchester-inverted|ethernet|ieee802\.4)$", entropy=lambda e: .17616*e+2.56229) - +# -*- coding: UTF-8 -*- +"""Manchester Codec - Manchester content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples1__ = {'enc(manchester)': {'This is a test!': "fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV"}} +__examples2__ = { + 'enc(manchester-inverted|ethernet|ieee802.4)': { + 'This is a test!': "\x99\x9a\x96j\x96i\x95\xa5\xa6\xaa\x96i\x95\xa5\xa6\xaa\x96\xa9\xa6\xaa\x95\x9a\x96\x99" + "\x95\xa5\x95\x9a\xa6\xa9", + }, +} + + +def manchester_encode(clock): + def encode(text, errors="strict"): + r = "" + for c in text: + bin_c = bin(ord(c))[2:].zfill(8) + for i in range(0, 8, 4): + r += chr(int("".join(2*bit for bit in bin_c[i:i+4]), 2) ^ clock) + return r, len(b(text)) + return encode + + +def manchester_decode(clock): + def decode(text, errors="strict"): + r, bits = "", "" + for c in text: + bin_c = bin(ord(c) ^ clock)[2:].zfill(8) + bits += "".join(bin_c[i] for i in range(0, len(bin_c), 2)) + if len(bits) == 8: + r += chr(int(bits, 2)) + bits = "" + return r, len(b(text)) + return decode + + +add("manchester", manchester_encode(0x55), manchester_decode(0x55), examples=__examples1__, printables_rate=.25, + entropy=lambda e: .17616*e+2.56229) +add("manchester-inverted", manchester_encode(0xaa), manchester_decode(0xaa), examples=__examples2__, + pattern=r"^(?:manchester-inverted|ethernet|ieee802\.4)$", entropy=lambda e: .17616*e+2.56229) + diff --git a/src/codext/binary/rotate.py b/src/codext/binary/rotate.py old mode 100755 new mode 100644 diff --git a/src/codext/checksums/__init__.py b/src/codext/checksums/__init__.py old mode 100755 new mode 100644 diff --git a/src/codext/common/__init__.py b/src/codext/common/__init__.py old mode 100755 new mode 100644 index 403d991..3ca65e6 --- a/src/codext/common/__init__.py +++ b/src/codext/common/__init__.py @@ -1,7 +1,7 @@ -# -*- coding: UTF-8 -*- -from .a1z26 import * -from .cases import * -from .dummy import * -from .octal import * -from .ordinal import * - +# -*- coding: UTF-8 -*- +from .a1z26 import * +from .cases import * +from .dummy import * +from .octal import * +from .ordinal import * + diff --git a/src/codext/common/a1z26.py b/src/codext/common/a1z26.py old mode 100755 new mode 100644 index cc4de7e..b184637 --- a/src/codext/common/a1z26.py +++ b/src/codext/common/a1z26.py @@ -1,60 +1,60 @@ -# -*- coding: UTF-8 -*- -"""A1Z26 Codec - A1Z26 content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from string import ascii_lowercase as lower - -from ..__common__ import * - - -SEP = "-_/|,;:*" - -__examples__ = { - 'enc(a1z26-BAD)': None, - 'dec(a1z26)': {'1-12-123': None}, - 'enc(a1z26)': {'test123': None, 'this is a test': "20-8-9-19 9-19 1 20-5-19-20"}, - 'enc(a1z26-/)': {'this is a test': "20/8/9/19 9/19 1 20/5/19/20"}, -} -__guess__ = ["a1z26", "a1z26_"] + ["a1z26-" + s for s in SEP[2:]] - - -def a1z26_encode(sep): - sep = sep[-1] if len(sep) > 0 else "-" - def encode(text, errors="strict"): - words = [] - for word in text.split(): - w = [] - for k, c in enumerate(word): - try: - w.append(str(lower.index(c.lower()) + 1)) - except ValueError: - w.append(handle_error("a1z26", errors)(c, k)) - words.append(sep.join(w).strip(sep)) - return " ".join(words), len(text) - return encode - - -def a1z26_decode(sep): - sep = sep[-1] if len(sep) > 0 else "-" - def decode(text, errors="strict"): - k, words = 0, [] - for word in text.split(): - w = "" - for i in word.split(sep): - k += 1 - try: - w += lower[int(i)-1] - except IndexError: - w += handle_error("a1z26", errors, decode=True)(str(i), k) - words.append(w) - return " ".join(words), len(text) - return decode - - -add("a1z26", a1z26_encode, a1z26_decode, pattern=r"^a1z26(|[-_]|[-_][/|,;:\*])$", printables_rate=1.) - +# -*- coding: UTF-8 -*- +"""A1Z26 Codec - A1Z26 content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from string import ascii_lowercase as lower + +from ..__common__ import * + + +SEP = "-_/|,;:*" + +__examples__ = { + 'enc(a1z26-BAD)': None, + 'dec(a1z26)': {'1-12-123': None}, + 'enc(a1z26)': {'test123': None, 'this is a test': "20-8-9-19 9-19 1 20-5-19-20"}, + 'enc(a1z26-/)': {'this is a test': "20/8/9/19 9/19 1 20/5/19/20"}, +} +__guess__ = ["a1z26", "a1z26_"] + ["a1z26-" + s for s in SEP[2:]] + + +def a1z26_encode(sep): + sep = sep[-1] if len(sep) > 0 else "-" + def encode(text, errors="strict"): + words = [] + for word in text.split(): + w = [] + for k, c in enumerate(word): + try: + w.append(str(lower.index(c.lower()) + 1)) + except ValueError: + w.append(handle_error("a1z26", errors)(c, k)) + words.append(sep.join(w).strip(sep)) + return " ".join(words), len(text) + return encode + + +def a1z26_decode(sep): + sep = sep[-1] if len(sep) > 0 else "-" + def decode(text, errors="strict"): + k, words = 0, [] + for word in text.split(): + w = "" + for i in word.split(sep): + k += 1 + try: + w += lower[int(i)-1] + except IndexError: + w += handle_error("a1z26", errors, decode=True)(str(i), k) + words.append(w) + return " ".join(words), len(text) + return decode + + +add("a1z26", a1z26_encode, a1z26_decode, pattern=r"^a1z26(|[-_]|[-_][/|,;:\*])$", printables_rate=1.) + diff --git a/src/codext/common/dummy.py b/src/codext/common/dummy.py old mode 100755 new mode 100644 index b45c023..e6d73cb --- a/src/codext/common/dummy.py +++ b/src/codext/common/dummy.py @@ -1,57 +1,57 @@ -# -*- coding: UTF-8 -*- -"""Dummy Codecs - simple string manipulations. - -These are dummy codecs for manipulating strings, for use with other codecs in encoding/decoding chains. - -These codecs: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import re - -from ..__common__ import * - - -def replace(pair, *args): - def code(input, errors="strict"): - return input.replace(pair[0], pair[1]), len(input) - return code -add("replace", replace, replace, r"^replace[-_]?((?!.*(.).*\2)..)$", guess=None) -# important note: ^ -# using "{2}" here instead will break the codec -# this is due to the fact the codext.__common__.generate_string_from_regex DOES NOT handle ASSERT_NOT (?!) and will -# fail to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo - - -def substitute(token, replacement): - def code(input, errors="strict"): - return input.replace(token, replacement), len(input) - return code -add("substitute", substitute, substitute, r"^substitute[-_]?(.*?)/(.*?)$", guess=None) - - -reverse = lambda i, e="strict": (i[::-1], len(i)) -add("reverse", reverse, reverse) - -_revl = lambda i, wd=False: "".join((" ".join(w[::-1] for w in l.split()) if wd else l[::-1]) \ - if not re.match(r"(\r?\n)", l) else l for l in re.split(r"(\r?\n)", i)) -line_reverse = lambda i, e="strict": (_revl(i), len(i)) -add("reverse-lines", line_reverse, line_reverse, r"^reverse[-_]lines$") -word_reverse = lambda i, e="strict": (_revl(i, True), len(i)) -add("reverse-words", word_reverse, word_reverse, r"^reverse[-_]words$") - -strip_spaces = lambda i, e="strict": (i.replace(" ", ""), len(i)) -add("strip-spaces", strip_spaces, strip_spaces, guess=None) - -def tokenize(n): - tlen = int(n[8:].lstrip("-_")) - def code(input, errors="strict"): - l = len(input) - if tlen > l: - raise LookupError("unknown encoding: %s" % n) - return " ".join(input[i:i+tlen] for i in range(0, l, tlen)), l - return code -add("tokenize", tokenize, tokenize, r"^(tokenize[-_]?[1-9][0-9]*)$", guess=None) - +# -*- coding: UTF-8 -*- +"""Dummy Codecs - simple string manipulations. + +These are dummy codecs for manipulating strings, for use with other codecs in encoding/decoding chains. + +These codecs: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import re + +from ..__common__ import * + + +def replace(pair, *args): + def code(input, errors="strict"): + return input.replace(pair[0], pair[1]), len(input) + return code +add("replace", replace, replace, r"^replace[-_]?((?!.*(.).*\2)..)$", guess=None) +# important note: ^ +# using "{2}" here instead will break the codec +# this is due to the fact the codext.__common__.generate_string_from_regex DOES NOT handle ASSERT_NOT (?!) and will +# fail to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo + + +def substitute(token, replacement): + def code(input, errors="strict"): + return input.replace(token, replacement), len(input) + return code +add("substitute", substitute, substitute, r"^substitute[-_]?(.*?)/(.*?)$", guess=None) + + +reverse = lambda i, e="strict": (i[::-1], len(i)) +add("reverse", reverse, reverse) + +_revl = lambda i, wd=False: "".join((" ".join(w[::-1] for w in l.split()) if wd else l[::-1]) \ + if not re.match(r"(\r?\n)", l) else l for l in re.split(r"(\r?\n)", i)) +line_reverse = lambda i, e="strict": (_revl(i), len(i)) +add("reverse-lines", line_reverse, line_reverse, r"^reverse[-_]lines$") +word_reverse = lambda i, e="strict": (_revl(i, True), len(i)) +add("reverse-words", word_reverse, word_reverse, r"^reverse[-_]words$") + +strip_spaces = lambda i, e="strict": (i.replace(" ", ""), len(i)) +add("strip-spaces", strip_spaces, strip_spaces, guess=None) + +def tokenize(n): + tlen = int(n[8:].lstrip("-_")) + def code(input, errors="strict"): + l = len(input) + if tlen > l: + raise LookupError("unknown encoding: %s" % n) + return " ".join(input[i:i+tlen] for i in range(0, l, tlen)), l + return code +add("tokenize", tokenize, tokenize, r"^(tokenize[-_]?[1-9][0-9]*)$", guess=None) + diff --git a/src/codext/common/octal.py b/src/codext/common/octal.py old mode 100755 new mode 100644 index 6559409..9165065 --- a/src/codext/common/octal.py +++ b/src/codext/common/octal.py @@ -1,31 +1,31 @@ -# -*- coding: UTF-8 -*- -"""Octal Codec - octal content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples1__ = { - 'enc(octal-spaced|octals_spaced)': {'this is a test': "164 150 151 163 40 151 163 40 141 40 164 145 163 164"}, -} -__examples2__ = { - 'enc(octal|octals)': {'this is a test': "164150151163040151163040141040164145163164"}, -} - - -oct2 = lambda i: oct(i).lstrip("0").replace("o", "") - -ENCMAP1 = {chr(i): oct2(i) for i in range(256)} -ENCMAP2 = {chr(i): oct2(i).zfill(3) for i in range(256)} - - -add_map("octal-spaced", ENCMAP1, sep=" ", pattern=r"^octals?[-_]spaced$", examples=__examples1__, - entropy=lambda e: .07258*e+2.3739, printables_rate=1.) -add_map("octal", ENCMAP2, pattern=r"^octals?$", examples=__examples2__, entropy=lambda e: .08803*e+2.19498, - printables_rate=1.) - +# -*- coding: UTF-8 -*- +"""Octal Codec - octal content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples1__ = { + 'enc(octal-spaced|octals_spaced)': {'this is a test': "164 150 151 163 40 151 163 40 141 40 164 145 163 164"}, +} +__examples2__ = { + 'enc(octal|octals)': {'this is a test': "164150151163040151163040141040164145163164"}, +} + + +oct2 = lambda i: oct(i).lstrip("0").replace("o", "") + +ENCMAP1 = {chr(i): oct2(i) for i in range(256)} +ENCMAP2 = {chr(i): oct2(i).zfill(3) for i in range(256)} + + +add_map("octal-spaced", ENCMAP1, sep=" ", pattern=r"^octals?[-_]spaced$", examples=__examples1__, + entropy=lambda e: .07258*e+2.3739, printables_rate=1.) +add_map("octal", ENCMAP2, pattern=r"^octals?$", examples=__examples2__, entropy=lambda e: .08803*e+2.19498, + printables_rate=1.) + diff --git a/src/codext/common/ordinal.py b/src/codext/common/ordinal.py old mode 100755 new mode 100644 index 4bf576b..6d4d227 --- a/src/codext/common/ordinal.py +++ b/src/codext/common/ordinal.py @@ -1,28 +1,28 @@ -# -*- coding: UTF-8 -*- -"""Ordinal Codec - ordinal content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples1__ = { - 'enc(ordinal-spaced|ordinals_spaced)': {'this is a test': "116 104 105 115 32 105 115 32 97 32 116 101 115 116"}, -} -__examples2__ = { - 'enc(ordinal|ordinals)': {'this is a test': "116104105115032105115032097032116101115116"}, -} - - -ENCMAP1 = {chr(i): str(i) for i in range(256)} -ENCMAP2 = {chr(i): str(i).zfill(3) for i in range(256)} - - -add_map("ordinal-spaced", ENCMAP1, sep=" ", pattern=r"^ordinals?[-_]spaced$", examples=__examples1__, entropy=3., - printables_rate=1.) -add_map("ordinal", ENCMAP2, pattern=r"^ordinals?$", examples=__examples2__, entropy=3., printables_rate=1.) - +# -*- coding: UTF-8 -*- +"""Ordinal Codec - ordinal content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples1__ = { + 'enc(ordinal-spaced|ordinals_spaced)': {'this is a test': "116 104 105 115 32 105 115 32 97 32 116 101 115 116"}, +} +__examples2__ = { + 'enc(ordinal|ordinals)': {'this is a test': "116104105115032105115032097032116101115116"}, +} + + +ENCMAP1 = {chr(i): str(i) for i in range(256)} +ENCMAP2 = {chr(i): str(i).zfill(3) for i in range(256)} + + +add_map("ordinal-spaced", ENCMAP1, sep=" ", pattern=r"^ordinals?[-_]spaced$", examples=__examples1__, entropy=3., + printables_rate=1.) +add_map("ordinal", ENCMAP2, pattern=r"^ordinals?$", examples=__examples2__, entropy=3., printables_rate=1.) + diff --git a/src/codext/compressions/__init__.py b/src/codext/compressions/__init__.py old mode 100755 new mode 100644 index 606a1dc..eae3eee --- a/src/codext/compressions/__init__.py +++ b/src/codext/compressions/__init__.py @@ -1,12 +1,12 @@ -# -*- coding: UTF-8 -*- -from .gzipp import * -from .lz77 import * -from .lz78 import * -from .pkzip import * - - -for e in list_encodings("compression"): - ci = lookup(e, False) - ci.parameters['scoring']['entropy'] = 7.9 - ci.parameters['scoring']['expansion_factor'] = lambda f: f - +# -*- coding: UTF-8 -*- +from .gzipp import * +from .lz77 import * +from .lz78 import * +from .pkzip import * + + +for e in list_encodings("compression"): + ci = lookup(e, False) + ci.parameters['scoring']['entropy'] = 7.9 + ci.parameters['scoring']['expansion_factor'] = lambda f: f + diff --git a/src/codext/compressions/gzipp.py b/src/codext/compressions/gzipp.py old mode 100755 new mode 100644 index 14e65bc..e162239 --- a/src/codext/compressions/gzipp.py +++ b/src/codext/compressions/gzipp.py @@ -1,44 +1,44 @@ -# -*- coding: UTF-8 -*- -"""Gzip Codec - gzip content compression. - -NB: Not an encoding properly speaking. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import zlib -from gzip import GzipFile - -from ..__common__ import * - - -__examples__ = {'enc-dec(gzip)': ["test", "This is a test", "@random{512,1024,2048}"]} - - -def gzip_compress(text, errors="strict"): - out = BytesIO() - with GzipFile(fileobj=out, mode="wb") as f: - f.write(b(text)) - return out.getvalue(), len(text) - - -def gzip_decompress(data, errors="strict"): - # then try decompressing considering the file signature - try: - with GzipFile(fileobj=BytesIO(b(data)), mode="rb") as f: - r = f.read() - except: - pass - # try decompressing without considering the file signature - try: - r = zlib.decompress(b(data), 16 + zlib.MAX_WBITS) - except: - return handle_error("gzip", errors, decode=True)(data[0], 0) if len(data) > 0 else "", len(data) - return r, len(r) - - -add("gzip", gzip_compress, gzip_decompress) - +# -*- coding: UTF-8 -*- +"""Gzip Codec - gzip content compression. + +NB: Not an encoding properly speaking. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import zlib +from gzip import GzipFile + +from ..__common__ import * + + +__examples__ = {'enc-dec(gzip)': ["test", "This is a test", "@random{512,1024,2048}"]} + + +def gzip_compress(text, errors="strict"): + out = BytesIO() + with GzipFile(fileobj=out, mode="wb") as f: + f.write(b(text)) + return out.getvalue(), len(text) + + +def gzip_decompress(data, errors="strict"): + # then try decompressing considering the file signature + try: + with GzipFile(fileobj=BytesIO(b(data)), mode="rb") as f: + r = f.read() + except: + pass + # try decompressing without considering the file signature + try: + r = zlib.decompress(b(data), 16 + zlib.MAX_WBITS) + except: + return handle_error("gzip", errors, decode=True)(data[0], 0) if len(data) > 0 else "", len(data) + return r, len(r) + + +add("gzip", gzip_compress, gzip_decompress) + diff --git a/src/codext/compressions/pkzip.py b/src/codext/compressions/pkzip.py old mode 100755 new mode 100644 diff --git a/src/codext/crypto/__init__.py b/src/codext/crypto/__init__.py old mode 100755 new mode 100644 index 6928637..1244bae --- a/src/codext/crypto/__init__.py +++ b/src/codext/crypto/__init__.py @@ -1,12 +1,12 @@ -# -*- coding: UTF-8 -*- -from .affine import * -from .atbash import * -from .bacon import * -from .barbie import * -from .citrix import * -from .railfence import * -from .rot import * -from .scytale import * -from .shift import * -from .xor import * - +# -*- coding: UTF-8 -*- +from .affine import * +from .atbash import * +from .bacon import * +from .barbie import * +from .citrix import * +from .railfence import * +from .rot import * +from .scytale import * +from .shift import * +from .xor import * + diff --git a/src/codext/crypto/affine.py b/src/codext/crypto/affine.py old mode 100755 new mode 100644 index cc18818..59c9d34 --- a/src/codext/crypto/affine.py +++ b/src/codext/crypto/affine.py @@ -1,32 +1,32 @@ -# -*- coding: UTF-8 -*- -"""Affine Cipher Codec - affine content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) - -Reference: https://crypto.interactive-maths.com/affine-cipher.html -""" -from ..__common__ import * - - -__guess__ = [] - - -def encmap_factory(mask=None): - mask = mask or "?l?u?s-1,2" - mask, key = mask.split("-") - a, b = map(int, key.split(",")) - alphabet = get_alphabet_from_mask(mask) - encmap = {c: alphabet[(a * alphabet.index(c) + b) % len(alphabet)] for c in alphabet} - if len(set(encmap.keys())) != len(set(encmap.values())): - raise LookupError("Bad parameter for encoding 'affine': {}, {}".format(a, b)) - if ' ' not in encmap.keys(): - encmap[' '] = " " - return encmap - - -add_map("affine", encmap_factory, pattern=r"^affine(?:[-_]cipher)?(?:[-_](.+?\-\d+\,\d+))?$") - +# -*- coding: UTF-8 -*- +"""Affine Cipher Codec - affine content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) + +Reference: https://crypto.interactive-maths.com/affine-cipher.html +""" +from ..__common__ import * + + +__guess__ = [] + + +def encmap_factory(mask=None): + mask = mask or "?l?u?s-1,2" + mask, key = mask.split("-") + a, b = map(int, key.split(",")) + alphabet = get_alphabet_from_mask(mask) + encmap = {c: alphabet[(a * alphabet.index(c) + b) % len(alphabet)] for c in alphabet} + if len(set(encmap.keys())) != len(set(encmap.values())): + raise LookupError("Bad parameter for encoding 'affine': {}, {}".format(a, b)) + if ' ' not in encmap.keys(): + encmap[' '] = " " + return encmap + + +add_map("affine", encmap_factory, pattern=r"^affine(?:[-_]cipher)?(?:[-_](.+?\-\d+\,\d+))?$") + diff --git a/src/codext/crypto/atbash.py b/src/codext/crypto/atbash.py old mode 100755 new mode 100644 index 5cb9f83..b6dbf16 --- a/src/codext/crypto/atbash.py +++ b/src/codext/crypto/atbash.py @@ -1,34 +1,34 @@ -# -*- coding: UTF-8 -*- -"""Atbash Cipher Codec - atbash-based content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) - -Reference: https://crypto.interactive-maths.com/atbash-cipher.html -""" -from ..__common__ import * - - -__guess__ = ["atbash"] - - -def encmap_factory(mask=None): - mask = mask or "?u?l" - # [...] enclosure causes the mask to be handled as a whole - if mask[0] == "[" and mask[-1] == "]": - alphabet = get_alphabet_from_mask(mask[1:-1]) - return {k: v for k, v in zip(alphabet, alphabet[::-1])} - # not enclosing the whole mask means that each group is to be considered separately - else: - m = {} - for group in re.findall(r"(\?.|[^?]+)", mask): - alphabet = get_alphabet_from_mask(group) - m.update({k: v for k, v in zip(alphabet, alphabet[::-1])}) - return m - - -add_map("atbash", encmap_factory, no_error=True, pattern=r"atbash(?:[-_]cipher)?(?:[-_](.+))?$") - +# -*- coding: UTF-8 -*- +"""Atbash Cipher Codec - atbash-based content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) + +Reference: https://crypto.interactive-maths.com/atbash-cipher.html +""" +from ..__common__ import * + + +__guess__ = ["atbash"] + + +def encmap_factory(mask=None): + mask = mask or "?u?l" + # [...] enclosure causes the mask to be handled as a whole + if mask[0] == "[" and mask[-1] == "]": + alphabet = get_alphabet_from_mask(mask[1:-1]) + return {k: v for k, v in zip(alphabet, alphabet[::-1])} + # not enclosing the whole mask means that each group is to be considered separately + else: + m = {} + for group in re.findall(r"(\?.|[^?]+)", mask): + alphabet = get_alphabet_from_mask(group) + m.update({k: v for k, v in zip(alphabet, alphabet[::-1])}) + return m + + +add_map("atbash", encmap_factory, no_error=True, pattern=r"atbash(?:[-_]cipher)?(?:[-_](.+))?$") + diff --git a/src/codext/crypto/bacon.py b/src/codext/crypto/bacon.py old mode 100755 new mode 100644 index e7daf92..a7048ef --- a/src/codext/crypto/bacon.py +++ b/src/codext/crypto/bacon.py @@ -1,36 +1,36 @@ -# -*- coding: UTF-8 -*- -"""Bacon's Cipher Codec - bacon content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) - -Reference: https://en.wikipedia.org/wiki/Bacon%27s_cipher -""" -from ..__common__ import * - - -__examples__ = { - 'enc(bacon|bacon_cipher|baconian-cipher|bacon-ab|bacon_AB)': { - 'this is a test': "baabaaabbbabaaabaaab abaaabaaab aaaaa baabaaabaabaaabbaaba", - }, - 'enc(bacon-01|bacon_01)': { - 'this is a test': "10010001110100010001 0100010001 00000 10010001001000110010", - }, -} -__guess__ = {"bacon", "bacon-ba", "bacon-01", "bacon-10"} - - -ENCMAP = { - 'A': "aaaaa", 'B': "aaaab", 'C': "aaaba", 'D': "aaabb", 'E': "aabaa", 'F': "aabab", 'G': "aabba", 'H': "aabbb", - 'I': "abaaa", 'J': "abaaa", 'K': "abaab", 'L': "ababa", 'M': "ababb", 'N': "abbaa", 'O': "abbab", 'P': "abbba", - 'Q': "abbbb", 'R': "baaaa", 'S': "baaab", 'T': "baaba", 'U': "baabb", 'V': "baabb", 'W': "babaa", 'X': "babab", - 'Y': "babba", 'Z': "babbb", ' ': " ", -} - - -add_map("bacon", ENCMAP, ignore_case="both", pattern=r"bacon(?:(?:ian)?[-_]cipher)?([\-_].{2})?$", expansion_factor=5., - printables_rate=1.) - +# -*- coding: UTF-8 -*- +"""Bacon's Cipher Codec - bacon content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) + +Reference: https://en.wikipedia.org/wiki/Bacon%27s_cipher +""" +from ..__common__ import * + + +__examples__ = { + 'enc(bacon|bacon_cipher|baconian-cipher|bacon-ab|bacon_AB)': { + 'this is a test': "baabaaabbbabaaabaaab abaaabaaab aaaaa baabaaabaabaaabbaaba", + }, + 'enc(bacon-01|bacon_01)': { + 'this is a test': "10010001110100010001 0100010001 00000 10010001001000110010", + }, +} +__guess__ = {"bacon", "bacon-ba", "bacon-01", "bacon-10"} + + +ENCMAP = { + 'A': "aaaaa", 'B': "aaaab", 'C': "aaaba", 'D': "aaabb", 'E': "aabaa", 'F': "aabab", 'G': "aabba", 'H': "aabbb", + 'I': "abaaa", 'J': "abaaa", 'K': "abaab", 'L': "ababa", 'M': "ababb", 'N': "abbaa", 'O': "abbab", 'P': "abbba", + 'Q': "abbbb", 'R': "baaaa", 'S': "baaab", 'T': "baaba", 'U': "baabb", 'V': "baabb", 'W': "babaa", 'X': "babab", + 'Y': "babba", 'Z': "babbb", ' ': " ", +} + + +add_map("bacon", ENCMAP, ignore_case="both", pattern=r"bacon(?:(?:ian)?[-_]cipher)?([\-_].{2})?$", expansion_factor=5., + printables_rate=1.) + diff --git a/src/codext/crypto/barbie.py b/src/codext/crypto/barbie.py old mode 100755 new mode 100644 index 4e593b8..830119a --- a/src/codext/crypto/barbie.py +++ b/src/codext/crypto/barbie.py @@ -1,54 +1,54 @@ -# -*- coding: UTF-8 -*- -"""Barbie typewriter Codec - barbie content encoding. - -While Barbie typewriter is more a cipher, its very limited key size of 2 bits makes it easy to turn into four variants - of the same encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) - -Reference: http://www.cryptomuseum.com/crypto/mehano/barbie/ -""" -from ..__common__ import * - - -__examples__ = { - 'enc(barbie1)': {'\r': None}, - 'enc(barbie1|barbie_1|barbie-1)': {'this is a test': "hstf tf i hafh"}, - 'enc(barbie2|barbie_2|barbie-2)': {'this is a test': "sfhp hp t sips"}, - 'enc(barbie3|barbie_3|barbie-3)': {'this is a test': "fpsu su h ftuf"}, - 'enc(barbie4|barbie_4|barbie-4)': {'this is a test': "pufq fq s phqp"}, -} -__guess__ = ["barbie-%d" % i for i in range(1, 5)] - - -STD = [ - "abcdefghijklmnopqrstuvABCDEFGHIJKLMNOPQRSTUVWXYZ0123456 \n\t", - "icolapxstvybjeruknfhqg>FAUTCYOLVJDZINQKSEHG<.1PB5234067 \n\t", - "torbiudfhgzcvanqyepskxRC>GHAPNDQIUXSPNRKLG1XYCUDV ¢ £ § €", - "; d z w 8 9 - ¨ _ & m @ : \" * ( # W M § ^ , ¢ / ? ! ) % X \' R + € £ =", - "¢ l w ; 9 - ¨ § ) \" j ? , m # * @ . Z £ ! W + ^ / & ( : 1 _ S % = € \'", - "+ b ; ¢ - ¨ § £ ( m v / W j @ # ? M B € & . % ! ^ \" * , 2 ) E : \' = _", - "% c ¢ + ¨ § £ € * j g ^ . v ? @ / Z F = \" N : & ! m # W 3 ( T , _ \' )", -] -ENCMAP = [] -for i in range(4): - encmap = {} - for j, c in enumerate(STD[0]): - encmap[c] = STD[i+1][j] - spec = SPEC[i+1].split() - for j, c in enumerate(SPEC[0].split()): - encmap[c] = spec[j] - ENCMAP.append(encmap) - - -add_map("barbie", ENCMAP, pattern=r"^barbie[-_]?([1-4])$", printables_rate=lambda pr: .857 * pr) - +# -*- coding: UTF-8 -*- +"""Barbie typewriter Codec - barbie content encoding. + +While Barbie typewriter is more a cipher, its very limited key size of 2 bits makes it easy to turn into four variants + of the same encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) + +Reference: http://www.cryptomuseum.com/crypto/mehano/barbie/ +""" +from ..__common__ import * + + +__examples__ = { + 'enc(barbie1)': {'\r': None}, + 'enc(barbie1|barbie_1|barbie-1)': {'this is a test': "hstf tf i hafh"}, + 'enc(barbie2|barbie_2|barbie-2)': {'this is a test': "sfhp hp t sips"}, + 'enc(barbie3|barbie_3|barbie-3)': {'this is a test': "fpsu su h ftuf"}, + 'enc(barbie4|barbie_4|barbie-4)': {'this is a test': "pufq fq s phqp"}, +} +__guess__ = ["barbie-%d" % i for i in range(1, 5)] + + +STD = [ + "abcdefghijklmnopqrstuvABCDEFGHIJKLMNOPQRSTUVWXYZ0123456 \n\t", + "icolapxstvybjeruknfhqg>FAUTCYOLVJDZINQKSEHG<.1PB5234067 \n\t", + "torbiudfhgzcvanqyepskxRC>GHAPNDQIUXSPNRKLG1XYCUDV ¢ £ § €", + "; d z w 8 9 - ¨ _ & m @ : \" * ( # W M § ^ , ¢ / ? ! ) % X \' R + € £ =", + "¢ l w ; 9 - ¨ § ) \" j ? , m # * @ . Z £ ! W + ^ / & ( : 1 _ S % = € \'", + "+ b ; ¢ - ¨ § £ ( m v / W j @ # ? M B € & . % ! ^ \" * , 2 ) E : \' = _", + "% c ¢ + ¨ § £ € * j g ^ . v ? @ / Z F = \" N : & ! m # W 3 ( T , _ \' )", +] +ENCMAP = [] +for i in range(4): + encmap = {} + for j, c in enumerate(STD[0]): + encmap[c] = STD[i+1][j] + spec = SPEC[i+1].split() + for j, c in enumerate(SPEC[0].split()): + encmap[c] = spec[j] + ENCMAP.append(encmap) + + +add_map("barbie", ENCMAP, pattern=r"^barbie[-_]?([1-4])$", printables_rate=lambda pr: .857 * pr) + diff --git a/src/codext/crypto/citrix.py b/src/codext/crypto/citrix.py index 43ac77b..c361eab 100644 --- a/src/codext/crypto/citrix.py +++ b/src/codext/crypto/citrix.py @@ -1,52 +1,52 @@ -# -*- coding: UTF-8 -*- -"""Citrix Codec - citrix password encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) - -Reference: https://crypto.interactive-maths.com/atbash-cipher.html -""" -from ..__common__ import * - - -__examples__ = { - 'enc(citrix-ctx0)': None, - 'enc(citrix|citrix-1|citrix_ctx1)': {'this is a test': "NBBMNAAGIDEPJJBMNIFNIMEMJKEL"}, -} -__guess__ = ["citrix-ctx1"] - - -_dec = lambda g: ((ord(g[0]) - 0x41) & 0xf) ^ ((((ord(g[1]) - 0x41) & 0xf) << 4) & 0xf0) -_enc = lambda o: chr(((o >> 4) & 0xf) + 0x41) + chr((o & 0xf) + 0x41) - - -def citrix_encode(t): - def encode(text, errors="strict"): - l = len(text) - r, x = "", 0 - for c in text: - x = ord(c) ^ 0xa5 ^ x - r += _enc(x) - return r, l - return encode - - -def citrix_decode(t): - def decode(text, errors="strict"): - l = len(text) - text = text[::-1] - r = "" - for i in range(0, l, 2): - x = 0 if i + 2 >= l else _dec(text[i+2:i+4]) - x ^= _dec(text[i:i+2]) ^ 0xa5 - r += chr(x) - return r[::-1], l - return decode - - -add("citrix", citrix_encode, citrix_decode, r"citrix(|[-_]?(?:ctx)?1)$", entropy=4., printables_rate=1., - expansion_factor=2.) - +# -*- coding: UTF-8 -*- +"""Citrix Codec - citrix password encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) + +Reference: https://crypto.interactive-maths.com/atbash-cipher.html +""" +from ..__common__ import * + + +__examples__ = { + 'enc(citrix-ctx0)': None, + 'enc(citrix|citrix-1|citrix_ctx1)': {'this is a test': "NBBMNAAGIDEPJJBMNIFNIMEMJKEL"}, +} +__guess__ = ["citrix-ctx1"] + + +_dec = lambda g: ((ord(g[0]) - 0x41) & 0xf) ^ ((((ord(g[1]) - 0x41) & 0xf) << 4) & 0xf0) +_enc = lambda o: chr(((o >> 4) & 0xf) + 0x41) + chr((o & 0xf) + 0x41) + + +def citrix_encode(t): + def encode(text, errors="strict"): + l = len(text) + r, x = "", 0 + for c in text: + x = ord(c) ^ 0xa5 ^ x + r += _enc(x) + return r, l + return encode + + +def citrix_decode(t): + def decode(text, errors="strict"): + l = len(text) + text = text[::-1] + r = "" + for i in range(0, l, 2): + x = 0 if i + 2 >= l else _dec(text[i+2:i+4]) + x ^= _dec(text[i:i+2]) ^ 0xa5 + r += chr(x) + return r[::-1], l + return decode + + +add("citrix", citrix_encode, citrix_decode, r"citrix(|[-_]?(?:ctx)?1)$", entropy=4., printables_rate=1., + expansion_factor=2.) + diff --git a/src/codext/crypto/rot.py b/src/codext/crypto/rot.py old mode 100755 new mode 100644 index 3f696f4..88700bd --- a/src/codext/crypto/rot.py +++ b/src/codext/crypto/rot.py @@ -1,102 +1,102 @@ -# -*- coding: UTF-8 -*- -"""ROT Codec - rot-with-N-offset content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from string import ascii_lowercase as LC, ascii_uppercase as UC, digits as DIG - -from ..__common__ import * - - -__examples1__ = { - 'enc(rot0|rot--10|rot100)': None, - 'enc(rot1|rot-1|caesar_1)': {'this is a test': "uijt jt b uftu"}, - 'enc(rot3|caesar-3)': {'this is a test': "wklv lv d whvw"}, - 'enc(rot47)': {'this is a test': "E9:D :D 2 E6DE"}, -} -__examples2__ = { - 'enc(prot0|prot--10|prot100)': None, - 'enc(prot1|prog-caesar_1)': {'this is a test': "ujlw oz j eqfh"}, - 'enc(prot3|pcaesar-3)': {'this is a test': "wlny qb l gshj"}, -} -__examples3__ = { - 'enc(arot0|arot--10|arot100)': None, - 'enc(arot1|alt-caesar_1)': {'this is a test': "ugjr ht b udts"}, - 'enc(arot3|acaesar-3)': {'this is a test': "welp fv d wbvq"}, -} -__guess1__ = ["rot-%d" % i for i in range(1, 26)] + ["rot-47"] -__guess2__ = ["progressive-rot-%d" % i for i in range(1, 26)] + ["progressive-rot-n%d" % i for i in range(1, 26)] -__guess3__ = ["alternative-rot-%d" % i for i in range(1, 26) if i != 13] - - -ROT47 = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" - - -def _rotn(text, n=13, a=(LC, UC), alt=False, prog=False, neg=False): - r = "" - for i, c in enumerate(ensure_str(text)): - found = False - for l in a: - if c in l: - r += l[(l.index(c) + [1, -1][alt and i % 2 == 1] * n + ([1, -1][neg] * i if prog else 0)) % len(l)] - found = True - break - if not found: - r += c - return r - - -def arot_encode(i): - def encode(text, errors="strict"): - return _rotn(ensure_str(text), i, alt=True), len(text) - return encode - - -def arot_decode(i): - def decode(text, errors="strict"): - return _rotn(ensure_str(text), -i, alt=True), len(text) - return decode - - -def rot_encode(i): - def encode(text, errors="strict"): - t = ensure_str(text) - r = _rotn(t, 47, [ROT47]) if i == 47 else _rotn(t, i) - return r, len(r) - return encode - - -def rot_decode(i): - def decode(text, errors="strict"): - t = ensure_str(text) - r = _rotn(t, -47, [ROT47]) if i == 47 else _rotn(t, -i) - return r, len(r) - return decode - - -def prot_encode(n, i): - def encode(text, errors="strict"): - return _rotn(ensure_str(text), i, prog=True, neg=n == "n"), len(text) - return encode - - -def prot_decode(n, i): - def decode(text, errors="strict"): - return _rotn(ensure_str(text), -i, prog=True, neg=n != "n"), len(text) - return decode - - -# note: alternative-rot-13 is equivalent to rot-13, therefore excluded from the regex -add("alternative-rot", arot_encode, arot_decode, r"a(?:lt(?:ernative)?-)?(?:caesar|rot)[-_]?([1-9]|1[0-24-9]|2[0-5])$", - penalty=.2, entropy=lambda e: e, printables_rate=lambda pr: pr, transitive=True, examples=__examples3__, - guess=__guess3__) -add("rot", rot_encode, rot_decode, r"(?:caesar|rot)[-_]?([1-9]|1[0-9]|2[0-5]|47)$", aliases=["caesar"], penalty=.2, - entropy=lambda e: e, printables_rate=lambda pr: pr, transitive=True, examples=__examples1__, guess=__guess1__) -add("progressive-rot", prot_encode, prot_decode, r"p(?:rog(?:ressive)?-)?(?:caesar|rot)[-_]?(n?)([1-9]|1[0-9]|2[0-5])$", - penalty=.2, entropy=lambda e: e, printables_rate=lambda pr: pr, transitive=True, examples=__examples2__, - guess=__guess2__) - +# -*- coding: UTF-8 -*- +"""ROT Codec - rot-with-N-offset content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from string import ascii_lowercase as LC, ascii_uppercase as UC, digits as DIG + +from ..__common__ import * + + +__examples1__ = { + 'enc(rot0|rot--10|rot100)': None, + 'enc(rot1|rot-1|caesar_1)': {'this is a test': "uijt jt b uftu"}, + 'enc(rot3|caesar-3)': {'this is a test': "wklv lv d whvw"}, + 'enc(rot47)': {'this is a test': "E9:D :D 2 E6DE"}, +} +__examples2__ = { + 'enc(prot0|prot--10|prot100)': None, + 'enc(prot1|prog-caesar_1)': {'this is a test': "ujlw oz j eqfh"}, + 'enc(prot3|pcaesar-3)': {'this is a test': "wlny qb l gshj"}, +} +__examples3__ = { + 'enc(arot0|arot--10|arot100)': None, + 'enc(arot1|alt-caesar_1)': {'this is a test': "ugjr ht b udts"}, + 'enc(arot3|acaesar-3)': {'this is a test': "welp fv d wbvq"}, +} +__guess1__ = ["rot-%d" % i for i in range(1, 26)] + ["rot-47"] +__guess2__ = ["progressive-rot-%d" % i for i in range(1, 26)] + ["progressive-rot-n%d" % i for i in range(1, 26)] +__guess3__ = ["alternative-rot-%d" % i for i in range(1, 26) if i != 13] + + +ROT47 = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" + + +def _rotn(text, n=13, a=(LC, UC), alt=False, prog=False, neg=False): + r = "" + for i, c in enumerate(ensure_str(text)): + found = False + for l in a: + if c in l: + r += l[(l.index(c) + [1, -1][alt and i % 2 == 1] * n + ([1, -1][neg] * i if prog else 0)) % len(l)] + found = True + break + if not found: + r += c + return r + + +def arot_encode(i): + def encode(text, errors="strict"): + return _rotn(ensure_str(text), i, alt=True), len(text) + return encode + + +def arot_decode(i): + def decode(text, errors="strict"): + return _rotn(ensure_str(text), -i, alt=True), len(text) + return decode + + +def rot_encode(i): + def encode(text, errors="strict"): + t = ensure_str(text) + r = _rotn(t, 47, [ROT47]) if i == 47 else _rotn(t, i) + return r, len(r) + return encode + + +def rot_decode(i): + def decode(text, errors="strict"): + t = ensure_str(text) + r = _rotn(t, -47, [ROT47]) if i == 47 else _rotn(t, -i) + return r, len(r) + return decode + + +def prot_encode(n, i): + def encode(text, errors="strict"): + return _rotn(ensure_str(text), i, prog=True, neg=n == "n"), len(text) + return encode + + +def prot_decode(n, i): + def decode(text, errors="strict"): + return _rotn(ensure_str(text), -i, prog=True, neg=n != "n"), len(text) + return decode + + +# note: alternative-rot-13 is equivalent to rot-13, therefore excluded from the regex +add("alternative-rot", arot_encode, arot_decode, r"a(?:lt(?:ernative)?-)?(?:caesar|rot)[-_]?([1-9]|1[0-24-9]|2[0-5])$", + penalty=.2, entropy=lambda e: e, printables_rate=lambda pr: pr, transitive=True, examples=__examples3__, + guess=__guess3__) +add("rot", rot_encode, rot_decode, r"(?:caesar|rot)[-_]?([1-9]|1[0-9]|2[0-5]|47)$", aliases=["caesar"], penalty=.2, + entropy=lambda e: e, printables_rate=lambda pr: pr, transitive=True, examples=__examples1__, guess=__guess1__) +add("progressive-rot", prot_encode, prot_decode, r"p(?:rog(?:ressive)?-)?(?:caesar|rot)[-_]?(n?)([1-9]|1[0-9]|2[0-5])$", + penalty=.2, entropy=lambda e: e, printables_rate=lambda pr: pr, transitive=True, examples=__examples2__, + guess=__guess2__) + diff --git a/src/codext/crypto/scytale.py b/src/codext/crypto/scytale.py old mode 100755 new mode 100644 index 7490241..286d51e --- a/src/codext/crypto/scytale.py +++ b/src/codext/crypto/scytale.py @@ -1,54 +1,54 @@ -# -*- coding: UTF-8 -*- -"""Scytale-N Codec - scytale content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from math import ceil - -from ..__common__ import * - - -__examples__ = { - 'enc(scytale0|scytale--10|scytale01)': None, - 'enc(scytale2|scytale-2|scytale_2)': {'this is a test': "ti satshsi et"}, - 'enc(scytale5|scytale-5|scytale_5)': {'this is a test': "tithsei ssat "}, -} -__guess__ = ["scytale-%d" % i for i in range(1, 10)] - - -PADDING_CHAR = "" - - -def scytale_encode(l): - def encode(text, errors="strict"): - s, n = "", int(ceil(len(text) / float(l))) - for x in range(l): - for y in range(n): - try: - s += text[y*l+x] - except IndexError: - s += PADDING_CHAR - return s, len(s) - return encode - - -def scytale_decode(l): - def decode(text, errors="strict"): - s, n = "", int(ceil(len(text) / float(l))) - pl = l * n - len(text) - for x in range(n): - for y in range(l): - if y >= l-pl and x == n-1: - continue - s += text[y*n+x-max(0,y-(l-pl))] - s = s.rstrip(PADDING_CHAR) - return s, len(s) - return decode - - -add("scytale", scytale_encode, scytale_decode, r"^scytale[-_]?([1-9]\d*)$") - +# -*- coding: UTF-8 -*- +"""Scytale-N Codec - scytale content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from math import ceil + +from ..__common__ import * + + +__examples__ = { + 'enc(scytale0|scytale--10|scytale01)': None, + 'enc(scytale2|scytale-2|scytale_2)': {'this is a test': "ti satshsi et"}, + 'enc(scytale5|scytale-5|scytale_5)': {'this is a test': "tithsei ssat "}, +} +__guess__ = ["scytale-%d" % i for i in range(1, 10)] + + +PADDING_CHAR = "" + + +def scytale_encode(l): + def encode(text, errors="strict"): + s, n = "", int(ceil(len(text) / float(l))) + for x in range(l): + for y in range(n): + try: + s += text[y*l+x] + except IndexError: + s += PADDING_CHAR + return s, len(s) + return encode + + +def scytale_decode(l): + def decode(text, errors="strict"): + s, n = "", int(ceil(len(text) / float(l))) + pl = l * n - len(text) + for x in range(n): + for y in range(l): + if y >= l-pl and x == n-1: + continue + s += text[y*n+x-max(0,y-(l-pl))] + s = s.rstrip(PADDING_CHAR) + return s, len(s) + return decode + + +add("scytale", scytale_encode, scytale_decode, r"^scytale[-_]?([1-9]\d*)$") + diff --git a/src/codext/crypto/shift.py b/src/codext/crypto/shift.py old mode 100755 new mode 100644 index 599e60d..d1c432e --- a/src/codext/crypto/shift.py +++ b/src/codext/crypto/shift.py @@ -1,34 +1,34 @@ -# -*- coding: UTF-8 -*- -"""Shift Codec - Shift-ordinal-with-N content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(shift0|shift--10|shift256)': None, - 'enc(shift1|shift_1|shift-1)': {'this is a test': "uijt!jt!b!uftu"}, - 'enc(shift9|shift_9|shift-9)': {'this is a test': "}qr|)r|)j)}n|}"}, -} -__guess__ = ["shift-%d" % i for i in range(1, 256)] - - -def ord_shift_decode(i): - return ord_shift_encode(-int(i)) - - -def ord_shift_encode(i): - def encode(text, errors="strict"): - r = "".join(chr((ord(c) + int(i)) % 256) for c in text) - return r, len(r) - return encode - - -add("shift", ord_shift_encode, ord_shift_decode, r"shift[-_]?([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])$", - transitive=True) - +# -*- coding: UTF-8 -*- +"""Shift Codec - Shift-ordinal-with-N content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(shift0|shift--10|shift256)': None, + 'enc(shift1|shift_1|shift-1)': {'this is a test': "uijt!jt!b!uftu"}, + 'enc(shift9|shift_9|shift-9)': {'this is a test': "}qr|)r|)j)}n|}"}, +} +__guess__ = ["shift-%d" % i for i in range(1, 256)] + + +def ord_shift_decode(i): + return ord_shift_encode(-int(i)) + + +def ord_shift_encode(i): + def encode(text, errors="strict"): + r = "".join(chr((ord(c) + int(i)) % 256) for c in text) + return r, len(r) + return encode + + +add("shift", ord_shift_encode, ord_shift_decode, r"shift[-_]?([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])$", + transitive=True) + diff --git a/src/codext/crypto/xor.py b/src/codext/crypto/xor.py old mode 100755 new mode 100644 index 61da6e9..cc77057 --- a/src/codext/crypto/xor.py +++ b/src/codext/crypto/xor.py @@ -1,35 +1,35 @@ -# -*- coding: UTF-8 -*- -"""XOR Codec - XOR-with-1-byte content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(xor0|xor--10|xor256|xor300)': None, - 'enc(xor3|xor-3|xor_3)': {'this is a test': "wkjp#jp#b#wfpw"}, - 'enc(xor3|xor-3|xor_3)': {'wkjp#jp#b#wfpw': "this is a test"}, - 'enc(xor6|xor-6|xor_6)': {'this is a test': "rnou&ou&g&rcur"}, -} -__guess__ = ["xor-%d" % i for i in range(1, 256)] - - -def _xorn(text, n=1): - return "".join(chr(ord(c) ^ (n % 256)) for c in text) - - -def xor_byte_encode(i): - def encode(text, errors="strict"): - r = _xorn(ensure_str(text), i) - return r, len(r) - return encode - - -add("xor", xor_byte_encode, xor_byte_encode, r"^xor[-_]?([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])$", - transitive=True) - +# -*- coding: UTF-8 -*- +"""XOR Codec - XOR-with-1-byte content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(xor0|xor--10|xor256|xor300)': None, + 'enc(xor3|xor-3|xor_3)': {'this is a test': "wkjp#jp#b#wfpw"}, + 'enc(xor3|xor-3|xor_3)': {'wkjp#jp#b#wfpw': "this is a test"}, + 'enc(xor6|xor-6|xor_6)': {'this is a test': "rnou&ou&g&rcur"}, +} +__guess__ = ["xor-%d" % i for i in range(1, 256)] + + +def _xorn(text, n=1): + return "".join(chr(ord(c) ^ (n % 256)) for c in text) + + +def xor_byte_encode(i): + def encode(text, errors="strict"): + r = _xorn(ensure_str(text), i) + return r, len(r) + return encode + + +add("xor", xor_byte_encode, xor_byte_encode, r"^xor[-_]?([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])$", + transitive=True) + diff --git a/src/codext/hashing/__init__.py b/src/codext/hashing/__init__.py old mode 100755 new mode 100644 diff --git a/src/codext/languages/__init__.py b/src/codext/languages/__init__.py old mode 100755 new mode 100644 index 196b8d3..8dbe999 --- a/src/codext/languages/__init__.py +++ b/src/codext/languages/__init__.py @@ -1,12 +1,12 @@ -# -*- coding: UTF-8 -*- -from .braille import * -from .galactic import * -from .ipsum import * -from .leetspeak import * -from .morse import * -from .navajo import * -from .radio import * -from .southpark import * -from .tap import * -from .tomtom import * - +# -*- coding: UTF-8 -*- +from .braille import * +from .galactic import * +from .ipsum import * +from .leetspeak import * +from .morse import * +from .navajo import * +from .radio import * +from .southpark import * +from .tap import * +from .tomtom import * + diff --git a/src/codext/languages/braille.py b/src/codext/languages/braille.py old mode 100755 new mode 100644 diff --git a/src/codext/languages/ipsum.py b/src/codext/languages/ipsum.py old mode 100755 new mode 100644 index 5a0fee7..a56c197 --- a/src/codext/languages/ipsum.py +++ b/src/codext/languages/ipsum.py @@ -1,97 +1,97 @@ -# -*- coding: UTF-8 -*- -"""Letters Codec - letter indices-related content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import random - -from ..__common__ import * - - -__examples__ = { - 'enc-dec(ipsum|lorem-ipsum)': ["This is a test !"], - 'enc(ipsum)': {'Bad test#': None}, -} - - -DICT = { - 'a': ['a', 'ac', 'accumsan', 'ad', 'adipiscing', 'aenean', 'aliquam', 'aliquet', 'amet', 'ante', 'aptent', 'arcu', - 'at', 'auctor', 'augue'], - 'b': ['babel', 'bibendum', 'blandit', 'bomba', 'botum', 'buxus'], - 'c': ['class', 'commodo', 'condimentum', 'congue', 'consectetur', 'consequat', 'conubia', 'convallis', 'cras', - 'cubilia', 'curabitur', 'curae', 'cursus'], - 'd': ['dapibus', 'diam', 'dictum', 'dictumst', 'dignissim', 'dis', 'dolor', 'donec', 'dui', 'duis'], - 'e': ['efficitur', 'egestas', 'eget', 'eleifend', 'elementum', 'elit', 'enim', 'erat', 'eros', 'est', 'et', 'etiam', - 'eu', 'euismod', 'ex'], - 'f': ['facilisis', 'fames', 'faucibus', 'felis', 'fermentum', 'feugiat', 'finibus', 'fringilla', 'fusce'], - 'g': ['gadus', 'galliarus', 'ganeo', 'garba', 'gemma', 'gener', 'genuine', 'gestus', 'gramma', 'gravida', 'grex', - 'gusto', 'guttur', 'gyro'], - 'h': ['habitant', 'habitasse', 'hac', 'haicu', 'halo', 'helleborum', 'hendrerit', 'hilarius', 'himenaeos', - 'horreum', 'hydrus', 'hystericus'], - 'i': ['iaculis', 'id', 'imperdiet', 'in', 'inceptos', 'integer', 'interdum', 'ipsum'], - 'j': ['jaccae', 'jacio', 'jecur', 'jocundiatas', 'jovis', 'juctim', 'juger', 'juno', 'jussum', 'justo'], - 'k': ['kal', 'kalatorium', 'kalium', 'kaput', 'kardo', 'kenia', 'koppa', 'kum'], - 'l': ['lacinia', 'lacus', 'laoreet', 'lectus', 'leo', 'libero', 'ligula', 'litora', 'lobortis', 'lorem', 'luctus'], - 'm': ['maecenas', 'magna', 'magnis', 'malesuada', 'massa', 'mattis', 'mauris', 'maximus', 'metus', 'mi', 'molestie', - 'mollis', 'montes', 'morbi', 'mus'], - 'n': ['nam', 'nascetur', 'natoque', 'nec', 'neque', 'netus', 'nibh', 'nisi', 'nisl', 'non', 'nostra', 'nulla', - 'nullam', 'nunc'], - 'o': ['odio', 'orci', 'ornare'], - 'p': ['parturient', 'pellentesque', 'penatibus', 'per', 'pharetra', 'phasellus', 'placerat', 'platea', 'porta', - 'porttitor', 'posuere', 'potenti', 'praesent', 'pretium', 'primis', 'proin', 'pulvinar', 'purus'], - 'q': ['qua', 'quadrum', 'quam', 'quasi', 'quintum', 'quis', 'quisque', 'quo', 'quom', 'quota', 'qur'], - 'r': ['radicitus', 'radius', 'ratio', 'recidivus', 'rectio', 'rhoncus', 'ridiculus', 'risus', 'ros', 'rutrum'], - 's': ['sagittis', 'sapien', 'scelerisque', 'sed', 'sem', 'semper', 'senectus', 'sit', 'sociosqu', 'sodales', - 'sollicitudin', 'suscipit', 'suspendisse'], - 't': ['taciti', 'tellus', 'tempor', 'tempus', 'tincidunt', 'torquent', 'tortor', 'tristique', 'turpis'], - 'u': ['ullamcorper', 'ultrices', 'ultricies', 'urna', 'ut'], - 'v': ['varius', 'vehicula', 'vel', 'velit', 'venenatis', 'vestibulum', 'vitae', 'vivamus', 'volutpat', 'vulputate'], - 'w': ['wadiarus', 'warantus', 'warra', 'werumensium', 'wormicia'], - 'x': ['xandicus', 'xenon', 'xenium', 'xiphias', 'xvir', 'xylon', 'xysticus', 'xystus'], - 'y': ['yata', 'yatum', 'yatus', 'ypra'], - 'z': ['zamia', 'zelosus', 'zerum', 'zonatus', 'zymus'], -} -SCHARS = "0123456789.,:;!?+=-*/\\" - - -def ipsum_encode(text, errors="strict"): - s, strip = "", False - for i, c in enumerate(text): - try: - if c == " " or c in SCHARS: - s += c - strip = False - else: - w = random.choice(DICT[c.lower()]) - s += (w.capitalize() if c.isupper() else w) + " " - strip = True - except KeyError: - s += handle_error("ipsum", errors, " ")(c, i) - return s[:-1] if strip else s, len(text) - - -def ipsum_decode(text, errors="strict"): - s = "" - words = text.split(" ") - for i, w in enumerate(words[:-1] if words[-1] == "" else words): - if w.strip() == "": - s += " " - elif w in SCHARS: - s += w - else: - try: - if w.lower().strip(SCHARS) not in DICT[w[0].lower()]: - raise KeyError - s += w[:len(w)-len(w.lstrip(SCHARS))] + w.strip(SCHARS)[0] + w[len(w.rstrip(SCHARS)):len(w)] - except KeyError: - s += handle_error("ipsum", errors, decode=True, item="word")(w, i) - return s, len(text) - - -add("ipsum", ipsum_encode, ipsum_decode, pattern=r"^(?:lorem[-_]?)?ipsum$", printables_rate=1., - expansion_factor=(6., .5)) - +# -*- coding: UTF-8 -*- +"""Letters Codec - letter indices-related content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import random + +from ..__common__ import * + + +__examples__ = { + 'enc-dec(ipsum|lorem-ipsum)': ["This is a test !"], + 'enc(ipsum)': {'Bad test#': None}, +} + + +DICT = { + 'a': ['a', 'ac', 'accumsan', 'ad', 'adipiscing', 'aenean', 'aliquam', 'aliquet', 'amet', 'ante', 'aptent', 'arcu', + 'at', 'auctor', 'augue'], + 'b': ['babel', 'bibendum', 'blandit', 'bomba', 'botum', 'buxus'], + 'c': ['class', 'commodo', 'condimentum', 'congue', 'consectetur', 'consequat', 'conubia', 'convallis', 'cras', + 'cubilia', 'curabitur', 'curae', 'cursus'], + 'd': ['dapibus', 'diam', 'dictum', 'dictumst', 'dignissim', 'dis', 'dolor', 'donec', 'dui', 'duis'], + 'e': ['efficitur', 'egestas', 'eget', 'eleifend', 'elementum', 'elit', 'enim', 'erat', 'eros', 'est', 'et', 'etiam', + 'eu', 'euismod', 'ex'], + 'f': ['facilisis', 'fames', 'faucibus', 'felis', 'fermentum', 'feugiat', 'finibus', 'fringilla', 'fusce'], + 'g': ['gadus', 'galliarus', 'ganeo', 'garba', 'gemma', 'gener', 'genuine', 'gestus', 'gramma', 'gravida', 'grex', + 'gusto', 'guttur', 'gyro'], + 'h': ['habitant', 'habitasse', 'hac', 'haicu', 'halo', 'helleborum', 'hendrerit', 'hilarius', 'himenaeos', + 'horreum', 'hydrus', 'hystericus'], + 'i': ['iaculis', 'id', 'imperdiet', 'in', 'inceptos', 'integer', 'interdum', 'ipsum'], + 'j': ['jaccae', 'jacio', 'jecur', 'jocundiatas', 'jovis', 'juctim', 'juger', 'juno', 'jussum', 'justo'], + 'k': ['kal', 'kalatorium', 'kalium', 'kaput', 'kardo', 'kenia', 'koppa', 'kum'], + 'l': ['lacinia', 'lacus', 'laoreet', 'lectus', 'leo', 'libero', 'ligula', 'litora', 'lobortis', 'lorem', 'luctus'], + 'm': ['maecenas', 'magna', 'magnis', 'malesuada', 'massa', 'mattis', 'mauris', 'maximus', 'metus', 'mi', 'molestie', + 'mollis', 'montes', 'morbi', 'mus'], + 'n': ['nam', 'nascetur', 'natoque', 'nec', 'neque', 'netus', 'nibh', 'nisi', 'nisl', 'non', 'nostra', 'nulla', + 'nullam', 'nunc'], + 'o': ['odio', 'orci', 'ornare'], + 'p': ['parturient', 'pellentesque', 'penatibus', 'per', 'pharetra', 'phasellus', 'placerat', 'platea', 'porta', + 'porttitor', 'posuere', 'potenti', 'praesent', 'pretium', 'primis', 'proin', 'pulvinar', 'purus'], + 'q': ['qua', 'quadrum', 'quam', 'quasi', 'quintum', 'quis', 'quisque', 'quo', 'quom', 'quota', 'qur'], + 'r': ['radicitus', 'radius', 'ratio', 'recidivus', 'rectio', 'rhoncus', 'ridiculus', 'risus', 'ros', 'rutrum'], + 's': ['sagittis', 'sapien', 'scelerisque', 'sed', 'sem', 'semper', 'senectus', 'sit', 'sociosqu', 'sodales', + 'sollicitudin', 'suscipit', 'suspendisse'], + 't': ['taciti', 'tellus', 'tempor', 'tempus', 'tincidunt', 'torquent', 'tortor', 'tristique', 'turpis'], + 'u': ['ullamcorper', 'ultrices', 'ultricies', 'urna', 'ut'], + 'v': ['varius', 'vehicula', 'vel', 'velit', 'venenatis', 'vestibulum', 'vitae', 'vivamus', 'volutpat', 'vulputate'], + 'w': ['wadiarus', 'warantus', 'warra', 'werumensium', 'wormicia'], + 'x': ['xandicus', 'xenon', 'xenium', 'xiphias', 'xvir', 'xylon', 'xysticus', 'xystus'], + 'y': ['yata', 'yatum', 'yatus', 'ypra'], + 'z': ['zamia', 'zelosus', 'zerum', 'zonatus', 'zymus'], +} +SCHARS = "0123456789.,:;!?+=-*/\\" + + +def ipsum_encode(text, errors="strict"): + s, strip = "", False + for i, c in enumerate(text): + try: + if c == " " or c in SCHARS: + s += c + strip = False + else: + w = random.choice(DICT[c.lower()]) + s += (w.capitalize() if c.isupper() else w) + " " + strip = True + except KeyError: + s += handle_error("ipsum", errors, " ")(c, i) + return s[:-1] if strip else s, len(text) + + +def ipsum_decode(text, errors="strict"): + s = "" + words = text.split(" ") + for i, w in enumerate(words[:-1] if words[-1] == "" else words): + if w.strip() == "": + s += " " + elif w in SCHARS: + s += w + else: + try: + if w.lower().strip(SCHARS) not in DICT[w[0].lower()]: + raise KeyError + s += w[:len(w)-len(w.lstrip(SCHARS))] + w.strip(SCHARS)[0] + w[len(w.rstrip(SCHARS)):len(w)] + except KeyError: + s += handle_error("ipsum", errors, decode=True, item="word")(w, i) + return s, len(text) + + +add("ipsum", ipsum_encode, ipsum_decode, pattern=r"^(?:lorem[-_]?)?ipsum$", printables_rate=1., + expansion_factor=(6., .5)) + diff --git a/src/codext/languages/leetspeak.py b/src/codext/languages/leetspeak.py old mode 100755 new mode 100644 index 0628742..f3af876 --- a/src/codext/languages/leetspeak.py +++ b/src/codext/languages/leetspeak.py @@ -1,23 +1,23 @@ -# -*- coding: UTF-8 -*- -"""Leetspeak Codec - leetspeak content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(leet|1337|leetspeak)': {'this is a test': "7h15 15 4 7357"}, - 'dec(leet|1337|leetspeak)': {'7H15 15 4 7357': "THIS IS A TEST"}, -} - - -ENCMAP = {k: v for k, v in zip("aabeliostzg", "@4831105729")} - - -add_map("leet", ENCMAP, ignore_case="encode", no_error=True, pattern=r"(?:leet|1337|leetspeak)$", entropy=lambda e: e) - +# -*- coding: UTF-8 -*- +"""Leetspeak Codec - leetspeak content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(leet|1337|leetspeak)': {'this is a test': "7h15 15 4 7357"}, + 'dec(leet|1337|leetspeak)': {'7H15 15 4 7357': "THIS IS A TEST"}, +} + + +ENCMAP = {k: v for k, v in zip("aabeliostzg", "@4831105729")} + + +add_map("leet", ENCMAP, ignore_case="encode", no_error=True, pattern=r"(?:leet|1337|leetspeak)$", entropy=lambda e: e) + diff --git a/src/codext/languages/morse.py b/src/codext/languages/morse.py old mode 100755 new mode 100644 index 10f9f14..6c21a09 --- a/src/codext/languages/morse.py +++ b/src/codext/languages/morse.py @@ -1,40 +1,40 @@ -# -*- coding: UTF-8 -*- -"""Morse Codec - morse content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(morse|morse/-.)': {'this is a test': "- .... .. ... / .. ... / .- / - . ... -"}, - 'enc(morse-/AB)': {'this is a test': "A BBBB BB BBB / BB BBB / BA / A B BBB A"}, - 'enc(morse-01)': {'this is a test': "0 1111 11 111 - 11 111 - 10 - 0 1 111 0"}, -} -__guess__ = ["morse", "morse/_.", "morse-/01", "morse-01", "morse-/ab", "morse-ab", "morse-/AB", "morse-AB"] - - -ENCMAP = { - # letters - 'a': ".-", 'b': "-...", 'c': "-.-.", 'd': "-..", 'e': ".", 'f': "..-.", 'g': "--.", 'h': "....", 'i': "..", - 'j': ".---", 'k': "-.-", 'l': ".-..", 'm': "--", 'n': "-.", 'o': "---", 'p': ".--.", 'q': "--.-", 'r': ".-.", - 's': "...", 't': "-", 'u': "..-", 'v': "...-", 'w': ".--", 'x': "-..-", 'y': "-.--", 'z': "--..", - # digits - '1': ".----", '2': "..---", '3': "...--", '4': "....-", '5': ".....", '6': "-....", '7': "--...", '8': "---..", - '9': "----.", '0': "-----", - # punctuation - ',': "--..--", '.': ".-.-.-", ':' : "---...", '?': "..--..", '/': "-..-.", '-': "-....-", '=' : "-...-", - '(': "-.--.", ')': "-.--.-", '@' : ".--.-.", '\'': ".----.", '_': "..--.-", '!': "-.-.--", '&': ".-...", - '"': ".-..-.", ';': "-.-.-.", '$': "...-..-", - # word separator - ' ' : "/", -} - - -add_map("morse", ENCMAP, "#", " ", ignore_case="encode", pattern=r"^morse([-_]?.{3})?$", printables_rate=1., - expansion_factor=(2.8, .6)) - +# -*- coding: UTF-8 -*- +"""Morse Codec - morse content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(morse|morse/-.)': {'this is a test': "- .... .. ... / .. ... / .- / - . ... -"}, + 'enc(morse-/AB)': {'this is a test': "A BBBB BB BBB / BB BBB / BA / A B BBB A"}, + 'enc(morse-01)': {'this is a test': "0 1111 11 111 - 11 111 - 10 - 0 1 111 0"}, +} +__guess__ = ["morse", "morse/_.", "morse-/01", "morse-01", "morse-/ab", "morse-ab", "morse-/AB", "morse-AB"] + + +ENCMAP = { + # letters + 'a': ".-", 'b': "-...", 'c': "-.-.", 'd': "-..", 'e': ".", 'f': "..-.", 'g': "--.", 'h': "....", 'i': "..", + 'j': ".---", 'k': "-.-", 'l': ".-..", 'm': "--", 'n': "-.", 'o': "---", 'p': ".--.", 'q': "--.-", 'r': ".-.", + 's': "...", 't': "-", 'u': "..-", 'v': "...-", 'w': ".--", 'x': "-..-", 'y': "-.--", 'z': "--..", + # digits + '1': ".----", '2': "..---", '3': "...--", '4': "....-", '5': ".....", '6': "-....", '7': "--...", '8': "---..", + '9': "----.", '0': "-----", + # punctuation + ',': "--..--", '.': ".-.-.-", ':' : "---...", '?': "..--..", '/': "-..-.", '-': "-....-", '=' : "-...-", + '(': "-.--.", ')': "-.--.-", '@' : ".--.-.", '\'': ".----.", '_': "..--.-", '!': "-.-.--", '&': ".-...", + '"': ".-..-.", ';': "-.-.-.", '$': "...-..-", + # word separator + ' ' : "/", +} + + +add_map("morse", ENCMAP, "#", " ", ignore_case="encode", pattern=r"^morse([-_]?.{3})?$", printables_rate=1., + expansion_factor=(2.8, .6)) + diff --git a/src/codext/languages/navajo.py b/src/codext/languages/navajo.py old mode 100755 new mode 100644 index b895622..a46b35c --- a/src/codext/languages/navajo.py +++ b/src/codext/languages/navajo.py @@ -1,35 +1,35 @@ -# -*- coding: UTF-8 -*- -"""Navajo Codec - Navajo code content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = {'enc-dec(navajo)': ["this is a test", "THIS\nIS\nA\nTEST"]} - - -# source: https://www.history.navy.mil/research/library/online-reading-room/title-list-alphabetically/n/navajo-code-talker-dictionary.html -ENCMAP = { - 'A': ["WOL-LA-CHEE", "BE-LA-SANA", "TSE-NILL"], 'B': ["NA-HASH-CHID", "SHUSH", "TOISH-JEH"], - 'C': ["MOASI", "TLA-GIN", "BA-GOSHI"], 'D': ["BE", "CHINDI", "LHA-CHA-EH"], 'E': ["AH-JAH", "DZEH", "AH-NAH"], - 'F': ["CHUO", "TSA-E-DONIN-EE", "MA-E"], 'G': ["AH-TAD", "KLIZZIE", "JEHA"], 'H': ["TSE-GAH", "CHA", "LIN"], - 'I': ["TKIN", "YEH-HES", "A-CHI"], 'J': ["TKELE-CHO-G", "AH-YA-TSINNE", "YIL-DOI"], - 'K': ["JAD-HO-LONI", "BA-AH-NE-DI-TININ", "KLIZZIE-YAZZIE"], 'L': ["DIBEH-YAZZIE", "AH-JAD", "NASH-DOIE-TSO"], - 'M': ["TSIN-TLITI", "BE-TAS-TNI", "NA-AS-TSO-SI"], 'N': ["TSAH", "A-CHIN"], - 'O': ["A-KHA", "TLO-CHIN", "NE-AHS-JAH"], 'P': ["CLA-GI-AIH", "BI-SO-DIH", "NE-ZHONI"], 'Q': "CA-YEILTH", - 'R': ["GAH", "DAH-NES-TSA", "AH-LOSZ"], 'S': ["DIBEH", "KLESH"], 'T': ["D-AH", "A-WOH", "THAN-ZIE"], - 'U': ["SHI-DA", "NO-DA-IH"], 'V': "A-KEH-DI-GLINI", 'W': "GLOE-IH", 'X': "AL-NA-AS-DZOH", 'Y': "TSAH-AS-ZIH", - 'Z': "BESH-DO-TLIZ", - ' ': "-", '\n': "\n", - '0': "0", '1': "1", '2': "2", '3': "3", '4': "4", '5': "5", '6': "6", '7': "7", '8': "8", '9': "9", -} - - -add_map("navajo", ENCMAP, ignore_case="both", sep=" ", pattern=r"^navajo$", printables_rate=1., - expansion_factor=(6.2, .8)) - +# -*- coding: UTF-8 -*- +"""Navajo Codec - Navajo code content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = {'enc-dec(navajo)': ["this is a test", "THIS\nIS\nA\nTEST"]} + + +# source: https://www.history.navy.mil/research/library/online-reading-room/title-list-alphabetically/n/navajo-code-talker-dictionary.html +ENCMAP = { + 'A': ["WOL-LA-CHEE", "BE-LA-SANA", "TSE-NILL"], 'B': ["NA-HASH-CHID", "SHUSH", "TOISH-JEH"], + 'C': ["MOASI", "TLA-GIN", "BA-GOSHI"], 'D': ["BE", "CHINDI", "LHA-CHA-EH"], 'E': ["AH-JAH", "DZEH", "AH-NAH"], + 'F': ["CHUO", "TSA-E-DONIN-EE", "MA-E"], 'G': ["AH-TAD", "KLIZZIE", "JEHA"], 'H': ["TSE-GAH", "CHA", "LIN"], + 'I': ["TKIN", "YEH-HES", "A-CHI"], 'J': ["TKELE-CHO-G", "AH-YA-TSINNE", "YIL-DOI"], + 'K': ["JAD-HO-LONI", "BA-AH-NE-DI-TININ", "KLIZZIE-YAZZIE"], 'L': ["DIBEH-YAZZIE", "AH-JAD", "NASH-DOIE-TSO"], + 'M': ["TSIN-TLITI", "BE-TAS-TNI", "NA-AS-TSO-SI"], 'N': ["TSAH", "A-CHIN"], + 'O': ["A-KHA", "TLO-CHIN", "NE-AHS-JAH"], 'P': ["CLA-GI-AIH", "BI-SO-DIH", "NE-ZHONI"], 'Q': "CA-YEILTH", + 'R': ["GAH", "DAH-NES-TSA", "AH-LOSZ"], 'S': ["DIBEH", "KLESH"], 'T': ["D-AH", "A-WOH", "THAN-ZIE"], + 'U': ["SHI-DA", "NO-DA-IH"], 'V': "A-KEH-DI-GLINI", 'W': "GLOE-IH", 'X': "AL-NA-AS-DZOH", 'Y': "TSAH-AS-ZIH", + 'Z': "BESH-DO-TLIZ", + ' ': "-", '\n': "\n", + '0': "0", '1': "1", '2': "2", '3': "3", '4': "4", '5': "5", '6': "6", '7': "7", '8': "8", '9': "9", +} + + +add_map("navajo", ENCMAP, ignore_case="both", sep=" ", pattern=r"^navajo$", printables_rate=1., + expansion_factor=(6.2, .8)) + diff --git a/src/codext/languages/radio.py b/src/codext/languages/radio.py old mode 100755 new mode 100644 index 304e44a..03e420e --- a/src/codext/languages/radio.py +++ b/src/codext/languages/radio.py @@ -1,29 +1,29 @@ -# -*- coding: UTF-8 -*- -"""Radio Codec - NATO/Military phonetic alphabet content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(radio|military-alphabet)': {'test': "Tango Echo Sierra Tango"}, - 'enc(nato-alphabet|radio-phonetic)': {'string': "Sierra Tango Romeo India November Golf"}, -} - - -ENCMAP = { - 'A': "Alpha", 'B': "Bravo", 'C': "Charlie", 'D': "Delta", 'E': "Echo", 'F': "Foxtrot", 'G': "Golf", 'H': "Hotel", - 'I': "India", 'J': "Juliett", 'K': "Kilo", 'L': "Lima", 'M': "Mike", 'N': "November", 'O': "Oscar", 'P': "Papa", - 'Q': "Quebec", 'R': "Romeo", 'S': "Sierra", 'T': "Tango", 'U': "Uniform", 'V': "Victor", 'W': "Whiskey", - 'X': "X-ray", 'Y': "Yankee", 'Z': "Zulu", ' ': "/", -} - - -add_map("radio", ENCMAP, sep=" ", ignore_case="both", printables_rate=1., expansion_factor=(5.5, .3), - pattern=r"^(?:military|nato|radio)(?:(?:[-_]phonetic)?(?:[-_]alphabet)?)?$") - +# -*- coding: UTF-8 -*- +"""Radio Codec - NATO/Military phonetic alphabet content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(radio|military-alphabet)': {'test': "Tango Echo Sierra Tango"}, + 'enc(nato-alphabet|radio-phonetic)': {'string': "Sierra Tango Romeo India November Golf"}, +} + + +ENCMAP = { + 'A': "Alpha", 'B': "Bravo", 'C': "Charlie", 'D': "Delta", 'E': "Echo", 'F': "Foxtrot", 'G': "Golf", 'H': "Hotel", + 'I': "India", 'J': "Juliett", 'K': "Kilo", 'L': "Lima", 'M': "Mike", 'N': "November", 'O': "Oscar", 'P': "Papa", + 'Q': "Quebec", 'R': "Romeo", 'S': "Sierra", 'T': "Tango", 'U': "Uniform", 'V': "Victor", 'W': "Whiskey", + 'X': "X-ray", 'Y': "Yankee", 'Z': "Zulu", ' ': "/", +} + + +add_map("radio", ENCMAP, sep=" ", ignore_case="both", printables_rate=1., expansion_factor=(5.5, .3), + pattern=r"^(?:military|nato|radio)(?:(?:[-_]phonetic)?(?:[-_]alphabet)?)?$") + diff --git a/src/codext/languages/southpark.py b/src/codext/languages/southpark.py old mode 100755 new mode 100644 index 8abc18b..6fdbd93 --- a/src/codext/languages/southpark.py +++ b/src/codext/languages/southpark.py @@ -1,44 +1,44 @@ -# -*- coding: UTF-8 -*- -"""Southpark Codec - Kenny's language content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples1__ = { - 'enc_dec(kenny|southpark)': ["This is a Test"], - 'enc_dec(kenny_123456|southpark-ABCDEF)': ["This is a Test"], -} -__guess1__ = ["southpark", "southpark-123456", "southpark-abcdef", "southpark-ABCDEF"] -__examples2__ = { - 'enc(southpark-icase|kenny_icase)': {'this is a test': "FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP"}, - 'enc(southpark_icase-123)': {'this is a test': "123213211122111211122111222111123233122123"}, -} -__guess2__ = ["southpark-icase", "southpark-icase-123", "southpark-icase-abc", "southpark-icase-ABC"] - - -ENCMAP1 = { - 'a': "mmm", 'b': "mmp", 'c': "mmf", 'd': "mpm", 'e': "mpp", 'f': "mpf", 'g': "mfm", 'h': "mfp", 'i': "mff", - 'j': "pmm", 'k': "pmp", 'l': "pmf", 'm': "ppm", 'n': "ppp", 'o': "ppf", 'p': "pfm", 'q': "pfp", 'r': "pff", - 's': "fmm", 't': "fmp", 'u': "fmf", 'v': "fpm", 'w': "fpp", 'x': "fpf", 'y': "ffm", 'z': "ffp", - 'A': "Mmm", 'B': "Mmp", 'C': "Mmf", 'D': "Mpm", 'E': "Mpp", 'F': "Mpf", 'G': "Mfm", 'H': "Mfp", 'I': "Mff", - 'J': "Pmm", 'K': "Pmp", 'L': "Pmf", 'M': "Ppm", 'N': "Ppp", 'O': "Ppf", 'P': "Pfm", 'Q': "Pfp", 'R': "Pff", - 'S': "Fmm", 'T': "Fmp", 'U': "Fmf", 'V': "Fpm", 'W': "Fpp", 'X': "Fpf", 'Y': "Ffm", 'Z': "Ffp", - ' ': ["fff", "Fff"], -} -ENCMAP2 = { - 'a': "MMM", 'b': "MMP", 'c': "MMF", 'd': "MPM", 'e': "MPP", 'f': "MPF", 'g': "MFM", 'h': "MFP", 'i': "MFF", - 'j': "PMM", 'k': "PMP", 'l': "PMF", 'm': "PPM", 'n': "PPP", 'o': "PPF", 'p': "PFM", 'q': "PFP", 'r': "PFF", - 's': "FMM", 't': "FMP", 'u': "FMF", 'v': "FPM", 'w': "FPP", 'x': "FPF", 'y': "FFM", 'z': "FFP", ' ': "FFF", -} - - -add_map("southpark", ENCMAP1, pattern=r"^(?:kenny|southpark)([-_].{6})?$", examples=__examples1__, guess=__guess1__) -add_map("southpark-icase", ENCMAP2, ignore_case="both", pattern=r"^(?:kenny|southpark)[-_]icase([-_].{3})?$", - examples=__examples2__, guess=__guess2__, printables_rate=1., expansion_factor=3.) - +# -*- coding: UTF-8 -*- +"""Southpark Codec - Kenny's language content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples1__ = { + 'enc_dec(kenny|southpark)': ["This is a Test"], + 'enc_dec(kenny_123456|southpark-ABCDEF)': ["This is a Test"], +} +__guess1__ = ["southpark", "southpark-123456", "southpark-abcdef", "southpark-ABCDEF"] +__examples2__ = { + 'enc(southpark-icase|kenny_icase)': {'this is a test': "FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP"}, + 'enc(southpark_icase-123)': {'this is a test': "123213211122111211122111222111123233122123"}, +} +__guess2__ = ["southpark-icase", "southpark-icase-123", "southpark-icase-abc", "southpark-icase-ABC"] + + +ENCMAP1 = { + 'a': "mmm", 'b': "mmp", 'c': "mmf", 'd': "mpm", 'e': "mpp", 'f': "mpf", 'g': "mfm", 'h': "mfp", 'i': "mff", + 'j': "pmm", 'k': "pmp", 'l': "pmf", 'm': "ppm", 'n': "ppp", 'o': "ppf", 'p': "pfm", 'q': "pfp", 'r': "pff", + 's': "fmm", 't': "fmp", 'u': "fmf", 'v': "fpm", 'w': "fpp", 'x': "fpf", 'y': "ffm", 'z': "ffp", + 'A': "Mmm", 'B': "Mmp", 'C': "Mmf", 'D': "Mpm", 'E': "Mpp", 'F': "Mpf", 'G': "Mfm", 'H': "Mfp", 'I': "Mff", + 'J': "Pmm", 'K': "Pmp", 'L': "Pmf", 'M': "Ppm", 'N': "Ppp", 'O': "Ppf", 'P': "Pfm", 'Q': "Pfp", 'R': "Pff", + 'S': "Fmm", 'T': "Fmp", 'U': "Fmf", 'V': "Fpm", 'W': "Fpp", 'X': "Fpf", 'Y': "Ffm", 'Z': "Ffp", + ' ': ["fff", "Fff"], +} +ENCMAP2 = { + 'a': "MMM", 'b': "MMP", 'c': "MMF", 'd': "MPM", 'e': "MPP", 'f': "MPF", 'g': "MFM", 'h': "MFP", 'i': "MFF", + 'j': "PMM", 'k': "PMP", 'l': "PMF", 'm': "PPM", 'n': "PPP", 'o': "PPF", 'p': "PFM", 'q': "PFP", 'r': "PFF", + 's': "FMM", 't': "FMP", 'u': "FMF", 'v': "FPM", 'w': "FPP", 'x': "FPF", 'y': "FFM", 'z': "FFP", ' ': "FFF", +} + + +add_map("southpark", ENCMAP1, pattern=r"^(?:kenny|southpark)([-_].{6})?$", examples=__examples1__, guess=__guess1__) +add_map("southpark-icase", ENCMAP2, ignore_case="both", pattern=r"^(?:kenny|southpark)[-_]icase([-_].{3})?$", + examples=__examples2__, guess=__guess2__, printables_rate=1., expansion_factor=3.) + diff --git a/src/codext/languages/tomtom.py b/src/codext/languages/tomtom.py old mode 100755 new mode 100644 index 34a3b46..403eda4 --- a/src/codext/languages/tomtom.py +++ b/src/codext/languages/tomtom.py @@ -1,35 +1,35 @@ -# -*- coding: UTF-8 -*- -"""Tom-Tom Codec - tom-tom content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc': { - 'this is a test': "\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\" - } -} -__guess__ = ["tom-tom", "tom-tom/_.", "tom-tom-/01", "tom-tom-01", "tom-tom-/ab", "tom-tom-ab", "tom-tom-/AB", - "tom-tom-AB"] - - -ENCMAP = { - # letters - 'A': "/", 'B': "//", 'C': "///", 'D': "////", 'E': "/\\", 'F': "//\\", 'G': "///\\", 'H': "/\\\\", 'I': "/\\\\\\", - 'J': "\\/", 'K': "\\\\/", 'L': "\\\\\\/", 'M': "\\//", 'N': "\\///", 'O': "/\\/", 'P': "//\\/", 'Q': "/\\\\/", - 'R': "/\\//", 'S': "\\/\\", 'T': "\\\\/\\", 'U': "\\//\\", 'V': "\\/\\\\", 'W': "//\\\\", 'X': "\\\\//", - 'Y': "\\/\\/", 'Z': "/\\/\\", - # word separator - ' ' : "|", -} - - -add_map("tom-tom", ENCMAP, ".", " ", ignore_case="both", pattern=r"^tom-?tom([-_]?.{3})?$", printables_rate=1., - expansion_factor=(3.8, .2)) - +# -*- coding: UTF-8 -*- +"""Tom-Tom Codec - tom-tom content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc': { + 'this is a test': "\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\" + } +} +__guess__ = ["tom-tom", "tom-tom/_.", "tom-tom-/01", "tom-tom-01", "tom-tom-/ab", "tom-tom-ab", "tom-tom-/AB", + "tom-tom-AB"] + + +ENCMAP = { + # letters + 'A': "/", 'B': "//", 'C': "///", 'D': "////", 'E': "/\\", 'F': "//\\", 'G': "///\\", 'H': "/\\\\", 'I': "/\\\\\\", + 'J': "\\/", 'K': "\\\\/", 'L': "\\\\\\/", 'M': "\\//", 'N': "\\///", 'O': "/\\/", 'P': "//\\/", 'Q': "/\\\\/", + 'R': "/\\//", 'S': "\\/\\", 'T': "\\\\/\\", 'U': "\\//\\", 'V': "\\/\\\\", 'W': "//\\\\", 'X': "\\\\//", + 'Y': "\\/\\/", 'Z': "/\\/\\", + # word separator + ' ' : "|", +} + + +add_map("tom-tom", ENCMAP, ".", " ", ignore_case="both", pattern=r"^tom-?tom([-_]?.{3})?$", printables_rate=1., + expansion_factor=(3.8, .2)) + diff --git a/src/codext/others/__init__.py b/src/codext/others/__init__.py old mode 100755 new mode 100644 index 3bbf102..7342b8a --- a/src/codext/others/__init__.py +++ b/src/codext/others/__init__.py @@ -1,7 +1,7 @@ -# -*- coding: UTF-8 -*- -from .dna import * -from .kbshift import * -from .letters import * -from .markdown import * -from .uuencode import * - +# -*- coding: UTF-8 -*- +from .dna import * +from .kbshift import * +from .letters import * +from .markdown import * +from .uuencode import * + diff --git a/src/codext/others/dna.py b/src/codext/others/dna.py old mode 100755 new mode 100644 index 428edee..2757a6a --- a/src/codext/others/dna.py +++ b/src/codext/others/dna.py @@ -1,42 +1,42 @@ -# -*- coding: UTF-8 -*- -"""DNA Codec - dna content encoding. - -This implements the 8 methods of ATGC nucleotides following the rule of complementary pairing, according the literature4 - about coding and computing of DNA sequences. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(dna0|dna9)': None, - 'enc(dna1)': {'this is a test': "GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA"}, - 'enc(dna-2)': {'this is a test': "CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA"}, - 'enc(dna_3)': {'this is a test': "ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG"}, - 'enc(dna4)': {'this is a test': "AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC"}, - 'enc(dna-5)': {'this is a test': "TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG"}, - 'enc(dna_6)': {'this is a test': "TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC"}, - 'enc(dna7)': {'this is a test': "GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT"}, - 'enc(dna-8)': {'this is a test': "CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT"}, -} -__guess__ = ["dna%d" % i for i in range(1, 9)] - - -SEQUENCES = { - '00': "AAGCGCTT", - '11': "TTCGCGAA", - '01': "GCAATTGC", - '10': "CGTTAACG", -} -ENCMAP = [] -for i in range(8): - ENCMAP.append({k: v[i] for k, v in SEQUENCES.items()}) - - -add_map("dna", ENCMAP, intype="bin", pattern=r"dna[-_]?([1-8])$", entropy=2., printables_rate=1., expansion_factor=4.) - +# -*- coding: UTF-8 -*- +"""DNA Codec - dna content encoding. + +This implements the 8 methods of ATGC nucleotides following the rule of complementary pairing, according the literature4 + about coding and computing of DNA sequences. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(dna0|dna9)': None, + 'enc(dna1)': {'this is a test': "GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA"}, + 'enc(dna-2)': {'this is a test': "CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA"}, + 'enc(dna_3)': {'this is a test': "ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG"}, + 'enc(dna4)': {'this is a test': "AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC"}, + 'enc(dna-5)': {'this is a test': "TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG"}, + 'enc(dna_6)': {'this is a test': "TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC"}, + 'enc(dna7)': {'this is a test': "GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT"}, + 'enc(dna-8)': {'this is a test': "CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT"}, +} +__guess__ = ["dna%d" % i for i in range(1, 9)] + + +SEQUENCES = { + '00': "AAGCGCTT", + '11': "TTCGCGAA", + '01': "GCAATTGC", + '10': "CGTTAACG", +} +ENCMAP = [] +for i in range(8): + ENCMAP.append({k: v[i] for k, v in SEQUENCES.items()}) + + +add_map("dna", ENCMAP, intype="bin", pattern=r"dna[-_]?([1-8])$", entropy=2., printables_rate=1., expansion_factor=4.) + diff --git a/src/codext/others/kbshift.py b/src/codext/others/kbshift.py old mode 100755 new mode 100644 index 2bd0991..60b3bf0 --- a/src/codext/others/kbshift.py +++ b/src/codext/others/kbshift.py @@ -1,66 +1,66 @@ -# -*- coding: UTF-8 -*- -"""Keyboard-Shift Codec - keyboard line shifting content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -LAYOUTS = { - 'ansi': "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;'\nzxcvbnm<>\n,./", - 'azerty': "azertyuiop\nqsdfghjklm\nwxcvbn", - 'azerty-be': "³1234567890°_\n²&é\"'(§è!çà)-\n|@#^{}\nazertyuiop$\n€[]\n¨*\nqsdfghjklm%£\nùµ\n´`\n>wxcvbn?./+\n<,;:=\n\\~", - 'azerty-fr': "1234567890°+\n²&é\"'(-è_çà)=\n~#{[|`\\^@]}\nazertyuiop¨£\nqsdfghjklm%µ\nù*\n>wxcvbn?./§\n<,;:!", - 'dvorak': "~!@#$%^&*(){}\n`1234567890[]\n\"<>pyfgcrl?+|\n',./=\\\naoeuidhtns_\n-\n:qjkxbmwvz\n;", - 'qwerty': "qwertyuiop\nasdfghjkl\nzxcvbnm", - 'qwerty-us': "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;,\nzxcvbnm<>?\n./", -} -__per_len = {} -for k, s in LAYOUTS.items(): - i = max(map(len, s.split("\n"))) - __per_len.setdefault(i, []) - __per_len[i].append(k) - - -__examples__ = {"enc-dec(kbshift_%s_%d)" % (kb, n): ["@irandom{256,512}"] for n in range(10) for kb in LAYOUTS.keys()} -__guess__ = [] -for mlen, kbs in __per_len.items(): - for k in kbs: - __guess__.extend(["kbshift-%s-%d" % (k, i+1) for i in range(mlen)]) - - -def _kbshift(text, keyboard="azerty", n=1, decode=False): - r = "" - for c in text: - nc = None - for l in LAYOUTS[keyboard].splitlines(): - if c.lower() in l: - nc = l[(l.index(c.lower()) + [-1, 1][decode] * n) % len(l)] - break - r += c if nc is None else nc - return r - - -def kbshift_encode(scheme): - kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups() - def encode(text, errors="strict"): - r = _kbshift(ensure_str(text), kb, int(shift)) - return r, len(r) - return encode - - -def kbshift_decode(scheme): - kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups() - def decode(text, errors="strict"): - r = _kbshift(ensure_str(text), kb, int(shift), True) - return r, len(r) - return decode - - -add("kbshift", kbshift_encode, kbshift_decode, entropy=lambda e: e,printables_rate=lambda pr: pr, transitive=True, - pattern=r"^kbshift(?:|[-_]((?:az|qw)erty[-_]?[1-9]|(?:ansi|azerty-(?:be|fr)|dvorak|qwerty-us)[-_]?(?:[1-9]|1[0-2])))$") - +# -*- coding: UTF-8 -*- +"""Keyboard-Shift Codec - keyboard line shifting content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +LAYOUTS = { + 'ansi': "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;'\nzxcvbnm<>\n,./", + 'azerty': "azertyuiop\nqsdfghjklm\nwxcvbn", + 'azerty-be': "³1234567890°_\n²&é\"'(§è!çà)-\n|@#^{}\nazertyuiop$\n€[]\n¨*\nqsdfghjklm%£\nùµ\n´`\n>wxcvbn?./+\n<,;:=\n\\~", + 'azerty-fr': "1234567890°+\n²&é\"'(-è_çà)=\n~#{[|`\\^@]}\nazertyuiop¨£\nqsdfghjklm%µ\nù*\n>wxcvbn?./§\n<,;:!", + 'dvorak': "~!@#$%^&*(){}\n`1234567890[]\n\"<>pyfgcrl?+|\n',./=\\\naoeuidhtns_\n-\n:qjkxbmwvz\n;", + 'qwerty': "qwertyuiop\nasdfghjkl\nzxcvbnm", + 'qwerty-us': "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;,\nzxcvbnm<>?\n./", +} +__per_len = {} +for k, s in LAYOUTS.items(): + i = max(map(len, s.split("\n"))) + __per_len.setdefault(i, []) + __per_len[i].append(k) + + +__examples__ = {"enc-dec(kbshift_%s_%d)" % (kb, n): ["@irandom{256,512}"] for n in range(10) for kb in LAYOUTS.keys()} +__guess__ = [] +for mlen, kbs in __per_len.items(): + for k in kbs: + __guess__.extend(["kbshift-%s-%d" % (k, i+1) for i in range(mlen)]) + + +def _kbshift(text, keyboard="azerty", n=1, decode=False): + r = "" + for c in text: + nc = None + for l in LAYOUTS[keyboard].splitlines(): + if c.lower() in l: + nc = l[(l.index(c.lower()) + [-1, 1][decode] * n) % len(l)] + break + r += c if nc is None else nc + return r + + +def kbshift_encode(scheme): + kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups() + def encode(text, errors="strict"): + r = _kbshift(ensure_str(text), kb, int(shift)) + return r, len(r) + return encode + + +def kbshift_decode(scheme): + kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups() + def decode(text, errors="strict"): + r = _kbshift(ensure_str(text), kb, int(shift), True) + return r, len(r) + return decode + + +add("kbshift", kbshift_encode, kbshift_decode, entropy=lambda e: e,printables_rate=lambda pr: pr, transitive=True, + pattern=r"^kbshift(?:|[-_]((?:az|qw)erty[-_]?[1-9]|(?:ansi|azerty-(?:be|fr)|dvorak|qwerty-us)[-_]?(?:[1-9]|1[0-2])))$") + diff --git a/src/codext/others/letters.py b/src/codext/others/letters.py old mode 100755 new mode 100644 index e27ae96..57fa26a --- a/src/codext/others/letters.py +++ b/src/codext/others/letters.py @@ -1,91 +1,91 @@ -# -*- coding: UTF-8 -*- -"""Letters Codec - letter indices-related content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from string import ascii_uppercase - -from ..__common__ import * - - -__examples__ = { - 'enc(consonant-index|consonants_indices)': { - 'This is a test': "166I15I15A16E1516", - '\x00': None, - '\xff': None, - }, - 'dec(consonant-index|consonants_indices)': { - '166I15I15A16E1516': "THISISATEST", - '\x00': None, - '\xff': None, - }, - 'enc(vowel-index|vowels_indices)': {'This is a test': "TH3S3S1T2ST"}, - 'dec(vowel-index|vowels_indices)': {'TH3S3S1T2ST': "THISISATEST"}, - 'enc(consonant-vowel_indices)': {'This is a test': "C16C6V3C15V3C15V1C16V2C15C16"}, - 'dec(consonants_vowels-index)': {'C16C6V3C15V3C15V1C16V2C15C16': "THISISATEST"}, -} -__guess__ = ["consonant-index", "vowel-index", "consonants_vowels-index"] - - -VOWELS = "AEIOUY" - - -def __get_encmap(letters): - if re.match(r"^consonants?$", letters): - encmap = {c: str(i+1) for i, c in enumerate(sorted(set(ascii_uppercase) - set(VOWELS)))} - for c in VOWELS: - encmap[c] = c - elif re.match(r"^vowels?$", letters): - encmap = {c: c for c in ascii_uppercase} - for i, c in enumerate(VOWELS): - encmap[c] = str(i+1) - elif re.match(r"^consonants?[-_]vowels?$", letters): - encmap = {c: "C" + str(i+1) for i, c in enumerate(sorted(set(ascii_uppercase) - set(VOWELS)))} - for i, c in enumerate(VOWELS): - encmap[c] = "V" + str(i+1) - for c in " ": - encmap[c] = "" - return encmap - - -def letters_encode(letters): - encmap = __get_encmap(letters) - def encode(text, errors="strict"): - s = "" - for i, c in enumerate(text.upper()): - try: - s += encmap[c] - except KeyError: - s += handle_error("letter-indices", errors)(c, i) - return "".join(encmap.get(c.upper(), c) for c in text), len(text) - return encode - - -def letters_decode(letters): - decmap = {v: k for k, v in __get_encmap(letters).items()} - maxlen = max(len(x) for x in decmap.keys()) - def decode(text, errors="strict"): - s, i = "", 0 - while i < len(text): - err = True - for j in range(maxlen, 0, -1): - try: - s += decmap[text[i:i+j]] - i += j - err = False - break - except (IndexError, KeyError): - pass - if err: - s += handle_error("letter-indices", errors, decode=True)(text[i], i) - return s, len(text) - return decode - - -add("letter-indices", letters_encode, letters_decode, printables_rate=1., expansion_factor=None, - pattern=r"^(consonants?|vowels?|consonants?[-_]vowels?)[-_]ind(?:ex|ices)$") - +# -*- coding: UTF-8 -*- +"""Letters Codec - letter indices-related content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from string import ascii_uppercase + +from ..__common__ import * + + +__examples__ = { + 'enc(consonant-index|consonants_indices)': { + 'This is a test': "166I15I15A16E1516", + '\x00': None, + '\xff': None, + }, + 'dec(consonant-index|consonants_indices)': { + '166I15I15A16E1516': "THISISATEST", + '\x00': None, + '\xff': None, + }, + 'enc(vowel-index|vowels_indices)': {'This is a test': "TH3S3S1T2ST"}, + 'dec(vowel-index|vowels_indices)': {'TH3S3S1T2ST': "THISISATEST"}, + 'enc(consonant-vowel_indices)': {'This is a test': "C16C6V3C15V3C15V1C16V2C15C16"}, + 'dec(consonants_vowels-index)': {'C16C6V3C15V3C15V1C16V2C15C16': "THISISATEST"}, +} +__guess__ = ["consonant-index", "vowel-index", "consonants_vowels-index"] + + +VOWELS = "AEIOUY" + + +def __get_encmap(letters): + if re.match(r"^consonants?$", letters): + encmap = {c: str(i+1) for i, c in enumerate(sorted(set(ascii_uppercase) - set(VOWELS)))} + for c in VOWELS: + encmap[c] = c + elif re.match(r"^vowels?$", letters): + encmap = {c: c for c in ascii_uppercase} + for i, c in enumerate(VOWELS): + encmap[c] = str(i+1) + elif re.match(r"^consonants?[-_]vowels?$", letters): + encmap = {c: "C" + str(i+1) for i, c in enumerate(sorted(set(ascii_uppercase) - set(VOWELS)))} + for i, c in enumerate(VOWELS): + encmap[c] = "V" + str(i+1) + for c in " ": + encmap[c] = "" + return encmap + + +def letters_encode(letters): + encmap = __get_encmap(letters) + def encode(text, errors="strict"): + s = "" + for i, c in enumerate(text.upper()): + try: + s += encmap[c] + except KeyError: + s += handle_error("letter-indices", errors)(c, i) + return "".join(encmap.get(c.upper(), c) for c in text), len(text) + return encode + + +def letters_decode(letters): + decmap = {v: k for k, v in __get_encmap(letters).items()} + maxlen = max(len(x) for x in decmap.keys()) + def decode(text, errors="strict"): + s, i = "", 0 + while i < len(text): + err = True + for j in range(maxlen, 0, -1): + try: + s += decmap[text[i:i+j]] + i += j + err = False + break + except (IndexError, KeyError): + pass + if err: + s += handle_error("letter-indices", errors, decode=True)(text[i], i) + return s, len(text) + return decode + + +add("letter-indices", letters_encode, letters_decode, printables_rate=1., expansion_factor=None, + pattern=r"^(consonants?|vowels?|consonants?[-_]vowels?)[-_]ind(?:ex|ices)$") + diff --git a/src/codext/others/markdown.py b/src/codext/others/markdown.py old mode 100755 new mode 100644 index b3d300f..548ee2b --- a/src/codext/others/markdown.py +++ b/src/codext/others/markdown.py @@ -1,22 +1,22 @@ -# -*- coding: UTF-8 -*- -"""Markdown Codec - markdown content conversion to HTML. - -This codec: -- encodes strings from str to str -- encodes strings from bytes to bytes -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__guess__ = [] - - -try: - from markdown2 import markdown as md2html - # note: the group is NOT captured so that the pattern is only used to match the name of the codec and not to - # dynamically bind to a parametrizable encode function - add("markdown", lambda md, error="strict": (md2html(md), len(md)), pattern=r"^(?:markdown|Markdown|md)$") -except ImportError: - pass - +# -*- coding: UTF-8 -*- +"""Markdown Codec - markdown content conversion to HTML. + +This codec: +- encodes strings from str to str +- encodes strings from bytes to bytes +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__guess__ = [] + + +try: + from markdown2 import markdown as md2html + # note: the group is NOT captured so that the pattern is only used to match the name of the codec and not to + # dynamically bind to a parametrizable encode function + add("markdown", lambda md, error="strict": (md2html(md), len(md)), pattern=r"^(?:markdown|Markdown|md)$") +except ImportError: + pass + diff --git a/src/codext/stegano/__init__.py b/src/codext/stegano/__init__.py old mode 100755 new mode 100644 index 0f5d06b..22b8ca5 --- a/src/codext/stegano/__init__.py +++ b/src/codext/stegano/__init__.py @@ -1,8 +1,8 @@ -# -*- coding: UTF-8 -*- -from .hexagram import * -from .klopf import * -from .resistor import * -from .rick import * -from .sms import * -from .whitespace import * - +# -*- coding: UTF-8 -*- +from .hexagram import * +from .klopf import * +from .resistor import * +from .rick import * +from .sms import * +from .whitespace import * + diff --git a/src/codext/stegano/hexagram.py b/src/codext/stegano/hexagram.py old mode 100755 new mode 100644 diff --git a/src/codext/stegano/klopf.py b/src/codext/stegano/klopf.py old mode 100755 new mode 100644 index 0a0fd24..24c1f19 --- a/src/codext/stegano/klopf.py +++ b/src/codext/stegano/klopf.py @@ -1,25 +1,25 @@ -# -*- coding: UTF-8 -*- -"""Klopf Codec - Polybius-based content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(klopf|klopfcode)': {'this is a test': "44324234 4234 11 44513444"}, -} - - -ENCMAP = {"ABCDEFGHIKLMNOPQRSTUVWXYZ"[y*5+x]: "".join([str(x+1), str(y+1)]) for x in range(5) for y in range(5)} -ENCMAP['J'] = "43" -ENCMAP[' '] = " " - - -add_map("klopf", ENCMAP, ignore_case="both", pattern=r"^(?:klopf(?:code)?)$", printables_rate=1., - expansion_factor=(1.85, .15)) - +# -*- coding: UTF-8 -*- +"""Klopf Codec - Polybius-based content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(klopf|klopfcode)': {'this is a test': "44324234 4234 11 44513444"}, +} + + +ENCMAP = {"ABCDEFGHIKLMNOPQRSTUVWXYZ"[y*5+x]: "".join([str(x+1), str(y+1)]) for x in range(5) for y in range(5)} +ENCMAP['J'] = "43" +ENCMAP[' '] = " " + + +add_map("klopf", ENCMAP, ignore_case="both", pattern=r"^(?:klopf(?:code)?)$", printables_rate=1., + expansion_factor=(1.85, .15)) + diff --git a/src/codext/stegano/resistor.py b/src/codext/stegano/resistor.py old mode 100755 new mode 100644 index 2caaf1d..badb033 --- a/src/codext/stegano/resistor.py +++ b/src/codext/stegano/resistor.py @@ -1,28 +1,28 @@ -# -*- coding: UTF-8 -*- -"""Resistor Codec - resistor color codes content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(resistor|resistor_color|condensator_color_code|condensators-color-code)': { - 'Test': "\x1b[48;5;232m \x1b[0;00m\x1b[48;5;245m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m\x1b[48;5;130m " - "\x1b[0;00m\x1b[48;5;232m \x1b[0;00m\x1b[48;5;130m \x1b[0;00m\x1b[48;5;130m \x1b[0;00m\x1b[48;5;130m " - "\x1b[0;00m\x1b[48;5;2m \x1b[0;00m\x1b[48;5;130m \x1b[0;00m\x1b[48;5;130m \x1b[0;00m\x1b[48;5;4m " - "\x1b[0;00m" - }, -} - - -ENCMAP = {i: "\033[48;5;%dm \033[0;00m" % c for i, c in zip("0123456789", [232, 130, 1, 214, 11, 2, 4, 129, 245, 231])} - - -add_map("resistor", ENCMAP, intype="ord", pattern=r"^(?:condensator|resistor)s?(?:[-_]color(?:[-_]code)?)?$", - entropy=3.4, printables_rate=.3333333333333333, expansion_factor=(56., 2.)) - +# -*- coding: UTF-8 -*- +"""Resistor Codec - resistor color codes content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(resistor|resistor_color|condensator_color_code|condensators-color-code)': { + 'Test': "\x1b[48;5;232m \x1b[0;00m\x1b[48;5;245m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m\x1b[48;5;130m " + "\x1b[0;00m\x1b[48;5;232m \x1b[0;00m\x1b[48;5;130m \x1b[0;00m\x1b[48;5;130m \x1b[0;00m\x1b[48;5;130m " + "\x1b[0;00m\x1b[48;5;2m \x1b[0;00m\x1b[48;5;130m \x1b[0;00m\x1b[48;5;130m \x1b[0;00m\x1b[48;5;4m " + "\x1b[0;00m" + }, +} + + +ENCMAP = {i: "\033[48;5;%dm \033[0;00m" % c for i, c in zip("0123456789", [232, 130, 1, 214, 11, 2, 4, 129, 245, 231])} + + +add_map("resistor", ENCMAP, intype="ord", pattern=r"^(?:condensator|resistor)s?(?:[-_]color(?:[-_]code)?)?$", + entropy=3.4, printables_rate=.3333333333333333, expansion_factor=(56., 2.)) + diff --git a/src/codext/stegano/rick.py b/src/codext/stegano/rick.py old mode 100755 new mode 100644 index 30986e4..855af95 --- a/src/codext/stegano/rick.py +++ b/src/codext/stegano/rick.py @@ -1,31 +1,31 @@ -# -*- coding: UTF-8 -*- -"""Rick Astley Codec - Rick Astley's song content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(rick|rick-astley)': {'this is a test': "TELL LET You gonna + You gonna + NEVER + TELL UP gonna TELL"}, -} - - -# inspired from: https://github.com/moongazer07/rick-cipher -ENCMAP = { - 'A': "NEVER", 'B': "GONNA", 'C': "GIVE", 'D': "YOU", 'E': "UP", 'F': "Never", 'G': "Gonna", 'H': "LET", 'I': "You", - 'J': "DOWN", 'K': "NEver", 'L': "GOnna", 'M': "TURN", 'N': "AROUND", 'O': "AND", 'P': ["DESERT", "DESSERT"], - 'Q': "YOu", 'R': "NEVer", 'S': "gonna", 'T': "TELL", 'U': "A", 'V': "LIE", 'W': "and", 'X': "HURT", 'Y': "you", - 'Z': "rick", ' ': "+", '.': ".", '\n': "\n", - '0': "0", '1': "1", '2': "2", '3': "3", '4': "4", '5': "5", '6': "6", '7': "7", '8': "8", '9': "9", - '': "astley", # silent this token for decoding ("rick astley" causes an issue with the separator " ") -} - - -add_map("rick", ENCMAP, "?", " ", ignore_case="encode", pattern=r"^rick(?:[-_]astley)?(?:[-_]cipher)?$", - printables_rate=1.) - +# -*- coding: UTF-8 -*- +"""Rick Astley Codec - Rick Astley's song content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(rick|rick-astley)': {'this is a test': "TELL LET You gonna + You gonna + NEVER + TELL UP gonna TELL"}, +} + + +# inspired from: https://github.com/moongazer07/rick-cipher +ENCMAP = { + 'A': "NEVER", 'B': "GONNA", 'C': "GIVE", 'D': "YOU", 'E': "UP", 'F': "Never", 'G': "Gonna", 'H': "LET", 'I': "You", + 'J': "DOWN", 'K': "NEver", 'L': "GOnna", 'M': "TURN", 'N': "AROUND", 'O': "AND", 'P': ["DESERT", "DESSERT"], + 'Q': "YOu", 'R': "NEVer", 'S': "gonna", 'T': "TELL", 'U': "A", 'V': "LIE", 'W': "and", 'X': "HURT", 'Y': "you", + 'Z': "rick", ' ': "+", '.': ".", '\n': "\n", + '0': "0", '1': "1", '2': "2", '3': "3", '4': "4", '5': "5", '6': "6", '7': "7", '8': "8", '9': "9", + '': "astley", # silent this token for decoding ("rick astley" causes an issue with the separator " ") +} + + +add_map("rick", ENCMAP, "?", " ", ignore_case="encode", pattern=r"^rick(?:[-_]astley)?(?:[-_]cipher)?$", + printables_rate=1.) + diff --git a/src/codext/stegano/sms.py b/src/codext/stegano/sms.py old mode 100755 new mode 100644 index f8a5b2c..01e2be7 --- a/src/codext/stegano/sms.py +++ b/src/codext/stegano/sms.py @@ -1,27 +1,27 @@ -# -*- coding: UTF-8 -*- -"""SMS Codec - phone keystrokes content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(sms|nokia3310|nokia-3310|nokia_3310|t9)': {'this is a test': "8-44-444-7777-0-444-7777-0-2-0-8-33-7777-8"}, -} - - -ENCMAP = { - ' ': "0", 'a': "2", 'b': "22", 'c': "222", 'd': "3", 'e': "33", 'f': "333", 'g': "4", 'h': "44", 'i': "444", - 'j': "5", 'k': "55", 'l': "555", 'm': "6", 'n': "66", 'o': "666", 'p': "7", 'q': "77", 'r': "777", 's': "7777", - 't': "8", 'u': "88", 'v': "888", 'w': "9", 'x': "99", 'y': "999", 'z': "9999", '*': "*", '#': "#", -} - - -add_map("sms", ENCMAP, "?", "-_", ignore_case="encode", pattern=r"^(?:nokia(?:[-_]?3310)?|sms|t9)$", printables_rate=1., - expansion_factor=(2.9, .2)) - +# -*- coding: UTF-8 -*- +"""SMS Codec - phone keystrokes content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(sms|nokia3310|nokia-3310|nokia_3310|t9)': {'this is a test': "8-44-444-7777-0-444-7777-0-2-0-8-33-7777-8"}, +} + + +ENCMAP = { + ' ': "0", 'a': "2", 'b': "22", 'c': "222", 'd': "3", 'e': "33", 'f': "333", 'g': "4", 'h': "44", 'i': "444", + 'j': "5", 'k': "55", 'l': "555", 'm': "6", 'n': "66", 'o': "666", 'p': "7", 'q': "77", 'r': "777", 's': "7777", + 't': "8", 'u': "88", 'v': "888", 'w': "9", 'x': "99", 'y': "999", 'z': "9999", '*': "*", '#': "#", +} + + +add_map("sms", ENCMAP, "?", "-_", ignore_case="encode", pattern=r"^(?:nokia(?:[-_]?3310)?|sms|t9)$", printables_rate=1., + expansion_factor=(2.9, .2)) + diff --git a/src/codext/stegano/whitespace.py b/src/codext/stegano/whitespace.py old mode 100755 new mode 100644 index 07eaef0..f4343e4 --- a/src/codext/stegano/whitespace.py +++ b/src/codext/stegano/whitespace.py @@ -1,71 +1,71 @@ -# -*- coding: UTF-8 -*- -"""Whitespace Codec - whitespace/tabs content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import random -import re -from string import printable - -from ..__common__ import * - - -__examples1__ = { - 'enc(whitespace|whitespaces)': {'test': "\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t"}, - 'enc(whitespace-inv|whitespace_inverted)': {'test': " \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t "}, -} -__guess1__ = ["whitespace", "whitespace-inv"] -__guess2__ = ["whitespace+after-before", "whitespace-after+before"] - - -ENCMAP = {r'': {'0': "\t", '1': " "}, r'[-_]inv(erted)?': {'0': " ", '1': "\t"}} -add_map("whitespace", ENCMAP, intype="bin", pattern=r"^whitespaces?([-_]inv(?:erted)?)?$", examples=__examples1__, - guess=__guess1__, entropy=1., printables_rate=1., expansion_factor=8.) - - -def wsba_encode(p): - eq = "ord(c)" + p - def encode(text, errors="strict"): - r = [] - for i, c in enumerate(text): - if ord(c) < min(ord(c) for c in printable[:-6]): - r.append(handle_error("whitespace" + p, errors, repl_char="\x00")(c, i)) - continue - enc = "\x00" - offset = random.randint(-10,10) - while enc not in printable[:-6]: - after = random.randint(0, 20) - before = random.randint(0, 20) - enc = chr(eval(eq) % 256) - r.append(" " * before + enc + " " * after) - s = "\n".join(r) - return s, len(s) - return encode - - -def wsba_decode(p): - eq = "ord(c)" + "".join({'-':"+",'+':"-"}.get(c, c) for c in p) - def decode(text, errors="strict"): - s = "" - for i, l in enumerate(text.split("\n")): - ll = len(l.strip()) - if ll == 0: - continue - if ll > 1: - s += handle_error("whitespace_after_before", errors, decode=True, item="line")(l, i) - after = len(l) - len(l.rstrip(" ")) - before = len(l) - len(l.lstrip(" ")) - c = l[before] - s += chr(eval(eq)) - return s, len(text) - return decode - - -op = r"[+-](?:\d+(?:\.\d+)?[*/])?" -add("whitespace_after_before", wsba_encode, wsba_decode, guess=__guess2__, entropy=1., printables_rate=1., penalty=.1, - expansion_factor=(22., 3.), pattern=r"whitespace("+op+r"before"+op+r"after|"+op+r"after"+op+r"before)$") - +# -*- coding: UTF-8 -*- +"""Whitespace Codec - whitespace/tabs content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import random +import re +from string import printable + +from ..__common__ import * + + +__examples1__ = { + 'enc(whitespace|whitespaces)': {'test': "\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t"}, + 'enc(whitespace-inv|whitespace_inverted)': {'test': " \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t "}, +} +__guess1__ = ["whitespace", "whitespace-inv"] +__guess2__ = ["whitespace+after-before", "whitespace-after+before"] + + +ENCMAP = {r'': {'0': "\t", '1': " "}, r'[-_]inv(erted)?': {'0': " ", '1': "\t"}} +add_map("whitespace", ENCMAP, intype="bin", pattern=r"^whitespaces?([-_]inv(?:erted)?)?$", examples=__examples1__, + guess=__guess1__, entropy=1., printables_rate=1., expansion_factor=8.) + + +def wsba_encode(p): + eq = "ord(c)" + p + def encode(text, errors="strict"): + r = [] + for i, c in enumerate(text): + if ord(c) < min(ord(c) for c in printable[:-6]): + r.append(handle_error("whitespace" + p, errors, repl_char="\x00")(c, i)) + continue + enc = "\x00" + offset = random.randint(-10,10) + while enc not in printable[:-6]: + after = random.randint(0, 20) + before = random.randint(0, 20) + enc = chr(eval(eq) % 256) + r.append(" " * before + enc + " " * after) + s = "\n".join(r) + return s, len(s) + return encode + + +def wsba_decode(p): + eq = "ord(c)" + "".join({'-':"+",'+':"-"}.get(c, c) for c in p) + def decode(text, errors="strict"): + s = "" + for i, l in enumerate(text.split("\n")): + ll = len(l.strip()) + if ll == 0: + continue + if ll > 1: + s += handle_error("whitespace_after_before", errors, decode=True, item="line")(l, i) + after = len(l) - len(l.rstrip(" ")) + before = len(l) - len(l.lstrip(" ")) + c = l[before] + s += chr(eval(eq)) + return s, len(text) + return decode + + +op = r"[+-](?:\d+(?:\.\d+)?[*/])?" +add("whitespace_after_before", wsba_encode, wsba_decode, guess=__guess2__, entropy=1., printables_rate=1., penalty=.1, + expansion_factor=(22., 3.), pattern=r"whitespace("+op+r"before"+op+r"after|"+op+r"after"+op+r"before)$") + diff --git a/src/codext/web/__init__.py b/src/codext/web/__init__.py old mode 100755 new mode 100644 index b29367a..566b441 --- a/src/codext/web/__init__.py +++ b/src/codext/web/__init__.py @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- -from .html import * -from .url import * - +# -*- coding: UTF-8 -*- +from .html import * +from .url import * + diff --git a/src/codext/web/html.py b/src/codext/web/html.py old mode 100755 new mode 100644 diff --git a/src/codext/web/url.py b/src/codext/web/url.py old mode 100755 new mode 100644 index 24035a2..3abff09 --- a/src/codext/web/url.py +++ b/src/codext/web/url.py @@ -1,29 +1,29 @@ -# -*- coding: UTF-8 -*- -"""URL Codec - urlencode content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(url|urlencode)': {'?=this/is-a_test/../': "%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F"}, - 'dec(url|urlencode)': {'test/test%2etxt': "test/test.txt", 'test%2ftest.txt': "test/test.txt"} -} - - -SAFE = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-" -ENCMAP = {} -for i in range(256): - c = chr(i) - if c not in SAFE: - ENCMAP[c] = "%{:02X}".format(i) - - -add_map("url", ENCMAP, ignore_case="decode", no_error=True, pattern=r"^url(?:encode)?$", printables_rate=1., - expansion_factor=(1.2, .2)) - +# -*- coding: UTF-8 -*- +"""URL Codec - urlencode content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(url|urlencode)': {'?=this/is-a_test/../': "%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F"}, + 'dec(url|urlencode)': {'test/test%2etxt': "test/test.txt", 'test%2ftest.txt': "test/test.txt"} +} + + +SAFE = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-" +ENCMAP = {} +for i in range(256): + c = chr(i) + if c not in SAFE: + ENCMAP[c] = "%{:02X}".format(i) + + +add_map("url", ENCMAP, ignore_case="decode", no_error=True, pattern=r"^url(?:encode)?$", printables_rate=1., + expansion_factor=(1.2, .2)) + From 10ccc82a9442cbe955785ea2b5f60d51e1a6be43 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 22 Mar 2026 17:28:49 +0100 Subject: [PATCH 55/62] Updated README --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8026d48..b62cec3 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![PyPi](https://img.shields.io/pypi/v/codext.svg)](https://pypi.python.org/pypi/codext/) [![Read The Docs](https://readthedocs.org/projects/python-codext/badge/?version=latest)](https://python-codext.readthedocs.io/en/latest/?badge=latest) [![Build Status](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml/badge.svg)](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml) -[![Coverage Status](https://raw.githubusercontent.com/dhondta/python-codext/main/docs/coverage.svg)](#) +[![Coverage Status](https://raw.githubusercontent.com/dhondta/python-codext/coverage-badge/docs/coverage.svg)](#) [![Python Versions](https://img.shields.io/pypi/pyversions/codext.svg)](https://pypi.python.org/pypi/codext/) [![Known Vulnerabilities](https://snyk.io/test/github/dhondta/python-codext/badge.svg?targetFile=requirements.txt)](https://snyk.io/test/github/dhondta/python-codext?targetFile=requirements.txt) [![DOI](https://zenodo.org/badge/236679865.svg)](https://zenodo.org/badge/latestdoi/236679865) @@ -255,6 +255,7 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba - [X] `adler`: Adler32 algorithm (relies on `zlib`) - [X] `crc`: CRC of lengths 8, 10-17, 21, 24, 30-32, 40, 64, 82 with a variety of polynoms +- [X] `luhn`: Luhn mod N algorithm #### [Common](https://python-codext.readthedocs.io/en/latest/enc/common) From 12400045277c5dabca8683b920fbb76fc23154aa Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 22 Mar 2026 21:17:45 +0100 Subject: [PATCH 56/62] Added new codec: polybius --- README.md | 3 +- docs/pages/enc/crypto.md | 19 +++++++++ src/codext/crypto/__init__.py | 1 + src/codext/crypto/polybius.py | 77 +++++++++++++++++++++++++++++++++++ tests/test_manual.py | 11 ++++- 5 files changed, 108 insertions(+), 3 deletions(-) create mode 100755 src/codext/crypto/polybius.py diff --git a/README.md b/README.md index b62cec3..45f488c 100644 --- a/README.md +++ b/README.md @@ -285,7 +285,8 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba - [X] `bacon`: aka Baconian Cipher - [X] `barbie-N`: aka Barbie Typewriter (*N* belongs to [1, 4]) - [X] `citrix`: aka Citrix CTX1 password encoding -- [X] `railfence`: aka Rail Fence Cipher +- [X] `polybius`: aka Polybius Square Cipher +- [X] `railfence`: aka Rail Fence Cipher - [X] `rotN`: aka Caesar cipher (*N* belongs to [1,25]) - [X] `scytaleN`: encrypts using the number of letters on the rod (*N* belongs to [1,[) - [X] `shiftN`: shift ordinals (*N* belongs to [1,255]) diff --git a/docs/pages/enc/crypto.md b/docs/pages/enc/crypto.md index b189c0e..71d89e9 100644 --- a/docs/pages/enc/crypto.md +++ b/docs/pages/enc/crypto.md @@ -126,6 +126,25 @@ This implements the Citrix CTX1 password encoding algorithm. ----- +### Polybius Square Cipher + +This implements the well-known Polybius Square cipher, using the square with the alphabet in normal order as the default. It can be used dynamically with a custom alphabet. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`polybius` | text <-> polybius square ciphertext | `polybius-square`, `polybius_BACDEFGHIKLMNOPQRSTUVWXYZ`, ... | + +```python +>>> codext.encode("this is a test", "polybius") +'44232443 2443 11 44154344' +>>> codext.encode("this is a test", "polybius_BACDEFGHIKLMNOPQRSTUVWXYZ") +'44232443 2443 12 44154344' +>>> codext.decode("44232443 2443 11 441543445", "polybius-square", errors="replace") +'THIS IS A TEST?' +``` + +----- + ### Rail Fence Cipher This implements the Rail Fence encoding algorithm, using 3 rails and offset 0 as the default parameters. The encoding fence is built from the top ; the `up` flag can be used to build the fence from the bottom. Note that trying parameters that do not fit the input length will trigger a `ValueError` mentioning the bad value. diff --git a/src/codext/crypto/__init__.py b/src/codext/crypto/__init__.py index 1244bae..0854db2 100644 --- a/src/codext/crypto/__init__.py +++ b/src/codext/crypto/__init__.py @@ -4,6 +4,7 @@ from .bacon import * from .barbie import * from .citrix import * +from .polybius import * from .railfence import * from .rot import * from .scytale import * diff --git a/src/codext/crypto/polybius.py b/src/codext/crypto/polybius.py new file mode 100755 index 0000000..73fae76 --- /dev/null +++ b/src/codext/crypto/polybius.py @@ -0,0 +1,77 @@ +# -*- coding: UTF-8 -*- +"""Polybius Square Codec - polybius-square content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(polybius|polybius-square|polybius_square)': {'this is a test': "44232443 2443 11 44154344"}, + 'enc(polybius-ABCDEFGHIKLMNOPQRSTUVWXYZ)': {'this is a test': "44232443 2443 11 44154344"}, + 'dec(polybius)': {'44232443 2443 11 44154344': "THIS IS A TEST"}, +} +__guess__ = ["polybius"] + + +# Standard 5×5 Polybius square (I and J share the same cell): +# 1 2 3 4 5 +# 1 A B C D E +# 2 F G H I K +# 3 L M N O P +# 4 Q R S T U +# 5 V W X Y Z +_DEFAULT_ALPHABET = "ABCDEFGHIKLMNOPQRSTUVWXYZ" + + +def __make_maps(alphabet): + """ Build the encoding and decoding maps for the given 25-character alphabet. """ + alph = alphabet.upper() if alphabet else _DEFAULT_ALPHABET + if len(alph) != 25 or len(set(alph)) != 25: + raise LookupError("Polybius square requires exactly 25 distinct characters; " + f"got {len(alph)} character(s) with {len(set(alph))} unique: {alph}") + encmap = {alph[i]: str(i // 5 + 1) + str(i % 5 + 1) for i in range(25)} + decmap = {v: k for k, v in encmap.items()} + if 'J' not in encmap and 'I' in encmap: + encmap['J'] = encmap['I'] + encmap[' '] = ' ' + return encmap, decmap + + +def polybius_encode(alphabet=_DEFAULT_ALPHABET): + encmap, _ = __make_maps(alphabet) + def encode(text, errors="strict"): + _h = handle_error("polybius", errors) + r = "" + for pos, c in enumerate(ensure_str(text).upper()): + r += encmap[c] if c in encmap else _h(c, pos, r) + return r, len(text) + return encode + + +def polybius_decode(alphabet=_DEFAULT_ALPHABET): + _, decmap = __make_maps(alphabet) + def decode(text, errors="strict"): + _h = handle_error("polybius", errors, decode=True) + r, t, i = "", ensure_str(text), 0 + while i < len(t): + if t[i] == " ": + r += " " + i += 1 + elif i + 1 < len(t): + r += decmap.get(t[i:i+2]) or _h(t[i:i+2], i, r) + i += 2 + else: + r += _h(t[i], i, r) + i += 1 + return r, len(t) + return decode + + +add("polybius", polybius_encode, polybius_decode, r"^polybius(?:[-_]square)?(?:[-_]([A-Za-z]{25}))?$", + printables_rate=1., expansion_factor=(1.7, .3)) + diff --git a/tests/test_manual.py b/tests/test_manual.py index 5a5f164..25bbe18 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -8,12 +8,11 @@ from unittest import TestCase from codext.__common__ import * -from codext.binary.baudot import _check_alphabet -from codext.checksums.crc import CRC class ComplementaryTestCase(TestCase): def test_codec_baudot(self): + from codext.binary.baudot import _check_alphabet self.assertRaises(ValueError, _check_alphabet, ["BAD_ALPHABET"]) def test_codec_dna(self): @@ -23,6 +22,13 @@ def test_codec_dna(self): def test_codec_morse(self): self.assertRaises(LookupError, codecs.encode, "test", "morse-AAB") + def test_codec_polybius(self): + from codext.crypto.polybius import polybius_encode, polybius_decode + self.assertRaises(LookupError, polybius_encode, "ABC") + self.assertRaises(ValueError, polybius_decode(), "BAD_") + self.assertRaises(ValueError, polybius_decode(), "441543441") + self.assertEqual(codecs.decode("441543445", "polybius", "ignore"), "TEST") + def test_codec_sms(self): self.assertEqual(codecs.decode("A-B-222-3-4-5", "sms", "leave"), "ABcdgj") @@ -103,6 +109,7 @@ def test_codec_dummy_str_manips(self): self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200") def test_codec_hash_functions(self): + from codext.checksums.crc import CRC STR = b"This is a test string!" for h in ["adler32", "md2", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: self.assertIsNotNone(codecs.encode(STR, h)) From 8903aea7fdb7eac166ccf8e2f778e9976e2ada1d Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 22 Mar 2026 23:18:31 +0100 Subject: [PATCH 57/62] Added new codec: vigenere --- README.md | 2 ++ docs/pages/enc/crypto.md | 18 +++++++++++ src/codext/crypto/__init__.py | 1 + src/codext/crypto/vigenere.py | 59 +++++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+) create mode 100755 src/codext/crypto/vigenere.py diff --git a/README.md b/README.md index 45f488c..098a202 100644 --- a/README.md +++ b/README.md @@ -284,12 +284,14 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba - [X] `atbash`: aka Atbash Cipher - [X] `bacon`: aka Baconian Cipher - [X] `barbie-N`: aka Barbie Typewriter (*N* belongs to [1, 4]) +- [X] `beaufort`: aka Beaufort Cipher (variant of Vigenere Cipher) - [X] `citrix`: aka Citrix CTX1 password encoding - [X] `polybius`: aka Polybius Square Cipher - [X] `railfence`: aka Rail Fence Cipher - [X] `rotN`: aka Caesar cipher (*N* belongs to [1,25]) - [X] `scytaleN`: encrypts using the number of letters on the rod (*N* belongs to [1,[) - [X] `shiftN`: shift ordinals (*N* belongs to [1,255]) +- [X] `vigenere`: aka Vigenere Cipher - [X] `xorN`: XOR with a single byte (*N* belongs to [1,255]) > :warning: Crypto functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. diff --git a/docs/pages/enc/crypto.md b/docs/pages/enc/crypto.md index 71d89e9..432ac3d 100644 --- a/docs/pages/enc/crypto.md +++ b/docs/pages/enc/crypto.md @@ -202,6 +202,24 @@ This is a dynamic encoding, that is, it can be called with an integer to define ----- +### Vigenere Cipher + +This is a dynamic encoding, that is, it holds the key. There is no default key, meaning that `vigenere` as the encoding scheme throws a `LookupError` indicating that the _key must be a non-empty alphabetic string_. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`vigenere` | text <-> Vigenere ciphertext | `vigenere-abcdef`, `vigenere_MySuperSecret` | key only consists of characters, not digits + +```python +>>> codext.encode("This is a test !", "vigenere-abababa") +'Tiit it a tfsu !' +>>> codext.encode("This is a test !", "vigenere_MySuperSecret") +'Ffam xw r liuk !' +>>> codext.decode("Tiit it a tfsu !", "vigenere-abababa") +``` + +----- + ### XOR with 1 byte This is a dynamic encoding, that is, it can be called with an integer to define the ordinal of the byte to XOR with the input text. diff --git a/src/codext/crypto/__init__.py b/src/codext/crypto/__init__.py index 0854db2..21da6d9 100644 --- a/src/codext/crypto/__init__.py +++ b/src/codext/crypto/__init__.py @@ -9,5 +9,6 @@ from .rot import * from .scytale import * from .shift import * +from .vigenere import * from .xor import * diff --git a/src/codext/crypto/vigenere.py b/src/codext/crypto/vigenere.py new file mode 100755 index 0000000..4276104 --- /dev/null +++ b/src/codext/crypto/vigenere.py @@ -0,0 +1,59 @@ +# -*- coding: UTF-8 -*- +"""Vigenere Cipher Codec - vigenere content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from string import ascii_lowercase as LC, ascii_uppercase as UC + +from ..__common__ import * + + +__examples__ = { + 'enc(beaufort)': None, + 'enc(beaufort-lemon)': {'ATTACKATDAWN': 'LLTOLBETLNPR'}, + 'enc(beaufort-key)': {'hello': 'danzq'}, + 'enc(beaufort_key)': {'Hello World': 'Danzq Cwnnh'}, + 'enc-dec(beaufort-secret)': ['hello world', 'ATTACK AT DAWN', 'Test 1234!'], + 'enc(vigenere)': None, + 'enc(vigenere-lemon)': {'ATTACKATDAWN': 'LXFOPVEFRNHR'}, + 'enc(vigenere-key)': {'hello': 'rijvs'}, + 'enc(vigenère_key)': {'Hello World': 'Rijvs Uyvjn'}, + 'enc-dec(vigenere-secret)': ['hello world', 'ATTACK AT DAWN', 'Test 1234!'], +} +__guess__ = ["beaufort-key", "beaufort-secret", "beaufort-password", + "vigenere-key", "vigenere-secret", "vigenere-password"] + + +def __make(encoding, char_func): + def code(decode=False): + def _code(key): + def _wrapper(text, errors="strict"): + k = key.lower() + if not k or not k.isalpha(): + raise LookupError(f"Bad parameter for encoding '{encoding}': key must be a non-empty alphabetic string") + result, i = [], 0 + for c in ensure_str(text): + if c in LC or c in UC: + result.append(char_func(c, k, i, decode)) + i += 1 + else: + result.append(c) + r = "".join(result) + return r, len(r) + return _wrapper + return _code + return code(), code(True) + + +bchar = lambda c, k, i, d=False: (LC if (b := c in LC) else UC)[(ord(k[i % len(k)]) - ord('a') - \ + (ord(c) - ord("Aa"[b]))) % 26] +add("beaufort", *__make("beaufort", bchar), r"beaufort(?:[-_]cipher)?(?:[-_]([a-zA-Z]+))?$", penalty=.1) + +vchar = lambda c, k, i, d=False: (LC if (b := c in LC) else UC)[(ord(c) - ord("Aa"[b]) + \ + [1, -1][d] * (ord(k[i % len(k)]) - ord('a'))) % 26] +add("vigenere", *__make("vigenere", vchar), r"vigen[eè]re(?:[-_]cipher)?(?:[-_]([a-zA-Z]+))?$", penalty=.1) + From 16293996c86b6153ef4bc281079a68f67f4f7fc5 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 24 Mar 2026 22:40:52 +0100 Subject: [PATCH 58/62] Fine-tuned checksum codecs --- src/codext/__common__.py | 26 +++------- src/codext/checksums/adler.py | 8 +-- src/codext/checksums/crc.py | 10 ++-- src/codext/checksums/luhn.py | 91 ++++++----------------------------- tests/test_manual.py | 19 +++++--- 5 files changed, 41 insertions(+), 113 deletions(-) diff --git a/src/codext/__common__.py b/src/codext/__common__.py index 861d342..2b6d205 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -1,5 +1,6 @@ # -*- coding: UTF-8 -*- import _codecs +import builtins import codecs import hashlib import json @@ -20,22 +21,6 @@ from random import randint from string import * from types import FunctionType, ModuleType -try: # Python2 - import __builtin__ as builtins -except ImportError: - import builtins -try: # Python2 - from inspect import getfullargspec -except ImportError: - from inspect import getargspec as getfullargspec -try: # Python2 - from string import maketrans -except ImportError: - maketrans = str.maketrans -try: # Python3 - from importlib import reload -except ImportError: - pass try: import re._parser as sre_parse except ImportError: @@ -44,6 +29,8 @@ # from Python 3.11, 'sre_parse' is bound as '_parser' ; monkey-patch it for backward-compatibility re.sre_parse = sre_parse +maketrans = str.maketrans + __all__ = ["add", "add_macro", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess", "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "hashlib", "i2s", @@ -277,6 +264,7 @@ def getregentry(encoding): # this occurs while m is not None, but possibly no capture group that gives at least 1 group index ; # in this case, if fenc/fdec is a decorated function, execute it with no arg if len(args) == 0: + from inspect import getfullargspec if fenc and len(getfullargspec(fenc).args) == 1: fenc = fenc() if fdec and len(getfullargspec(fdec).args) == 1: @@ -767,6 +755,7 @@ def remove(name): def reset(): """ Reset codext's local registry of search functions and macros. """ + from importlib import reload global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS # noqa: F824 clear() d = os.path.dirname(__file__) @@ -1142,9 +1131,8 @@ def generate_string_from_regex(regex): def generate_strings_from_regex(regex, star_plus_max=STAR_PLUS_MAX, repeat_max=REPEAT_MAX, yield_max=YIELD_MAX): """ Utility function to generate strings from a regex pattern. """ - i = 0 - for result in __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max): - yield result + for r in __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max): + yield r # guess feature objects diff --git a/src/codext/checksums/adler.py b/src/codext/checksums/adler.py index 5aa312c..16163d8 100644 --- a/src/codext/checksums/adler.py +++ b/src/codext/checksums/adler.py @@ -3,10 +3,10 @@ This is a codec for computing checksums, for use with other codecs in encoding chains. -These codecs: -- transform strings from str to str -- transform strings from bytes to bytes -- transform file content from str to bytes (write) +This codec: +- transforms strings from str to str +- transforms strings from bytes to bytes +- transforms file content from str to bytes (write) """ from zlib import adler32 diff --git a/src/codext/checksums/crc.py b/src/codext/checksums/crc.py index a057d7f..dfea7ee 100644 --- a/src/codext/checksums/crc.py +++ b/src/codext/checksums/crc.py @@ -3,10 +3,10 @@ This is a codec for computing checksums, for use with other codecs in encoding chains. -These codecs: -- transform strings from str to str -- transform strings from bytes to bytes -- transform file content from str to bytes (write) +This codec: +- transforms strings from str to str +- transforms strings from bytes to bytes +- transforms file content from str to bytes (write) """ from ..__common__ import add @@ -212,7 +212,7 @@ }, } -_pattern = lambda n="": r"^crc" + str(n) + r"(|[-_]?(?:%s))$" % "|".join(x for x in CRC[n].keys() if len(x) > 0) +_pattern = lambda n="": rf"^crc(?:[-_]?){n}(|[-_]?(?:{'|'.join(x for x in CRC[n].keys() if len(x) > 0)}))$" _rev_int = lambda i, l=None: int(bin(i)[2:].zfill(l or len(bin(i)[2:]))[::-1], 2) diff --git a/src/codext/checksums/luhn.py b/src/codext/checksums/luhn.py index 42905d3..be19af6 100644 --- a/src/codext/checksums/luhn.py +++ b/src/codext/checksums/luhn.py @@ -1,96 +1,33 @@ # -*- coding: UTF-8 -*- """Luhn Codec - Luhn Mod N checksum algorithm. -The Luhn algorithm, also known as the "modulus 10" algorithm, is a simple checksum -formula used to validate identification numbers (e.g. credit card numbers, IMEI -numbers). Encoding appends a check character; decoding verifies the check character -and strips it. - -The Luhn Mod N generalization extends the algorithm to alphabets of arbitrary size N. -When called as 'luhn' or 'luhn-10', the standard decimal alphabet (0-9, N=10) is -used. When called as 'luhn-' for 2 ≤ N ≤ 36, the first N characters of -'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' form the alphabet. +This is a codec for computing checksums, for use with other codecs in encoding chains. This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) - -Reference: https://en.wikipedia.org/wiki/Luhn_algorithm - https://bitcoinwiki.org/wiki/luhn-mod-n-algorithm +- transforms strings from str to str +- transforms strings from bytes to bytes +- transforms file content from str to bytes (write) """ from ..__common__ import * -__examples__ = { - 'enc(luhn|luhn-10|luhn10)': { - '7992739871': '79927398713', - '': '', - '0': '00', - '1': '18', - }, - 'dec(luhn|luhn-10|luhn10)': { - '79927398713': '7992739871', - '': '', - '00': '0', - '18': '1', - }, - 'enc-dec(luhn)': ['123456789', '0' * 10, '9999999999999999'], - 'enc-dec(luhn-16)': ['0123456789ABCDEF', 'DEADBEEF'], - 'enc-dec(luhn-36)': ['HELLO', 'WORLD123'], -} - -_FULL_ALPHABET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" - - -def _luhn_encode(n=""): - mod = n if isinstance(n, int) else 10 - alphabet = _FULL_ALPHABET[:mod] - - def _encode(text, errors="strict"): - text = ensure_str(text).upper() if mod > 10 else ensure_str(text) - if not text: +def luhn(n=""): + alphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"[:(mod := n if isinstance(n, int) else 10)] + def encode(data, errors="strict"): + total, data = 0, "".join(c if c in alphabet else handle_error("luhn", errors, kind="character")(c, i, data) \ + for i, c in enumerate(data)) + if not (data := ensure_str(data).upper() if mod > 10 else ensure_str(data)): return "", 0 - for pos, c in enumerate(text): - if c not in alphabet: - handle_error("luhn", errors, kind="character")(c, pos, text) - total = 0 - for i, c in enumerate(reversed(text)): + for i, c in enumerate(reversed(data)): code = alphabet.index(c) if i % 2 == 0: d = code * 2 code = d % mod + d // mod total += code check = (mod - total % mod) % mod - return text + alphabet[check], len(b(text)) - - return _encode - - -def _luhn_decode(n=""): - mod = n if isinstance(n, int) else 10 - alphabet = _FULL_ALPHABET[:mod] - - def _decode(text, errors="strict"): - text = ensure_str(text).upper() if mod > 10 else ensure_str(text) - if not text: - return "", 0 - for pos, c in enumerate(text): - if c not in alphabet: - handle_error("luhn", errors, decode=True, kind="character")(c, pos, text) - total = 0 - for i, c in enumerate(reversed(text)): - code = alphabet.index(c) - if i % 2 == 1: - d = code * 2 - code = d % mod + d // mod - total += code - if total % mod != 0: - handle_error("luhn", errors, decode=True)(text[-1], len(text) - 1, text[:-1]) - return text[:-1], len(b(text)) + return alphabet[check], len(b(data)) + return encode - return _decode +add("luhn", luhn, pattern=r"^luhn[-_]?(\d{1,2})?$", guess=None) -add("luhn", _luhn_encode, _luhn_decode, pattern=r"^luhn[-_]?(\d{1,2})?$", guess=None) diff --git a/tests/test_manual.py b/tests/test_manual.py index 25bbe18..e443f75 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -96,6 +96,17 @@ def test_codec_case_related_manips(self): self.assertRaises(NotImplementedError, codecs.decode, STR, "slug") self.assertRaises(NotImplementedError, codecs.decode, STR, "snake") + def test_codec_checksum_functions(self): + from codext.checksums.crc import CRC + for n, variants in CRC.items(): + for name, params in variants.items(): + enc = ("crc%d-%s" % (n, name) if isinstance(n, int) else "crc-%s" % name).rstrip("-") + self.assertEqual(codecs.encode("123456789", enc), "%0{}x".format(round((n or 16)/4+.5)) % params[5]) + from codext.checksums.luhn import luhn + for s, r in [("", ""), ("0", "0"), ("1", "8"), ("7992739871", "3")]: + self.assertEqual(codecs.encode(s, "luhn"), r) + self.assertEqual(codecs.encode("-", "luhn", errors="ignore"), "") + def test_codec_dummy_str_manips(self): STR = "this is a test" self.assertEqual(codecs.decode(STR, "reverse"), "tset a si siht") @@ -109,7 +120,6 @@ def test_codec_dummy_str_manips(self): self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200") def test_codec_hash_functions(self): - from codext.checksums.crc import CRC STR = b"This is a test string!" for h in ["adler32", "md2", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: self.assertIsNotNone(codecs.encode(STR, h)) @@ -145,13 +155,6 @@ def test_codec_hash_functions(self): h = "crypt-" + m self.assertIsNotNone(codecs.encode(STR, h)) self.assertRaises(NotImplementedError, codecs.decode, STR, h) - # CRC checks - STR = "123456789" - for n, variants in CRC.items(): - for name, params in variants.items(): - enc = ("crc%d-%s" % (n, name) if isinstance(n, int) else "crc-%s" % name).rstrip("-") - print(enc) - self.assertEqual(codecs.encode(STR, enc), "%0{}x".format(round((n or 16)/4+.5)) % params[5]) def test_codec_markdown(self): HTM = "

Test title

\n\n

Test paragraph

\n" From 9b8d57450f931a5c10f5eab3d2e2f0aca2acdae5 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 24 Mar 2026 23:03:16 +0100 Subject: [PATCH 59/62] Refined codec: vigenere --- README.md | 1 + docs/pages/enc/crypto.md | 37 +++++++++++++++++++++++++++++++++++ src/codext/crypto/vigenere.py | 25 +++++++++++++---------- 3 files changed, 53 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 098a202..b8b5cd5 100644 --- a/README.md +++ b/README.md @@ -291,6 +291,7 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba - [X] `rotN`: aka Caesar cipher (*N* belongs to [1,25]) - [X] `scytaleN`: encrypts using the number of letters on the rod (*N* belongs to [1,[) - [X] `shiftN`: shift ordinals (*N* belongs to [1,255]) +- [X] `trithemius`: aka Trithemius Cipher (variant of Vigenere Cipher) - [X] `vigenere`: aka Vigenere Cipher - [X] `xorN`: XOR with a single byte (*N* belongs to [1,255]) diff --git a/docs/pages/enc/crypto.md b/docs/pages/enc/crypto.md index 432ac3d..a08d2f0 100644 --- a/docs/pages/enc/crypto.md +++ b/docs/pages/enc/crypto.md @@ -109,6 +109,25 @@ It implements the cipher for its 4 different keys. ----- +### Beaufort Cipher + +This is a variant of the [Vigenere Cipher](#vigenere-cipher). There is no default key, meaning that `beaufort` as the encoding scheme throws a `LookupError` indicating that the _key must be a non-empty alphabetic string_. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`beaufort` | text <-> Beaufort ciphertext | `beaufort-abcdef`, `beaufort_MySuperSecret` | key only consists of characters, not digits + +```python +>>> codext.encode("This is a test !", "beaufort-abababa") +'Husj sj a hxii !' +>>> codext.encode("This is a test !", "beaufort_MySuperSecret") +'Trkc hm r zaky !' +>>> codext.decode("Husj sj a hxii !", "vigenere-abababa") +'This is a test !' +``` + +----- + ### Citrix CTX1 This implements the Citrix CTX1 password encoding algorithm. @@ -202,6 +221,23 @@ This is a dynamic encoding, that is, it can be called with an integer to define ----- +### Trithemius Cipher + +This is a variant of the [Vigenere Cipher](#vigenere-cipher) with key `"ABCDEFGHIJKLMNOPQRSTUVWXYZ"`. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`trithemius` | text <-> Trithemius ciphertext | `trithemius`, `trithemius_cipher` | + +```python +>>> codext.encode("This is a test !", "trithemius") +'Tikv mx g ambd !' +>>> codext.decode("Tikv mx g ambd !", "trithemius") +'This is a test !' +``` + +----- + ### Vigenere Cipher This is a dynamic encoding, that is, it holds the key. There is no default key, meaning that `vigenere` as the encoding scheme throws a `LookupError` indicating that the _key must be a non-empty alphabetic string_. @@ -216,6 +252,7 @@ This is a dynamic encoding, that is, it holds the key. There is no default key, >>> codext.encode("This is a test !", "vigenere_MySuperSecret") 'Ffam xw r liuk !' >>> codext.decode("Tiit it a tfsu !", "vigenere-abababa") +'This is a test !' ``` ----- diff --git a/src/codext/crypto/vigenere.py b/src/codext/crypto/vigenere.py index 4276104..7ff9ffc 100755 --- a/src/codext/crypto/vigenere.py +++ b/src/codext/crypto/vigenere.py @@ -11,41 +11,43 @@ from ..__common__ import * - __examples__ = { 'enc(beaufort)': None, 'enc(beaufort-lemon)': {'ATTACKATDAWN': 'LLTOLBETLNPR'}, 'enc(beaufort-key)': {'hello': 'danzq'}, 'enc(beaufort_key)': {'Hello World': 'Danzq Cwnnh'}, - 'enc-dec(beaufort-secret)': ['hello world', 'ATTACK AT DAWN', 'Test 1234!'], + 'enc(trithemius-cipher)': {'this is a test': "tikv mx g ambd"}, + 'enc(trithemius)': {'HELLO': "HFNOS", '12345!@#$': "12345!@#$"}, 'enc(vigenere)': None, 'enc(vigenere-lemon)': {'ATTACKATDAWN': 'LXFOPVEFRNHR'}, 'enc(vigenere-key)': {'hello': 'rijvs'}, 'enc(vigenère_key)': {'Hello World': 'Rijvs Uyvjn'}, + 'enc-dec(beaufort-secret)': ['hello world', 'ATTACK AT DAWN', 'Test 1234!'], + 'enc-dec(trithemius)': ["Hello, World!", "@random"], 'enc-dec(vigenere-secret)': ['hello world', 'ATTACK AT DAWN', 'Test 1234!'], } -__guess__ = ["beaufort-key", "beaufort-secret", "beaufort-password", +__guess__ = ["beaufort-key", "beaufort-secret", "beaufort-password", "trithemius", "vigenere-key", "vigenere-secret", "vigenere-password"] -def __make(encoding, char_func): +def __make(enc, cfunc): def code(decode=False): - def _code(key): - def _wrapper(text, errors="strict"): + def _wrapper(key): + def _subwrapper(text, errors="strict"): k = key.lower() if not k or not k.isalpha(): - raise LookupError(f"Bad parameter for encoding '{encoding}': key must be a non-empty alphabetic string") + raise LookupError(f"Bad parameter for encoding '{enc}': key must be a non-empty alphabetic string") result, i = [], 0 for c in ensure_str(text): if c in LC or c in UC: - result.append(char_func(c, k, i, decode)) + result.append(cfunc(c, k, i, decode)) i += 1 else: result.append(c) r = "".join(result) return r, len(r) - return _wrapper - return _code + return _subwrapper + return _wrapper return code(), code(True) @@ -55,5 +57,8 @@ def _wrapper(text, errors="strict"): vchar = lambda c, k, i, d=False: (LC if (b := c in LC) else UC)[(ord(c) - ord("Aa"[b]) + \ [1, -1][d] * (ord(k[i % len(k)]) - ord('a'))) % 26] +enc, dec = __make("trithemius", vchar) +add("trithemius", enc(k := "ABCDEFGHIJKLMNOPQRSTUVWXYZ"), dec(k), r"trithemius(?:[-_]cipher)?$", penalty=.1) + add("vigenere", *__make("vigenere", vchar), r"vigen[eè]re(?:[-_]cipher)?(?:[-_]([a-zA-Z]+))?$", penalty=.1) From eb0b6d7fa000627ab1d77f80a71c8e6a31153e4e Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 29 Mar 2026 00:53:57 +0100 Subject: [PATCH 60/62] Refined codec: vigenere (2) --- README.md | 1 + docs/pages/enc/crypto.md | 17 +++++++ src/codext/crypto/vigenere.py | 85 ++++++++++++++++++++++------------- 3 files changed, 72 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index b8b5cd5..f3a35fb 100644 --- a/README.md +++ b/README.md @@ -282,6 +282,7 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba - [X] `affine`: aka Affine Cipher - [X] `atbash`: aka Atbash Cipher +- [X] `autoclave`: aka Autoclave/Autokey Cipher (variant of Vigenere Cipher) - [X] `bacon`: aka Baconian Cipher - [X] `barbie-N`: aka Barbie Typewriter (*N* belongs to [1, 4]) - [X] `beaufort`: aka Beaufort Cipher (variant of Vigenere Cipher) diff --git a/docs/pages/enc/crypto.md b/docs/pages/enc/crypto.md index a08d2f0..c065569 100644 --- a/docs/pages/enc/crypto.md +++ b/docs/pages/enc/crypto.md @@ -71,6 +71,23 @@ It implements the monoalphabetic substitution cipher used for the Hebrew alphabe ----- +### Autoclave/Autokey Cipher + +This is a variant of the [Vigenere Cipher](#vigenere-cipher) using a key stream generated from the primer key and the message appended. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`autoclave` | text <-> Autoclave ciphertext | `autoclave-cipher`, `autokey` | + +```python +>>> codext.encode("This is a test !", "autoclave-test") +'Mlal bz i lmkt !' +>>> codext.decode("Mlal bz i lmkt !", "autokey_cipher-test") +'This is a test !' +``` + +----- + ### Baconian Cipher It support only letters. diff --git a/src/codext/crypto/vigenere.py b/src/codext/crypto/vigenere.py index 7ff9ffc..0143046 100755 --- a/src/codext/crypto/vigenere.py +++ b/src/codext/crypto/vigenere.py @@ -11,54 +11,77 @@ from ..__common__ import * -__examples__ = { - 'enc(beaufort)': None, - 'enc(beaufort-lemon)': {'ATTACKATDAWN': 'LLTOLBETLNPR'}, - 'enc(beaufort-key)': {'hello': 'danzq'}, - 'enc(beaufort_key)': {'Hello World': 'Danzq Cwnnh'}, - 'enc(trithemius-cipher)': {'this is a test': "tikv mx g ambd"}, - 'enc(trithemius)': {'HELLO': "HFNOS", '12345!@#$': "12345!@#$"}, - 'enc(vigenere)': None, - 'enc(vigenere-lemon)': {'ATTACKATDAWN': 'LXFOPVEFRNHR'}, - 'enc(vigenere-key)': {'hello': 'rijvs'}, - 'enc(vigenère_key)': {'Hello World': 'Rijvs Uyvjn'}, - 'enc-dec(beaufort-secret)': ['hello world', 'ATTACK AT DAWN', 'Test 1234!'], - 'enc-dec(trithemius)': ["Hello, World!", "@random"], - 'enc-dec(vigenere-secret)': ['hello world', 'ATTACK AT DAWN', 'Test 1234!'], +__examples1__ = { + 'enc(autoclave)': None, + 'enc(autokey-queenly)': {'ATTACKATDAWN': 'QNXEPVYTWTWP'}, + 'enc-dec(autoclave-key)': ['hello world', 'ATTACK AT DAWN', 'Test 1234!', 'Mixed Case 123'], } -__guess__ = ["beaufort-key", "beaufort-secret", "beaufort-password", "trithemius", - "vigenere-key", "vigenere-secret", "vigenere-password"] +__examples2__ = { + 'enc(beaufort)': None, + 'enc(beaufort-lemon)': {'ATTACKATDAWN': 'LLTOLBETLNPR'}, + 'enc(beaufort-key)': {'hello': 'danzq'}, + 'enc(beaufort_key)': {'Hello World': 'Danzq Cwnnh'}, + 'enc-dec(beaufort-secret)': ['hello world', 'ATTACK AT DAWN', 'Test 1234!'], +} +__examples3__ = { + 'enc(trithemius-cipher)': {'this is a test': "tikv mx g ambd"}, + 'enc(trithemius)': {'HELLO': "HFNOS", '12345!@#$': "12345!@#$"}, + 'enc-dec(trithemius)': ["Hello, World!", "@random"], +} +__examples4__ = { + 'enc(vigenere)': None, + 'enc(vigenere-lemon)': {'ATTACKATDAWN': 'LXFOPVEFRNHR'}, + 'enc(vigenere-key)': {'hello': 'rijvs'}, + 'enc(vigenère_key)': {'Hello World': 'Rijvs Uyvjn'}, + 'enc-dec(vigenere-secret)': ['hello world', 'ATTACK AT DAWN', 'Test 1234!'], +} +__guess1__ = ["autoclave-key", "autoclave-password", "autoclave-secret"] +__guess2__ = ["beaufort-key", "beaufort-password", "beaufort-secret"] +__guess3__ = ["trithemius"] +__guess4__ = ["vigenere-key", "vigenere-password", "vigenere-secret"] + + +bchar = lambda c, k, i, d=False: (LC if (b := c in LC) else UC)[(ord(k[i % len(k)]) - ord('a') - \ + (ord(c) - ord("Aa"[b]))) % 26] +vchar = lambda c, k, i, d=False: (LC if (b := c in LC) else UC)[(ord(c) - ord("Aa"[b]) + \ + [1, -1][d] * (ord(k[i % len(k)]) - ord('a'))) % 26] -def __make(enc, cfunc): - def code(decode=False): +def __make(enc, char_func, key_stream=False): + def _code(decode=False): def _wrapper(key): def _subwrapper(text, errors="strict"): - k = key.lower() - if not k or not k.isalpha(): + if not (k := key.lower()) or not k.isalpha(): raise LookupError(f"Bad parameter for encoding '{enc}': key must be a non-empty alphabetic string") + if key_stream and not decode: + k += "".join(c.lower() for c in ensure_str(text) if c in LC or c in UC) result, i = [], 0 + if key_stream and decode: + k = list(k) for c in ensure_str(text): if c in LC or c in UC: - result.append(cfunc(c, k, i, decode)) + result.append(dc := char_func(c, k, i, decode)) + if key_stream and decode: + k.append(dc.lower()) i += 1 else: result.append(c) - r = "".join(result) - return r, len(r) + return (r := "".join(result)), len(r) return _subwrapper return _wrapper - return code(), code(True) + return _code(), _code(True) -bchar = lambda c, k, i, d=False: (LC if (b := c in LC) else UC)[(ord(k[i % len(k)]) - ord('a') - \ - (ord(c) - ord("Aa"[b]))) % 26] -add("beaufort", *__make("beaufort", bchar), r"beaufort(?:[-_]cipher)?(?:[-_]([a-zA-Z]+))?$", penalty=.1) +add("autoclave", *__make("autoclave", vchar, True), r"auto(?:clave|key)(?:[-_]cipher)?(?:[-_]([a-zA-Z]+))?$", + examples=__examples1__, guess=__guess1__, penalty=.1) + +add("beaufort", *__make("beaufort", bchar), r"beaufort(?:[-_]cipher)?(?:[-_]([a-zA-Z]+))?$", + examples=__examples2__, guess=__guess2__, penalty=.1) -vchar = lambda c, k, i, d=False: (LC if (b := c in LC) else UC)[(ord(c) - ord("Aa"[b]) + \ - [1, -1][d] * (ord(k[i % len(k)]) - ord('a'))) % 26] enc, dec = __make("trithemius", vchar) -add("trithemius", enc(k := "ABCDEFGHIJKLMNOPQRSTUVWXYZ"), dec(k), r"trithemius(?:[-_]cipher)?$", penalty=.1) +add("trithemius", enc(k := "ABCDEFGHIJKLMNOPQRSTUVWXYZ"), dec(k), r"trithemius(?:[-_]cipher)?$", + examples=__examples3__, guess=__guess3__, penalty=.1) -add("vigenere", *__make("vigenere", vchar), r"vigen[eè]re(?:[-_]cipher)?(?:[-_]([a-zA-Z]+))?$", penalty=.1) +add("vigenere", *__make("vigenere", vchar), r"vigen[eè]re(?:[-_]cipher)?(?:[-_]([a-zA-Z]+))?$", + examples=__examples4__, guess=__guess4__, penalty=.1) From b5414ecdfd278fb1ae598ac94c1792c2ee469087 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 29 Mar 2026 22:02:42 +0200 Subject: [PATCH 61/62] New release --- .github/workflows/python-package.yml | 5 +++++ src/codext/VERSION.txt | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index d505f24..91f67c9 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -18,12 +18,14 @@ jobs: runs-on: ubuntu-latest outputs: package: ${{ steps.pkg.outputs.package }} + pypi_url: ${{ steps.pkg.outputs.pypi_url }} steps: - name: Compute package name from the repository's id: pkg run: | name="${GITHUB_REPOSITORY##*/}" echo "package=${name#python-}" >> $GITHUB_OUTPUT + echo "pypi_url=https://pypi.org/p/${name#python-}" >> $GITHUB_OUTPUT build: needs: prepare runs-on: ${{ matrix.os }} @@ -96,6 +98,9 @@ jobs: git commit -m "Update coverage badge" git push origin coverage-badge --force deploy: + environment: + name: pypi + url: ${{ needs.prepare.outputs.pypi_url }} runs-on: ubuntu-latest needs: [prepare, coverage] steps: diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index 4ef4640..8917394 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.11 +1.16.0 From 578f57d2a2265d45dd10915e87e4f9f99076ab6d Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 11 Apr 2026 09:53:35 +0200 Subject: [PATCH 62/62] Clarified on listing codecs from the CLI (#41) --- README.md | 107 ++++++++++++++++++++++++++++----------- src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 2 +- src/codext/__init__.py | 14 ++--- 4 files changed, 87 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index f3a35fb..8100ff9 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ $ echo -en "test" | codext encode base100 👫👜👪👫 ``` -### Chaining codecs +### :chains: Chaining codecs ```sh $ echo -en "Test string" | codext encode reverse @@ -62,7 +62,7 @@ $ echo -en "AGTCAGTCAGTGAGAAAGTCAGTGAGAAAGTGAGTGAGAAAGTGAGTCAGTGAGAAAGTCAGAAAGTG test string ``` -### Using macros +### :twisted_rightwards_arrows: Using macros ```sh $ codext add-macro my-encoding-chain gzip base63 lzma base64 @@ -79,7 +79,9 @@ $ codext list macros example-macro ``` -## :computer: Usage (base CLI tool) Tweet on unbase +## :desktop_computer: Usage (`baseXX` CLI tools) Tweet on unbase + +Playing with base encodings. ```session $ echo "Test string !" | base122 @@ -106,9 +108,63 @@ $ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -f Te Test string ! ``` -## :computer: Usage (Python) +## :computer: Usage (CLI) + +Listing codecs. + +```session +$ codext list encodings +a1z26 adler32 affine alternative-rot ascii +atbash autoclave bacon barbie base +base1 base2 base3 base4 base8 +<> +``` + +Finding a codec based on a name. + +```session +$ codext search bitcoin +base58 +``` + +Encoding a string. -Getting the list of available codecs: +```sesssion +$ echo -en "This is a test" | codext encode polybius +44232443 2443 11 44154344 +``` + +Encoding a file. + +```session +$ echo -en "this is a test" > to_be_encoded.txt +$ codext encode base64 < to_be_encoded.txt > text.b64 +$ cat text.b64 +dGhpcyBpcyBhIHRlc3Q= +``` + +Chaining codecs. + +```session +$ echo -en "mrdvm6teie6t2cq=" | codext encode upper | codext decode base32 | codext decode base64 +test +``` + +Iteratively guessing decodings. + +```session +$ echo -en "test" | codext encode base64 gzip | codext guess +Codecs: gzip +dGVzdA== +$ echo -en "test" | codext encode base64 gzip | codext guess gzip -i base +Codecs: gzip, base64 +test +``` + + +## :snake: Usage (Python) + +Getting the list of available codecs. ```python >>> import codext @@ -116,6 +172,9 @@ Getting the list of available codecs: >>> codext.list() ['ascii85', 'base85', 'base100', 'base122', ..., 'tomtom', 'dna', 'html', 'markdown', 'url', 'resistor', 'sms', 'whitespace', 'whitespace-after-before'] +Playing with some base encodings. + +```python >>> codext.encode("this is a test", "base58-bitcoin") 'jo91waLQA1NNeBmZKUF' @@ -130,7 +189,21 @@ Getting the list of available codecs: >>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") 'this is a test' +``` + +Playing with some cryptography-based codecs. +```python +>>> codext.encode("This is a test !", "vigenere-MYSECRETKET") +'Ffaw kj e mowm !' + +>>> codext.encode("This is a test !", "autoclave-SECRET") +'Llkj ml t amkb !' +``` + +Encoding/decoding with various other codecs. + +```python >>> for i in range(8): print(codext.encode("this is a test", "dna-%d" % (i + 1))) GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA @@ -158,30 +231,6 @@ CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT f.read() 'this is a test' ->>> codext.decode(""" - = - X - : - x - n - r - y - Y - y - p - a - ` - n - | - a -o - h - ` - g - o - z """, "whitespace-after+before") -'CSC{not_so_invisible}' - >>> print(codext.encode("An example test string", "baudot-tape")) ***.** . * diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index 8917394..d00a804 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.16.0 +1.16.1 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index 2b6d205..7c3a001 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -671,7 +671,7 @@ def list_categories(): c.append(d.rstrip("s")) # particular category, hardcoded from base/_base.py c += ["base-generic"] - return c + return list(set(c)) list_categories() diff --git a/src/codext/__init__.py b/src/codext/__init__.py index 2a37ebe..c503d03 100644 --- a/src/codext/__init__.py +++ b/src/codext/__init__.py @@ -170,7 +170,7 @@ def _format_action_invocation(self, action): listi = sparsers.add_parser("list", help="list items") lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", required=True) liste = lsparsers.add_parser("encodings", help="list encodings") - liste.add_argument("category", nargs="+", help="selected categories") + liste.add_argument("category", nargs="*", help="selected categories") listm = lsparsers.add_parser("macros", help="list macros") addm = sparsers.add_parser("add-macro", help="add a macro to the registry") addm.add_argument("name", help="macro's name") @@ -198,13 +198,13 @@ def _format_action_invocation(self, action): # list encodings or macros elif args.command == "list": if args.type == "encodings": - cats = args.category or list_categories() - for c in sorted(cats): - l = list_encodings(c) - if len(l) > 0: - if len(cats) > 0: + if args.category: + for c in sorted(args.category): + if len(l := list_encodings(c)) > 0: print(c.upper() + ":") - __print_tabular(l) + __print_tabular(l) + else: + __print_tabular(list_encodings()) elif args.type == "macros": l = list_macros() if len(l) > 0: