diff --git a/.coveragerc b/.coveragerc index 4ccc970..b677975 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,27 +1,24 @@ -[run] -source = codext -omit = - codext/__info__.py - codext/**/__init__.py - -[report] -exclude_lines = - pragma: no cover - if.*?__name__.*?==.*?.__main__.: - def main\(\)\: - def __stdin_pipe\(\)\: - for line in __stdin_pipe\(\)\: - def __format_list\(items, include\=True\)\: - def __print_tabular\(lst, space\=4\)\: - except ImportError: - except NameError: - raise NotImplementedError - if not PY3 - if PY3 - def encode\(self, input, final\=False\)\: - def decode\(self, input, final\=False\)\: - def _detect\(text\)\: - def _lang\(lang\)\: - if stopfunc\.LANG_BACKEND\: - def _validate\(stop_function, lang_backend\=\"none\"\)\: - except KeyboardInterrupt\: +[run] +source = codext +omit = + src/codext/__info__.py + src/codext/**/__init__.py + +[report] +exclude_lines = + pragma: no cover + if.*?__name__.*?==.*?.__main__.: + def main\(\)\: + def __stdin_pipe\(\)\: + for line in __stdin_pipe\(\)\: + def __format_list\(items, include\=True\)\: + def __print_tabular\(lst, space\=4\)\: + except ImportError: + except NameError: + raise NotImplementedError + def _detect\(text\)\: + def _lang\(lang\)\: + if stopfunc\.LANG_BACKEND\: + def _validate\(stop_function, lang_backend\=\"none\"\)\: + except KeyboardInterrupt\: + if alt and len\(t\) \% 2 \=\= 1\: diff --git a/.github/ISSUE_TEMPLATE/add-encoding.yml b/.github/ISSUE_TEMPLATE/add-encoding.yml new file mode 100644 index 0000000..798bea7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/add-encoding.yml @@ -0,0 +1,21 @@ +name: Add new encoding +description: Propose a new encoding to be added +title: "Add new encoding: [codec]" +labels: ["enhancement"] +body: + - type: textarea + id: description + attributes: + label: Description + description: Describe the encoding, its purpose, and how it works + placeholder: Provide a clear and concise description + validations: + required: true + - type: input + id: reference + attributes: + label: Reference + description: Provide a reference URL for the encoding + placeholder: https:// + validations: + required: false diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..a1d90f8 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,81 @@ +# Copilot Instructions — Enhancements Only + +## Scope +This repository focuses on **adding new encoding/decoding schemes only**. + +Copilot MUST: +- Propose **new codecs only** +- Avoid refactoring unrelated code +- Avoid dependency changes unless strictly required for the codec +- Avoid stylistic or formatting changes + +## Context +This project extends Python's codecs with many encoding/decoding schemes and a CLI tool. +It already includes a wide variety of bases, ciphers, compression, and niche encodings. + +## Enhancement Guidelines +When adding a new encoding, follow the guideline in the documentation at `docs/pages/howto.md`. + + +## Implementation Constraints + +- Pure Python preferred +- No heavy dependencies +- Deterministic transformations only +- Reversible encoding required unless explicitly documented + +## Testing + +Every new codec: +- SHOULD include a list of `__examples__` that tells the automated tests what encoding/decoding transformations need to be verified ; it this cannot be made, unit tests (encode/decode roundtrip) SHALL be provided in `tests/test_manual.py` +- Edge cases (empty input, binary data if applicable), either in the `__examples__` list or in the explicit tests in `tests/test_manual.py` + +## Documentation + +Each codec SHALL comply with the following structure: + + ```python + # -*- coding: UTF-8 -*- + """{{codec_long_name}} Codec - {{codec_short_name}} content encoding. + + {{codec_description}} + + This codec: + - en/decodes strings from str to str + - en/decodes strings from bytes to bytes + - decodes file content to str (read) + - encodes file content from str to bytes (write) + + Reference: {{codec_source_hyperlink}} + """ + from ..__common__ import * + + + __examples__ = {<>} + <>] + + + <> + <> + + + <> + ``` + +In this template, `{{ ... }}` enclosures indicate codec's properties and `<< ... >>``enclosures indicate placeholder actions referring to steps from the documentation about how to make a codec at `docs/pages/howto.md`. + +## Output Format (IMPORTANT) + +When asked to add a codec, Copilot should: +1. Briefly justify the encoding (1–2 lines) +2. Provide full implementation (according to section _Adding a new codec to `codext`_ of the documentation at `docs/pages/howto.md`) +3. Provide tests (according to section _Self-generated tests_) +4. Add it to the `README.md` of the repository +5. Propose the update of the documentation (under the relevant page for the category of codec) + +## Explicit Non-Goals + +- No refactoring +- No performance optimization passes +- No linting-only changes +- No CI/CD changes diff --git a/.github/prompts/add_codec.prompt.md b/.github/prompts/add_codec.prompt.md new file mode 100644 index 0000000..9b470c8 --- /dev/null +++ b/.github/prompts/add_codec.prompt.md @@ -0,0 +1,17 @@ +Add a new encoding scheme to this repository. + +Constraints: +- Follow copilot-instructions.md strictly +- Do not modify unrelated code +- Use existing codec patterns + +Task: +Add encoding: {{ENCODING_NAME}} + +Requirements: +- Implement according to ADDING_CODECS.md guideline +- Add tests if needed (if `__examples__` cannot be consistently defined) +- Add minimal documentation (in the relevant category page under `docs/pages`) + +Reference: +{{LINK_OR_DESCRIPTION}} \ No newline at end of file diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..9d466f2 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,8 @@ +## Checklist +- [ ] No unrelated changes +- [ ] Codec is new (not already implemented) +- [ ] Tests included (if cannot be automated with `tests/test_generated`) +- [ ] Documentation (included in the right page in `docs/pages/enc`) + +## Description +Explain the encoding and its source. diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..91f67c9 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,128 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: build + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +permissions: + id-token: write + contents: read + +jobs: + prepare: + runs-on: ubuntu-latest + outputs: + package: ${{ steps.pkg.outputs.package }} + pypi_url: ${{ steps.pkg.outputs.pypi_url }} + steps: + - name: Compute package name from the repository's + id: pkg + run: | + name="${GITHUB_REPOSITORY##*/}" + echo "package=${name#python-}" >> $GITHUB_OUTPUT + echo "pypi_url=https://pypi.org/p/${name#python-}" >> $GITHUB_OUTPUT + build: + needs: prepare + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + env: + package: ${{ needs.prepare.outputs.package }} + steps: + - uses: actions/checkout@v5 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + - name: Install ${{ env.package }} + run: | + python -m pip install --upgrade pip + python -m pip install pytest pytest-cov coverage + pip install -r requirements.txt + pip install tinyscript>=1.31 + pip install . + - name: Test ${{ env.package }} with pytest + run: | + pytest --cov=$package + coverage: + needs: [prepare, build] + permissions: + contents: write + runs-on: ubuntu-latest + env: + cov_badge_path: docs/coverage.svg + package: ${{ needs.prepare.outputs.package }} + python_version: "3.13" + steps: + - uses: actions/checkout@v5 + with: + fetch-depth: 0 + ref: ${{ github.head_ref || github.ref_name }} + - name: Set up Python ${{ env.python_version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ env.python_version }} + - name: Install ${{ env.package }} + run: | + python -m pip install --upgrade pip + python -m pip install pytest pytest-cov + pip install -r requirements.txt + pip install . + - name: Make coverage badge for ${{ env.package }} + run: | + pip install genbadge[coverage] + pytest --cov=$package --cov-report=xml + genbadge coverage -i coverage.xml -o $cov_badge_path + - name: Verify Changed files + uses: tj-actions/verify-changed-files@v20 + id: changed_files + with: + files: ${{ env.cov_badge_path }} + - name: Push coverage badge + if: steps.changed_files.outputs.files_changed == 'true' + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git fetch origin + git checkout coverage-badge || git checkout -b coverage-badge + git add $cov_badge_path + git diff --cached --quiet && exit 0 + git commit -m "Update coverage badge" + git push origin coverage-badge --force + deploy: + environment: + name: pypi + url: ${{ needs.prepare.outputs.pypi_url }} + runs-on: ubuntu-latest + needs: [prepare, coverage] + steps: + - uses: actions/checkout@v5 + with: + fetch-depth: 0 + - name: Check for version change + uses: dorny/paths-filter@v4 + id: filter + with: + filters: | + version: + - '**/VERSION.txt' + - if: steps.filter.outputs.version == 'true' + name: Cleanup README + run: | + sed -ri 's/^(##*)\s*:.*:\s*/\1 /g' README.md + awk '{if (match($0,"## Supporters")) exit; print}' README.md > README + mv -f README README.md + - if: steps.filter.outputs.version == 'true' + name: Build ${{ needs.prepare.outputs.package }} package + run: python3 -m pip install --upgrade build && python3 -m build + - if: steps.filter.outputs.version == 'true' + name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.readthedocs.yml b/.readthedocs.yml index e8f4e71..aca74b8 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,6 +1,13 @@ version: 2 + +build: + os: "ubuntu-22.04" + tools: + python: "3.11" + mkdocs: - configuration: mkdocs.yml -formats: all + configuration: docs/mkdocs.yml + python: - version: 3.6 + install: + - requirements: docs/requirements.txt diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 39ff698..0000000 --- a/.travis.yml +++ /dev/null @@ -1,86 +0,0 @@ -language: python -jobs: - allow_failures: - - arch: arm64 - - os: osx - - python: nightly - fast_finish: true - include: - - python: 2.7 - - python: 3.6 - - python: 3.7 - - python: 3.8 - - python: 3.9 - - python: nightly - - os: windows - language: shell - before_install: choco install python2 --version 2.7.18 - env: PATH=/c/Python27:/c/Python27/Scripts:$PATH - - os: windows - language: shell - before_install: choco install python --version 3.6.8 - env: PATH=/c/Python36:/c/Python36/Scripts:$PATH - - os: windows - language: shell - before_install: choco install python --version 3.7.6 - env: PATH=/c/Python37:/c/Python37/Scripts:$PATH - - os: windows - language: shell - before_install: choco install python --version 3.8.1 - env: PATH=/c/Python38:/c/Python38/Scripts:$PATH - - os: windows - language: shell - before_install: choco install python --version 3.9.0 - env: PATH=/c/Python39:/c/Python39/Scripts:$PATH - - python: 2.7 - arch: arm64 - - python: 3.6 - arch: arm64 - - python: 3.7 - arch: arm64 - dist: focal - - python: 3.8 - arch: arm64 - - python: 3.9 - arch: arm64 - - python: nightly - arch: arm64 - - os: osx - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=2.7.18 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION - - os: osx - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=3.6.8 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION - - os: osx - osx_image: xcode11.3 - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=3.7.6 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION - - os: osx - osx_image: xcode11.3 - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=3.8.1 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION - - os: osx - osx_image: xcode11.3 - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=3.9.0 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION -cache: pip -install: - - python -m pip install --upgrade pip - - pip install pytest pytest-cov coveralls markdown2 six . -script: pytest --cov=codext --cov-report=term-missing tests -after_success: coveralls diff --git a/LICENSE b/LICENSE index f288702..3877ae0 100644 --- a/LICENSE +++ b/LICENSE @@ -1,674 +1,674 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/README.md b/README.md index 2ce70be..8100ff9 100644 --- a/README.md +++ b/README.md @@ -1,344 +1,402 @@ -

-

CodExt Tweet

-

Encode/decode anything.

- -[![PyPi](https://img.shields.io/pypi/v/codext.svg)](https://pypi.python.org/pypi/codext/) -[![Read The Docs](https://readthedocs.org/projects/python-codext/badge/?version=latest)](https://python-codext.readthedocs.io/en/latest/?badge=latest) -[![Build Status](https://travis-ci.com/dhondta/python-codext.svg?branch=master)](https://travis-ci.com/dhondta/python-codext) -[![Coverage Status](https://coveralls.io/repos/github/dhondta/python-codext/badge.svg?branch=master)](https://coveralls.io/github/dhondta/python-codext?branch=master) -[![Python Versions](https://img.shields.io/pypi/pyversions/codext.svg)](https://pypi.python.org/pypi/codext/) -[![Requirements Status](https://requires.io/github/dhondta/python-codext/requirements.svg?branch=master)](https://requires.io/github/dhondta/python-codext/requirements/?branch=master) -[![Known Vulnerabilities](https://snyk.io/test/github/dhondta/python-codext/badge.svg?targetFile=requirements.txt)](https://snyk.io/test/github/dhondta/python-codext?targetFile=requirements.txt) -[![DOI](https://zenodo.org/badge/236679865.svg)](https://zenodo.org/badge/latestdoi/236679865) -[![License](https://img.shields.io/pypi/l/codext.svg)](https://pypi.python.org/pypi/codext/) - -[**CodExt**](https://github.com/dhondta/python-codext) is a (Python2-3 compatible) library that extends the native [`codecs`](https://docs.python.org/3/library/codecs.html) library (namely for adding new custom encodings and character mappings) and provides **120+ new codecs**, hence its name combining *CODecs EXTension*. It also features a **guess mode** for decoding multiple layers of encoding and **CLI tools** for convenience. - -```sh -$ pip install codext -``` - -Want to contribute a new codec ? | Want to contribute a new macro ? -:----------------------------------:|:------------------------------------: -Check the [documentation](https://python-codext.readthedocs.io/en/latest/howto.html) first
Then [PR](https://github.com/dhondta/python-codext/pulls) your new codec | [PR](https://github.com/dhondta/python-codext/pulls) your updated version of [`macros.json`](https://github.com/dhondta/python-codext/blob/master/codext/macros.json) - -## :mag: Demonstrations - -

Using CodExt from the command line

-

Using base tools from the command line

-

Using the unbase command line tool

- -## :computer: Usage (main CLI tool) Tweet on codext - -```session -$ codext -i test.txt encode dna-1 -GTGAGCGGGTATGTGA - -$ echo -en "test" | codext encode morse -- . ... - - -$ echo -en "test" | codext encode braille -⠞⠑⠎⠞ - -$ echo -en "test" | codext encode base100 -👫👜👪👫 -``` - -### Chaining codecs - -```sh -$ echo -en "Test string" | codext encode reverse -gnirts tseT - -$ echo -en "Test string" | codext encode reverse morse ---. -. .. .-. - ... / - ... . - - -$ echo -en "Test string" | codext encode reverse morse dna-2 -AGTCAGTCAGTGAGAAAGTCAGTGAGAAAGTGAGTGAGAAAGTGAGTCAGTGAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTTAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTGAGAAAGTC - -$ echo -en "Test string" | codext encode reverse morse dna-2 octal -101107124103101107124103101107124107101107101101101107124103101107124107101107101101101107124107101107124107101107101101101107124107101107124103101107124107101107101101101107124103101107101101101107124107101107124107101107124107101107101101101107124124101107101101101107124103101107101101101107124107101107124107101107124107101107101101101107124107101107101101101107124103 - -$ echo -en "AGTCAGTCAGTGAGAAAGTCAGTGAGAAAGTGAGTGAGAAAGTGAGTCAGTGAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTTAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTGAGAAAGTC" | codext -d dna-2 morse reverse -test string -``` - -### Using macros - -```sh -$ codext add-macro my-encoding-chain gzip base63 lzma base64 - -$ codext list macros -example-macro, my-encoding-chain - -$ echo -en "Test string" | codext encode my-encoding-chain -CQQFAF0AAIAAABuTgySPa7WaZC5Sunt6FS0ko71BdrYE8zHqg91qaqadZIR2LafUzpeYDBalvE///ug4AA== - -$ codext remove-macro my-encoding-chain - -$ codext list macros -example-macro -``` - -## :computer: Usage (base CLI tool) Tweet on unbase - -```session -$ echo "Test string !" | base122 -*.7!ft9�-f9Â - -$ echo "Test string !" | base91 -"ONK;WDZM%Z%xE7L - -$ echo "Test string !" | base91 | base85 -B2P|BJ6A+nO(j|-cttl% - -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr -QVx5tvgjvCAkXaMSuKoQmCnjeCV1YyyR3WErUUErFf - -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | base58-flickr -d | base36 -d | base85 -d | base91 -d -Test string ! -``` - -```session -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -m 3 -Test string ! - -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -f Test -Test string ! -``` - -## :computer: Usage (Python) - -Getting the list of available codecs: - -```python ->>> import codext - ->>> codext.list() -['ascii85', 'base85', 'base100', 'base122', ..., 'tomtom', 'dna', 'html', 'markdown', 'url', 'resistor', 'sms', 'whitespace', 'whitespace-after-before'] - ->>> codext.encode("this is a test", "base58-bitcoin") -'jo91waLQA1NNeBmZKUF' - ->>> codext.encode("this is a test", "base58-ripple") -'jo9rA2LQwr44eBmZK7E' - ->>> codext.encode("this is a test", "base58-url") -'JN91Wzkpa1nnDbLyjtf' - ->>> codecs.encode("this is a test", "base100") -'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' - ->>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") -'this is a test' - ->>> for i in range(8): - print(codext.encode("this is a test", "dna-%d" % (i + 1))) -GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA -CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA -ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG -AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC -TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG -TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC -GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT -CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT ->>> codext.decode("GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA", "dna-1") -'this is a test' - ->>> codecs.encode("this is a test", "morse") -'- .... .. ... / .. ... / .- / - . ... -' - ->>> codecs.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") -'this is a test' - ->>> with open("morse.txt", 'w', encoding="morse") as f: - f.write("this is a test") -14 - ->>> with open("morse.txt",encoding="morse") as f: - f.read() -'this is a test' - ->>> codext.decode(""" - = - X - : - x - n - r - y - Y - y - p - a - ` - n - | - a -o - h - ` - g - o - z """, "whitespace-after+before") -'CSC{not_so_invisible}' - ->>> print(codext.encode("An example test string", "baudot-tape")) -***.** - . * -***.* -* . - .* -* .* - . * -** .* -***.** -** .** - .* -* . -* *. * - .* -* *. -* *. * -* . -* *. -* *. * -***. - *.* -***.* - * .* -``` - -## :page_with_curl: List of codecs - -#### [BaseXX](https://python-codext.readthedocs.io/en/latest/enc/base.html) - -- [X] `base1`: useless, but for the sake of completeness -- [X] `base2`: simple conversion to binary (with a variant with a reversed alphabet) -- [X] `base3`: conversion to ternary (with a variant with a reversed alphabet) -- [X] `base4`: conversion to quarternary (with a variant with a reversed alphabet) -- [X] `base8`: simple conversion to octal (with a variant with a reversed alphabet) -- [X] `base10`: simple conversion to decimal -- [X] `base11`: conversion to digits with a "*a*" -- [X] `base16`: simple conversion to hexadecimal (with a variant holding an alphabet with digits and letters inverted) -- [X] `base26`: conversion to alphabet letters -- [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32.html)) -- [X] `base36`: [Base36](https://en.wikipedia.org/wiki/Base36) conversion to letters and digits (with a variant inverting both groups) -- [X] `base45`: [Base45](https://datatracker.ietf.org/doc/html/draft-faltstrom-base45-04.txt) DRAFT algorithm (with a variant inverting letters and digits) -- [X] `base58`: multiple versions of [Base58](https://en.bitcoinwiki.org/wiki/Base58) (bitcoin, flickr, ripple) -- [X] `base62`: [Base62](https://en.wikipedia.org/wiki/Base62) conversion to lower- and uppercase letters and digits (with a variant with letters and digits inverted) -- [X] `base63`: similar to `base62` with the "`_`" added -- [X] `base64`: classical conversion according to RFC4648 with its variant URL (or *file*) (it also holds a variant with letters and digits inverted) -- [X] `base67`: custom conversion using some more special characters (also with a variant with letters and digits inverted) -- [X] `base85`: all variants of Base85 ([Ascii85](https://fr.wikipedia.org/wiki/Ascii85), [z85](https://rfc.zeromq.org/spec/32), [Adobe](https://dencode.com/string/ascii85), [(x)btoa](https://dencode.com/string/ascii85), [RFC1924](https://datatracker.ietf.org/doc/html/rfc1924), [XML](https://datatracker.ietf.org/doc/html/draft-kwiatkowski-base85-for-xml-00)) -- [X] `base91`: [Base91](http://base91.sourceforge.net) custom conversion -- [X] `base100` (or *emoji*): [Base100](https://github.com/AdamNiederer/base100) custom conversion -- [X] `base122`: [Base100](http://blog.kevinalbs.com/base122) custom conversion -- [X] `base-genericN`: see [base encodings](https://python-codext.readthedocs.io/en/latest/enc/base.html) ; supports any possible base - -This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `base85` codec. - -#### [Binary](https://python-codext.readthedocs.io/en/latest/enc/binary.html) - -- [X] `baudot`: supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... -- [X] `baudot-spaced`: variant of `baudot` ; groups of 5 bits are whitespace-separated -- [X] `baudot-tape`: variant of `baudot` ; outputs a string that looks like a perforated tape -- [X] `bcd`: _Binary Coded Decimal_, encodes characters from their (zero-left-padded) ordinals -- [X] `bcd-extended0`: variant of `bcd` ; encodes characters from their (zero-left-padded) ordinals using prefix bits `0000` -- [X] `bcd-extended1`: variant of `bcd` ; encodes characters from their (zero-left-padded) ordinals using prefix bits `1111` -- [X] `excess3`: uses Excess-3 (aka Stibitz code) binary encoding to convert characters from their ordinals -- [X] `gray`: aka reflected binary code -- [X] `manchester`: XORes each bit of the input with `01` -- [X] `manchester-inverted`: variant of `manchester` ; XORes each bit of the input with `10` -- [X] `rotateN`: rotates characters by the specified number of bits (*N* belongs to [1, 7] ; Python 3 only) - -#### [Common](https://python-codext.readthedocs.io/en/latest/enc/common.html) - -- [X] `a1z26`: keeps words whitespace-separated and uses a custom character separator -- [X] `cases`: set of case-related encodings (including camel-, kebab-, lower-, pascal-, upper-, snake- and swap-case, slugify, capitalize, title) -- [X] `dummy`: set of simple encodings (including integer, replace, reverse, word-reverse, substite and strip-spaces) -- [X] `octal`: dummy octal conversion (converts to 3-digits groups) -- [X] `octal-spaced`: variant of `octal` ; dummy octal conversion, handling whitespace separators -- [X] `ordinal`: dummy character ordinals conversion (converts to 3-digits groups) -- [X] `ordinal-spaced`: variant of `ordinal` ; dummy character ordinals conversion, handling whitespace separators - -#### [Compression](https://python-codext.readthedocs.io/en/latest/enc/compressions.html) - -- [X] `gzip`: standard Gzip compression/decompression -- [X] `lz77`: compresses the given data with the algorithm of Lempel and Ziv of 1977 -- [X] `lz78`: compresses the given data with the algorithm of Lempel and Ziv of 1978 -- [X] `pkzip_deflate`: standard Zip-deflate compression/decompression -- [X] `pkzip_bzip2`: standard BZip2 compression/decompression -- [X] `pkzip_lzma`: standard LZMA compression/decompression - -> :warning: Compression functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. - -#### [Cryptography](https://python-codext.readthedocs.io/en/latest/enc/crypto.html) - -- [X] `affine`: aka Affine Cipher -- [X] `atbash`: aka Atbash Cipher -- [X] `bacon`: aka Baconian Cipher -- [X] `barbie-N`: aka Barbie Typewriter (*N* belongs to [1, 4]) -- [X] `citrix`: aka Citrix CTX1 password encoding -- [X] `railfence`: aka Rail Fence Cipher -- [X] `rotN`: aka Caesar cipher (*N* belongs to [1,25]) -- [X] `scytaleN`: encrypts using the number of letters on the rod (*N* belongs to [1,[) -- [X] `shiftN`: shift ordinals (*N* belongs to [1,255]) -- [X] `xorN`: XOR with a single byte (*N* belongs to [1,255]) - -> :warning: Crypto functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. - -#### [Hashing](https://python-codext.readthedocs.io/en/latest/enc/hashing.html) - -- [X] `blake`: includes BLAKE2b and BLAKE2s (Python 3 only ; relies on `hashlib`) -- [X] `checksums`: includes Adler32 and CRC32 (relies on `zlib`) -- [X] `crypt`: Unix's crypt hash for passwords (Python 3 and Unix only ; relies on `crypt`) -- [X] `md`: aka Message Digest ; includes MD4 and MD5 (relies on `hashlib`) -- [X] `sha`: aka Secure Hash Algorithms ; includes SHA1, 224, 256, 384, 512 (Python2/3) but also SHA3-224, -256, -384 and -512 (Python 3 only ; relies on `hashlib`) -- [X] `shake`: aka SHAKE hashing (Python 3 only ; relies on `hashlib`) - -> :warning: Hash functions are of course definitely **NOT** encoding functions ; they are implemented for convenience with the `.encode(...)` API from `codecs` and useful for chaning codecs. - -#### [Languages](https://python-codext.readthedocs.io/en/latest/enc/languages.html) - -- [X] `braille`: well-known braille language (Python 3 only) -- [X] `ipsum`: aka lorem ipsum -- [X] `galactic`: aka galactic alphabet or Minecraft enchantment language (Python 3 only) -- [X] `leetspeak`: based on minimalistic elite speaking rules -- [X] `morse`: uses whitespace as a separator -- [X] `navajo`: only handles letters (not full words from the Navajo dictionary) -- [X] `radio`: aka NATO or radio phonetic alphabet -- [X] `southpark`: converts letters to Kenny's language from Southpark (whitespace is also handled) -- [X] `southpark-icase`: case insensitive variant of `southpark` -- [X] `tap`: converts text to tap/knock code, commonly used by prisoners -- [X] `tomtom`: similar to `morse`, using slashes and backslashes - -#### [Others](https://python-codext.readthedocs.io/en/latest/enc/others.html) - -- [X] `dna`: implements the 8 rules of DNA sequences (N belongs to [1,8]) -- [X] `letter-indices`: encodes consonants and/or vowels with their corresponding indices -- [X] `markdown`: unidirectional encoding from Markdown to HTML - -#### [Steganography](https://python-codext.readthedocs.io/en/latest/enc/stegano.html) - -- [X] `hexagram`: uses Base64 and encodes the result to a charset of [I Ching hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) (as implemented [here](https://github.com/qntm/hexagram-encode)) -- [X] `klopf`: aka Klopf code ; Polybius square with trivial alphabetical distribution -- [X] `resistor`: aka resistor color codes -- [X] `rick`: aka Rick cipher (in reference to Rick Astley's song "*Never gonna give you up*") -- [X] `sms`: also called _T9 code_ ; uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding -- [X] `whitespace`: replaces bits with whitespaces and tabs -- [X] `whitespace_after_before`: variant of `whitespace` ; encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") - -#### [Web](https://python-codext.readthedocs.io/en/latest/enc/web.html) - -- [X] `html`: implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) -- [X] `url`: aka URL encoding - - -## :clap: Supporters - -[![Stargazers repo roster for @dhondta/python-codext](https://reporoster.com/stars/dark/dhondta/python-codext)](https://github.com/dhondta/python-codext/stargazers) - -[![Forkers repo roster for @dhondta/python-codext](https://reporoster.com/forks/dark/dhondta/python-codext)](https://github.com/dhondta/python-codext/network/members) - -

Back to top

+

+

CodExt Tweet

+

Encode/decode anything.

+ +[![PyPi](https://img.shields.io/pypi/v/codext.svg)](https://pypi.python.org/pypi/codext/) +[![Read The Docs](https://readthedocs.org/projects/python-codext/badge/?version=latest)](https://python-codext.readthedocs.io/en/latest/?badge=latest) +[![Build Status](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml/badge.svg)](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml) +[![Coverage Status](https://raw.githubusercontent.com/dhondta/python-codext/coverage-badge/docs/coverage.svg)](#) +[![Python Versions](https://img.shields.io/pypi/pyversions/codext.svg)](https://pypi.python.org/pypi/codext/) +[![Known Vulnerabilities](https://snyk.io/test/github/dhondta/python-codext/badge.svg?targetFile=requirements.txt)](https://snyk.io/test/github/dhondta/python-codext?targetFile=requirements.txt) +[![DOI](https://zenodo.org/badge/236679865.svg)](https://zenodo.org/badge/latestdoi/236679865) +[![License](https://img.shields.io/pypi/l/codext.svg)](https://pypi.python.org/pypi/codext/) + +[**CodExt**](https://github.com/dhondta/python-codext) is a (Python2-3 compatible) library that extends the native [`codecs`](https://docs.python.org/3/library/codecs) library (namely for adding new custom encodings and character mappings) and provides **120+ new codecs**, hence its name combining *CODecs EXTension*. It also features a **guess mode** for decoding multiple layers of encoding and **CLI tools** for convenience. + +```sh +$ pip install codext +``` + +Want to contribute a new codec ? | Want to contribute a new macro ? +:----------------------------------:|:------------------------------------: +Check the [documentation](https://python-codext.readthedocs.io/en/latest/howto) first
Then [PR](https://github.com/dhondta/python-codext/pulls) your new codec | [PR](https://github.com/dhondta/python-codext/pulls) your updated version of [`macros.json`](https://github.com/dhondta/python-codext/blob/main/codext/macros.json) + +## :mag: Demonstrations + +

Using CodExt from the command line

+

Using base tools from the command line

+

Using the unbase command line tool

+ +## :computer: Usage (main CLI tool) Tweet on codext + +```session +$ codext -i test.txt encode dna-1 +GTGAGCGGGTATGTGA + +$ echo -en "test" | codext encode morse +- . ... - + +$ echo -en "test" | codext encode braille +⠞⠑⠎⠞ + +$ echo -en "test" | codext encode base100 +👫👜👪👫 +``` + +### :chains: Chaining codecs + +```sh +$ echo -en "Test string" | codext encode reverse +gnirts tseT + +$ echo -en "Test string" | codext encode reverse morse +--. -. .. .-. - ... / - ... . - + +$ echo -en "Test string" | codext encode reverse morse dna-2 +AGTCAGTCAGTGAGAAAGTCAGTGAGAAAGTGAGTGAGAAAGTGAGTCAGTGAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTTAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTGAGAAAGTC + +$ echo -en "Test string" | codext encode reverse morse dna-2 octal +101107124103101107124103101107124107101107101101101107124103101107124107101107101101101107124107101107124107101107101101101107124107101107124103101107124107101107101101101107124103101107101101101107124107101107124107101107124107101107101101101107124124101107101101101107124103101107101101101107124107101107124107101107124107101107101101101107124107101107101101101107124103 + +$ echo -en "AGTCAGTCAGTGAGAAAGTCAGTGAGAAAGTGAGTGAGAAAGTGAGTCAGTGAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTTAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTGAGAAAGTC" | codext -d dna-2 morse reverse +test string +``` + +### :twisted_rightwards_arrows: Using macros + +```sh +$ codext add-macro my-encoding-chain gzip base63 lzma base64 + +$ codext list macros +example-macro, my-encoding-chain + +$ echo -en "Test string" | codext encode my-encoding-chain +CQQFAF0AAIAAABuTgySPa7WaZC5Sunt6FS0ko71BdrYE8zHqg91qaqadZIR2LafUzpeYDBalvE///ug4AA== + +$ codext remove-macro my-encoding-chain + +$ codext list macros +example-macro +``` + +## :desktop_computer: Usage (`baseXX` CLI tools) Tweet on unbase + +Playing with base encodings. + +```session +$ echo "Test string !" | base122 +*.7!ft9�-f9Â + +$ echo "Test string !" | base91 +"ONK;WDZM%Z%xE7L + +$ echo "Test string !" | base91 | base85 +B2P|BJ6A+nO(j|-cttl% + +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr +QVx5tvgjvCAkXaMSuKoQmCnjeCV1YyyR3WErUUErFf + +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | base58-flickr -d | base36 -d | base85 -d | base91 -d +Test string ! +``` + +```session +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -m 3 +Test string ! + +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -f Test +Test string ! +``` + +## :computer: Usage (CLI) + +Listing codecs. + +```session +$ codext list encodings +a1z26 adler32 affine alternative-rot ascii +atbash autoclave bacon barbie base +base1 base2 base3 base4 base8 +<> +``` + +Finding a codec based on a name. + +```session +$ codext search bitcoin +base58 +``` + +Encoding a string. + +```sesssion +$ echo -en "This is a test" | codext encode polybius +44232443 2443 11 44154344 +``` + +Encoding a file. + +```session +$ echo -en "this is a test" > to_be_encoded.txt +$ codext encode base64 < to_be_encoded.txt > text.b64 +$ cat text.b64 +dGhpcyBpcyBhIHRlc3Q= +``` + +Chaining codecs. + +```session +$ echo -en "mrdvm6teie6t2cq=" | codext encode upper | codext decode base32 | codext decode base64 +test +``` + +Iteratively guessing decodings. + +```session +$ echo -en "test" | codext encode base64 gzip | codext guess +Codecs: gzip +dGVzdA== +$ echo -en "test" | codext encode base64 gzip | codext guess gzip -i base +Codecs: gzip, base64 +test +``` + + +## :snake: Usage (Python) + +Getting the list of available codecs. + +```python +>>> import codext + +>>> codext.list() +['ascii85', 'base85', 'base100', 'base122', ..., 'tomtom', 'dna', 'html', 'markdown', 'url', 'resistor', 'sms', 'whitespace', 'whitespace-after-before'] + +Playing with some base encodings. + +```python +>>> codext.encode("this is a test", "base58-bitcoin") +'jo91waLQA1NNeBmZKUF' + +>>> codext.encode("this is a test", "base58-ripple") +'jo9rA2LQwr44eBmZK7E' + +>>> codext.encode("this is a test", "base58-url") +'JN91Wzkpa1nnDbLyjtf' + +>>> codecs.encode("this is a test", "base100") +'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' + +>>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") +'this is a test' +``` + +Playing with some cryptography-based codecs. + +```python +>>> codext.encode("This is a test !", "vigenere-MYSECRETKET") +'Ffaw kj e mowm !' + +>>> codext.encode("This is a test !", "autoclave-SECRET") +'Llkj ml t amkb !' +``` + +Encoding/decoding with various other codecs. + +```python +>>> for i in range(8): + print(codext.encode("this is a test", "dna-%d" % (i + 1))) +GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA +CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA +ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG +AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC +TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG +TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC +GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT +CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT +>>> codext.decode("GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA", "dna-1") +'this is a test' + +>>> codecs.encode("this is a test", "morse") +'- .... .. ... / .. ... / .- / - . ... -' + +>>> codecs.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") +'this is a test' + +>>> with open("morse.txt", 'w', encoding="morse") as f: + f.write("this is a test") +14 + +>>> with open("morse.txt",encoding="morse") as f: + f.read() +'this is a test' + +>>> print(codext.encode("An example test string", "baudot-tape")) +***.** + . * +***.* +* . + .* +* .* + . * +** .* +***.** +** .** + .* +* . +* *. * + .* +* *. +* *. * +* . +* *. +* *. * +***. + *.* +***.* + * .* +``` + +## :page_with_curl: List of codecs + +#### [BaseXX](https://python-codext.readthedocs.io/en/latest/enc/base) + +- [X] `base1`: useless, but for the sake of completeness +- [X] `base2`: simple conversion to binary (with a variant with a reversed alphabet) +- [X] `base3`: conversion to ternary (with a variant with a reversed alphabet) +- [X] `base4`: conversion to quarternary (with a variant with a reversed alphabet) +- [X] `base8`: simple conversion to octal (with a variant with a reversed alphabet) +- [X] `base10`: simple conversion to decimal +- [X] `base11`: conversion to digits with a "*a*" +- [X] `base16`: simple conversion to hexadecimal (with a variant holding an alphabet with digits and letters inverted) +- [X] `base26`: conversion to alphabet letters +- [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32)) +- [X] `base36`: [Base36](https://en.wikipedia.org/wiki/Base36) conversion to letters and digits (with a variant inverting both groups) +- [X] `base45`: [Base45](https://datatracker.ietf.org/doc/html/draft-faltstrom-base45-04.txt) DRAFT algorithm (with a variant inverting letters and digits) +- [X] `base58`: multiple versions of [Base58](https://en.bitcoinwiki.org/wiki/Base58) (bitcoin, flickr, ripple) +- [X] `base62`: [Base62](https://en.wikipedia.org/wiki/Base62) conversion to lower- and uppercase letters and digits (with a variant with letters and digits inverted) +- [X] `base63`: similar to `base62` with the "`_`" added +- [X] `base64`: classical conversion according to RFC4648 with its variant URL (or *file*) (it also holds a variant with letters and digits inverted) +- [X] `base67`: custom conversion using some more special characters (also with a variant with letters and digits inverted) +- [X] `base85`: all variants of Base85 ([Ascii85](https://fr.wikipedia.org/wiki/Ascii85), [z85](https://rfc.zeromq.org/spec/32), [Adobe](https://dencode.com/string/ascii85), [(x)btoa](https://dencode.com/string/ascii85), [RFC1924](https://datatracker.ietf.org/doc/html/rfc1924), [XML](https://datatracker.ietf.org/doc/html/draft-kwiatkowski-base85-for-xml-00)) +- [X] `base91`: [Base91](http://base91.sourceforge.net) custom conversion +- [X] `base100` (or *emoji*): [Base100](https://github.com/AdamNiederer/base100) custom conversion +- [X] `base122`: [Base100](http://blog.kevinalbs.com/base122) custom conversion +- [X] `base-genericN`: see [base encodings](https://python-codext.readthedocs.io/en/latest/enc/base) ; supports any possible base + +This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `base85` codec. + +#### [Binary](https://python-codext.readthedocs.io/en/latest/enc/binary) + +- [X] `baudot`: supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... +- [X] `baudot-spaced`: variant of `baudot` ; groups of 5 bits are whitespace-separated +- [X] `baudot-tape`: variant of `baudot` ; outputs a string that looks like a perforated tape +- [X] `bcd`: _Binary Coded Decimal_, encodes characters from their (zero-left-padded) ordinals +- [X] `bcd-extended0`: variant of `bcd` ; encodes characters from their (zero-left-padded) ordinals using prefix bits `0000` +- [X] `bcd-extended1`: variant of `bcd` ; encodes characters from their (zero-left-padded) ordinals using prefix bits `1111` +- [X] `excess3`: uses Excess-3 (aka Stibitz code) binary encoding to convert characters from their ordinals +- [X] `gray`: aka reflected binary code +- [X] `manchester`: XORes each bit of the input with `01` +- [X] `manchester-inverted`: variant of `manchester` ; XORes each bit of the input with `10` +- [X] `rotateN`: rotates characters by the specified number of bits (*N* belongs to [1, 7] ; Python 3 only) + +#### [Checksums](https://python-codext.readthedocs.io/en/latest/enc/checksums) + +- [X] `adler`: Adler32 algorithm (relies on `zlib`) +- [X] `crc`: CRC of lengths 8, 10-17, 21, 24, 30-32, 40, 64, 82 with a variety of polynoms +- [X] `luhn`: Luhn mod N algorithm + +#### [Common](https://python-codext.readthedocs.io/en/latest/enc/common) + +- [X] `a1z26`: keeps words whitespace-separated and uses a custom character separator +- [X] `cases`: set of case-related encodings (including camel-, kebab-, lower-, pascal-, upper-, snake- and swap-case, slugify, capitalize, title) +- [X] `dummy`: set of simple encodings (including integer, replace, reverse, word-reverse, substite and strip-spaces) +- [X] `octal`: dummy octal conversion (converts to 3-digits groups) +- [X] `octal-spaced`: variant of `octal` ; dummy octal conversion, handling whitespace separators +- [X] `ordinal`: dummy character ordinals conversion (converts to 3-digits groups) +- [X] `ordinal-spaced`: variant of `ordinal` ; dummy character ordinals conversion, handling whitespace separators + +#### [Compression](https://python-codext.readthedocs.io/en/latest/enc/compressions) + +- [X] `gzip`: standard Gzip compression/decompression +- [X] `lz77`: compresses the given data with the algorithm of Lempel and Ziv of 1977 +- [X] `lz78`: compresses the given data with the algorithm of Lempel and Ziv of 1978 +- [X] `pkzip_deflate`: standard Zip-deflate compression/decompression +- [X] `pkzip_bzip2`: standard BZip2 compression/decompression +- [X] `pkzip_lzma`: standard LZMA compression/decompression + +> :warning: Compression functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. + +#### [Cryptography](https://python-codext.readthedocs.io/en/latest/enc/crypto) + +- [X] `affine`: aka Affine Cipher +- [X] `atbash`: aka Atbash Cipher +- [X] `autoclave`: aka Autoclave/Autokey Cipher (variant of Vigenere Cipher) +- [X] `bacon`: aka Baconian Cipher +- [X] `barbie-N`: aka Barbie Typewriter (*N* belongs to [1, 4]) +- [X] `beaufort`: aka Beaufort Cipher (variant of Vigenere Cipher) +- [X] `citrix`: aka Citrix CTX1 password encoding +- [X] `polybius`: aka Polybius Square Cipher +- [X] `railfence`: aka Rail Fence Cipher +- [X] `rotN`: aka Caesar cipher (*N* belongs to [1,25]) +- [X] `scytaleN`: encrypts using the number of letters on the rod (*N* belongs to [1,[) +- [X] `shiftN`: shift ordinals (*N* belongs to [1,255]) +- [X] `trithemius`: aka Trithemius Cipher (variant of Vigenere Cipher) +- [X] `vigenere`: aka Vigenere Cipher +- [X] `xorN`: XOR with a single byte (*N* belongs to [1,255]) + +> :warning: Crypto functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. + +#### [Hashing](https://python-codext.readthedocs.io/en/latest/enc/hashing) + +- [X] `blake`: includes BLAKE2b and BLAKE2s (Python 3 only ; relies on `hashlib`) +- [X] `crypt`: Unix's crypt hash for passwords (Python 3 and Unix only ; relies on `crypt`) +- [X] `md`: aka Message Digest ; includes MD4 and MD5 (relies on `hashlib`) +- [X] `sha`: aka Secure Hash Algorithms ; includes SHA1, 224, 256, 384, 512 (Python2/3) but also SHA3-224, -256, -384 and -512 (Python 3 only ; relies on `hashlib`) +- [X] `shake`: aka SHAKE hashing (Python 3 only ; relies on `hashlib`) + +> :warning: Hash functions are of course definitely **NOT** encoding functions ; they are implemented for convenience with the `.encode(...)` API from `codecs` and useful for chaning codecs. + +#### [Languages](https://python-codext.readthedocs.io/en/latest/enc/languages) + +- [X] `braille`: well-known braille language (Python 3 only) +- [X] `ipsum`: aka lorem ipsum +- [X] `galactic`: aka galactic alphabet or Minecraft enchantment language (Python 3 only) +- [X] `leetspeak`: based on minimalistic elite speaking rules +- [X] `morse`: uses whitespace as a separator +- [X] `navajo`: only handles letters (not full words from the Navajo dictionary) +- [X] `radio`: aka NATO or radio phonetic alphabet +- [X] `southpark`: converts letters to Kenny's language from Southpark (whitespace is also handled) +- [X] `southpark-icase`: case insensitive variant of `southpark` +- [X] `tap`: converts text to tap/knock code, commonly used by prisoners +- [X] `tomtom`: similar to `morse`, using slashes and backslashes + +#### [Others](https://python-codext.readthedocs.io/en/latest/enc/others) + +- [X] `dna`: implements the 8 rules of DNA sequences (N belongs to [1,8]) +- [X] `letter-indices`: encodes consonants and/or vowels with their corresponding indices +- [X] `markdown`: unidirectional encoding from Markdown to HTML + +#### [Steganography](https://python-codext.readthedocs.io/en/latest/enc/stegano) + +- [X] `hexagram`: uses Base64 and encodes the result to a charset of [I Ching hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) (as implemented [here](https://github.com/qntm/hexagram-encode)) +- [X] `klopf`: aka Klopf code ; Polybius square with trivial alphabetical distribution +- [X] `resistor`: aka resistor color codes +- [X] `rick`: aka Rick cipher (in reference to Rick Astley's song "*Never gonna give you up*") +- [X] `sms`: also called _T9 code_ ; uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding +- [X] `whitespace`: replaces bits with whitespaces and tabs +- [X] `whitespace_after_before`: variant of `whitespace` ; encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") + +#### [Web](https://python-codext.readthedocs.io/en/latest/enc/web) + +- [X] `html`: implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) +- [X] `url`: aka URL encoding + + +## :clap: Supporters + +[![Stargazers repo roster for @dhondta/python-codext](https://reporoster.com/stars/dark/dhondta/python-codext)](https://github.com/dhondta/python-codext/stargazers) + +[![Forkers repo roster for @dhondta/python-codext](https://reporoster.com/forks/dark/dhondta/python-codext)](https://github.com/dhondta/python-codext/network/members) + +

Back to top

diff --git a/codext/VERSION.txt b/codext/VERSION.txt deleted file mode 100644 index 80138e7..0000000 --- a/codext/VERSION.txt +++ /dev/null @@ -1 +0,0 @@ -1.13.4 diff --git a/codext/base/base100.py b/codext/base/base100.py deleted file mode 100755 index f5faa1d..0000000 --- a/codext/base/base100.py +++ /dev/null @@ -1,56 +0,0 @@ -# -*- coding: UTF-8 -*- -"""Base100 Codec - base100 content encoding. - -Note: only works in Python3 ; strongly inspired from https://github.com/MasterGroosha/pybase100 - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ._base import main -from ..__common__ import * - - -# no __examples__ ; handled manually in tests/test_base.py - - -def base100_encode(input, errors="strict"): - raise NotImplementedError - - -def base100_decode(input, errors="strict"): - raise NotImplementedError - - -if PY3: - class Base100DecodeError(ValueError): - __module__ = "builtins" - - def base100_encode(input, errors="strict"): - input = b(input) - r = [240, 159, 0, 0] * len(input) - for i, c in enumerate(input): - r[4*i+2] = (c + 55) // 64 + 143 - r[4*i+3] = (c + 55) % 64 + 128 - return bytes(r), len(input) - - def base100_decode(input, errors="strict"): - input = b(_stripl(input, True, True)) - if errors == "ignore": - input = input.replace(b"\n", "") - if len(input) % 4 != 0: - raise Base100DecodeError("Bad input (length should be multiple of 4)") - r = [None] * (len(input) // 4) - for i, c in enumerate(input): - if i % 4 == 2: - tmp = ((c - 143) * 64) % 256 - elif i % 4 == 3: - r[i//4] = (c - 128 + tmp - 55) & 0xff - return bytes(r), len(input) - - -add("base100", base100_encode, base100_decode, r"^(?:base[-_]?100|emoji)$", expansion_factor=1.) -main100 = main(100, "") - diff --git a/codext/base/base122.py b/codext/base/base122.py deleted file mode 100755 index f580ff8..0000000 --- a/codext/base/base122.py +++ /dev/null @@ -1,106 +0,0 @@ -# -*- coding: UTF-8 -*- -"""Base122 Codec - base122 content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ._base import main -from ..__common__ import * - - -__examples__ = { - 'enc(base122|base-122)': { - 'this is a test': ":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", - b'This is another longer test string with d1g1t5 and sp3c141 characters !\n': \ - b"*\x1a\xca\x97\x19\x01Rs\x10\x18-f{QPe9\x08\xcb\x86{9Ne9\x08\x0eF+Mh 9]\x0e\xd3\x8b" - b"9N ;Z.FA\x01H13L.C)\x01Bn2\x08\x0e7\x01MF1\x1a\x0c$\x06\x1b!Br0XnF+If \x10B@" - }, - 'enc-dec(base_122)': ["@random"], -} if PY3 else {'enc(base122': None} - - -_BAD = [0, 10, 13, 34, 38, 92] -_i = lambda c: c if isinstance(c, int) else ord(c) - - -def base122_encode(input, errors='strict'): - raise NotImplementedError - - -def base122_decode(input, errors='strict'): - raise NotImplementedError - - -if PY3: - # inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js - def base122_encode(input, errors="strict"): - idx, bit, r, l = 0, 0, [], len(input) - - def _get_7bits(idx, bit): - if idx >= l: - return idx, bit, False - B1 = _i(input[idx]) - p1 = (((254 >> bit) & B1) << bit) >> 1 - bit += 7 - if bit < 8: - return idx, bit, p1 - bit -= 8 - idx += 1 - if idx >= l: - return idx, bit, p1 - B2 = _i(input[idx]) - p2 = (((65280 >> bit) & B2) & 255) >> (8 - bit) - return idx, bit, (p1 | p2) - - while True: - if idx >= l: - break - # get seven bits of input data - idx, bit, B = _get_7bits(idx, bit) - # check for illegal chars - try: - bad_idx = _BAD.index(B) - except ValueError: - r.append(B) - continue - idx, bit, nB = _get_7bits(idx, bit) - if nB is False: - nB, bad_idx = B, 7 - B1, B2 = 194, 128 - B1 |= (7 & bad_idx) << 2 - B1 |= int((nB & 64) > 0) - B2 |= nB & 63 - r.extend([B1, B2]) - return "".join(map(chr, r)).encode("latin-1"), len(input) - - # inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js - def base122_decode(input, errors="strict"): - currB, bob, r, input = 0, 0, [], list(map(ord, input)) - - def _get_7bits(currB, bob, B, decoded): - B <<= 1 - currB |= (B % 0x100000000) >> bob - bob += 7 - if bob >= 8: - decoded += [currB] - bob -= 8 - return (B << (7 - bob)) & 255, bob - - for i in range(len(input)): - if input[i] >= 128: - try: - currB, bob = _get_7bits(currB, bob, _BAD[(input[i] >> 8) & 7], r) - except IndexError: - pass - currB, bob = _get_7bits(currB, bob, input[i] & 127, r) - else: - currB, bob = _get_7bits(currB, bob, input[i], r) - return "".join(map(chr, r)).rstrip("\0"), len(input) - - -add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085) -main122 = main(122, "", wrap=False) - diff --git a/codext/binary/rotate.py b/codext/binary/rotate.py deleted file mode 100755 index 944e2b2..0000000 --- a/codext/binary/rotate.py +++ /dev/null @@ -1,52 +0,0 @@ -# -*- coding: UTF-8 -*- -"""Rotate-Bits Codec - rotate-N-bits content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(rotate-0|rotate-8|rotate-left-8)': None, - 'enc(rotate1|rotate-right-1|rotate_1)': {'This is a test': "*4\xb4\xb9\x10\xb4\xb9\x10\xb0\x10:\xb2\xb9:"}, - 'enc(rotate-left-1|rotate_left_1)': {'This is a test': "¨ÐÒæ@Òæ@Â@èÊæè"}, -} -__guess__ = ["rotate-%d" % i for i in range(1, 8)] + ["rotate-left-%d" % i for i in range(1, 8)] - - -if PY3: - def _getn(i): - m = 1 - if str(i).startswith("left"): - i = i[4:].lstrip("-_") - m = -1 - return m * int(i) - - - def _rotaten(text, n=1): - r = "" - for c in ensure_str(text): - b = bin(ord(c))[2:].zfill(8) - r += chr(int(b[-n:] + b[:-n], 2)) - return r - - - def rotate_encode(i): - def encode(text, errors="strict"): - return _rotaten(text, _getn(i)), len(text) - return encode - - - def rotate_decode(i): - def decode(text, errors="strict"): - return _rotaten(text, -_getn(i)), len(text) - return decode - - - add("rotate", rotate_encode, rotate_decode, r"rotate(?:[-_]?bits)?[-_]?((?:(?:left|right)[-_]?)?[1-7])$", - transitive=True) - diff --git a/codext/compressions/pkzip.py b/codext/compressions/pkzip.py deleted file mode 100755 index 47d9cd5..0000000 --- a/codext/compressions/pkzip.py +++ /dev/null @@ -1,56 +0,0 @@ -# -*- coding: UTF-8 -*- -"""Pkzip Codec - pkzip content compression. - -NB: Not an encoding properly speaking. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import zipfile - -from ..__common__ import * - - -_str = ["test", "This is a test", "@random{512,1024,2048}"] -__examples1__ = {'enc-dec(pkzip-deflate|deflate)': _str} -__examples2__ = {'enc-dec(pkzip_bz2|bzip2)': _str} -__examples3__ = {'enc-dec(pkzip-lzma|lzma)': _str} - - -if PY3: - NULL = { - 8: b"\x03\x00", - 12: b"BZh9\x17rE8P\x90\x00\x00\x00\x00", - 14: b"\t\x04\x05\x00]\x00\x00\x80\x00\x00\x83\xff\xfb\xff\xff\xc0\x00\x00\x00", - } - - - def pkzip_encode(compression_type): - def _encode(text, errors="strict"): - c = zipfile._get_compressor(compression_type) - return c.compress(b(text)) + c.flush(), len(text) - return _encode - - - def pkzip_decode(compression_type, name): - def _decode(data, errors="strict"): - d = zipfile._get_decompressor(compression_type) - r = d.decompress(b(data)) - if len(r) == 0 and b(data) != NULL[compression_type]: - return handle_error(name, errors, decode=True)(data[0], 0) if len(data) > 0 else "", len(data) - return r, len(r) - return _decode - - - add("pkzip_deflate", pkzip_encode(8), pkzip_decode(8, "deflate"), r"(?:(?:pk)?zip[-_])?deflate", - examples=__examples1__, guess=["deflate"]) - - add("pkzip_bzip2", pkzip_encode(12), pkzip_decode(12, "bzip2"), r"(?:(?:pk)?zip[-_])?bz(?:ip)?2", - examples=__examples2__, guess=["bz2"]) - - add("pkzip_lzma", pkzip_encode(14), pkzip_decode(14, "lzma"), r"(?:(?:pk)?zip[-_])?lzma", - examples=__examples3__, guess=["lzma"]) - diff --git a/codext/hashing/blake.py b/codext/hashing/blake.py deleted file mode 100644 index 2fad090..0000000 --- a/codext/hashing/blake.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: UTF-8 -*- -"""Case Codecs - string hashing with blake. - -These are codecs for hashing strings, for use with other codecs in encoding chains. - -These codecs: -- transform strings from str to str -- transform strings from bytes to bytes -- transform file content from str to bytes (write) -""" -import hashlib - -from ..__common__ import add, b, PY3 - - -if PY3: - def blake_hash(c): - def _hash_transform(l): - l = (l or "64" if c == "b" else "32").lstrip("_-") - def _encode(data, error="strict"): - return getattr(hashlib, "blake2%s" % c)(b(data), digest_size=int(l)).hexdigest(), len(data) - return _encode - return _hash_transform - - add("blake2b", blake_hash("b"), pattern=r"^blake2b(|[-_](?:[1-9]|[1-5]\d|6[0-4]))$", guess=None) - add("blake2s", blake_hash("s"), pattern=r"^blake2s(|[-_](?:[1-9]|[1-2]\d|3[0-2]))$", guess=None) - diff --git a/codext/hashing/crypt.py b/codext/hashing/crypt.py deleted file mode 100644 index caf8290..0000000 --- a/codext/hashing/crypt.py +++ /dev/null @@ -1,29 +0,0 @@ -# -*- coding: UTF-8 -*- -"""Case Codecs - string hashing with Unix's Crypt. - -These are codecs for hashing strings, for use with other codecs in encoding chains. - -These codecs: -- transform strings from str to str -- transform strings from bytes to bytes -- transform file content from str to bytes (write) -""" -from ..__common__ import add, ensure_str, PY3, UNIX - - -if PY3 and UNIX: - import crypt - - METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] - - def crypt_hash(method): - method = (method or "").lstrip("-_") or "blowfish" - if method not in METHODS: - raise NotImplementedError("method '%s' is not implemented" % method) - def _encode(input, error="strict"): - m = getattr(crypt, "METHOD_" + method.upper()) - return crypt.crypt(ensure_str(input), crypt.mksalt(m)), len(input) - return _encode - - add("crypt", crypt_hash, pattern=r"^crypt(|[-_](?:%s))$" % "|".join(METHODS), guess=None) - diff --git a/codext/hashing/shake.py b/codext/hashing/shake.py deleted file mode 100644 index af79dce..0000000 --- a/codext/hashing/shake.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: UTF-8 -*- -"""Case Codecs - string hashing with SHAKE. - -These are codecs for hashing strings, for use with other codecs in encoding chains. - -These codecs: -- transform strings from str to str -- transform strings from bytes to bytes -- transform file content from str to bytes (write) -""" -import hashlib - -from ..__common__ import add, b, PY3 - - -if PY3: - def shake_hash(i): - def _hash_transform(l): - l = (l or str(i)).lstrip("_-") - def _encode(data, error="strict"): - return getattr(hashlib, "shake_%d" % i)(b(data)).hexdigest(int(l)), len(data) - return _encode - return _hash_transform - - add("shake_128", shake_hash(128), pattern=r"^shake[-_]?128(|[-_][1-9]\d*)$", guess=None) - add("shake_256", shake_hash(256), pattern=r"^shake[-_]?256(|[-_][1-9]\d*)$", guess=None) - diff --git a/codext/stegano/hexagram.py b/codext/stegano/hexagram.py deleted file mode 100755 index 4c32095..0000000 --- a/codext/stegano/hexagram.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: UTF-8 -*- -"""Hexagram Codec - hexagram content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -if PY3: - __examples__ = { - 'enc(hexagram|iching|i-ching-hexagrams)': {'this is a test': "䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯"}, - } - - ENCMAP = {c1: c2 for c1, c2 in zip("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=", - "䷁䷗䷆䷒䷎䷣䷭䷊䷏䷲䷧䷵䷽䷶䷟䷡䷇䷂䷜䷻䷦䷾䷯䷄䷬䷐䷮䷹䷞䷰䷛䷪䷖䷚䷃䷨䷳䷕" - "䷑䷙䷢䷔䷿䷥䷷䷝䷱䷍䷓䷩䷺䷼䷴䷤䷸䷈䷋䷘䷅䷉䷠䷌䷫䷀☯")} - DECMAP = {c2: c1 for c1, c2 in ENCMAP.items()} - - def hexagram_encode(input, errors="strict"): - return "".join(ENCMAP[c] for c in codecs.encode(input, "base64")), len(input) - - def hexagram_decode(input, errors="strict"): - r, ehandler = "", handle_error("hexagram", errors, decode=True) - for i, c in enumerate(input): - try: - r += DECMAP[c] - except KeyError: - r += ehandler(c, i, r) - return codecs.decode(r, "base64"), len(input) - - add("hexagram", hexagram_encode, hexagram_decode, printables_rate=0., - pattern=r"^(?:(?:i-ching-)?hexagrams?|i-?ching)$") - diff --git a/docs/coverage.svg b/docs/coverage.svg new file mode 100644 index 0000000..efa3c52 --- /dev/null +++ b/docs/coverage.svg @@ -0,0 +1 @@ +coverage: 98.83%coverage98.83% \ No newline at end of file diff --git a/docs/imgs/logo.png b/docs/imgs/logo.png deleted file mode 100644 index d14178d..0000000 Binary files a/docs/imgs/logo.png and /dev/null differ diff --git a/docs/js/collapsible-navbar.js b/docs/js/collapsible-navbar.js deleted file mode 100644 index b1e1593..0000000 --- a/docs/js/collapsible-navbar.js +++ /dev/null @@ -1,54 +0,0 @@ -String.prototype.format = function() { - a = this; - for (k in arguments) { - a = a.replace("{" + k + "}", arguments[k]) - } - return a -} - -$(document).ready(function () { - $('li.toctree-l1').each(function () { - var parent = $(this); - var span = parent.find('span:first'); - var sibling = null; - var remove = true; - $('li.toctree-l1').each(function() { - var a = $(this).find('a:first'); - if (a.text() != '' && a.text() == span.text()) { - parent.prepend(a); - span.remove(); - span = a; - if ($(this).hasClass('current')) parent.addClass('current'); - sibling = $(this); - return false - } - }); - if (sibling === null && parent.find('ul.subnav:not(li.toctree-l2)').children('li').length) { - sibling = parent; - remove = false; - } - if (sibling !== null) { - var ul = parent.find('ul.subnav:not(li.toctree-l2)'); - var new_a = ''; - if (!ul.children('li.current').length && !parent.hasClass('current')) { - ul.hide(); - $(new_a.format("left")).insertBefore(span); - } else { - $(new_a.format("down")).insertBefore(span); - } - if (remove) sibling.remove(); - } - }); - $('a.collapse-navbar').click(function () { - var parent = $(this).closest('li.toctree-l1'); - var subnav = parent.find('ul.subnav:not(li.toctree-l2)'); - if ($(this).hasClass('fa-caret-left')) { - subnav.show(); - $(this).removeClass('fa-caret-left'); - $(this).addClass('fa-caret-down'); - } else { - subnav.hide(); - $(this).addClass('fa-caret-left'); - $(this).removeClass('fa-caret-down'); - } -});}); diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml new file mode 100644 index 0000000..af5a4b3 --- /dev/null +++ b/docs/mkdocs.yml @@ -0,0 +1,58 @@ +site_author: dhondta +site_name: "Codext - Extension of native codecs for Python" +repo_url: https://github.com/dhondta/python-codext +copyright: Copyright © 2021-2026 Alexandre D'Hondt +docs_dir: pages +nav: + - Introduction: index.md + - Features: features.md + - 'Guess mode': guessing.md + - Encodings: + - Base: enc/base.md + - Binary: enc/binary.md + - Common: enc/common.md + - Compressions: enc/compressions.md + - Cryptography: enc/crypto.md + - Hashing: enc/hashing.md + - Languages: enc/languages.md + - Others: enc/others.md + - Steganography: enc/stegano.md + - 'String manipulations': manipulations.md + - 'CLI tool': cli.md + - 'Create your codec': howto.md +extra: + generator: false + social: + - icon: fontawesome/solid/paper-plane + link: mailto:alexandre.dhondt@gmail.com + name: Contact Alex + - icon: fontawesome/brands/github + link: https://github.com/dhondta + name: Alex on GitHub + - icon: fontawesome/brands/linkedin + link: https://www.linkedin.com/in/alexandre-d-2ab2aa14/ + name: Alex on LinkedIn + - icon: fontawesome/brands/twitter + link: https://twitter.com/alex_dhondt + name: Alex on Twitter +extra_css: + - css/extra.css +plugins: + - search +theme: + name: material + palette: + - scheme: default + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - scheme: slate + toggle: + icon: material/brightness-4 + name: Switch to light mode + logo: img/logo.png + favicon: img/icon.png +markdown_extensions: + - toc: + permalink: true + - admonition diff --git a/docs/cli.md b/docs/pages/cli.md similarity index 97% rename from docs/cli.md rename to docs/pages/cli.md index 111913c..4b22cd4 100644 --- a/docs/cli.md +++ b/docs/pages/cli.md @@ -1,184 +1,182 @@ -## CLI Tool - -`codext` has a Command-Line Interface tool. - ------ - -### Using Codext from the terminal - -The help message describes everything to know: - -```sh -usage: codext [-h] [-i INFILE] [-o OUTFILE] [-s] {encode,decode,guess,search} ... - -Codecs Extension (CodExt) 1.8.1 - -Author : Alexandre D'Hondt (alexandre.dhondt@gmail.com) -Copyright: © 2019-2021 A. D'Hondt -License : GPLv3 (https://www.gnu.org/licenses/gpl-3.0.fr.html) -Source : https://github.com/dhondta/python-codext - -This tool allows to encode/decode input strings/files with an extended set of codecs. - -positional arguments: - {encode,decode,guess,search} - command to be executed - encode encode input using the specified codecs - decode decode input using the specified codecs - guess try guessing the decoding codecs - search search for codecs - -optional arguments: - -h, --help show this help message and exit - -i INFILE, --input-file INFILE - input file (if none, take stdin as input) - -o OUTFILE, --output-file OUTFILE - output file (if none, display result to stdout) - -s, --strip-newlines strip newlines from input - -usage examples: -- codext search bitcoin -- codext decode base32 -i file.b32 -- codext encode morse < to_be_encoded.txt -- echo "test" | codext encode base100 -- echo -en "test" | codext encode braille -o test.braille -- codext encode base64 < to_be_encoded.txt > text.b64 -- echo -en "test" | codext encode base64 | codext encode base32 -- echo -en "mrdvm6teie6t2cq=" | codext encode upper | codext decode base32 | codext decode base64 -- echo -en "test" | codext encode upper reverse base32 | codext decode base32 reverse lower -- echo -en "test" | codext encode upper reverse base32 base64 morse -- echo -en "test" | codext encode base64 gzip | codext guess -- echo -en "test" | codext encode base64 gzip | codext guess gzip -c base -``` - -!!! note "Input/output" - - STDIN can be used as shown in an example from the help message, like when using the common Linux tool `base64`. - - Unless an output file is specified, the result is displayed in STDOUT. - -!!! note "Encodings chaining" - - Encodings can be chained as shown in the last examples of the help message. This can be practical for quickly manipulating data. - -### Execution examples - -**Scenario 1**: 2-stages encoded flag - -Creating the payload: - -```session -$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 -pwTDSWRUbXTuMQs5EDgKpjgW8MiJVw1 -``` - -From this point, the only thing we know is that we are searching for "*flag*" (with eventually other characters, i.e. leetspeak). - -```session -$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag -Codecs: base58, rotate-3 -A somewhat weird F1@9 ! -``` - -Executing the previous command will take a few tens of seconds. With few stages to be guessed, using the scoring heuristic can be far quicker to get to the right output. The following takes less than a second. - -```session -$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag --heuristic -Codecs: base58, rotate-3 -A somewhat weird F1@9 ! -``` - -**Scenario 2**: Multi-stage-encoded flag - -Creating the payload: - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse -.... -.-- --.- --. -- ....- - -.- -- . ..... -..- --. ..--- .-.. .. . .- ..... .-- -.-. ..... -.. --- -. --.- --.- . --. -- .-. --... ..-. ..- --.- -.-. -- -...- -...- -...- -``` - -When looking at the string, it is easy to figure out it is morse. The problem, at this point, is that this codec is case-insensitive and always returns lowercase characters, as shown hereafter. - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse -hyqgm4tkme5xg2liea5wc5donqqegmr7fuqcm=== -``` - -In order to get it guessed as Base32, it is necessary to put it back to uppercase (in other words, decode from lowercase). - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase -HYQGM4TKME5XG2LIEA5WC5DONQQEGMR7FUQCM=== -``` - -Now that we know we are searching for something with "*flag*" (with eventually other characters), we can use the predefined "`flag`" stop function. - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase | codext guess -f flag -Codecs: base32, barbie -A somewhat weird F1@9 ! -``` - -**Scenario 3**: Base-encoded rotated shifted secret (English) message - -Creating the payload: - -```session -$ echo "My super secret string" | codext encode shift-1 rotate-2 base58 base64 -NDNxaFdieXh0Z29XOVZpWWpjRGNpRWgyZE44Z2FNU0g= -``` - -First, we shall simplify as much as possible ; we can easily guess that Base64 was used as the first encoding scheme: - -```session -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext rank -[+] 1.00002: base62 -[+] 0.99401: base64 -[+] 0.70806: rotate-1 -[+] 0.70806: rotate-2 -[+] 0.70806: rotate-3 -[+] 0.70806: rotate-4 -[+] 0.70806: rotate-5 -[+] 0.70806: rotate-6 -[+] 0.70806: rotate-7 -[+] 0.70806: rotate-left-1 - -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base62 -%¤q ´!.[æ&[fÿhbð^ - -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 -h4nRqFifSnRjFfQxRHuVpxjxpP8cCR -``` - -Afterwards, we can still try to simplify ; - -```session -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext rank -[+] 1.00185: base58 -[+] 0.99091: base62 -[+] 0.67001: rotate-1 -[+] 0.67001: rotate-2 -[+] 0.67001: rotate-3 -[+] 0.67001: rotate-4 -[+] 0.67001: rotate-5 -[+] 0.67001: rotate-6 -[+] 0.67001: rotate-7 -[+] 0.67001: rotate-left-1 -``` - -From here, let us assume that `base58` is effectively the right second-stage encoding. Guessing the two remaining encodings with no more information will now take a few seconds. As multiple outputs can be recognized as normal text, we will use the "`-s`" option not to stop on the first output successfully decoded as text. Moreover, if we have the intuition that the output shall be English text, we can use a more refined stop function like "`lang_en`" with the "`-f`" option. - -```session -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext decode base58 | codext guess -s -f lang_en -[...] -[+] rotate-2, rot-1: My!super!secret!string -[+] rotate-2, rot-23: Qc!wytiv!wigvix!wxvmrk -[+] rotate-2, shift-1: My super secret string -[+] rotate-2, shift-20: :f\r`b]R_\r`RP_Ra\r`a_V[T -[...] -[+] rotate-left-6, shift-1: My super secret string -^C^C^C -``` - -We can then stop the research with Ctrl+C. The right output has been found ! - +`codext` has a Command-Line Interface tool. + +----- + +### Using Codext from the terminal + +The help message describes everything to know: + +```sh +usage: codext [-h] [-i INFILE] [-o OUTFILE] [-s] {encode,decode,guess,search} ... + +Codecs Extension (CodExt) 1.8.1 + +Author : Alexandre D'Hondt (alexandre.dhondt@gmail.com) +Copyright: © 2019-2021 A. D'Hondt +License : GPLv3 (https://www.gnu.org/licenses/gpl-3.0.fr.html) +Source : https://github.com/dhondta/python-codext + +This tool allows to encode/decode input strings/files with an extended set of codecs. + +positional arguments: + {encode,decode,guess,search} + command to be executed + encode encode input using the specified codecs + decode decode input using the specified codecs + guess try guessing the decoding codecs + search search for codecs + +optional arguments: + -h, --help show this help message and exit + -i INFILE, --input-file INFILE + input file (if none, take stdin as input) + -o OUTFILE, --output-file OUTFILE + output file (if none, display result to stdout) + -s, --strip-newlines strip newlines from input + +usage examples: +- codext search bitcoin +- codext decode base32 -i file.b32 +- codext encode morse < to_be_encoded.txt +- echo "test" | codext encode base100 +- echo -en "test" | codext encode braille -o test.braille +- codext encode base64 < to_be_encoded.txt > text.b64 +- echo -en "test" | codext encode base64 | codext encode base32 +- echo -en "mrdvm6teie6t2cq=" | codext encode upper | codext decode base32 | codext decode base64 +- echo -en "test" | codext encode upper reverse base32 | codext decode base32 reverse lower +- echo -en "test" | codext encode upper reverse base32 base64 morse +- echo -en "test" | codext encode base64 gzip | codext guess +- echo -en "test" | codext encode base64 gzip | codext guess gzip -c base +``` + +!!! note "Input/output" + + STDIN can be used as shown in an example from the help message, like when using the common Linux tool `base64`. + + Unless an output file is specified, the result is displayed in STDOUT. + +!!! note "Encodings chaining" + + Encodings can be chained as shown in the last examples of the help message. This can be practical for quickly manipulating data. + +### Execution examples + +**Scenario 1**: 2-stages encoded flag + +Creating the payload: + +```session +$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 +pwTDSWRUbXTuMQs5EDgKpjgW8MiJVw1 +``` + +From this point, the only thing we know is that we are searching for "*flag*" (with eventually other characters, i.e. leetspeak). + +```session +$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag +Codecs: base58, rotate-3 +A somewhat weird F1@9 ! +``` + +Executing the previous command will take a few tens of seconds. With few stages to be guessed, using the scoring heuristic can be far quicker to get to the right output. The following takes less than a second. + +```session +$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag --heuristic +Codecs: base58, rotate-3 +A somewhat weird F1@9 ! +``` + +**Scenario 2**: Multi-stage-encoded flag + +Creating the payload: + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse +.... -.-- --.- --. -- ....- - -.- -- . ..... -..- --. ..--- .-.. .. . .- ..... .-- -.-. ..... -.. --- -. --.- --.- . --. -- .-. --... ..-. ..- --.- -.-. -- -...- -...- -...- +``` + +When looking at the string, it is easy to figure out it is morse. The problem, at this point, is that this codec is case-insensitive and always returns lowercase characters, as shown hereafter. + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse +hyqgm4tkme5xg2liea5wc5donqqegmr7fuqcm=== +``` + +In order to get it guessed as Base32, it is necessary to put it back to uppercase (in other words, decode from lowercase). + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase +HYQGM4TKME5XG2LIEA5WC5DONQQEGMR7FUQCM=== +``` + +Now that we know we are searching for something with "*flag*" (with eventually other characters), we can use the predefined "`flag`" stop function. + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase | codext guess -f flag +Codecs: base32, barbie +A somewhat weird F1@9 ! +``` + +**Scenario 3**: Base-encoded rotated shifted secret (English) message + +Creating the payload: + +```session +$ echo "My super secret string" | codext encode shift-1 rotate-2 base58 base64 +NDNxaFdieXh0Z29XOVZpWWpjRGNpRWgyZE44Z2FNU0g= +``` + +First, we shall simplify as much as possible ; we can easily guess that Base64 was used as the first encoding scheme: + +```session +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext rank +[+] 1.00002: base62 +[+] 0.99401: base64 +[+] 0.70806: rotate-1 +[+] 0.70806: rotate-2 +[+] 0.70806: rotate-3 +[+] 0.70806: rotate-4 +[+] 0.70806: rotate-5 +[+] 0.70806: rotate-6 +[+] 0.70806: rotate-7 +[+] 0.70806: rotate-left-1 + +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base62 +%¤q ´!.[æ&[fÿhbð^ + +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 +h4nRqFifSnRjFfQxRHuVpxjxpP8cCR +``` + +Afterwards, we can still try to simplify ; + +```session +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext rank +[+] 1.00185: base58 +[+] 0.99091: base62 +[+] 0.67001: rotate-1 +[+] 0.67001: rotate-2 +[+] 0.67001: rotate-3 +[+] 0.67001: rotate-4 +[+] 0.67001: rotate-5 +[+] 0.67001: rotate-6 +[+] 0.67001: rotate-7 +[+] 0.67001: rotate-left-1 +``` + +From here, let us assume that `base58` is effectively the right second-stage encoding. Guessing the two remaining encodings with no more information will now take a few seconds. As multiple outputs can be recognized as normal text, we will use the "`-s`" option not to stop on the first output successfully decoded as text. Moreover, if we have the intuition that the output shall be English text, we can use a more refined stop function like "`lang_en`" with the "`-f`" option. + +```session +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext decode base58 | codext guess -s -f lang_en +[...] +[+] rotate-2, rot-1: My!super!secret!string +[+] rotate-2, rot-23: Qc!wytiv!wigvix!wxvmrk +[+] rotate-2, shift-1: My super secret string +[+] rotate-2, shift-20: :f\r`b]R_\r`RP_Ra\r`a_V[T +[...] +[+] rotate-left-6, shift-1: My super secret string +^C^C^C +``` + +We can then stop the research with Ctrl+C. The right output has been found ! + diff --git a/docs/pages/css/extra.css b/docs/pages/css/extra.css new file mode 100644 index 0000000..c78f454 --- /dev/null +++ b/docs/pages/css/extra.css @@ -0,0 +1,26 @@ +/* Full width (only works for some themes, including 'material') */ +@media only screen and (min-width: 76.25em) { + .md-main__inner { + max-width: none; + } + .md-sidebar--primary { + left: 0; + } + .md-sidebar--secondary { + right: 0; + margin-left: 0; + -webkit-transform: none; + transform: none; + } +} + +/* See https://github.com/mkdocs/mkdocs/wiki/MkDocs-Recipes */ +/* Add Support for Checkbox Lists */ +.task-list-item { + list-style-type: none; +} + +.task-list-item input { + margin: 0 4px 0.25em -20px; + vertical-align: middle; +} diff --git a/docs/demos/using-bases.gif b/docs/pages/demos/using-bases.gif similarity index 100% rename from docs/demos/using-bases.gif rename to docs/pages/demos/using-bases.gif diff --git a/docs/demos/using-codext.gif b/docs/pages/demos/using-codext.gif similarity index 100% rename from docs/demos/using-codext.gif rename to docs/pages/demos/using-codext.gif diff --git a/docs/demos/using-debase.gif b/docs/pages/demos/using-debase.gif similarity index 100% rename from docs/demos/using-debase.gif rename to docs/pages/demos/using-debase.gif diff --git a/docs/enc/base.md b/docs/pages/enc/base.md similarity index 97% rename from docs/enc/base.md rename to docs/pages/enc/base.md index 757965e..dc7b26c 100644 --- a/docs/enc/base.md +++ b/docs/pages/enc/base.md @@ -1,174 +1,172 @@ -## Base - -`codext` defines a far broader set of Base-encodings than in the original library. - ------ - -### Classical base 2^N encodings - -This namely adds the classical BaseXX encodings like 16 (hexadecimal) and 32 (RFC 3548), which are not available in the native codecs. - -Common base encodings with N a power of 2: - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_AB`) -`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_6VC9`) -`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | Charset: `abcdefgh` ; Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_A5c96T7x`) -`base16` | text <-> Base16 encoded text | `base[-_]?16-inv(erted)` | -`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)`, `base32-crockford`, `base32_geohash`, ... | Also supports Base32 Crockford, Geohash and Hex -`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | Human-oriented Base32 -`base64` | text <-> Base64 encoded text | `base[-_]?64-inv(erted)` | - -!!! note "Aliases" - - All the aliases are case insensitive for base encodings. - -```python ->>> codext.encode("test", "base2") -'01110100011001010111001101110100' ->>> codext.encode("test", "base2-inv") -'10001011100110101000110010001011' -``` - -```python ->>> codecs.encode("this is a test", "base16") -'7468697320697320612074657374' ->>> codecs.decode("7468697320697320612074657374", "base16") -'this is a test' ->>> codecs.encode("this is a test", "base16-inv") -'1E02031DCA031DCA0BCA1E0F1D1E' -``` - -```python ->>> codext.encode("this is a test", "base32") -'ORUGS4ZANFZSAYJAORSXG5A=' ->>> codext.decode("ORUGS4ZANFZSAYJAORSXG5A=", "base32") -'this is a test' -``` - -Note that for `base64`, it overwrites the native `base64_codec` to also support en/decoding from str. - -```python ->>> codecs.encode("this is a test", "base64") -'dGhpcyBpcyBhIHRlc3Q=' ->>> codecs.decode("dGhpcyBpcyBhIHRlc3Q=", "base64") -'this is a test' -``` - ------ - -### Generic base encodings - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base3` | text <-> Base3 encoded text | `base[-_]?3(|[-_]inv(erted)?)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_C2Z`) -`base10` | text <-> Base10 encoded text | `base[-_]?10|int(?:eger)?|dec(?:imal)?` | -`base11` | text <-> Base11 encoded text | `base[-_]?11(|[-_]inv(erted)?)` | -`base36` | text <-> Base36 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | -`base45` | text <-> Base45 encoded text | `base[-_]?45(|[-_]inv(erted)?)` | -`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | Supports Bitcoin, Ripple and short URL -`base62` | text <-> Base62 encoded text | `base[-_]?62(|[-_]inv(erted)?)` | -`base63` | text <-> Base63 encoded text | `base[-_]?63(|[-_]inv(erted)?)` | -`base91` | text <-> Base91 encoded text | `base[-_]?91(|[-_]inv(erted)?)` | -`base91-alt` | text <-> Alternate Base91 encoded text | `base[-_]?91[-_]alt(?:ernate)?(|[-_]inv(erted)?)` | Another version of Base91 - -```python ->>> codext.encode("test", "base3") -'23112113223321323322' -``` - -```python ->>> codecs.encode("test", "base36") -'WANEK4' ->>> codecs.decode("4WMHTK6UZL044O91NKCEB8", "base36") -'this is a test' -``` - -```python ->>> codext.encode("this is a test!", "base45") -'AWE+EDH44.OEOCC7WE QEX0' ->>> codext.decode('AWE+EDH44.OEOCC7WE QEX0', "base45") -'this is a test!' -``` - -```python ->>> codext.encode("this is a test", "base58") -'jo91waLQA1NNeBmZKUF' ->>> codext.encode("this is a test", "base58-ripple") -'jo9rA2LQwr44eBmZK7E' ->>> codext.encode("this is a test", "base58-url") -'JN91Wzkpa1nnDbLyjtf' -``` - -```python ->>> codecs.encode("test", "base62") -'289lyu' ->>> codecs.encode("this is a test", "base62") -'CsoB4HQ5gmgMyCenF7E' -``` - -```python ->>> codecs.encode("This is a test !", "base91") -'nX,<:WRT%yxth90oZB^C' ->>> codext.encode("This is a test !", "base91-alt") -'?a&[jv4S3Wg>,71@Jo#K' -``` - -!!! note "Generic encodings" - - Base encodings are available for any N other than the ones explicitely specified using the "`-generic`" suffix. Their charsets consist of printable characters from the `string` module for N up to 100 and for characters composed from the 256 possible ordinals for a greater N. - - :::python - >>> codext.encode("test", "base3-generic") - '12001002112210212211' - >>> codext.encode("test", "base17-generic") - '4cf60456' - ------ - -### Base85 - -This encoding implements various different versions of Base85. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base85` | text <-> ascii85 | `(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)` | - -```python ->>> codext.encode("this is a test", "ascii85") -"FD,B0+DGm>@3BZ'F*%" ->>> codext.decode("FD,B0+DGm>@3BZ'F*%", "ascii85") -'this is a test' ->>> with open("ascii85.txt", 'w', encoding="ascii85") as f: - f.write("this is a test") -14 ->>> with open("ascii85.txt", encoding="ascii85") as f: - f.read() -'this is a test' -``` - ------ - -### Other base encodings - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base100` | text <-> Base100 encoded text | `base[-_]?100|emoji` | Python 3 only -`base122` | text <-> Base122 encoded text | `base[-_]?122` | Python 3 only -`base128` | text <-> Base128 encoded text | `base[-_]?128` | Relies on the ASCII charset - -```python ->>> codecs.encode("this is a test", "base100") -'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' ->>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") -'this is a test' -``` - -```python ->>> codecs.encode("this is a test", "base122") -':\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft' ->>> codecs.decode(":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", "base122") -'this is a test' -``` - +`codext` defines a far broader set of Base-encodings than in the original library. + +----- + +### Classical base 2^N encodings + +This namely adds the classical BaseXX encodings like 16 (hexadecimal) and 32 (RFC 3548), which are not available in the native codecs. + +Common base encodings with N a power of 2: + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_AB`) +`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_6VC9`) +`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | Charset: `abcdefgh` ; Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_A5c96T7x`) +`base16` | text <-> Base16 encoded text | `base[-_]?16-inv(erted)` | +`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)`, `base32-crockford`, `base32_geohash`, ... | Also supports Base32 Crockford, Geohash and Hex +`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | Human-oriented Base32 +`base64` | text <-> Base64 encoded text | `base[-_]?64-inv(erted)` | + +!!! note "Aliases" + + All the aliases are case insensitive for base encodings. + +```python +>>> codext.encode("test", "base2") +'01110100011001010111001101110100' +>>> codext.encode("test", "base2-inv") +'10001011100110101000110010001011' +``` + +```python +>>> codecs.encode("this is a test", "base16") +'7468697320697320612074657374' +>>> codecs.decode("7468697320697320612074657374", "base16") +'this is a test' +>>> codecs.encode("this is a test", "base16-inv") +'1E02031DCA031DCA0BCA1E0F1D1E' +``` + +```python +>>> codext.encode("this is a test", "base32") +'ORUGS4ZANFZSAYJAORSXG5A=' +>>> codext.decode("ORUGS4ZANFZSAYJAORSXG5A=", "base32") +'this is a test' +``` + +Note that for `base64`, it overwrites the native `base64_codec` to also support en/decoding from str. + +```python +>>> codecs.encode("this is a test", "base64") +'dGhpcyBpcyBhIHRlc3Q=' +>>> codecs.decode("dGhpcyBpcyBhIHRlc3Q=", "base64") +'this is a test' +``` + +----- + +### Generic base encodings + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base3` | text <-> Base3 encoded text | `base[-_]?3(|[-_]inv(erted)?)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_C2Z`) +`base10` | text <-> Base10 encoded text | `base[-_]?10|int(?:eger)?|dec(?:imal)?` | +`base11` | text <-> Base11 encoded text | `base[-_]?11(|[-_]inv(erted)?)` | +`base36` | text <-> Base36 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | +`base45` | text <-> Base45 encoded text | `base[-_]?45(|[-_]inv(erted)?)` | +`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | Supports Bitcoin, Ripple and short URL +`base62` | text <-> Base62 encoded text | `base[-_]?62(|[-_]inv(erted)?)` | +`base63` | text <-> Base63 encoded text | `base[-_]?63(|[-_]inv(erted)?)` | +`base91` | text <-> Base91 encoded text | `base[-_]?91(|[-_]inv(erted)?)` | +`base91-alt` | text <-> Alternate Base91 encoded text | `base[-_]?91[-_]alt(?:ernate)?(|[-_]inv(erted)?)` | Another version of Base91 + +```python +>>> codext.encode("test", "base3") +'23112113223321323322' +``` + +```python +>>> codecs.encode("test", "base36") +'WANEK4' +>>> codecs.decode("4WMHTK6UZL044O91NKCEB8", "base36") +'this is a test' +``` + +```python +>>> codext.encode("this is a test!", "base45") +'AWE+EDH44.OEOCC7WE QEX0' +>>> codext.decode('AWE+EDH44.OEOCC7WE QEX0', "base45") +'this is a test!' +``` + +```python +>>> codext.encode("this is a test", "base58") +'jo91waLQA1NNeBmZKUF' +>>> codext.encode("this is a test", "base58-ripple") +'jo9rA2LQwr44eBmZK7E' +>>> codext.encode("this is a test", "base58-url") +'JN91Wzkpa1nnDbLyjtf' +``` + +```python +>>> codecs.encode("test", "base62") +'289lyu' +>>> codecs.encode("this is a test", "base62") +'CsoB4HQ5gmgMyCenF7E' +``` + +```python +>>> codecs.encode("This is a test !", "base91") +'nX,<:WRT%yxth90oZB^C' +>>> codext.encode("This is a test !", "base91-alt") +'?a&[jv4S3Wg>,71@Jo#K' +``` + +!!! note "Generic encodings" + + Base encodings are available for any N other than the ones explicitely specified using the "`-generic`" suffix. Their charsets consist of printable characters from the `string` module for N up to 100 and for characters composed from the 256 possible ordinals for a greater N. + + :::python + >>> codext.encode("test", "base3-generic") + '12001002112210212211' + >>> codext.encode("test", "base17-generic") + '4cf60456' + +----- + +### Base85 + +This encoding implements various different versions of Base85. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base85` | text <-> ascii85 | `(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)` | + +```python +>>> codext.encode("this is a test", "ascii85") +"FD,B0+DGm>@3BZ'F*%" +>>> codext.decode("FD,B0+DGm>@3BZ'F*%", "ascii85") +'this is a test' +>>> with open("ascii85.txt", 'w', encoding="ascii85") as f: + f.write("this is a test") +14 +>>> with open("ascii85.txt", encoding="ascii85") as f: + f.read() +'this is a test' +``` + +----- + +### Other base encodings + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base100` | text <-> Base100 encoded text | `base[-_]?100|emoji` | Python 3 only +`base122` | text <-> Base122 encoded text | `base[-_]?122` | Python 3 only +`base128` | text <-> Base128 encoded text | `base[-_]?128` | Relies on the ASCII charset + +```python +>>> codecs.encode("this is a test", "base100") +'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' +>>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") +'this is a test' +``` + +```python +>>> codecs.encode("this is a test", "base122") +':\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft' +>>> codecs.decode(":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", "base122") +'this is a test' +``` + diff --git a/docs/enc/binary.md b/docs/pages/enc/binary.md similarity index 97% rename from docs/enc/binary.md rename to docs/pages/enc/binary.md index 745ef82..0ed7fb0 100644 --- a/docs/enc/binary.md +++ b/docs/pages/enc/binary.md @@ -1,168 +1,166 @@ -## Binary - -`codext` also adds common binary encodings. For instance, the Manchester code, that encodes digits, is applied to the ordinals of the input text and the resulting binary stream is converted back to characters. - ------ - -### Baudot - -It supports various formats such as CCITT-1 and CCITT-2, ITA1 and ITA2, and some others. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`baudot` | text <-> text | Baudot code bits | `baudot-ccitt1`, `baudot_ccitt2_lsb`, ... | supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... -`baudot-spaced` | text <-> Baudot code groups of bits | `baudot-spaced-ita1_lsb`, `baudot_spaced_ita2_msb`, ... | groups of 5 bits are whitespace-separated -`baudot-tape` | text <-> Baudot code tape | `baudot-tape-mtk2`, `baudot_tape_murray`, ... | outputs a string that looks like a perforated tape - -!!! note "LSB / MSB" - - "`_lsb`" or "`_msb`" can be specified in the codec name to set the bits order. If not specified, it defaults to MSB. - - -```python ->>> codext.encode("12345", "baudot-fr") -'010000000100010001000010100111' ->>> codext.decode("010000000100010001000010100111", "baudot-fr") -'12345' -``` - -```python ->>> codext.encode("TEST", "baudot-spaced_uk") -'10101 00010 10100 10101' ->>> codext.decode("10101 00010 10100 10101", "baudot-spaced_uk") -'TEST' -``` - -```python ->>> s = codext.encode("HELLO WORLD!", "baudot-tape_ita2") ->>> print(s) -***.** -* *. - . * -* .* -* .* -** . - *. -* .** -** . - * .* -* .* - * . * -** .** - **. * ->>> codext.decode(s, "baudot-tape_ita2") -'HELLO WORLD!' -``` - ------ - -### Binary Coded Decimal (BCD) - -It converts characters to their odrinals, left-pads with zeros, converts digits to 4-bits groups and then make characters with the assembled groups. It can also use a 4-bits prefix for making new characters. It then allows to define extended versions of BCD. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`bcd` | text <-> BCD encoded text | `binary_coded_decimals` | -`bcd-extended0` | text <-> BCD encoded text using prefix `0000` | `bcd_ext0`, `bcd-extended-zeros`, `binary_coded_decimals_extended_0` | -`bcd-extended1` | text <-> BCD encoded text using prefix `1111` | `bcd_ext1`, `bcd-extended-ones`, `binary_coded_decimals_extended_1` | - -```python ->>> codext.encode("Test", "bcd") -'\x08A\x01\x11Q\x16' ->>> codext.decode("\x08A\x01\x11Q\x16", "binary_coded_decimal") -'Test' ->>> codext.encode("Test", "bcd_ext_zero") -'\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00' ->>> codext.decode("\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00", "bcd-ext0") -'Test' ->>> codext.encode("Test", "bcd_extended_ones") -'\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0' ->>> codext.decode("\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0", "bcd_ext1") -'Test' -``` - ------ - -### Excess-3 - -Also called *Stibitz code*, it converts characters to ordinals, left-pads with zeros and then applies Excess-3 (Stibitz) code to get groups of 4 bits that are finally reassembled into bytes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`excess3` | text <-> XS3 encoded text | `excess-3`, `xs3`, `stibitz` | - -```python ->>> codext.encode("This is a test!", "excess-3") -';t7C\x84H6T8D\x83e<£eD\x944D\x84I6`' ->>> codext.decode(";t7C\x84H6T8D\x83e<£eD\x944D\x84I6`", "stibitz") -'This is a test!' -``` - ------ - -### Gray - -Also called *reflected binary code*, it implements the Gray code applied to characters while converted to bytes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`gray` | text <-> gray encoded text | `reflected-bin`, `reflected_binary` | - -```python ->>> codext.encode("this is a test", "gray") -'N\\]J0]J0Q0NWJN' ->>> codext.decode("N\\]J0]J0Q0NWJN", "gray") -'this is a test' ->>> codext.encode("THIS IS A TEST", "gray") -'~lmz0mz0a0~gz~' ->>> codext.decode("~lmz0mz0a0~gz~", "gray") -'THIS IS A TEST' -``` - ------ - -### Manchester - -This codec XORes each group of 4 bits of the input text with a 1-byte clock signal, e.g. `0x55` giving in binary `01010101`. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`manchester` | text <-> manchester encoded text | | clock signal is `0x55` (`01010101`) -`manchester-inverted` | text <-> manchester encoded text | `ethernet`, `ieee802.4` | clock signal is `0xaa` (`10101010`) - -```python ->>> codext.encode("This is a test!", "manchester") -'fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV' ->>> codext.decode("fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV", "manchester") -'This is a test!' ->>> codext.encode("This is a test!", "manchester-inverted") -'\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©' ->>> codext.decode("\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©", "ethernet") -'This is a test!' -``` - ------ - -### Rotate N bits - -This codec rotates of N bits each byte of an input string. - -!!! note "Lossless" - - This codec does not use the "`<<`" and "`>>`" operators as it is lossy in some cases. Instead, it rotates per group of 8 bits. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rotate` | text <-> N-bits-rotated text | `rotate-N`, `rotate_bits-N`, `rotate-right-N`, `rotate_left_N` | N belongs to [1,7] ; when nothing specified, it rotates to the right - -```python ->>> codext.encode("test", "rotate-1") -':29:' ->>> codext.encode("test", "rotatebits-1") -':29:' ->>> codext.encode("test", "rotate_right-1") -':29:' ->>> codext.encode("test", "rotate_left_1") -'èÊæè' -``` - +`codext` also adds common binary encodings. For instance, the Manchester code, that encodes digits, is applied to the ordinals of the input text and the resulting binary stream is converted back to characters. + +----- + +### Baudot + +It supports various formats such as CCITT-1 and CCITT-2, ITA1 and ITA2, and some others. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`baudot` | text <-> text | Baudot code bits | `baudot-ccitt1`, `baudot_ccitt2_lsb`, ... | supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... +`baudot-spaced` | text <-> Baudot code groups of bits | `baudot-spaced-ita1_lsb`, `baudot_spaced_ita2_msb`, ... | groups of 5 bits are whitespace-separated +`baudot-tape` | text <-> Baudot code tape | `baudot-tape-mtk2`, `baudot_tape_murray`, ... | outputs a string that looks like a perforated tape + +!!! note "LSB / MSB" + + "`_lsb`" or "`_msb`" can be specified in the codec name to set the bits order. If not specified, it defaults to MSB. + + +```python +>>> codext.encode("12345", "baudot-fr") +'010000000100010001000010100111' +>>> codext.decode("010000000100010001000010100111", "baudot-fr") +'12345' +``` + +```python +>>> codext.encode("TEST", "baudot-spaced_uk") +'10101 00010 10100 10101' +>>> codext.decode("10101 00010 10100 10101", "baudot-spaced_uk") +'TEST' +``` + +```python +>>> s = codext.encode("HELLO WORLD!", "baudot-tape_ita2") +>>> print(s) +***.** +* *. + . * +* .* +* .* +** . + *. +* .** +** . + * .* +* .* + * . * +** .** + **. * +>>> codext.decode(s, "baudot-tape_ita2") +'HELLO WORLD!' +``` + +----- + +### Binary Coded Decimal (BCD) + +It converts characters to their odrinals, left-pads with zeros, converts digits to 4-bits groups and then make characters with the assembled groups. It can also use a 4-bits prefix for making new characters. It then allows to define extended versions of BCD. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`bcd` | text <-> BCD encoded text | `binary_coded_decimals` | +`bcd-extended0` | text <-> BCD encoded text using prefix `0000` | `bcd_ext0`, `bcd-extended-zeros`, `binary_coded_decimals_extended_0` | +`bcd-extended1` | text <-> BCD encoded text using prefix `1111` | `bcd_ext1`, `bcd-extended-ones`, `binary_coded_decimals_extended_1` | + +```python +>>> codext.encode("Test", "bcd") +'\x08A\x01\x11Q\x16' +>>> codext.decode("\x08A\x01\x11Q\x16", "binary_coded_decimal") +'Test' +>>> codext.encode("Test", "bcd_ext_zero") +'\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00' +>>> codext.decode("\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00", "bcd-ext0") +'Test' +>>> codext.encode("Test", "bcd_extended_ones") +'\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0' +>>> codext.decode("\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0", "bcd_ext1") +'Test' +``` + +----- + +### Excess-3 + +Also called *Stibitz code*, it converts characters to ordinals, left-pads with zeros and then applies Excess-3 (Stibitz) code to get groups of 4 bits that are finally reassembled into bytes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`excess3` | text <-> XS3 encoded text | `excess-3`, `xs3`, `stibitz` | + +```python +>>> codext.encode("This is a test!", "excess-3") +';t7C\x84H6T8D\x83e<£eD\x944D\x84I6`' +>>> codext.decode(";t7C\x84H6T8D\x83e<£eD\x944D\x84I6`", "stibitz") +'This is a test!' +``` + +----- + +### Gray + +Also called *reflected binary code*, it implements the Gray code applied to characters while converted to bytes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`gray` | text <-> gray encoded text | `reflected-bin`, `reflected_binary` | + +```python +>>> codext.encode("this is a test", "gray") +'N\\]J0]J0Q0NWJN' +>>> codext.decode("N\\]J0]J0Q0NWJN", "gray") +'this is a test' +>>> codext.encode("THIS IS A TEST", "gray") +'~lmz0mz0a0~gz~' +>>> codext.decode("~lmz0mz0a0~gz~", "gray") +'THIS IS A TEST' +``` + +----- + +### Manchester + +This codec XORes each group of 4 bits of the input text with a 1-byte clock signal, e.g. `0x55` giving in binary `01010101`. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`manchester` | text <-> manchester encoded text | | clock signal is `0x55` (`01010101`) +`manchester-inverted` | text <-> manchester encoded text | `ethernet`, `ieee802.4` | clock signal is `0xaa` (`10101010`) + +```python +>>> codext.encode("This is a test!", "manchester") +'fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV' +>>> codext.decode("fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV", "manchester") +'This is a test!' +>>> codext.encode("This is a test!", "manchester-inverted") +'\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©' +>>> codext.decode("\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©", "ethernet") +'This is a test!' +``` + +----- + +### Rotate N bits + +This codec rotates of N bits each byte of an input string. + +!!! note "Lossless" + + This codec does not use the "`<<`" and "`>>`" operators as it is lossy in some cases. Instead, it rotates per group of 8 bits. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rotate` | text <-> N-bits-rotated text | `rotate-N`, `rotate_bits-N`, `rotate-right-N`, `rotate_left_N` | N belongs to [1,7] ; when nothing specified, it rotates to the right + +```python +>>> codext.encode("test", "rotate-1") +':29:' +>>> codext.encode("test", "rotatebits-1") +':29:' +>>> codext.encode("test", "rotate_right-1") +':29:' +>>> codext.encode("test", "rotate_left_1") +'èÊæè' +``` + diff --git a/docs/enc/common.md b/docs/pages/enc/common.md similarity index 97% rename from docs/enc/common.md rename to docs/pages/enc/common.md index 34a566c..1739ca8 100644 --- a/docs/enc/common.md +++ b/docs/pages/enc/common.md @@ -1,71 +1,69 @@ -## Common - -`codext` also provides some very common encodings, for the sake of simplicity (e.g. while chaining codecs with [the CLI tool](../cli.html)). - ------ - -### A1Z26 - -This simple codec converts letters to their order number in the alphabet using a separator between characters and keeping words separated by a whitespace. It is similar to the [`consonant-vowel-indices`](others.html#letter-indices) encoding. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`a1z26` | text <-> alphabet order numbers | `a1z26`, `a1z26-/`, `a1z26-,`, ... | this codec does not preserve the case and is dynamic (separator of characters in each word can be customized among these: "`-_/|,;:*`") - -```python ->>> codext.encode("This is a test", "a1z26") -'20-8-9-19 9-19 1 20-5-19-20' ->>> codext.decode("20-8-9-19 9-19 1 20-5-19-20", "a1z26") -'this is a test' -``` - ------ - -### Octal - -This simple codec converts characters into their octal values. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`octal` | text <-> octal digits | `octals` | groups of 3-chars octal values when encoded -`octal-spaced` | text <-> spaced octal digits | `octals-spaced` | whitespace-separated suite of variable-length groups of octal digits when encoded - -```python ->>> codext.encode("this is a test", "octal") -'164150151163040151163040141040164145163164' ->>> codext.decode("164150151163040151163040141040164145163164", "octals") -'this is a test' -``` - -```python ->>> codext.encode("this is a test", "octal-spaced") -'164 150 151 163 40 151 163 40 141 40 164 145 163 164' ->>> codext.decode("164 150 151 163 40 151 163 40 141 40 164 145 163 164", "octals-spaced") -'this is a test' -``` - ------ - -### Ordinal - -This simple codec converts characters into their ordinals. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`ordinal` | text <-> ordinal digits | `ordinals` | groups of 3-chars ordinal values when encoded -`ordinal-spaced` | text <-> spaced ordinal digits | `ordinals-spaced` | whitespace-separated suite of variable-length groups of ordinal digits when encoded - -```python ->>> codext.encode("this is a test", "ordinal") -'116104105115032105115032097032116101115116' ->>> codext.decode("116104105115032105115032097032116101115116", "ordinals") -'this is a test' -``` - -```python ->>> codext.encode("this is a test", "ordinal-spaced") -'116 104 105 115 32 105 115 32 97 32 116 101 115 116' ->>> codext.decode("116 104 105 115 32 105 115 32 97 32 116 101 115 116", "ordinals-spaced") -'this is a test' -``` - +`codext` also provides some very common encodings, for the sake of simplicity (e.g. while chaining codecs with [the CLI tool](../cli.html)). + +----- + +### A1Z26 + +This simple codec converts letters to their order number in the alphabet using a separator between characters and keeping words separated by a whitespace. It is similar to the [`consonant-vowel-indices`](others.html#letter-indices) encoding. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`a1z26` | text <-> alphabet order numbers | `a1z26`, `a1z26-/`, `a1z26-,`, ... | this codec does not preserve the case and is dynamic (separator of characters in each word can be customized among these: "`-_/|,;:*`") + +```python +>>> codext.encode("This is a test", "a1z26") +'20-8-9-19 9-19 1 20-5-19-20' +>>> codext.decode("20-8-9-19 9-19 1 20-5-19-20", "a1z26") +'this is a test' +``` + +----- + +### Octal + +This simple codec converts characters into their octal values. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`octal` | text <-> octal digits | `octals` | groups of 3-chars octal values when encoded +`octal-spaced` | text <-> spaced octal digits | `octals-spaced` | whitespace-separated suite of variable-length groups of octal digits when encoded + +```python +>>> codext.encode("this is a test", "octal") +'164150151163040151163040141040164145163164' +>>> codext.decode("164150151163040151163040141040164145163164", "octals") +'this is a test' +``` + +```python +>>> codext.encode("this is a test", "octal-spaced") +'164 150 151 163 40 151 163 40 141 40 164 145 163 164' +>>> codext.decode("164 150 151 163 40 151 163 40 141 40 164 145 163 164", "octals-spaced") +'this is a test' +``` + +----- + +### Ordinal + +This simple codec converts characters into their ordinals. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`ordinal` | text <-> ordinal digits | `ordinals` | groups of 3-chars ordinal values when encoded +`ordinal-spaced` | text <-> spaced ordinal digits | `ordinals-spaced` | whitespace-separated suite of variable-length groups of ordinal digits when encoded + +```python +>>> codext.encode("this is a test", "ordinal") +'116104105115032105115032097032116101115116' +>>> codext.decode("116104105115032105115032097032116101115116", "ordinals") +'this is a test' +``` + +```python +>>> codext.encode("this is a test", "ordinal-spaced") +'116 104 105 115 32 105 115 32 97 32 116 101 115 116' +>>> codext.decode("116 104 105 115 32 105 115 32 97 32 116 101 115 116", "ordinals-spaced") +'this is a test' +``` + diff --git a/docs/enc/compressions.md b/docs/pages/enc/compressions.md similarity index 99% rename from docs/enc/compressions.md rename to docs/pages/enc/compressions.md index a5437cf..5c4fd2e 100644 --- a/docs/enc/compressions.md +++ b/docs/pages/enc/compressions.md @@ -1,5 +1,3 @@ -## Compressions - `codext` provides a few common compression codecs. ----- diff --git a/docs/enc/crypto.md b/docs/pages/enc/crypto.md similarity index 68% rename from docs/enc/crypto.md rename to docs/pages/enc/crypto.md index e59ab0f..c065569 100644 --- a/docs/enc/crypto.md +++ b/docs/pages/enc/crypto.md @@ -1,206 +1,295 @@ -## Cryptography - -`codext` also implements several simple cryptographic ciphers. But how does it relate to encoding while a key is required ? `codext` focuses on ciphers that have a weak key. With dynamically named encodings, it is then possible to define a bunch of encodings, one for each value of the key. For instance, Barbie Typewriter has a key with only 4 possible values. The `barbie` codec can then be `barbie-1`, ..., `barbie-4`. - -!!! note "Available masks" - - Some cipher codecs use character masks to generate their alphabets. Groups of characters are indicated using a headin "`?`". - - `a`: printable characters - `b`: all 8-bits chars - `d`: digits - `h`: lowercase hexadecimal - `H`: uppercase hexadecimal - `l`: lowercase letters - `p`: punctuation characters - `s`: whitespace - `u`: uppercase letters - - When combining masks, only one occurrence of each character is taken in the final alphabet. - - So, for instance, the following masks yield the following alphabets: - - - `?l?u?d?s`: "`abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 `" - - `?s.,?!?u?d`: "` .,?!ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789`" - ------ - -### Affine Cipher - -This codec implements the Affine monoalphabetic substitution cipher. It is parametrizable with a mask for generating the alphabet and the parameters `a` and `b`. By default, it uses mask "`lus`" and parameters `a=1` and `b=2` but it can be set as in the examples hereafter. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`affine` | text <-> affine ciphertext | `affine`, `affine_cipher-?l?u?d?s-5,8`, `affine-?s.,?!?u?d-23,6`, ... | Mask-generated alphabet ; uses default mask "`?l?u?s`" with `a=1` and `b=2` - -```python ->>> codext.encode("this is a test", "affine") -'vjkubkubcbvguv' ->>> codext.decode("vjkubkubcbvguv", "affine") -'this is a test' ->>> codext.encode("this is a test", "affine-?l?u?d?s-5,8") -'ORWJdWJdidOCJO' ->>> codext.decode("ORWJdWJdidOCJO", "affine-?l?u?d?s-5,8") -'this is a test' ->>> codext.encode("THIS IS A TEST", "affine-?s.,?!?u?d-5,8") -'AW1 D1 D2DAH A' ->>> codext.decode("AW1 D1 D2DAH A", "affine-?s.,?!?u?d-5,8") -'THIS IS A TEST' -``` - -!!! warning "Parameters `a` and `b`" - - Not all values are suitable for `a` and `b`. If a generated encoding map has mapping collisions, an exception is raised telling that `a` and `b` are bad. - ------ - -### Atbash Cipher - -It implements the monoalphabetic substitution cipher used for the Hebrew alphabet. By default, it considers the lowercase and uppercase letters, inverted per group, as the alphabet. It can also use a mask to extend it. Note that it does not generate any error for characters that are not part of the alphabet. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`atbash` | text <-> Atbash ciphertext | `atbash`, `atbash_cipher-?l?d?s`, ... | Mask-generated alphabet ; uses default mask "`?u?l`" - -```python ->>> codext.encode("this is a test", "atbash") -'gsrh rh z gvhg' ->>> codext.encode("this is a test", "atbash-[?l?u?p?s]") -'.^]/a]/a a.{/.' ->>> codext.decode(".^]/a]/a a.{/.", "atbash_cipher_[?l?u?p?s]") -'this is a test' -``` - ------ - -### Baconian Cipher - -It support only letters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`bacon` | text <-> Bacon ciphertext | `bacon-cipher`, `baconian_cipher`, `bacon-01`, `bacon-10` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `ab`) - -```python ->>> codext.encode("this is a test", "bacon") -'baaba aabbb abaaa baaab abaaa baaab aaaaa baaba aabaa baaab baaba' ->>> codext.encode("this is a test", "bacon_01") -'10010 00111 01000 10001 01000 10001 00000 10010 00100 10001 10010' ->>> codext.decode("-..-. ..--- .-... -...- .-... -...- ..... -..-. ..-.. -...- -..-.", "bacon_.-") -'THIS IS A TEST' -``` - ------ - -### Barbie Typewriter - -It implements the cipher for its 4 different keys. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`barbie` | text <-> Barbie ciphertext | `barbie-1`, `barbie-2`, `barbie-3`, `barbie-4` - -```python ->>> codext.encode("this is a test", "barbie-1") -'hstf tf i hafh' ->>> codext.encode("this is a test", "barbie_3") -'fpsu su h ftuf' ->>> codext.decode("fpsu su h ftuf", "barbie-3") -'this is a test' -``` - ------ - -### Citrix CTX1 - -This implements the Citrix CTX1 password encoding algorithm. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`citrix` | text <-> Citrix CTX1 ciphertext | `citrix`, `citrix-1`, `citrix_ctx1` | - -```python ->>> codext.encode("this is a test", "citrix-ctx1") -'NBBMNAAGIDEPJJBMNIFNIMEMJKEL' ->>> codext.decode("NBBMNAAGIDEPJJBMNIFNIMEMJKEL", "citrix-ctx1") -'this is a test' -``` - ------ - -### Rail Fence Cipher - -This implements the Rail Fence encoding algorithm, using 3 rails and offset 0 as the default parameters. The encoding fence is built from the top ; the `up` flag can be used to build the fence from the bottom. Note that trying parameters that do not fit the input length will trigger a `ValueError` mentioning the bad value. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rail` | text <-> rail fence ciphertext, X rails and Y offset | `rail-X-Y`, `rail_X_Y`, `rail-X-Y-up`, `zigzag`, ... | - -```python ->>> codext.encode("this is a test", "zigzag") -'t ashsi etist' ->>> codext.encode("this is a test", "rail-5-3") -'it sss etiath ' ->>> codext.decode("it sss etiath ", "zigzag_5-3") -'this is a test' -``` - ------ -### ROT N - -This is a dynamic encoding, that is, it can be called with an integer to define the ROT offset. Encoding will apply a positive offset, decoding will apply a negative one. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_25`, `caesar13` | Dynamic ROT parameter ; belongs to [1, 26[ -`rot47` | text <-> rot47 ciphertext | | - -```python ->>> codext.encode("this is a test", "rot-15") -'iwxh xh p ithi' ->>> codext.encode("iwxh xh p ithi", "rot20") -'cqrb rb j cnbc' ->>> codext.decode("cqrb rb j cnbc", "rot_9") -'this is a test' -``` - ------ - -### Shift - -This is a dynamic encoding, that is, it can be called with an integer to define the shift offset. Encoding will apply a positive offset, decoding will apply a negative one. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-158`, `shift_255` | Dynamic shift parameter ; belongs to [1, 256[ - -```python ->>> codext.encode("this is a test", "shift-3") -'wklv#lv#d#whvw' ->>> codext.decode("wklv#lv#d#whvw", "shift10") -'mabl\x19bl\x19Z\x19m^lm' ->>> codext.encode("mabl\x19bl\x19Z\x19m^lm", "ordshift_7") -'this is a test' -``` - ------ - -### XOR with 1 byte - -This is a dynamic encoding, that is, it can be called with an integer to define the ordinal of the byte to XOR with the input text. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor22`, `xor-158`, `xor_255` | Dynamic XOR parameter ; belongs to [1, 256[ - -```python ->>> codext.encode("this is a test", "xor-10") -'~bcy*cy*k*~oy~' ->>> codext.encode("this is a test", "xor-30") -'jvwm>wm>\x7f>j{mj' ->>> codext.decode("this is a test", "xor-30") -'jvwm>wm>\x7f>j{mj' ->>> codext.encode("~bcy*cy*k*~oy~", "xor-10") -'this is a test' -``` - +`codext` also implements several simple cryptographic ciphers. But how does it relate to encoding while a key is required ? `codext` focuses on ciphers that have a weak key. With dynamically named encodings, it is then possible to define a bunch of encodings, one for each value of the key. For instance, Barbie Typewriter has a key with only 4 possible values. The `barbie` codec can then be `barbie-1`, ..., `barbie-4`. + +!!! note "Available masks" + + Some cipher codecs use character masks to generate their alphabets. Groups of characters are indicated using a headin "`?`". + + `a`: printable characters + `b`: all 8-bits chars + `d`: digits + `h`: lowercase hexadecimal + `H`: uppercase hexadecimal + `l`: lowercase letters + `p`: punctuation characters + `s`: whitespace + `u`: uppercase letters + + When combining masks, only one occurrence of each character is taken in the final alphabet. + + So, for instance, the following masks yield the following alphabets: + + - `?l?u?d?s`: "`abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 `" + - `?s.,?!?u?d`: "` .,?!ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789`" + +----- + +### Affine Cipher + +This codec implements the Affine monoalphabetic substitution cipher. It is parametrizable with a mask for generating the alphabet and the parameters `a` and `b`. By default, it uses mask "`lus`" and parameters `a=1` and `b=2` but it can be set as in the examples hereafter. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`affine` | text <-> affine ciphertext | `affine`, `affine_cipher-?l?u?d?s-5,8`, `affine-?s.,?!?u?d-23,6`, ... | Mask-generated alphabet ; uses default mask "`?l?u?s`" with `a=1` and `b=2` + +```python +>>> codext.encode("this is a test", "affine") +'vjkubkubcbvguv' +>>> codext.decode("vjkubkubcbvguv", "affine") +'this is a test' +>>> codext.encode("this is a test", "affine-?l?u?d?s-5,8") +'ORWJdWJdidOCJO' +>>> codext.decode("ORWJdWJdidOCJO", "affine-?l?u?d?s-5,8") +'this is a test' +>>> codext.encode("THIS IS A TEST", "affine-?s.,?!?u?d-5,8") +'AW1 D1 D2DAH A' +>>> codext.decode("AW1 D1 D2DAH A", "affine-?s.,?!?u?d-5,8") +'THIS IS A TEST' +``` + +!!! warning "Parameters `a` and `b`" + + Not all values are suitable for `a` and `b`. If a generated encoding map has mapping collisions, an exception is raised telling that `a` and `b` are bad. + +----- + +### Atbash Cipher + +It implements the monoalphabetic substitution cipher used for the Hebrew alphabet. By default, it considers the lowercase and uppercase letters, inverted per group, as the alphabet. It can also use a mask to extend it. Note that it does not generate any error for characters that are not part of the alphabet. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`atbash` | text <-> Atbash ciphertext | `atbash`, `atbash_cipher-?l?d?s`, ... | Mask-generated alphabet ; uses default mask "`?u?l`" + +```python +>>> codext.encode("this is a test", "atbash") +'gsrh rh z gvhg' +>>> codext.encode("this is a test", "atbash-[?l?u?p?s]") +'.^]/a]/a a.{/.' +>>> codext.decode(".^]/a]/a a.{/.", "atbash_cipher_[?l?u?p?s]") +'this is a test' +``` + +----- + +### Autoclave/Autokey Cipher + +This is a variant of the [Vigenere Cipher](#vigenere-cipher) using a key stream generated from the primer key and the message appended. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`autoclave` | text <-> Autoclave ciphertext | `autoclave-cipher`, `autokey` | + +```python +>>> codext.encode("This is a test !", "autoclave-test") +'Mlal bz i lmkt !' +>>> codext.decode("Mlal bz i lmkt !", "autokey_cipher-test") +'This is a test !' +``` + +----- + +### Baconian Cipher + +It support only letters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`bacon` | text <-> Bacon ciphertext | `bacon-cipher`, `baconian_cipher`, `bacon-01`, `bacon-10` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `ab`) + +```python +>>> codext.encode("this is a test", "bacon") +'baaba aabbb abaaa baaab abaaa baaab aaaaa baaba aabaa baaab baaba' +>>> codext.encode("this is a test", "bacon_01") +'10010 00111 01000 10001 01000 10001 00000 10010 00100 10001 10010' +>>> codext.decode("-..-. ..--- .-... -...- .-... -...- ..... -..-. ..-.. -...- -..-.", "bacon_.-") +'THIS IS A TEST' +``` + +----- + +### Barbie Typewriter + +It implements the cipher for its 4 different keys. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`barbie` | text <-> Barbie ciphertext | `barbie-1`, `barbie-2`, `barbie-3`, `barbie-4` + +```python +>>> codext.encode("this is a test", "barbie-1") +'hstf tf i hafh' +>>> codext.encode("this is a test", "barbie_3") +'fpsu su h ftuf' +>>> codext.decode("fpsu su h ftuf", "barbie-3") +'this is a test' +``` + +----- + +### Beaufort Cipher + +This is a variant of the [Vigenere Cipher](#vigenere-cipher). There is no default key, meaning that `beaufort` as the encoding scheme throws a `LookupError` indicating that the _key must be a non-empty alphabetic string_. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`beaufort` | text <-> Beaufort ciphertext | `beaufort-abcdef`, `beaufort_MySuperSecret` | key only consists of characters, not digits + +```python +>>> codext.encode("This is a test !", "beaufort-abababa") +'Husj sj a hxii !' +>>> codext.encode("This is a test !", "beaufort_MySuperSecret") +'Trkc hm r zaky !' +>>> codext.decode("Husj sj a hxii !", "vigenere-abababa") +'This is a test !' +``` + +----- + +### Citrix CTX1 + +This implements the Citrix CTX1 password encoding algorithm. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`citrix` | text <-> Citrix CTX1 ciphertext | `citrix`, `citrix-1`, `citrix_ctx1` | + +```python +>>> codext.encode("this is a test", "citrix-ctx1") +'NBBMNAAGIDEPJJBMNIFNIMEMJKEL' +>>> codext.decode("NBBMNAAGIDEPJJBMNIFNIMEMJKEL", "citrix-ctx1") +'this is a test' +``` + +----- + +### Polybius Square Cipher + +This implements the well-known Polybius Square cipher, using the square with the alphabet in normal order as the default. It can be used dynamically with a custom alphabet. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`polybius` | text <-> polybius square ciphertext | `polybius-square`, `polybius_BACDEFGHIKLMNOPQRSTUVWXYZ`, ... | + +```python +>>> codext.encode("this is a test", "polybius") +'44232443 2443 11 44154344' +>>> codext.encode("this is a test", "polybius_BACDEFGHIKLMNOPQRSTUVWXYZ") +'44232443 2443 12 44154344' +>>> codext.decode("44232443 2443 11 441543445", "polybius-square", errors="replace") +'THIS IS A TEST?' +``` + +----- + +### Rail Fence Cipher + +This implements the Rail Fence encoding algorithm, using 3 rails and offset 0 as the default parameters. The encoding fence is built from the top ; the `up` flag can be used to build the fence from the bottom. Note that trying parameters that do not fit the input length will trigger a `ValueError` mentioning the bad value. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rail` | text <-> rail fence ciphertext, X rails and Y offset | `rail-X-Y`, `rail_X_Y`, `rail-X-Y-up`, `zigzag`, ... | + +```python +>>> codext.encode("this is a test", "zigzag") +'t ashsi etist' +>>> codext.encode("this is a test", "rail-5-3") +'it sss etiath ' +>>> codext.decode("it sss etiath ", "zigzag_5-3") +'this is a test' +``` + +----- +### ROT N + +This is a dynamic encoding, that is, it can be called with an integer to define the ROT offset. Encoding will apply a positive offset, decoding will apply a negative one. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_25`, `caesar13` | Dynamic ROT parameter ; belongs to [1, 26[ +`rot47` | text <-> rot47 ciphertext | | + +```python +>>> codext.encode("this is a test", "rot-15") +'iwxh xh p ithi' +>>> codext.encode("iwxh xh p ithi", "rot20") +'cqrb rb j cnbc' +>>> codext.decode("cqrb rb j cnbc", "rot_9") +'this is a test' +``` + +----- + +### Shift + +This is a dynamic encoding, that is, it can be called with an integer to define the shift offset. Encoding will apply a positive offset, decoding will apply a negative one. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-158`, `shift_255` | Dynamic shift parameter ; belongs to [1, 256[ + +```python +>>> codext.encode("this is a test", "shift-3") +'wklv#lv#d#whvw' +>>> codext.decode("wklv#lv#d#whvw", "shift10") +'mabl\x19bl\x19Z\x19m^lm' +>>> codext.encode("mabl\x19bl\x19Z\x19m^lm", "ordshift_7") +'this is a test' +``` + +----- + +### Trithemius Cipher + +This is a variant of the [Vigenere Cipher](#vigenere-cipher) with key `"ABCDEFGHIJKLMNOPQRSTUVWXYZ"`. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`trithemius` | text <-> Trithemius ciphertext | `trithemius`, `trithemius_cipher` | + +```python +>>> codext.encode("This is a test !", "trithemius") +'Tikv mx g ambd !' +>>> codext.decode("Tikv mx g ambd !", "trithemius") +'This is a test !' +``` + +----- + +### Vigenere Cipher + +This is a dynamic encoding, that is, it holds the key. There is no default key, meaning that `vigenere` as the encoding scheme throws a `LookupError` indicating that the _key must be a non-empty alphabetic string_. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`vigenere` | text <-> Vigenere ciphertext | `vigenere-abcdef`, `vigenere_MySuperSecret` | key only consists of characters, not digits + +```python +>>> codext.encode("This is a test !", "vigenere-abababa") +'Tiit it a tfsu !' +>>> codext.encode("This is a test !", "vigenere_MySuperSecret") +'Ffam xw r liuk !' +>>> codext.decode("Tiit it a tfsu !", "vigenere-abababa") +'This is a test !' +``` + +----- + +### XOR with 1 byte + +This is a dynamic encoding, that is, it can be called with an integer to define the ordinal of the byte to XOR with the input text. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor22`, `xor-158`, `xor_255` | Dynamic XOR parameter ; belongs to [1, 256[ + +```python +>>> codext.encode("this is a test", "xor-10") +'~bcy*cy*k*~oy~' +>>> codext.encode("this is a test", "xor-30") +'jvwm>wm>\x7f>j{mj' +>>> codext.decode("this is a test", "xor-30") +'jvwm>wm>\x7f>j{mj' +>>> codext.encode("~bcy*cy*k*~oy~", "xor-10") +'this is a test' +``` + diff --git a/docs/enc/hashing.md b/docs/pages/enc/hashing.md similarity index 99% rename from docs/enc/hashing.md rename to docs/pages/enc/hashing.md index d1b0298..0f6f151 100644 --- a/docs/enc/hashing.md +++ b/docs/pages/enc/hashing.md @@ -1,5 +1,3 @@ -## Hashing - `codext` provides hash functions through the `.encode(...)` API for convenience (e.g. while chaining codecs with [the CLI tool](../cli.html)). ----- diff --git a/docs/enc/languages.md b/docs/pages/enc/languages.md similarity index 97% rename from docs/enc/languages.md rename to docs/pages/enc/languages.md index 3735d15..9aa805c 100644 --- a/docs/enc/languages.md +++ b/docs/pages/enc/languages.md @@ -1,199 +1,197 @@ -## Languages - -`codext` also adds some common languages for encoding. - ------ - -### Braille - -It supports letters, digits and some special characters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`braille` | text <-> braille symbols | | Python 3 only - -```python ->>> codext.encode("this is a test", "braille") -'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' ->>> codext.encode("THIS IS A TEST", "braille") -'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' ->>> codext.decode("⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞", "braille") -'this is a test' -``` - ------ - -### Galactic - -This implements the [Minecraft's enchanting table](https://www.thegamer.com/minecraft-enchantment-table-language-guide/) using resembling Unicode characters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`galactic` | text <-> Minecraft enchantment symbols | `galactic-alphabet`, `minecraft_enchantment`, `minecraft-enchanting-language` | Python 3 only - -```python ->>> codext.encode("this is a test", "galactic") -'ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ' ->>> codext.decode("ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ", "galactic") -'this is a test' -``` - ------ - -### Ipsum - -This implements a codec that uses lorem ipsum words. It selects random words per letter and keeps the following punctuations: "`.,:;+=-*/\\`". - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`ipsum` | text <-> latin words | `loremipsum`, `lorem-ipsum` | words from the classical lorem ipsum - -```python ->>> codext.encode("This is a test.", "ipsum") -'Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.' ->>> codext.decode("Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.", "lorem-ipsum") -'This is a test.' -``` - ------ - -### Leetspeak - -This implements a very basic ruleset of elite speaking. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`leetspeak` | text <-> leetspeak encoded text | `leet`, `1337`, `leetspeak` | based on minimalistic elite speaking rules - -```python ->>> codext.encode("this is a test", "leetspeak") -'7h15 15 4 7357' ->>> codext.decode("7h15 15 4 7357", "leetspeak") -'ThIS IS A TEST' -``` - ------ - -### Morse - -It supports of course letters and digits, but also a few special characters: `.,;:?!/\\@&=-_'" $()`. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`morse` | text <-> morse encoded text | none | uses whitespace as a separator, dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `/-.`) - -```python ->>> codext.encode("this is a test", "morse") -'- .... .. ... / .. ... / .- / - . ... -' ->>> codext.encode("this is a test", "morse/-.") -'- .... .. ... / .. ... / .- / - . ... -' ->>> codext.encode("this is a test", "morse_ABC") -'B CCCC CC CCC A CC CCC A CB A B C CCC B' ->>> codext.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") -'this is a test' ->>> with codext.open("morse.txt", 'w', encoding="morse") as f: - f.write("this is a test") -14 ->>> with codext.open("morse.txt", encoding="morse") as f: - f.read() -'this is a test' -``` - ------ - -### Navajo - -It implements the letters from the [Navajo Code Talkers' Dictionary](https://www.history.navy.mil/research/library/online-reading-room/title-list-alphabetically/n/navajo-code-talker-dictionary.html). It conserves digits and newlines. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`navajo` | text <-> Navajo | | - -```python ->>> import codext ->>> codext.encode("this is a test 123", "navajo") -'a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3' ->>> codext.decode("a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3", "navajo") -'this is a test 123' -``` - ------ - -### Radio Alphabet - -This is also known as the [NATO phonetic alphabet](https://en.wikipedia.org/wiki/NATO_phonetic_alphabet). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`radio` | text <-> radio alphabet words | `military_alphabet`, `nato-phonetic-alphabet`, `radio-alphabet` | - -```python ->>> codext.encode("foobar", "nato_phonetic_alphabet") -'Foxtrot Oscar Oscar Bravo Alpha Romeo' ->>> codext.decode("Foxtrot Oscar Oscar Bravo Alpha Romeo", "radio-alphabet") -'FOOBAR' -``` - ------ - -### Southpark - -This encodes text according to Kenny's language in Southpark. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`southpark` | text <-> Kenny's language | `kenny` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `fFMmpP`) -`southpark-icase` | text <-> Kenny's language | `kenny_icase` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `FMP`) - -```python ->>> codext.encode("This is a Test", "southpark") -'FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp' ->>> codext.decode('FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp', "kenny") -'This is a Test' ->>> codext.encode("This is a test", "kenny_123456") -'245415411144111411144211444111145455144145' ->>> codext.decode("245415411144111411144211444111145455144145", "kenny-123456") -'This is a test' ->>> codext.encode("this is a test", "kenny_icase") -'FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP' ->>> codext.decode("FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP", "southpark-icase") -'this is a test' ->>> codext.encode("this is a test", "southpark-icase_123") -'123213211122111211122111222111123233122123' ->>> codext.decode('123213211122111211122111222111123233122123', "kenny_icase-123") -'this is a test' -``` - ------ - -### Tap - -This codec implements the [tap/knock code](https://en.wikipedia.org/wiki/Tap_code) commonly used by prisoners. It uses 25 letters, "*k*" is encoded to the same token than "*c*". - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`tap` | text <-> tap/knock encoded text | `knock`, `tap-code` | uses a large Unicode whitespace as a token separator ; Python 3 only - -```python ->>> codext.encode("this is a test", "tap") -'.... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....' ->>> codext.decode(".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....", "knock") -'this is a test' -``` - ------ - -### Tom-Tom - -This codec is similar to morse. It converts text into slashes and backslashes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`tomtom` | text <-> tom-tom encoded text | `tom-tom` | uses "`|`" as a separator - -```python ->>> codext.encode("this is a test", "tom-tom") -'\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\' ->>> codext.decode("\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\", "tomtom") -'THIS IS A TEST' -``` +`codext` also adds some common languages for encoding. + +----- + +### Braille + +It supports letters, digits and some special characters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`braille` | text <-> braille symbols | | Python 3 only + +```python +>>> codext.encode("this is a test", "braille") +'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' +>>> codext.encode("THIS IS A TEST", "braille") +'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' +>>> codext.decode("⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞", "braille") +'this is a test' +``` + +----- + +### Galactic + +This implements the [Minecraft's enchanting table](https://www.thegamer.com/minecraft-enchantment-table-language-guide/) using resembling Unicode characters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`galactic` | text <-> Minecraft enchantment symbols | `galactic-alphabet`, `minecraft_enchantment`, `minecraft-enchanting-language` | Python 3 only + +```python +>>> codext.encode("this is a test", "galactic") +'ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ' +>>> codext.decode("ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ", "galactic") +'this is a test' +``` + +----- + +### Ipsum + +This implements a codec that uses lorem ipsum words. It selects random words per letter and keeps the following punctuations: "`.,:;+=-*/\\`". + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`ipsum` | text <-> latin words | `loremipsum`, `lorem-ipsum` | words from the classical lorem ipsum + +```python +>>> codext.encode("This is a test.", "ipsum") +'Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.' +>>> codext.decode("Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.", "lorem-ipsum") +'This is a test.' +``` + +----- + +### Leetspeak + +This implements a very basic ruleset of elite speaking. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`leetspeak` | text <-> leetspeak encoded text | `leet`, `1337`, `leetspeak` | based on minimalistic elite speaking rules + +```python +>>> codext.encode("this is a test", "leetspeak") +'7h15 15 4 7357' +>>> codext.decode("7h15 15 4 7357", "leetspeak") +'ThIS IS A TEST' +``` + +----- + +### Morse + +It supports of course letters and digits, but also a few special characters: `.,;:?!/\\@&=-_'" $()`. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`morse` | text <-> morse encoded text | none | uses whitespace as a separator, dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `/-.`) + +```python +>>> codext.encode("this is a test", "morse") +'- .... .. ... / .. ... / .- / - . ... -' +>>> codext.encode("this is a test", "morse/-.") +'- .... .. ... / .. ... / .- / - . ... -' +>>> codext.encode("this is a test", "morse_ABC") +'B CCCC CC CCC A CC CCC A CB A B C CCC B' +>>> codext.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") +'this is a test' +>>> with codext.open("morse.txt", 'w', encoding="morse") as f: + f.write("this is a test") +14 +>>> with codext.open("morse.txt", encoding="morse") as f: + f.read() +'this is a test' +``` + +----- + +### Navajo + +It implements the letters from the [Navajo Code Talkers' Dictionary](https://www.history.navy.mil/research/library/online-reading-room/title-list-alphabetically/n/navajo-code-talker-dictionary.html). It conserves digits and newlines. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`navajo` | text <-> Navajo | | + +```python +>>> import codext +>>> codext.encode("this is a test 123", "navajo") +'a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3' +>>> codext.decode("a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3", "navajo") +'this is a test 123' +``` + +----- + +### Radio Alphabet + +This is also known as the [NATO phonetic alphabet](https://en.wikipedia.org/wiki/NATO_phonetic_alphabet). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`radio` | text <-> radio alphabet words | `military_alphabet`, `nato-phonetic-alphabet`, `radio-alphabet` | + +```python +>>> codext.encode("foobar", "nato_phonetic_alphabet") +'Foxtrot Oscar Oscar Bravo Alpha Romeo' +>>> codext.decode("Foxtrot Oscar Oscar Bravo Alpha Romeo", "radio-alphabet") +'FOOBAR' +``` + +----- + +### Southpark + +This encodes text according to Kenny's language in Southpark. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`southpark` | text <-> Kenny's language | `kenny` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `fFMmpP`) +`southpark-icase` | text <-> Kenny's language | `kenny_icase` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `FMP`) + +```python +>>> codext.encode("This is a Test", "southpark") +'FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp' +>>> codext.decode('FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp', "kenny") +'This is a Test' +>>> codext.encode("This is a test", "kenny_123456") +'245415411144111411144211444111145455144145' +>>> codext.decode("245415411144111411144211444111145455144145", "kenny-123456") +'This is a test' +>>> codext.encode("this is a test", "kenny_icase") +'FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP' +>>> codext.decode("FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP", "southpark-icase") +'this is a test' +>>> codext.encode("this is a test", "southpark-icase_123") +'123213211122111211122111222111123233122123' +>>> codext.decode('123213211122111211122111222111123233122123', "kenny_icase-123") +'this is a test' +``` + +----- + +### Tap + +This codec implements the [tap/knock code](https://en.wikipedia.org/wiki/Tap_code) commonly used by prisoners. It uses 25 letters, "*k*" is encoded to the same token than "*c*". + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`tap` | text <-> tap/knock encoded text | `knock`, `tap-code` | uses a large Unicode whitespace as a token separator ; Python 3 only + +```python +>>> codext.encode("this is a test", "tap") +'.... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....' +>>> codext.decode(".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....", "knock") +'this is a test' +``` + +----- + +### Tom-Tom + +This codec is similar to morse. It converts text into slashes and backslashes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`tomtom` | text <-> tom-tom encoded text | `tom-tom` | uses "`|`" as a separator + +```python +>>> codext.encode("this is a test", "tom-tom") +'\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\' +>>> codext.decode("\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\", "tomtom") +'THIS IS A TEST' +``` diff --git a/docs/enc/others.md b/docs/pages/enc/others.md similarity index 97% rename from docs/enc/others.md rename to docs/pages/enc/others.md index 3470611..408ac07 100644 --- a/docs/enc/others.md +++ b/docs/pages/enc/others.md @@ -1,79 +1,79 @@ -## Others - -All kinds of other codecs are categorized in "*Others*". - ------ - -### DNA - -This implements the 8 methods of ATGC nucleotides following the rule of complementary pairing, according the literature about coding and computing of DNA sequences. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`dna` (rule 1) | text <-> DNA-1 | `dna1`, `dna-1`, `dna_1` | -`dna` (rule X) | text <-> DNA-X | ... | -`dna` (rule 8) | text <-> DNA-8 | `dna8`, `dna-8`, `dna_8` | - -```python ->>> for i in range(8): - print(codext.encode("this is a test", "dna-%d" % (i + 1))) -GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA -CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA -ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG -AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC -TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG -TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC -GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT -CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT ->>> codext.decode("GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA", "dna-1") -'this is a test' -``` - ------ - -### Letter indices - -This encodes consonants and/or vowels with their respective indices. This codec is case insensitive, strips white spaces and only applies to letters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`consonant-indices` | text <-> text with consonant indices | `consonants_indices`, `consonants_index` | while decoding, searches from the longest match, possibly not producing the original input -`vowel-indices` | text <-> text with vowel indices | `vowels_indices`, `vowels_index` | -`consonant-vowel-indices` | text <-> text with consonant and vowel indices | `consonants-vowels_index` | prefixes consonants with `C` and vowels with `V` - -```python ->>> codext.encode("This is a test", "consonant-index") -'166I15I15A16E1516' ->>> codext.decode("166I15I15A16E1516", "consonant-index") -'THISISATEST' -``` - -```python ->>> codext.encode("This is a test", "vowel-index") -'TH3S3S1T2ST' ->>> codext.decode("TH3S3S1T2ST", "vowel-index") -'THISISATEST' -``` - -```python ->>> codext.encode("This is a test", "consonant-vowel-index") -'C16C6V3C15V3C15V1C16V2C15C16' ->>> codext.decode("C16C6V3C15V3C15V1C16V2C15C16", "consonant-vowel-index") -'THISISATEST' -``` - ------ - -### Markdown - -This is only for "encoding" (converting) Markdown to HTML. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`markdown` | Markdown --> HTML | `markdown`, `Markdown`, `md` | unidirectional ! - -```python ->>> codext.encode("# Test\nparagraph", "markdown") -'

Test

\n\n

paragraph

\n' -``` - +## Others + +All kinds of other codecs are categorized in "*Others*". + +----- + +### DNA + +This implements the 8 methods of ATGC nucleotides following the rule of complementary pairing, according the literature about coding and computing of DNA sequences. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`dna` (rule 1) | text <-> DNA-1 | `dna1`, `dna-1`, `dna_1` | +`dna` (rule X) | text <-> DNA-X | ... | +`dna` (rule 8) | text <-> DNA-8 | `dna8`, `dna-8`, `dna_8` | + +```python +>>> for i in range(8): + print(codext.encode("this is a test", "dna-%d" % (i + 1))) +GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA +CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA +ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG +AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC +TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG +TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC +GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT +CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT +>>> codext.decode("GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA", "dna-1") +'this is a test' +``` + +----- + +### Letter indices + +This encodes consonants and/or vowels with their respective indices. This codec is case insensitive, strips white spaces and only applies to letters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`consonant-indices` | text <-> text with consonant indices | `consonants_indices`, `consonants_index` | while decoding, searches from the longest match, possibly not producing the original input +`vowel-indices` | text <-> text with vowel indices | `vowels_indices`, `vowels_index` | +`consonant-vowel-indices` | text <-> text with consonant and vowel indices | `consonants-vowels_index` | prefixes consonants with `C` and vowels with `V` + +```python +>>> codext.encode("This is a test", "consonant-index") +'166I15I15A16E1516' +>>> codext.decode("166I15I15A16E1516", "consonant-index") +'THISISATEST' +``` + +```python +>>> codext.encode("This is a test", "vowel-index") +'TH3S3S1T2ST' +>>> codext.decode("TH3S3S1T2ST", "vowel-index") +'THISISATEST' +``` + +```python +>>> codext.encode("This is a test", "consonant-vowel-index") +'C16C6V3C15V3C15V1C16V2C15C16' +>>> codext.decode("C16C6V3C15V3C15V1C16V2C15C16", "consonant-vowel-index") +'THISISATEST' +``` + +----- + +### Markdown + +This is only for "encoding" (converting) Markdown to HTML. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`markdown` | Markdown --> HTML | `markdown`, `Markdown`, `md` | unidirectional ! + +```python +>>> codext.encode("# Test\nparagraph", "markdown") +'

Test

\n\n

paragraph

\n' +``` + diff --git a/docs/enc/stegano.md b/docs/pages/enc/stegano.md similarity index 97% rename from docs/enc/stegano.md rename to docs/pages/enc/stegano.md index 57dfb18..1a3a5fa 100644 --- a/docs/enc/stegano.md +++ b/docs/pages/enc/stegano.md @@ -1,123 +1,121 @@ -## Steganography - -`codext` defines a few steganography-related encodings. While encoding is not really steganography (that is, concealing data within data), the following codecs are worth creating this category as they relate to converting data into something that could mislead the unaware reader. - ------ - -### Hexagrams (I Ching) - -This uses Base64 and then encodes output characters to [I Ching Hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) such that implemented [here](https://github.com/qntm/hexagram-encode). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`hexagram` | text <-> hexagrams-encoded Base64 | `hexagrams`, `i-ching-hexagrams`, `iching` | Python3 only - -```python ->>> codext.encode("this is a test", "hexagram") -'䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯' ->>> codext.decode("䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯", "iching") -'this is a test' -``` - ------ - -### Klopf Code - -This is a Polybius code with the trivial alphabetical distribution ("A" -> (1,1), "B" -> (2,1), ...). This can be tested [here](https://gc.de/gc/klopfcode/). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`klopf` | text <-> klopf encoded text | `klopfcode` | - -```python ->>> codext.encode("this is a test", "klopf") -'44324234 4234 11 44513444' ->>> codext.decode("44324234 4234 11 44513444", "klopf") -'THIS IS A TEST' -``` - ------ - -### Resistor Color Codes - -This uses the [electronic color code](https://en.wikipedia.org/wiki/Electronic_color_code#Resistor_color-coding) to encode digits, displaying colors in the terminal with ANSI color codes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`resistor` | text <-> resistor colors | `condensator`, `resistors-color`, `resistor_color_code` | visually, it only works in a terminal supporting ANSI color codes - -```python ->>> codext.encode("1234", "resistor") -'\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m' ->>> codext.decode("\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m", "condensators_color") -'1234' -``` - ------ - -### Rick Cipher - -This converts letters to words from Rick Astley's famous song "*Never gonna give you up*". - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rick` | text <-> words from Risk's song | `rick-astley`, `rick_cipher`, `rick-astley-cipher` | case-insensitive while encoding - -```python ->>> codext.encode("Test String", "rick") -'TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna' ->>> codext.decode("TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna", "rick") -'TEST STRING' -``` - ------ - -### SMS (T9) - -This codec implements the SMS encoding, also caled T9, that is the conversion from characters to their corresponding phone keystrokes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`sms` | text <-> phone keystrokes | `nokia`, `nokia_3310`, `t9` | uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding - -```python ->>> codext.encode("this is a test", "sms") -'8-44-444-7777-0-444-7777-0-2-0-8-33-7777-8' ->>> codext.decode("8_44_444_7777_0_444_7777_0_2_0_8_33_7777_8", "nokia") -'this is a test' ->>> codext.decode("8_44_444_7777_0-444-7777_0-2_0_8_33-7777-8", "t9") -'this is a test' -``` - ------ - -### Whitespaces - -This simple encoding replaces zeros and ones of the binary version of the input text with spaces and tabs. It is supported either with its original mapping or with the inverted mapping. - -!!! warning "Encoding, not programming !" - - This should not be confused with the [whitespace esoteric language](https://en.wikipedia.org/wiki/Whitespace_(programming_language)). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`whitespace` | text <-> whitespaces and tabs | `whitespaces?-inv(erted)?` | The default encoding uses tabs for zeros and spaces for ones -`whitespace_after_before` | text <-> whitespaces[letter]whitespaces | | This codec encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") - -```python ->>> codext.encode("test", "whitespace") -'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' ->>> codext.encode("test", "whitespaces") -'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' ->>> codext.encode("test", "whitespaces-inv") -' \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ' ->>> codext.decode(" \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ", "whitespaces_inverted") -'test' -``` - -```python ->>> codext.encode("test", "whitespace+after-before") -' m \n l \n u \n m ' ->>> codext.decode(" m \n l \n u \n m ", "whitespace+after-before") -'test' -``` +`codext` defines a few steganography-related encodings. While encoding is not really steganography (that is, concealing data within data), the following codecs are worth creating this category as they relate to converting data into something that could mislead the unaware reader. + +----- + +### Hexagrams (I Ching) + +This uses Base64 and then encodes output characters to [I Ching Hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) such that implemented [here](https://github.com/qntm/hexagram-encode). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`hexagram` | text <-> hexagrams-encoded Base64 | `hexagrams`, `i-ching-hexagrams`, `iching` | Python3 only + +```python +>>> codext.encode("this is a test", "hexagram") +'䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯' +>>> codext.decode("䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯", "iching") +'this is a test' +``` + +----- + +### Klopf Code + +This is a Polybius code with the trivial alphabetical distribution ("A" -> (1,1), "B" -> (2,1), ...). This can be tested [here](https://gc.de/gc/klopfcode/). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`klopf` | text <-> klopf encoded text | `klopfcode` | + +```python +>>> codext.encode("this is a test", "klopf") +'44324234 4234 11 44513444' +>>> codext.decode("44324234 4234 11 44513444", "klopf") +'THIS IS A TEST' +``` + +----- + +### Resistor Color Codes + +This uses the [electronic color code](https://en.wikipedia.org/wiki/Electronic_color_code#Resistor_color-coding) to encode digits, displaying colors in the terminal with ANSI color codes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`resistor` | text <-> resistor colors | `condensator`, `resistors-color`, `resistor_color_code` | visually, it only works in a terminal supporting ANSI color codes + +```python +>>> codext.encode("1234", "resistor") +'\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m' +>>> codext.decode("\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m", "condensators_color") +'1234' +``` + +----- + +### Rick Cipher + +This converts letters to words from Rick Astley's famous song "*Never gonna give you up*". + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rick` | text <-> words from Risk's song | `rick-astley`, `rick_cipher`, `rick-astley-cipher` | case-insensitive while encoding + +```python +>>> codext.encode("Test String", "rick") +'TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna' +>>> codext.decode("TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna", "rick") +'TEST STRING' +``` + +----- + +### SMS (T9) + +This codec implements the SMS encoding, also caled T9, that is the conversion from characters to their corresponding phone keystrokes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`sms` | text <-> phone keystrokes | `nokia`, `nokia_3310`, `t9` | uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding + +```python +>>> codext.encode("this is a test", "sms") +'8-44-444-7777-0-444-7777-0-2-0-8-33-7777-8' +>>> codext.decode("8_44_444_7777_0_444_7777_0_2_0_8_33_7777_8", "nokia") +'this is a test' +>>> codext.decode("8_44_444_7777_0-444-7777_0-2_0_8_33-7777-8", "t9") +'this is a test' +``` + +----- + +### Whitespaces + +This simple encoding replaces zeros and ones of the binary version of the input text with spaces and tabs. It is supported either with its original mapping or with the inverted mapping. + +!!! warning "Encoding, not programming !" + + This should not be confused with the [whitespace esoteric language](https://en.wikipedia.org/wiki/Whitespace_(programming_language)). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`whitespace` | text <-> whitespaces and tabs | `whitespaces?-inv(erted)?` | The default encoding uses tabs for zeros and spaces for ones +`whitespace_after_before` | text <-> whitespaces[letter]whitespaces | | This codec encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") + +```python +>>> codext.encode("test", "whitespace") +'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' +>>> codext.encode("test", "whitespaces") +'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' +>>> codext.encode("test", "whitespaces-inv") +' \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ' +>>> codext.decode(" \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ", "whitespaces_inverted") +'test' +``` + +```python +>>> codext.encode("test", "whitespace+after-before") +' m \n l \n u \n m ' +>>> codext.decode(" m \n l \n u \n m ", "whitespace+after-before") +'test' +``` diff --git a/docs/enc/web.md b/docs/pages/enc/web.md similarity index 97% rename from docs/enc/web.md rename to docs/pages/enc/web.md index 80c6a20..4477a1f 100644 --- a/docs/enc/web.md +++ b/docs/pages/enc/web.md @@ -1,40 +1,38 @@ -## Web - -`codext` implements some common Web-related encodings. - ------ - -### HTML Entities - -This implements the full list of characters available at [this reference](https://dev.w3.org/html5/html-author/charref). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`html` | text <-> HTML entities | `html-entity`, `html_entities` | implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) - -```python ->>> codext.encode("Тħĩş Їś ą Ţêšŧ", "html") -'Тħĩş Їś ą Ţêšŧ' ->>> codext.decode("Тħĩş Їś ą Ţêšŧ", "html-entities") -'Тħĩş Їś ą Ţêšŧ' -``` - ------ - -### URL - -This handles URL encoding, regardless of the case when decoding and with no error. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`url` | text <-> URL encoded text | `url`, `urlencode` | - -```python ->>> codecs.encode("?=this/is-a_test/../", "url") -'%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F' ->>> codext.decode("%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F", "urlencode") -'?=this/is-a_test/../' ->>> codext.decode("%3f%3dthis%2fis-a_test%2f%2e%2e%2f", "urlencode") -'?=this/is-a_test/../' -``` - +`codext` implements some common Web-related encodings. + +----- + +### HTML Entities + +This implements the full list of characters available at [this reference](https://dev.w3.org/html5/html-author/charref). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`html` | text <-> HTML entities | `html-entity`, `html_entities` | implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) + +```python +>>> codext.encode("Тħĩş Їś ą Ţêšŧ", "html") +'Тħĩş Їś ą Ţêšŧ' +>>> codext.decode("Тħĩş Їś ą Ţêšŧ", "html-entities") +'Тħĩş Їś ą Ţêšŧ' +``` + +----- + +### URL + +This handles URL encoding, regardless of the case when decoding and with no error. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`url` | text <-> URL encoded text | `url`, `urlencode` | + +```python +>>> codecs.encode("?=this/is-a_test/../", "url") +'%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F' +>>> codext.decode("%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F", "urlencode") +'?=this/is-a_test/../' +>>> codext.decode("%3f%3dthis%2fis-a_test%2f%2e%2e%2f", "urlencode") +'?=this/is-a_test/../' +``` + diff --git a/docs/features.md b/docs/pages/features.md similarity index 97% rename from docs/features.md rename to docs/pages/features.md index 11316f0..02b375b 100644 --- a/docs/features.md +++ b/docs/pages/features.md @@ -1,338 +1,336 @@ -## Features - -Basically, the `codecs` library provides a series of functions from the built-in `_codecs` library which maintains a registry of search functions (a simple list) that maps ancodings to the right de/encode functions by returning a `CodecInfo` object once first matched. - -`codext` hooks `codecs`'s functions to insert its own proxy registry between the function calls and the native registry so that new encodings can be added or replace existing ones while using `code[cs|xt].open`. Indeed, as the proxy registry is called first, the first possible match occurs in a custom codec, while if not existing, the native registry is used. - -!!! note "The `open` built-in function" - - Two behaviors are to be considered when using `codext`: - - 1. Encodings added from `codext` are only added to the proxy codecs registry of `codext` and are NOT available using `open(...)` (but well using `code[cs|xt].open(...)`. - 2. Encodings added from `codecs` are added to the proxy registry AND ALSO to the native registry and are therefore available using `open(...)`. - - This difference allows to keep encodings added from `codext` removable while these added from `codecs` are not. This is the consequence from the fact that there is no unregister function in the native `_codecs` library. - -!!! warning "Lossy conversion" - - Some encodings are lossy, meaning that it is not always possible to decode back to the exact start string. This should be considered especially when chaining codecs. - ------ - -### Add a custom encoding - -New codecs can be added easily using the new function `add`. - -```python ->>> import codext ->>> help(codext.add) -Help on function add in module codext.__common__: - -add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False) - This adds a new codec to the codecs module setting its encode and/or decode - functions, eventually dynamically naming the encoding with a pattern and - with file handling (if text is True). - - :param ename: encoding name - :param encode: encoding function or None - :param decode: decoding function or None - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the - built-in open(...) but will make it impossible - to remove the codec later - -``` - -Here is a simple example of how to add a basic codec: - -```python -import codext - -def mycodec_encode(text, errors="strict"): - # do some encoding stuff - return encoded, len(text) - -def mycodec_decode(text, errors="strict"): - # do some decoding stuff - return decoded, len(text) - -codext.add("mycodec", mycodec_encode, mycodec_decode) -``` - -In this first example, we can see that: - -- The `decode`/`encode` functions have a signature holding a keyword-argument "`errors`" for error handling. This comes from the syntax for making a codec for the `codecs` native library. This argument can have multiple values, namely "`strict`" for raising an exception when an de/encoding error occurs, while "`replace`" allows to replace the character at the position of the error with a generic character and also "`ignore`" that simply ignores the error and continues without adding anything to the resulting string. -- These functions always return a pair with the resulting string and the length of consumed input text. - -Another example for a more complex and dynamic codec: - -```python -import codext - -def mydyncodec_encode(i): - def encode(text, error="strict"): - # do somthing depending on i - return result, len(text) - return encode - -codext.add("mydyncodec", mydyncodec_encode, pattern=r"mydyn-(\d+)$") -``` - -In this second example, we can see that: - -- Only the encoding function is defined. -- A pattern is defined to match the prefix "`mydyn-`" and then an integer which is captured and used with `mydyncodec_encode(i)`. - -!!! warning "Pattern capture group" - - A capture group means that the parameter will be used with a dynamic (decorated) encoding function. In order to avoid this, i.e. for matching multiple names leading to the same encoding while calling a static encoding function, we can simply define a non-capturing group, e.g. "`(?:my|special_)codec`". - ------ - -### Add a custom map encoding - -New codecs using encoding maps can be added easily using the new function `add_map`. - -```python ->>> import codext ->>> help(codext.add) -Help on function add_map in module codext.__common__: - -add_map(ename, encmap, repl_char='?', sep='', ignore_case=None, no_error=False, intype=None, outype=None, **kwargs) - This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs module - dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with a pattern - and with file handling (if text is True). - - :param ename: encoding name - :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture - group of the regex pattern) or a function building the encoding map - :param repl_char: replacement char (used when errors handling is set to "replace") - :param sep: string of possible character separators (hence, only single-char separators are considered) ; - - while encoding, the first separator is used - - while decoding, separators can be mixed in the input text - :param ignore_case: ignore text case while encoding and/or decoding - :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) - :param intype: specify the input type for pre-transforming the input text - :param outype: specify the output type for post-transforming the output text - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - -``` - -This relies on the [`add`](#add-a-custom-encoding) function and simplifies creating new encodings when they can be described as a mapping dictionary. - -Here is a simple example of how to add a map codec: - -```python -import codext - -ENCMAP = {'a': "A", 'b': "B", 'c': "C"} - -codext.add_map("mycodec", ENCMAP) -``` - -In this first example, we can see that: - -- The `decode`/`encode` functions do not have to be declared anymore. -- `ENCMAP` is the mapping between characters, it is also used to compute the decoding function. - -Another example for a more complex and dynamic codec: - -```python -import codext - -ENCMAP = [ - {'00': "A", '01': "B", '10': "C", '11': "D"}, - {'00': "D", '01': "C", '10': "B", '11': "A"}, -] - -codext.add("mydyncodec", ENCMAP, "#", ignore_case=True, intype="bin", pattern=r"mydyn-(\d+)$") -``` - -In this second example, we can see that: - -- `ENCMAP` is now a list of mappings. The capture group in the pattern is used to select the right encoding map. Consequently, using encoding "`mydyn-8`" will fail with a `LookupError` as the only possibility are "`mydyn-1`" and "`mydyn-2`". Note that the index begins at 1 in the encoding name. -- Instead of using the default character "`?`" for replacements, we use "`#`". -- The case is ignored ; decoding either "`abcd`" or "`ABCD`" will succeed. -- The binary mode is enabled, meaning that the input text is converted to a binary string for encoding, while it is converted from binary to text when decoding. - -!!! warning "Input/Output types" - - By default, when `intype` is defined, `outype` takes the same value. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). - ------ - -### Add a macro - -**Macros** are chains of encodings. It is possible to define own macros with this feature. It works by giving the precedence to user's macros saved in `~/.codext-macros.json` then using embedded macros from the `codext` package. - -Here is an example of adding a macro (and verifying it was indeed added): - -```python ->>> codext.list_macros() -['example-macro'] ->>> codext.add_macro("test-macro", "gzip", "base64") ->>> codext.list_macros() -['example-macro', 'test-macro'] -``` - -!!! note "Removing a macro" - - As macros are resolved like codecs (with the precedence for codecs), they can be removed the same way as a codec. - - :::python - >>> codext.remove("test-macro") - - If this is a built-in macro, it will removed from the runtime list within the `codext` package. Next time this will be loaded, it will reset the builtin list of macros. Otherwise, if this is a custom macro, it will removed from the list of custom macros AND removed from `~/.codext-macros.json`. - ------ - -### List codecs - -Codecs can be listed with the `list` function, either the whole codecs or only some categories. - -```python ->>> codext.list() -['affine', 'ascii', 'ascii85', 'atbash', 'bacon', ..., 'base36', 'base58', 'base62', 'base64', 'base64_codec', ..., 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'big5', 'big5hkscs', 'braille', 'bz2_codec', 'capitalize', 'cp037', ...] -``` - -!!! note "Codecs categories" - - - `native`: the built-in codecs from the original `codecs` package - - `non-native`: this special category regroups all the categories mentioned hereafter - - `base`: baseX codecs (e.g. `base`, `base100`) - - `binary`: codecs working on strings but applying their algorithms on their binary forms (e.g. `baudot`, `manchester`) - - `common`: common codecs not included in the native ones or simly added for the purpose of standardization (e.g. `octal`, `ordinal`) - - `crypto`: codecs related to cryptography algorithms (e.g. `barbie`, `rot`, `xor`) - - `language`: language-related codecs (e.g. `morse`, `navajo`) - - `other`: uncategorized codecs (e.g. `letters`, `url`) - - `stegano`: steganography-related codecs (e.g. `sms`, `resistor`) - - Except the `native` and `non-native` categories, the other ones are simply the name of the subdirectories (with "`s`" right-stripped) of the `codext` package. - -```python ->>> codext.list("binary") -['baudot', 'baudot-spaced', 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'excess3', 'gray', 'manchester', 'manchester-inverted'] ->>> codext.list("language") -['braille', 'leet', 'morse', 'navajo', 'radio', 'southpark', 'southpark-icase', 'tom-tom'] ->>> codext.list("native") -['ascii', 'base64_codec', 'big5', 'big5hkscs', 'bz2_codec', 'cp037', 'cp273', 'cp424', 'cp437', 'cp500', 'cp775', 'cp850', 'cp852', 'cp855', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', ...] -``` - -!!! warning "Codecs listed, not encodings" - - Beware that this function only lists the codecs, not the encodings. This means that, for instance, it only lists `base` (codecs' name) instead of `base17`, `base61`, `base97`, ... (the valid encoding names related to the `base` codec). - ------ - -### Search for encodings - -Natively, `codecs` provides a `lookup` function that allows to get the `CodecInfo` object for the desired encoding. This performs a lookup in the registry based on an exact match. Sometimes, it can be useful to search for available encodings based on a regular expression. Therefore, a `search` function is added by `codext` to allow to get a list of encoding names matching the input regex. - -```python ->>> codext.search("baudot") -['baudot', 'baudot_spaced', 'baudot_tape'] ->>> codext.search("al") -['capitalize', 'octal', 'octal_spaced', 'ordinal', 'ordinal_spaced', 'radio'] ->>> codext.search("white") -['whitespace', 'whitespace_after_before'] -``` - -Also, `codext` provides an `examples` function to get some examples of valid encoding names. This is especially useful when it concerns dynamicly named encodings (e.g. `rot`, `shift` or `dna`). - -```python ->>> codext.examples("rot") -['rot-14', 'rot-24', 'rot-7', 'rot18', 'rot3', 'rot4', 'rot6', 'rot_1', 'rot_12', 'rot_2'] ->>> codext.examples("dna") -['dna-1', 'dna-2', 'dna-5', 'dna1', 'dna4', 'dna5', 'dna6', 'dna8', 'dna_3', 'dna_5'] ->>> codext.examples("barbie", 5) -['barbie-1', 'barbie1', 'barbie4', 'barbie_2', 'barbie_4'] -``` - ------ - -### Remove a custom encoding or macro - -New codecs can be removed easily using the new function `remove`, which will only remove every codec matching the given encoding name in the proxy codecs registry and NOT in the native one. - -```python ->>> codext.encode("test", "bin") -'01110100011001010111001101110100' ->>> codext.remove("bin") ->>> codext.encode("test", "bin") - -Traceback (most recent call last): - [...] -LookupError: unknown encoding: bin -``` - -Trying to remove a codec that is in the native registry won't raise a `LookupError`. - -```python ->>> codext.remove("utf-8") ->>> codext.encode("test", "utf-8") -b'test' -``` - -Removing a macro works exactly the same way as for a codec. - -```python ->>> codext.remove("test-macro") -``` - ------ - -### Remove or restore `codext` encodings and macros - -It can be useful while playing with encodings and/or macros e.g. from Idle to be able to remove or restore `codext`'s encodings and macros. This can be achieved using respectively the new `clear` and `reset` functions. - -```python ->>> codext.clear() ->>> codext.encode("test", "bin") - -Traceback (most recent call last): - [...] -LookupError: unknown encoding: bin -``` - -```python ->>> codext.reset() ->>> codext.encode("test", "bin") -'01110100011001010111001101110100' -``` - ------ - -### Multi-rounds encoding - -It is possible to use multiple times the same encoding through the following convention: `encoding[X]` - -A simple example for a 1-round and a 2-rounds morse-encoded string: - -```python ->>> codext.encode("This is a test", "morse") -'- .... .. ... / .. ... / .- / - . ... -' ->>> codext.encode("This is a test", "morse[2]") -'-....- / .-.-.- .-.-.- .-.-.- .-.-.- / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- -....- / -..-. / -....- / .-.-.- / .-.-.- .-.-.- .-.-.- / -....-' -``` - -Another example using 5-rounds base58: - -```python ->>> codext.encode("Sup3rS3cr3t", "base58[5]") -'3YrjaeeJE1qfUVkpUbMymEMLJenvRrtcZ4vaDQ3httdiqWV8wGYFpqw' -``` - ------ - -### Hooked `codecs` functions - -In order to select the right de/encoding function and avoid any conflict, the native `codecs` library registers search functions (using the `register(search_function)` function), called in order of registration while searching for a codec. - -While being imported, `codext` hooks the following base functions of `codecs` dealing with the codecs registry: `encode`, `decode`, `lookup` and `register`. This way, `codext` holds a private registry that is called before reaching out to the native one, causing the codecs defined in `codext` to override native codecs with a matching registry search function. - +Basically, the `codecs` library provides a series of functions from the built-in `_codecs` library which maintains a registry of search functions (a simple list) that maps ancodings to the right de/encode functions by returning a `CodecInfo` object once first matched. + +`codext` hooks `codecs`'s functions to insert its own proxy registry between the function calls and the native registry so that new encodings can be added or replace existing ones while using `code[cs|xt].open`. Indeed, as the proxy registry is called first, the first possible match occurs in a custom codec, while if not existing, the native registry is used. + +!!! note "The `open` built-in function" + + Two behaviors are to be considered when using `codext`: + + 1. Encodings added from `codext` are only added to the proxy codecs registry of `codext` and are NOT available using `open(...)` (but well using `code[cs|xt].open(...)`. + 2. Encodings added from `codecs` are added to the proxy registry AND ALSO to the native registry and are therefore available using `open(...)`. + + This difference allows to keep encodings added from `codext` removable while these added from `codecs` are not. This is the consequence from the fact that there is no unregister function in the native `_codecs` library. + +!!! warning "Lossy conversion" + + Some encodings are lossy, meaning that it is not always possible to decode back to the exact start string. This should be considered especially when chaining codecs. + +----- + +### Add a custom encoding + +New codecs can be added easily using the new function `add`. + +```python +>>> import codext +>>> help(codext.add) +Help on function add in module codext.__common__: + +add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False) + This adds a new codec to the codecs module setting its encode and/or decode + functions, eventually dynamically naming the encoding with a pattern and + with file handling (if text is True). + + :param ename: encoding name + :param encode: encoding function or None + :param decode: decoding function or None + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the + built-in open(...) but will make it impossible + to remove the codec later + +``` + +Here is a simple example of how to add a basic codec: + +```python +import codext + +def mycodec_encode(text, errors="strict"): + # do some encoding stuff + return encoded, len(text) + +def mycodec_decode(text, errors="strict"): + # do some decoding stuff + return decoded, len(text) + +codext.add("mycodec", mycodec_encode, mycodec_decode) +``` + +In this first example, we can see that: + +- The `decode`/`encode` functions have a signature holding a keyword-argument "`errors`" for error handling. This comes from the syntax for making a codec for the `codecs` native library. This argument can have multiple values, namely "`strict`" for raising an exception when an de/encoding error occurs, while "`replace`" allows to replace the character at the position of the error with a generic character and also "`ignore`" that simply ignores the error and continues without adding anything to the resulting string. +- These functions always return a pair with the resulting string and the length of consumed input text. + +Another example for a more complex and dynamic codec: + +```python +import codext + +def mydyncodec_encode(i): + def encode(text, error="strict"): + # do somthing depending on i + return result, len(text) + return encode + +codext.add("mydyncodec", mydyncodec_encode, pattern=r"mydyn-(\d+)$") +``` + +In this second example, we can see that: + +- Only the encoding function is defined. +- A pattern is defined to match the prefix "`mydyn-`" and then an integer which is captured and used with `mydyncodec_encode(i)`. + +!!! warning "Pattern capture group" + + A capture group means that the parameter will be used with a dynamic (decorated) encoding function. In order to avoid this, i.e. for matching multiple names leading to the same encoding while calling a static encoding function, we can simply define a non-capturing group, e.g. "`(?:my|special_)codec`". + +----- + +### Add a custom map encoding + +New codecs using encoding maps can be added easily using the new function `add_map`. + +```python +>>> import codext +>>> help(codext.add) +Help on function add_map in module codext.__common__: + +add_map(ename, encmap, repl_char='?', sep='', ignore_case=None, no_error=False, intype=None, outype=None, **kwargs) + This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs module + dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with a pattern + and with file handling (if text is True). + + :param ename: encoding name + :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture + group of the regex pattern) or a function building the encoding map + :param repl_char: replacement char (used when errors handling is set to "replace") + :param sep: string of possible character separators (hence, only single-char separators are considered) ; + - while encoding, the first separator is used + - while decoding, separators can be mixed in the input text + :param ignore_case: ignore text case while encoding and/or decoding + :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) + :param intype: specify the input type for pre-transforming the input text + :param outype: specify the output type for post-transforming the output text + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + +``` + +This relies on the [`add`](#add-a-custom-encoding) function and simplifies creating new encodings when they can be described as a mapping dictionary. + +Here is a simple example of how to add a map codec: + +```python +import codext + +ENCMAP = {'a': "A", 'b': "B", 'c': "C"} + +codext.add_map("mycodec", ENCMAP) +``` + +In this first example, we can see that: + +- The `decode`/`encode` functions do not have to be declared anymore. +- `ENCMAP` is the mapping between characters, it is also used to compute the decoding function. + +Another example for a more complex and dynamic codec: + +```python +import codext + +ENCMAP = [ + {'00': "A", '01': "B", '10': "C", '11': "D"}, + {'00': "D", '01': "C", '10': "B", '11': "A"}, +] + +codext.add("mydyncodec", ENCMAP, "#", ignore_case=True, intype="bin", pattern=r"mydyn-(\d+)$") +``` + +In this second example, we can see that: + +- `ENCMAP` is now a list of mappings. The capture group in the pattern is used to select the right encoding map. Consequently, using encoding "`mydyn-8`" will fail with a `LookupError` as the only possibility are "`mydyn-1`" and "`mydyn-2`". Note that the index begins at 1 in the encoding name. +- Instead of using the default character "`?`" for replacements, we use "`#`". +- The case is ignored ; decoding either "`abcd`" or "`ABCD`" will succeed. +- The binary mode is enabled, meaning that the input text is converted to a binary string for encoding, while it is converted from binary to text when decoding. + +!!! warning "Input/Output types" + + By default, when `intype` is defined, `outype` takes the same value. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). + +----- + +### Add a macro + +**Macros** are chains of encodings. It is possible to define own macros with this feature. It works by giving the precedence to user's macros saved in `~/.codext-macros.json` then using embedded macros from the `codext` package. + +Here is an example of adding a macro (and verifying it was indeed added): + +```python +>>> codext.list_macros() +['example-macro'] +>>> codext.add_macro("test-macro", "gzip", "base64") +>>> codext.list_macros() +['example-macro', 'test-macro'] +``` + +!!! note "Removing a macro" + + As macros are resolved like codecs (with the precedence for codecs), they can be removed the same way as a codec. + + :::python + >>> codext.remove("test-macro") + + If this is a built-in macro, it will removed from the runtime list within the `codext` package. Next time this will be loaded, it will reset the builtin list of macros. Otherwise, if this is a custom macro, it will removed from the list of custom macros AND removed from `~/.codext-macros.json`. + +----- + +### List codecs + +Codecs can be listed with the `list` function, either the whole codecs or only some categories. + +```python +>>> codext.list() +['affine', 'ascii', 'ascii85', 'atbash', 'bacon', ..., 'base36', 'base58', 'base62', 'base64', 'base64_codec', ..., 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'big5', 'big5hkscs', 'braille', 'bz2_codec', 'capitalize', 'cp037', ...] +``` + +!!! note "Codecs categories" + + - `native`: the built-in codecs from the original `codecs` package + - `non-native`: this special category regroups all the categories mentioned hereafter + - `base`: baseX codecs (e.g. `base`, `base100`) + - `binary`: codecs working on strings but applying their algorithms on their binary forms (e.g. `baudot`, `manchester`) + - `common`: common codecs not included in the native ones or simly added for the purpose of standardization (e.g. `octal`, `ordinal`) + - `crypto`: codecs related to cryptography algorithms (e.g. `barbie`, `rot`, `xor`) + - `language`: language-related codecs (e.g. `morse`, `navajo`) + - `other`: uncategorized codecs (e.g. `letters`, `url`) + - `stegano`: steganography-related codecs (e.g. `sms`, `resistor`) + + Except the `native` and `non-native` categories, the other ones are simply the name of the subdirectories (with "`s`" right-stripped) of the `codext` package. + +```python +>>> codext.list("binary") +['baudot', 'baudot-spaced', 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'excess3', 'gray', 'manchester', 'manchester-inverted'] +>>> codext.list("language") +['braille', 'leet', 'morse', 'navajo', 'radio', 'southpark', 'southpark-icase', 'tom-tom'] +>>> codext.list("native") +['ascii', 'base64_codec', 'big5', 'big5hkscs', 'bz2_codec', 'cp037', 'cp273', 'cp424', 'cp437', 'cp500', 'cp775', 'cp850', 'cp852', 'cp855', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', ...] +``` + +!!! warning "Codecs listed, not encodings" + + Beware that this function only lists the codecs, not the encodings. This means that, for instance, it only lists `base` (codecs' name) instead of `base17`, `base61`, `base97`, ... (the valid encoding names related to the `base` codec). + +----- + +### Search for encodings + +Natively, `codecs` provides a `lookup` function that allows to get the `CodecInfo` object for the desired encoding. This performs a lookup in the registry based on an exact match. Sometimes, it can be useful to search for available encodings based on a regular expression. Therefore, a `search` function is added by `codext` to allow to get a list of encoding names matching the input regex. + +```python +>>> codext.search("baudot") +['baudot', 'baudot_spaced', 'baudot_tape'] +>>> codext.search("al") +['capitalize', 'octal', 'octal_spaced', 'ordinal', 'ordinal_spaced', 'radio'] +>>> codext.search("white") +['whitespace', 'whitespace_after_before'] +``` + +Also, `codext` provides an `examples` function to get some examples of valid encoding names. This is especially useful when it concerns dynamicly named encodings (e.g. `rot`, `shift` or `dna`). + +```python +>>> codext.examples("rot") +['rot-14', 'rot-24', 'rot-7', 'rot18', 'rot3', 'rot4', 'rot6', 'rot_1', 'rot_12', 'rot_2'] +>>> codext.examples("dna") +['dna-1', 'dna-2', 'dna-5', 'dna1', 'dna4', 'dna5', 'dna6', 'dna8', 'dna_3', 'dna_5'] +>>> codext.examples("barbie", 5) +['barbie-1', 'barbie1', 'barbie4', 'barbie_2', 'barbie_4'] +``` + +----- + +### Remove a custom encoding or macro + +New codecs can be removed easily using the new function `remove`, which will only remove every codec matching the given encoding name in the proxy codecs registry and NOT in the native one. + +```python +>>> codext.encode("test", "bin") +'01110100011001010111001101110100' +>>> codext.remove("bin") +>>> codext.encode("test", "bin") + +Traceback (most recent call last): + [...] +LookupError: unknown encoding: bin +``` + +Trying to remove a codec that is in the native registry won't raise a `LookupError`. + +```python +>>> codext.remove("utf-8") +>>> codext.encode("test", "utf-8") +b'test' +``` + +Removing a macro works exactly the same way as for a codec. + +```python +>>> codext.remove("test-macro") +``` + +----- + +### Remove or restore `codext` encodings and macros + +It can be useful while playing with encodings and/or macros e.g. from Idle to be able to remove or restore `codext`'s encodings and macros. This can be achieved using respectively the new `clear` and `reset` functions. + +```python +>>> codext.clear() +>>> codext.encode("test", "bin") + +Traceback (most recent call last): + [...] +LookupError: unknown encoding: bin +``` + +```python +>>> codext.reset() +>>> codext.encode("test", "bin") +'01110100011001010111001101110100' +``` + +----- + +### Multi-rounds encoding + +It is possible to use multiple times the same encoding through the following convention: `encoding[X]` + +A simple example for a 1-round and a 2-rounds morse-encoded string: + +```python +>>> codext.encode("This is a test", "morse") +'- .... .. ... / .. ... / .- / - . ... -' +>>> codext.encode("This is a test", "morse[2]") +'-....- / .-.-.- .-.-.- .-.-.- .-.-.- / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- -....- / -..-. / -....- / .-.-.- / .-.-.- .-.-.- .-.-.- / -....-' +``` + +Another example using 5-rounds base58: + +```python +>>> codext.encode("Sup3rS3cr3t", "base58[5]") +'3YrjaeeJE1qfUVkpUbMymEMLJenvRrtcZ4vaDQ3httdiqWV8wGYFpqw' +``` + +----- + +### Hooked `codecs` functions + +In order to select the right de/encoding function and avoid any conflict, the native `codecs` library registers search functions (using the `register(search_function)` function), called in order of registration while searching for a codec. + +While being imported, `codext` hooks the following base functions of `codecs` dealing with the codecs registry: `encode`, `decode`, `lookup` and `register`. This way, `codext` holds a private registry that is called before reaching out to the native one, causing the codecs defined in `codext` to override native codecs with a matching registry search function. + diff --git a/docs/guessing.md b/docs/pages/guessing.md similarity index 98% rename from docs/guessing.md rename to docs/pages/guessing.md index 9bac11c..5745918 100644 --- a/docs/guessing.md +++ b/docs/pages/guessing.md @@ -1,172 +1,170 @@ -## Guess Mode - -For decoding multiple layers of codecs, `codext` features a guess mode relying on an Artificial Intelligence algorithm, the Breadth-First tree Search (BFS). For many cases, the default parameters are sufficient for guess-decoding things. But it may require parameters tuning. - ------ - -### Parameters - -BFS stops when a given condition, in the form of a function applied to the decoded string at the current depth, is met. It returns two results: the decoded string and a tuple with the related encoding names in order of application. - -The following parameters are tunable: - -- `stop_func`: can be a function or a regular expression to be matched (automatically converted to a function that uses the `re` module) ; by default, checks if all input characters are printable. -- `min_depth`: the minimum depth for the tree search (allows to avoid a bit of overhead while checking the current decoded output at a depth with the stop function when we are sure it should not be the right result) ; by default 0. -- `max_depth`: the maximum depth for the tree search ; by default 5. -- `codec_categories`: a string indicating a codec [category](#list-codecs) or a list of [category](#list-codecs) strings ; by default, `None`, meaning the whole [categories](#list-codecs) (very slow). -- `found`: a list or tuple of currently found encodings that can be used to save time if the first decoding steps are known ; by default, an empty tuple. - -A simple example for a 1-stage base64-encoded string: - -```python ->>> codext.guess("VGhpcyBpcyBhIHRlc3Q=") -{('base64',): 'This is a test'} -``` - -An example of a 2-stages base64- then base62-encoded string: - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7") -{('base62',): 'VGhpcyBpcyBhIHRlc3Q='} -``` - -In the second example, we can see that the given encoded string is not decoded as expected. This is the case because the (default) stop condition is too broad and stops if all the characters of the output are printable. If we have a prior knowledge on what we should expect, we can input a simple string or a regex: - -!!! note "Default stop function" - - :::python - >>> codext.stopfunc.default.__name__ - '...' - - The output depends on whether you have a language detection backend library installed ; see section [*Natural Language Detection*](#natural-language-detection). If no such library is installed, the default function is "`text`". - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test") -{('base62', 'base64'): 'This is a test'} -``` - -In this example, the string "*test*" is converted to a function that uses this string as regular expression. Instead of a string, we can also pass a function. For this purpose, standard [stop functions](#available-stop-functions) are predefined. So, we can for instance use `stopfunc.lang_en` to stop when we find something that is English. Note that working this way gives lots of false positives if the text is very short like in the example case. That's why the `codec_categories` argument is used to only consider baseX codecs. This is also demonstrated in the next examples. - -```python ->>> codext.stopfunc._reload_lang("langdetect") ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", codext.stopfunc.lang_en, codec_categories="base") -('This is a test', ('base62', 'base64')) -``` - -If we know the first encoding, we can set this in the `found` parameter to save time: - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", found=["base62"]) -('This is a test', ('base62', 'base64')) -``` - -If we are sure that only `base` (which is a valid [category](#list-codecs)) encodings are used, we can restrict the tree search using the `codec_categories` parameter to save time: - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", codec_categories="base") -('This is a test', ('base62', 'base64')) -``` - -Another example of 2-stages encoded string: - -```python ->>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test") -('this is a test', ('base64', 'morse')) ->>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test", codec_categories=["base", "language"]) -('this is a test', ('base64', 'morse')) -``` - -When multiple results are expected, `stop` and `show` arguments can be used respectively to avoid stopping while finding a result and to display the intermediate result. - -!!! warning "Computation time" - - Note that, in the very last examples, the first call takes much longer than the second one but requires no knowledge about the possible [categories](#list-codecs) of encodings. - ------ - -### Available Stop Functions - -A few stop functions are predefined in the `stopfunc` submodule. - -```python ->>> import codext ->>> dir(codext.stopfunc) -['LANG_BACKEND', 'LANG_BACKENDS', ..., '_reload_lang', 'default', 'flag', ..., 'printables', 'regex', 'text'] -``` - -Currently, the following stop functions are provided: - -- `flag`: searches for the pattern "`[Ff][Ll1][Aa4@][Gg9]`" (either UTF-8 or UTF-16) -- `lang_**`: checks if the given lang is detected (note that it first checks if all characters are text ; see `text` hereafter) -- `printables`: checks that every output character is in the set of printables -- `regex(pattern)`: takes one argument, the regular expression, for checking a string against the given pattern -- `text`: checks for printables and an entropy less than 4.6 (empirically determined) - -A stop function can be used as the second argument of the `guess` function or as a keyword-argument, as shown in the following examples: - -```python ->>> codext.guess("...", codext.stopfunc.text) -[...] ->>> codext.guess("...", [...], stop_func=codext.stopfunc.text) -[...] -``` - -When a string is given, it is automatically converted to a `regex` stop function. - -```python ->>> s = codext.encode("pattern testing", "leetspeak") ->>> s -'p4773rn 73571n9' ->>> stop_func = codext.stopfunc.regex("p[a4@][t7]{2}[e3]rn") ->>> stop_func(s) -True ->>> codext.guess(s, stop_func) -[...] -``` - -Additionally, a simple stop function is predefined for CTF players, matching various declinations of the word *flag*. Alternatively, a pattern can always be used when flags have a particular format. - -```python ->>> codext.stopfunc.flag("test string") -False ->>> codext.stopfunc.flag("test f1@9") -True ->>> codext.stopfunc.regex(r"^CTF\{.*?\}$")("CTF{098f6bcd4621d373cade4e832627b4f6}") -True -``` - -The particular type of stop function `lang_**` is explained in the [next section](#natural-language-detection). - ------ - -### Natural Language Detection - -As in many cases, we are trying to decode inputs to readable text, it is necessary to narrow the scope while searching for valid decoded outputs. As matching printables and even text (as defined here before as printables with an entropy of less than 4.6) is too broad for many cases, it may be very useful to apply natural language detection. In `codext`, this is done by relying on Natural Language Processing (NLP) backend libraries, loaded only if they were separately installed. - -Currently, the following backends are supported, in order of precedence (this order was empirically determined by testing): - -- [`langid`](https://github.com/saffsd/langid.py): *Standalone Language Identification (LangID) tool.* -- [`langdetect`](https://github.com/Mimino666/langdetect): *Port of Nakatani Shuyo's language-detection library (version from 03/03/2014) to Python.* -- [`pycld2`](https://github.com/aboSamoor/pycld2): *Python bindings for the Compact Langauge Detect 2 (CLD2).* -- [`cld3`](https://github.com/bsolomon1124/pycld3): *Python bindings to the Compact Language Detector v3 (CLD3).* -- [`textblob`](https://github.com/sloria/TextBlob): *Python (2 and 3) library for processing textual data.* - -The way NLP is used is to check that these libraries exist and to take the first one by default. This sets up the `stopfunc.default` for the guess mode. This behavior aims to keep language detection as optional and to avoid multiple specific requirements having the same purpose. - -While loaded, the default backend can be switched to another one by using the `_reload_lang` function: - -```python ->>> codext.stopfunc._reload_lang("pycld2") # this loads pycld2 and attaches lang_** functions to the stopfunc submodule ->>> codext.stopfunc._reload_lang() # this unloads any loaded backend -``` - -Each time a backend is loaded, it gets `lang_**` stop functions attached to the `stopfunc` submodule for each supported language. - ------ - -### Ranking Heuristic - -!!! warning "Work in progress" - - This part is still in progress and shall be improved with better features and/or using machine learning. - +For decoding multiple layers of codecs, `codext` features a guess mode relying on an Artificial Intelligence algorithm, the Breadth-First tree Search (BFS). For many cases, the default parameters are sufficient for guess-decoding things. But it may require parameters tuning. + +----- + +### Parameters + +BFS stops when a given condition, in the form of a function applied to the decoded string at the current depth, is met. It returns two results: the decoded string and a tuple with the related encoding names in order of application. + +The following parameters are tunable: + +- `stop_func`: can be a function or a regular expression to be matched (automatically converted to a function that uses the `re` module) ; by default, checks if all input characters are printable. +- `min_depth`: the minimum depth for the tree search (allows to avoid a bit of overhead while checking the current decoded output at a depth with the stop function when we are sure it should not be the right result) ; by default 0. +- `max_depth`: the maximum depth for the tree search ; by default 5. +- `codec_categories`: a string indicating a codec [category](#list-codecs) or a list of [category](#list-codecs) strings ; by default, `None`, meaning the whole [categories](#list-codecs) (very slow). +- `found`: a list or tuple of currently found encodings that can be used to save time if the first decoding steps are known ; by default, an empty tuple. + +A simple example for a 1-stage base64-encoded string: + +```python +>>> codext.guess("VGhpcyBpcyBhIHRlc3Q=") +{('base64',): 'This is a test'} +``` + +An example of a 2-stages base64- then base62-encoded string: + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7") +{('base62',): 'VGhpcyBpcyBhIHRlc3Q='} +``` + +In the second example, we can see that the given encoded string is not decoded as expected. This is the case because the (default) stop condition is too broad and stops if all the characters of the output are printable. If we have a prior knowledge on what we should expect, we can input a simple string or a regex: + +!!! note "Default stop function" + + :::python + >>> codext.stopfunc.default.__name__ + '...' + + The output depends on whether you have a language detection backend library installed ; see section [*Natural Language Detection*](#natural-language-detection). If no such library is installed, the default function is "`text`". + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test") +{('base62', 'base64'): 'This is a test'} +``` + +In this example, the string "*test*" is converted to a function that uses this string as regular expression. Instead of a string, we can also pass a function. For this purpose, standard [stop functions](#available-stop-functions) are predefined. So, we can for instance use `stopfunc.lang_en` to stop when we find something that is English. Note that working this way gives lots of false positives if the text is very short like in the example case. That's why the `codec_categories` argument is used to only consider baseX codecs. This is also demonstrated in the next examples. + +```python +>>> codext.stopfunc._reload_lang("langdetect") +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", codext.stopfunc.lang_en, codec_categories="base") +('This is a test', ('base62', 'base64')) +``` + +If we know the first encoding, we can set this in the `found` parameter to save time: + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", found=["base62"]) +('This is a test', ('base62', 'base64')) +``` + +If we are sure that only `base` (which is a valid [category](#list-codecs)) encodings are used, we can restrict the tree search using the `codec_categories` parameter to save time: + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", codec_categories="base") +('This is a test', ('base62', 'base64')) +``` + +Another example of 2-stages encoded string: + +```python +>>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test") +('this is a test', ('base64', 'morse')) +>>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test", codec_categories=["base", "language"]) +('this is a test', ('base64', 'morse')) +``` + +When multiple results are expected, `stop` and `show` arguments can be used respectively to avoid stopping while finding a result and to display the intermediate result. + +!!! warning "Computation time" + + Note that, in the very last examples, the first call takes much longer than the second one but requires no knowledge about the possible [categories](#list-codecs) of encodings. + +----- + +### Available Stop Functions + +A few stop functions are predefined in the `stopfunc` submodule. + +```python +>>> import codext +>>> dir(codext.stopfunc) +['LANG_BACKEND', 'LANG_BACKENDS', ..., '_reload_lang', 'default', 'flag', ..., 'printables', 'regex', 'text'] +``` + +Currently, the following stop functions are provided: + +- `flag`: searches for the pattern "`[Ff][Ll1][Aa4@][Gg9]`" (either UTF-8 or UTF-16) +- `lang_**`: checks if the given lang is detected (note that it first checks if all characters are text ; see `text` hereafter) +- `printables`: checks that every output character is in the set of printables +- `regex(pattern)`: takes one argument, the regular expression, for checking a string against the given pattern +- `text`: checks for printables and an entropy less than 4.6 (empirically determined) + +A stop function can be used as the second argument of the `guess` function or as a keyword-argument, as shown in the following examples: + +```python +>>> codext.guess("...", codext.stopfunc.text) +[...] +>>> codext.guess("...", [...], stop_func=codext.stopfunc.text) +[...] +``` + +When a string is given, it is automatically converted to a `regex` stop function. + +```python +>>> s = codext.encode("pattern testing", "leetspeak") +>>> s +'p4773rn 73571n9' +>>> stop_func = codext.stopfunc.regex("p[a4@][t7]{2}[e3]rn") +>>> stop_func(s) +True +>>> codext.guess(s, stop_func) +[...] +``` + +Additionally, a simple stop function is predefined for CTF players, matching various declinations of the word *flag*. Alternatively, a pattern can always be used when flags have a particular format. + +```python +>>> codext.stopfunc.flag("test string") +False +>>> codext.stopfunc.flag("test f1@9") +True +>>> codext.stopfunc.regex(r"^CTF\{.*?\}$")("CTF{098f6bcd4621d373cade4e832627b4f6}") +True +``` + +The particular type of stop function `lang_**` is explained in the [next section](#natural-language-detection). + +----- + +### Natural Language Detection + +As in many cases, we are trying to decode inputs to readable text, it is necessary to narrow the scope while searching for valid decoded outputs. As matching printables and even text (as defined here before as printables with an entropy of less than 4.6) is too broad for many cases, it may be very useful to apply natural language detection. In `codext`, this is done by relying on Natural Language Processing (NLP) backend libraries, loaded only if they were separately installed. + +Currently, the following backends are supported, in order of precedence (this order was empirically determined by testing): + +- [`langid`](https://github.com/saffsd/langid.py): *Standalone Language Identification (LangID) tool.* +- [`langdetect`](https://github.com/Mimino666/langdetect): *Port of Nakatani Shuyo's language-detection library (version from 03/03/2014) to Python.* +- [`pycld2`](https://github.com/aboSamoor/pycld2): *Python bindings for the Compact Langauge Detect 2 (CLD2).* +- [`cld3`](https://github.com/bsolomon1124/pycld3): *Python bindings to the Compact Language Detector v3 (CLD3).* +- [`textblob`](https://github.com/sloria/TextBlob): *Python (2 and 3) library for processing textual data.* + +The way NLP is used is to check that these libraries exist and to take the first one by default. This sets up the `stopfunc.default` for the guess mode. This behavior aims to keep language detection as optional and to avoid multiple specific requirements having the same purpose. + +While loaded, the default backend can be switched to another one by using the `_reload_lang` function: + +```python +>>> codext.stopfunc._reload_lang("pycld2") # this loads pycld2 and attaches lang_** functions to the stopfunc submodule +>>> codext.stopfunc._reload_lang() # this unloads any loaded backend +``` + +Each time a backend is loaded, it gets `lang_**` stop functions attached to the `stopfunc` submodule for each supported language. + +----- + +### Ranking Heuristic + +!!! warning "Work in progress" + + This part is still in progress and shall be improved with better features and/or using machine learning. + diff --git a/docs/howto.md b/docs/pages/howto.md similarity index 71% rename from docs/howto.md rename to docs/pages/howto.md index 6163ef6..8cb3fc8 100644 --- a/docs/howto.md +++ b/docs/pages/howto.md @@ -1,242 +1,258 @@ -## How To Create Your Codec - -The purpose of this section is to provide a tutorial for creating new codecs accordingly. - -As explained in [this section](./features.html), `codext` provides the possibility to add new codecs in two ways: - -1. [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56): using this function, the *encode* and *decode* functions must be given as arguments. -2. [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160): using this function, an *encoding map* must be given but can be formatted in different ways to handle various use cases. - -In both cases, a *pattern* is given in argument and aims to define the set of all strings that aim to select this codec. - -!!! important "Codec precedence" - - `codext` uses a local registry that is queried first before attempting native `codecs` lookups. This means that a native codec can be overridden with a *pattern* that matches the same strings. - -The remainder of this section explains how to successfully create a new codec and/or how to make so that it can be added to the library. - -!!! reminder "Contributions welcome !" - - Remember that you can always [submit a request for a new codec](https://github.com/dhondta/python-codext/issues/new) or submit your own with a PR for improving `codext` ! - ------ - -### Generic arguments - -Whatever solution is chosen, the following arguments shall be considered: - -- `ename` (first positional argument): Choose the shortest possible encoding name. If it clashes with another codec, always remember that `codext` resolves codecs in order of registry, that is from the first added. Also, it resolves codecs based on the given pattern. So, a codec with a clashing name could still be selected if the pattern does not match for the codec with the precedence but matches for this codec. -- `pattern` (keyword-argument): If not defined, it defaults to the encoding name. It can be a regular expression ; in this case, it should not be too broad. A codec decode or encode function can be parametrized through the pattern using the **first capture group**. It is important to note that the first capture group is used and not any other. This means that any other group definition shall use the do-not-capture specifier, that is "`(?:...)`". - -!!! danger "Too broad pattern" - - Let us consider the following ; we add a codec that handles every character in any number of occurrence. It will then capture anything in the given encoding name and will then always resolve to this codec, preventing any other codec added afterwards to resolve. - - >>> import codext - >>> identity = lambda text, errors="strict": (text, len(text)) - >>> codext.add("everything", identity, identity, pattern=r".*") - >>> codext.encode("test string", "test-encoding-name") # r".*" matches anything, thus including "test-encoding-name" - 'test string' - >>> codext.decode("test string", "test-encoding-name") - 'test string' - >>> codext.encode("test string", "morse") # "morse" has the precedence on codec "everything" we just added - '- . ... - / ... - .-. .. -. --.' - >>> test = lambda text, errors="strict": ("TEST", len(t)) - >>> codext.add("test", test) # no pattern given ; should then be matched by encoding name "test" - >>> codext.encode("test string", "test") # should give "TEST" if codec "test" was selected - 'test string' # gives the output of codec "test-encoding-name", - # which has precedence on "test" and a too broad pattern - ------ - -### Which `add` function ? - -At this point, it is necessary to determine what kind of codec you want. If it is a simple map of characters, you should definitely use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160). If it is more complex and cannot be handled using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160)'s options, then you should use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) and define the encode/decode functions yourself. - -A few examples: - -- `morse` is a simple map that does not handle case ; it then uses [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `ignore_case` set to "`encode`" (not "`both`" for encoding and decoding as it does not matter anyway for decoding) -- `whitespace` has 2 codecs defined ; the simple one is a simple bit encoding map, therefore using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype` set to "`bin`" (for pre-converting characters to bits before applying the encoding map), and the complex one uses [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) with its specific endocde/decode functions -- `atbash` defines a dynamic map with a "factory" function, that creates the encoding map according to the parameters supplied in the codec name - -So, before going further, determine the following: - -- What does the new codec map from and to ? E.g. if binary input and ordinal output, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype="bin"` and `outype="ord"`. -- Is this codec ignoring case ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and specify which operation(s) should ignore case, e.g. `ignore_case="both"` or `ignore_case="decode"`. -- Should this codec handle no error ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) do not forget to specify `no_error=True`. -- Does the codec yields variable-length encoded tokens ? If so, you can still use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) but you should define `sep` (separator) as `codext` will not be able to handle ambiguities. - -If you find aspects that are not covered in these questions, you shall use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56), then refering to [Case 1](#case-1-generic-encoding-definition). Otherwise, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and refer -to [Case 2](#case-2-encoding-map). - ------ - -### Case 1: Generic encoding definition - -This uses: [`codext.add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) - -The following shall be considered: - -- `encode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot encode. -- `decode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot decode. - -Both functions must take 2 arguments and return 2 values (in order to stick to `codec`'s encode/decode function format): - -- Inputs: `text`, `errors="strict"` ; respectively the text to encode/decode and the error handling mode. -- Outputs: encoded text and length of consumed input text. - -!!! note "Error handling mode" - - - `strict`: this is the default ; it means that any error shall raise an exception. - - `ignore`: any error is ignored, adding nothing to the output. - - `replace`: any error yields the given replacement character(s). - - `leave`: any error yields the erroneous input token in the output. - - This last mode is an addition to the native ones. It can be useful for some encodings that must cause no error while encoding and can therefore have their original characters in the output. - -Also, while defining the `encode` and/or `decode` functions, `codext.handle_error` can be used as a shortcut to handle the different modes. It returns a wrapped function that takes `token` and `position` as arguments (see [`excess3`](https://github.com/dhondta/python-codext/blob/master/codext/binary/excess3.py) for an example). - -```python ->>> help(codext.handle_error) -Help on function handle_error in module codext.__common__: - -handle_error(ename, errors, sep='', repl_char='?', repl_minlen=1, decode=False, item='position') - This shortcut function allows to handle error modes given some tuning parameters. - - :param ename: encoding name - :param errors: error handling mode - :param sep: token separator - :param repl_char: replacement character (for use when errors="replace") - :param repl_minlen: repeat number for the replacement character - :param decode: whether we are encoding or decoding - :param item: position item description (for describing the error ; e.g. "group" or "token") - ->>> err = codext.handle_error("test", "strict") ->>> help(err) -Help on function _handle_error in module codext.__common__: - -_handle_error(token, position) - This handles an encoding/decoding error according to the selected handling mode. - - :param token: input token to be encoded/decoded - :param position: token position index - -``` - ------ - -### Case 2: Encoding map - -This uses: [`codext.add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) - -The following options shall be considered: - -- `encmap` (second positional argument): This defines the encoding map and is the core of the codec ; 4 subcases are handled and explained hereafter. -- `repl_char` (keyword-argument ; default: "`?`"): The replacement character can be tuned, especially if the default one clashes with a character from the encoding. -- `sep` (keyword-argument ; default: ""): The separator between encoded tokens can be useful to tune, especially when the encoded tokens have a variable length. -- `ignore_case` (keyword-argument ; default: `None`): This defines where the case shall be ignored ; it can be one of the followings: "`encode`", "`decode`" or "`both`". -- `no_error` (keyword-argument ; default: `False`): This sets if errors should be handled as normal or if no error should be considered, simply leaving the input token as is in the output. -- `intype` (keyword-argument ; default: `None`): This specifies the type the input text should be converted to before applying the encoding map (pre-conversion before really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. -- `outype` (keyword-argument ; default: `None`): This specifies the type the output text of the encoding map should be converted from (post-conversion after really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. - -!!! warning "Input/Output types" - - By default, when `intype` is defined, `outype` takes the same value if left `None`. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be explicitely set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). - -`encmap` can be defined as follows: - -1. **Simple map**: In this case, the encoding map is a dictionary mapping each input character to an output one (see [`radio`](https://github.com/dhondta/python-codext/blob/master/codext/languages/radio.py) for an example). -2. **List of maps**: In this case, encoding maps are put in a list and referenced by their order number starting from 1, meaning that the `pattern` shall define a capture group with values from 1 to the length of this list (see [`dna`](https://github.com/dhondta/python-codext/blob/master/codext/others/dna.py) for an example). -3. **Parametrized map**: This variant defines a dictionary of regex-selected encoding maps, that is, a dictionary of dictionaries with keys matching the captured groups from codec's pattern. -4. **Map factory function**: This one is implemented by a function that returns the composed encoding map. This function takes a single argument according to the capture group from the `pattern` (see [`affine`](https://github.com/dhondta/python-codext/blob/master/codext/crypto/affine.py) for an example). - -!!! note "Mapping one input character to multiple output characters" - - In some particular cases (e.g. the `navajo` codec), a single input character can be mapped to multiple output ones. It is possible to define them in a map by simply putting them into a list (e.g. a map with `{'A': ["B", "C", "D"]}`). In this case, while encoding, the output character is randomly chosen (e.g. "`A`" will map to "`D`", another time to "`B`", ...). - ------ - -### Self-generated tests - -In order to facilitate testing, a test suite can be automatically generated from a set of *examples*. This is defined in the `__examples__` dunder inside codec's source file (see [`sms`](https://github.com/dhondta/python-codext/blob/master/codext/stegano/sms.py) for an example). By default, the `add`/`add_map` function will get `__examples__` from the global scope but this behavior can be overridden by specifying the keyword-argument `examples` (e.g. `add(..., examples=__examples1__)` ; see [`ordinal`](https://github.com/dhondta/python-codext/blob/master/codext/common/ordinal.py) for an example). - -A set of examples is a dictionary specifying the test cases to be considered. The keys are the descriptions of the test cases and the values can be either dictionaries of input texts and their output encoded texts or lists of input texts. Each key has the format "`operation(encodings)`". Operations can be: - -- `enc`: This is for testing the encoding of the nested values (that is, a dictionary of input/outputs). -- `dec`: This is for testing the decoding of the nested values (that is, a dictionary of input/outputs). If this is not specified, the test suite automatically tries to decode from what is defined in `enc`. -- `enc-dec`: This is for testing the encoding AND decoding of the nested values (that is, a list of inputs) ; this one does not enforce what should be the output of the encoding but checks that encoding AND decoding leads to the same input text. This is particularly useful when encoding can yield randomly chosen tokens in the encoded output. - -The `encodings` are a `|`-separated list of encoding names, compliant or not with tested codec's pattern. Faulty names can also be tested as of the examples hereafter. - -Examples of `__examples__` test suites: - -```python -__my_examples__ = { - 'enc(BAD)': None -} -``` - -!!! note "Observations" - - - `__my__examples__` is not the standard dunder, therefore requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. - - `BAD` is assumed to be a bad encoding name, therefore having a dictionary value of `None`, meaning that the test should raise a `LookupError`. - -```python -__examples__ = { - 'enc(codec)': {'string': None} -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, therefore NOT requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. - - `codec` is assumed to be a valid encoding name, therefore having a dictionary as its value, but in this special case "`string`" is assumed not to be encoded, its corresponding value is then `None`, meaning that the test should raise a `ValueError`. - -```python -__examples__ = { - 'enc-dec(codec)': ["test string", "TEST STRING", "@random", "@random{1024}"] -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. - - `enc-dec` is used, meaning that a list of inputs is defined. - - So, whatever its encoded output, the input string shall give the same while applying encoding then decoding. - - The special values `@random` and `@random{1024}`, meaning that test strings are generated from any possible byte-character with a specified length (512 when not specified, otherwise specified with `{...}`). - -```python -__examples__ = { - 'enc(codec)': {"test string": "..."} -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. - - `enc` only is used, meaning that a dictionary of inputs/outputs is given and `dec` is automatically handled while requiring the exact encoded text but recovering the exact same input while decoding. - -```python -__examples__ = { - 'enc(codec)': {"Test String": "..."}, - 'dec(codec)': {"...": "test string"}, -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. - - `enc` and `dec` are used, meaning that dictionaries of inputs/outputs are given and the input texts are not necessarily the same (i.e. if text case is not handled by the codec). - ------ - -### Adding a new codec to `codext` - -As a checklist when making a codec for addition in `codext`, please follow these steps: - -1. Create your codec file (i.e. starting with a copy of an existing similar one) -2. Place it into the right category folder -3. Add it to the list in [`README.md`](https://github.com/dhondta/python-codext/blob/master/README.md#list-of-codecs) -4. Add its documentation in the [right Markdown file](https://github.com/dhondta/python-codext/tree/master/docs/enc) -5. If self-generated tests are not enough, add manual tests in [the related file](https://github.com/dhondta/python-codext/blob/master/tests/test_manual.py) - +The purpose of this section is to provide a tutorial for creating new codecs accordingly. + +As explained in [this section](./features), `codext` provides the possibility to add new codecs in two ways: + +1. [`add`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L56): using this function, the *encode* and *decode* functions must be given as arguments. +2. [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160): using this function, an *encoding map* must be given but can be formatted in different ways to handle various use cases. + +In both cases, a *pattern* is given in argument and aims to define the set of all strings that aim to select this codec. + +!!! important "Codec precedence" + + `codext` uses a local registry that is queried first before attempting native `codecs` lookups. This means that a native codec can be overridden with a *pattern* that matches the same strings. + +The remainder of this section explains how to successfully create a new codec and/or how to make so that it can be added to the library. + +!!! reminder "Contributions welcome !" + + Remember that you can always [submit a request for a new codec](https://github.com/dhondta/python-codext/issues/new) or submit your own with a PR for improving `codext` ! + +----- + +### Generic arguments + +Whatever solution is chosen, the following arguments shall be considered: + +- `ename` (first positional argument): Choose the shortest possible encoding name. If it clashes with another codec, always remember that `codext` resolves codecs in order of registry, that is from the first added. Also, it resolves codecs based on the given pattern. So, a codec with a clashing name could still be selected if the pattern does not match for the codec with the precedence but matches for this codec. +- `pattern` (keyword-argument): If not defined, it defaults to the encoding name. It can be a regular expression ; in this case, it should not be too broad. A codec decode or encode function can be parametrized through the pattern using the **first capture group**. It is important to note that the first capture group is used and not any other. This means that any other group definition shall use the do-not-capture specifier, that is "`(?:...)`". + +!!! danger "Too broad pattern" + + Let us consider the following ; we add a codec that handles every character in any number of occurrence. It will then capture anything in the given encoding name and will then always resolve to this codec, preventing any other codec added afterwards to resolve. + + >>> import codext + >>> identity = lambda text, errors="strict": (text, len(text)) + >>> codext.add("everything", identity, identity, pattern=r".*") + >>> codext.encode("test string", "test-encoding-name") # r".*" matches anything, thus including "test-encoding-name" + 'test string' + >>> codext.decode("test string", "test-encoding-name") + 'test string' + >>> codext.encode("test string", "morse") # "morse" has the precedence on codec "everything" we just added + '- . ... - / ... - .-. .. -. --.' + >>> test = lambda text, errors="strict": ("TEST", len(t)) + >>> codext.add("test", test) # no pattern given ; should then be matched by encoding name "test" + >>> codext.encode("test string", "test") # should give "TEST" if codec "test" was selected + 'test string' # gives the output of codec "test-encoding-name", + # which has precedence on "test" and a too broad pattern + +----- + +### Which `add` function ? + +At this point, it is necessary to determine what kind of codec you want. If it is a simple map of characters, you should definitely use [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160). If it is more complex and cannot be handled using [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160)'s options, then you should use [`add`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L56) and define the encode/decode functions yourself. + +A few examples: + +- `morse` is a simple map that does not handle case ; it then uses [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) with `ignore_case` set to "`encode`" (not "`both`" for encoding and decoding as it does not matter anyway for decoding) +- `whitespace` has 2 codecs defined ; the simple one is a simple bit encoding map, therefore using [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) with `intype` set to "`bin`" (for pre-converting characters to bits before applying the encoding map), and the complex one uses [`add`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L56) with its specific endocde/decode functions +- `atbash` defines a dynamic map with a "factory" function, that creates the encoding map according to the parameters supplied in the codec name + +So, before going further, determine the following: + +- What does the new codec map from and to ? E.g. if binary input and ordinal output, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) with `intype="bin"` and `outype="ord"`. +- Is this codec ignoring case ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) and specify which operation(s) should ignore case, e.g. `ignore_case="both"` or `ignore_case="decode"`. +- Should this codec handle no error ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) do not forget to specify `no_error=True`. +- Does the codec yields variable-length encoded tokens ? If so, you can still use [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) but you should define `sep` (separator) as `codext` will not be able to handle ambiguities. + +If you find aspects that are not covered in these questions, you shall use [`add`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L56), then refering to [Case 1](#case-1-generic-encoding-definition). Otherwise, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) and refer +to [Case 2](#case-2-encoding-map). + +----- + +### Case 1: Generic encoding definition + +This uses: [`codext.add`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L56) + +This applies when the codec is more complex than a mapping, as defined in _[Case 2: Encoding map](./#case-2-encoding-map)_. + +**Examples**: [`crypto/barbie`](https://github.com/dhondta/python-codext/blob/main/src/codext/crypto/barbie.py), [`crypto/railfence`](https://github.com/dhondta/python-codext/blob/main/src/codext/crypto/railfence.py), [`stegano/resistor`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/resistor.py), [`stegano/whitespace`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/whitespace.py) + +The following shall be considered: + +- `encode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot encode. +- `decode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot decode. + +Both functions must take 2 arguments and return 2 values (in order to stick to `codec`'s encode/decode function format): + +- Inputs: `text`, `errors="strict"` ; respectively the text to encode/decode and the error handling mode. +- Outputs: encoded text and length of consumed input text. + +!!! note "Error handling mode" + + - `strict`: this is the default ; it means that any error shall raise an exception. + - `ignore`: any error is ignored, adding nothing to the output. + - `replace`: any error yields the given replacement character(s). + - `leave`: any error yields the erroneous input token in the output. + + This last mode is an addition to the native ones. It can be useful for some encodings that must cause no error while encoding and can therefore have their original characters in the output. + +Also, while defining the `encode` and/or `decode` functions, `codext.handle_error` can be used as a shortcut to handle the different modes. It returns a wrapped function that takes `token` and `position` as arguments (see [`excess3`](https://github.com/dhondta/python-codext/blob/main/src/codext/binary/excess3.py) for an example). + +```python +>>> help(codext.handle_error) +Help on function handle_error in module codext.__common__: + +handle_error(ename, errors, sep='', repl_char='?', repl_minlen=1, decode=False, item='position') + This shortcut function allows to handle error modes given some tuning parameters. + + :param ename: encoding name + :param errors: error handling mode + :param sep: token separator + :param repl_char: replacement character (for use when errors="replace") + :param repl_minlen: repeat number for the replacement character + :param decode: whether we are encoding or decoding + :param item: position item description (for describing the error ; e.g. "group" or "token") + +>>> err = codext.handle_error("test", "strict") +>>> help(err) +Help on function _handle_error in module codext.__common__: + +_handle_error(token, position) + This handles an encoding/decoding error according to the selected handling mode. + + :param token: input token to be encoded/decoded + :param position: token position index + +``` + +----- + +### Case 2: Encoding map + +This uses: [`codext.add_map`](https://github.com/dhondta/python-codext/blob/main/src/codext/__common__.py#L160) + +This applies when the codec can be defined a simple mapping between source and destination tokens. + +**Examples**: [`languages/braille`](https://github.com/dhondta/python-codext/blob/main/src/codext/languages/braille.py), [`languages/morse`](https://github.com/dhondta/python-codext/blob/main/src/codext/languages/morse.py), [`languages/southpark`](https://github.com/dhondta/python-codext/blob/main/src/codext/languages/southpark.py), [`stegano/klopf`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/klopf.py), [`stegano/rick`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/rick.py) + +The following options shall be considered: + +- `encmap` (second positional argument): This defines the encoding map and is the core of the codec ; 4 subcases are handled and explained hereafter. +- `repl_char` (keyword-argument ; default: "`?`"): The replacement character can be tuned, especially if the default one clashes with a character from the encoding. +- `sep` (keyword-argument ; default: ""): The separator between encoded tokens can be useful to tune, especially when the encoded tokens have a variable length. +- `ignore_case` (keyword-argument ; default: `None`): This defines where the case shall be ignored ; it can be one of the followings: "`encode`", "`decode`" or "`both`". +- `no_error` (keyword-argument ; default: `False`): This sets if errors should be handled as normal or if no error should be considered, simply leaving the input token as is in the output. +- `intype` (keyword-argument ; default: `None`): This specifies the type the input text should be converted to before applying the encoding map (pre-conversion before really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. +- `outype` (keyword-argument ; default: `None`): This specifies the type the output text of the encoding map should be converted from (post-conversion after really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. + +!!! warning "Input/Output types" + + By default, when `intype` is defined, `outype` takes the same value if left `None`. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be explicitely set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). + +`encmap` can be defined as follows: + +1. **Simple map**: In this case, the encoding map is a dictionary mapping each input character to an output one (see [`radio`](https://github.com/dhondta/python-codext/blob/main/src/codext/languages/radio.py) for an example). +2. **List of maps**: In this case, encoding maps are put in a list and referenced by their order number starting from 1, meaning that the `pattern` shall define a capture group with values from 1 to the length of this list (see [`dna`](https://github.com/dhondta/python-codext/blob/main/src/codext/others/dna.py) for an example). +3. **Parametrized map**: This variant defines a dictionary of regex-selected encoding maps, that is, a dictionary of dictionaries with keys matching the captured groups from codec's pattern. +4. **Map factory function**: This one is implemented by a function that returns the composed encoding map. This function takes a single argument according to the capture group from the `pattern` (see [`affine`](https://github.com/dhondta/python-codext/blob/main/src/codext/crypto/affine.py) for an example). + +!!! note "Mapping one input character to multiple output characters" + + In some particular cases (e.g. the `navajo` codec), a single input character can be mapped to multiple output ones. It is possible to define them in a map by simply putting them into a list (e.g. a map with `{'A': ["B", "C", "D"]}`). In this case, while encoding, the output character is randomly chosen (e.g. "`A`" will map to "`D`", another time to "`B`", ...). + +----- + +### Self-generated tests + +In order to facilitate testing, a test suite can be automatically generated from a set of *examples*. This is defined in the `__examples__` dunder inside codec's source file (see [`sms`](https://github.com/dhondta/python-codext/blob/main/src/codext/stegano/sms.py) for an example). By default, the `add`/`add_map` function will get `__examples__` from the global scope but this behavior can be overridden by specifying the keyword-argument `examples` (e.g. `add(..., examples=__examples1__)` ; see [`ordinal`](https://github.com/dhondta/python-codext/blob/main/src/codext/common/ordinal.py) for an example). + +A set of examples is a dictionary specifying the test cases to be considered. The keys are the descriptions of the test cases and the values can be either dictionaries of input texts and their output encoded texts or lists of input texts. Each key has the format "`operation(encodings)`". Operations can be: + +- `enc`: This is for testing the encoding of the nested values (that is, a dictionary of input/outputs). +- `dec`: This is for testing the decoding of the nested values (that is, a dictionary of input/outputs). If this is not specified, the test suite automatically tries to decode from what is defined in `enc`. +- `enc-dec`: This is for testing the encoding AND decoding of the nested values (that is, a list of inputs) ; this one does not enforce what should be the output of the encoding but checks that encoding AND decoding leads to the same input text. This is particularly useful when encoding can yield randomly chosen tokens in the encoded output. + +The `encodings` are a `|`-separated list of encoding names, compliant or not with tested codec's pattern. Faulty names can also be tested as of the examples hereafter. + +Examples of `__examples__` test suites: + +```python +__my_examples__ = { + 'enc(BAD)': None +} +``` + +!!! note "Observations" + + - `__my__examples__` is not the standard dunder, therefore requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. + - `BAD` is assumed to be a bad encoding name, therefore having a dictionary value of `None`, meaning that the test should raise a `LookupError`. + +```python +__examples__ = { + 'enc(codec)': {'string': None} +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, therefore NOT requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. + - `codec` is assumed to be a valid encoding name, therefore having a dictionary as its value, but in this special case "`string`" is assumed not to be encoded, its corresponding value is then `None`, meaning that the test should raise a `ValueError`. + +```python +__examples__ = { + 'enc-dec(codec)': ["test string", "TEST STRING", "@random", "@random{1024}"] +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. + - `enc-dec` is used, meaning that a list of inputs is defined. + - So, whatever its encoded output, the input string shall give the same while applying encoding then decoding. + - The special values `@random` and `@random{1024}`, meaning that test strings are generated from any possible byte-character with a specified length (512 when not specified, otherwise specified with `{...}`). + +```python +__examples__ = { + 'enc(codec)': {"test string": "..."} +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. + - `enc` only is used, meaning that a dictionary of inputs/outputs is given and `dec` is automatically handled while requiring the exact encoded text but recovering the exact same input while decoding. + +```python +__examples__ = { + 'enc(codec)': {"Test String": "..."}, + 'dec(codec)': {"...": "test string"}, +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. + - `enc` and `dec` are used, meaning that dictionaries of inputs/outputs are given and the input texts are not necessarily the same (i.e. if text case is not handled by the codec). + +----- + +### Codec names for the guessing mode + +The `__guess__` list of codec names is used to limit the possibilities in the tree search from the [guessing mode](./guessing). Especially when the codec is dynamic and may have a large (or even infinite) number of dynamic names, it is necessary to set a limited number in order to avoid exponentially increasing computation time. This list, when relevant, shall be used with due care. + +!!! note "Mapping one input character to multiple output characters" + + As a best practice, static names for the [guessing mode](./guessing) should be limited to 16, in order to avoid exponential computation time in the search tree algorithm. + +----- + +### Adding a new codec to `codext` + +As a checklist when making a codec for addition in `codext`, please follow these steps: + +1. Create your codec file (i.e. starting with a copy of an existing similar one) +2. Place it into the right category folder (when a category cannot be put in one of the folders under the root of [`codext`](https://github.com/dhondta/python-codext/blob/main/src/codext), it shall be put by default in [`others`](https://github.com/dhondta/python-codext/blob/main/src/codext/others)) +3. Add it to the list in [`README.md`](https://github.com/dhondta/python-codext/blob/main/src/README.md#list-of-codecs) +4. Add its documentation in the [right Markdown file](https://github.com/dhondta/python-codext/tree/main/src/docs/enc) +5. If self-generated tests are not enough, add manual tests in [the related file](https://github.com/dhondta/python-codext/blob/main/src/tests/test_manual.py) + diff --git a/docs/imgs/banner.png b/docs/pages/img/banner.png similarity index 100% rename from docs/imgs/banner.png rename to docs/pages/img/banner.png diff --git a/docs/pages/img/icon.png b/docs/pages/img/icon.png new file mode 100644 index 0000000..da3cb31 Binary files /dev/null and b/docs/pages/img/icon.png differ diff --git a/docs/pages/img/logo.png b/docs/pages/img/logo.png new file mode 100644 index 0000000..a1827f8 Binary files /dev/null and b/docs/pages/img/logo.png differ diff --git a/docs/index.md b/docs/pages/index.md similarity index 97% rename from docs/index.md rename to docs/pages/index.md index 185dd25..2579b17 100644 --- a/docs/index.md +++ b/docs/pages/index.md @@ -1,11 +1,9 @@ -## Introduction - -Codext, contraction of "*codecs*" and "*extension*", is a library that gathers many additional encodings for use with [`codecs`](https://docs.python.org/3/library/codecs.html). When imported, it registers new encodings to an extended codecs registry for making the encodings available from the `codecs.(decode|encode|open)` API. It also features [CLI tools](./cli.html) and a [guess mode](./features.html#guess-decode-an-arbitrary-input) for decoding multiple layers of codecs. - -### Setup - -This library is available on [PyPi](https://pypi.python.org/pypi/codext/) and can be simply installed using Pip: - -```sh -pip install codext -``` +Codext, contraction of "*codecs*" and "*extension*", is a library that gathers many additional encodings for use with [`codecs`](https://docs.python.org/3/library/codecs.html). When imported, it registers new encodings to an extended codecs registry for making the encodings available from the `codecs.(decode|encode|open)` API. It also features [CLI tools](./cli.html) and a [guess mode](./features.html#guess-decode-an-arbitrary-input) for decoding multiple layers of codecs. + +### Setup + +This library is available on [PyPi](https://pypi.python.org/pypi/codext/) and can be simply installed using Pip: + +```sh +pip install codext +``` diff --git a/docs/manipulations.md b/docs/pages/manipulations.md similarity index 83% rename from docs/manipulations.md rename to docs/pages/manipulations.md index 7962278..340f89c 100644 --- a/docs/manipulations.md +++ b/docs/pages/manipulations.md @@ -1,67 +1,74 @@ -## String tranformations - -`codext` also defines multiple dummy string manipulation/transformation codecs, essentially for use with the CLI tool and for the sake of simplicity. - ------ - -### Case-related operations - -These transformation functions are simple string transformations, including `str`'s methods. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`camelcase` | text --> camel-case text | `camel` | no decoding -`capitalize` | text <-> capitalized text | | decoding "uncapitalizes" the text -`lowercase` | text <-> lowercase text | `lower` | decoding is `uppercase` -`pascalcase` | text --> pascal-case text | `pascal` | no decoding -`slugify` | text --> slug | `slug`, `kebab`, `kebabcase` | no decoding -`snakecase` | text --> snake-case text | `snake` | no decoding -`swapcase` | text <-> case-swapped text | `swap`, `invert`, `invertcase` | -`title` | text <-> titled text | | decoding "untitles" the text -`uppercase` | text <-> uppercase text | `upper` | decoding is `lowercase` - -Of course, these transformations have no interest while using them in Python as the `str` methods can be called. It can be useful while using `codext` from the terminal (see [*CLI tool*](cli.html)). - -Some simple examples: - -```sh -$ echo -en "test string" | codext encode swap-case -TEST STRING - -$ echo -en "test string" | codext encode camel_case -testString - -$ echo -en "test string" | codext encode kebab_case -test-string -``` - ------ - -### Dummy string operations - -These transformation functions are simple string transformations. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`replace` | text <-> text with single-char replaced | | -`reverse` | text <-> reversed text | | -`reverse-words` | text <-> reversed words | | same as `reverse` but not on the whole text, only on the words (text split by whitespace) -`strip-spaces` | text <-> all whitespaces stripped | | -`substitute` | text <-> text with token substituted | | - -As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)). - -A simple example: - -```sh -$ echo -en "test string" | codext encode reverse-words | codext encode reverse replace-\ _ -string_test -``` - -Or using encodings chaining: - -```sh -$ echo -en "test string" | codext encode reverse-words reverse substitute-string/phrase -phrase test -``` - +`codext` also defines multiple dummy string manipulation/transformation codecs, essentially for use with the CLI tool and for the sake of simplicity. + +----- + +### Case-related operations + +These transformation functions are simple string transformations, including `str`'s methods. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`camelcase` | text --> camel-case text | `camel` | no decoding +`capitalize` | text <-> capitalized text | | decoding "uncapitalizes" the text +`lowercase` | text <-> lowercase text | `lower` | decoding is `uppercase` +`pascalcase` | text --> pascal-case text | `pascal` | no decoding +`screamingsnakecase` | text --> screaming-snake-case text | `screaming-snake`, `screaming_snake_case` | no decoding +`slugify` | text --> slug | `slug`, `kebab`, `kebabcase` | no decoding +`snakecase` | text --> snake-case text | `snake` | no decoding +`swapcase` | text <-> case-swapped text | `swap`, `invert`, `invertcase` | +`title` | text <-> titled text | | decoding "untitles" the text +`uppercase` | text <-> uppercase text | `upper` | decoding is `lowercase` + +Of course, these transformations have no interest while using them in Python as the `str` methods can be called. It can be useful while using `codext` from the terminal (see [*CLI tool*](cli.html)). + +Some simple examples: + +```sh +$ echo -en "test string" | codext encode swap-case +TEST STRING + +$ echo -en "test string" | codext encode camel_case +testString + +$ echo -en "test string" | codext encode kebab_case +test-string +``` + +----- + +### Dummy string operations + +These transformation functions are simple string transformations. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`replace` | text <-> text with multi-chars replaced | | parametrized with a _string_ and its _replacement_ +`reverse` | text <-> reversed text | | +`reverse-words` | text <-> reversed words | | same as `reverse` but not on the whole text, only on the words (text split by whitespace) +`strip-spaces` | text <-> all whitespaces stripped | | +`substitute` | text <-> text with token substituted | | +`tokenize` | text <-> text split in tokens of length N | | parametrized with _N_ + +As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)). + +A simple example: + +```sh +$ echo -en "test string" | codext encode reverse-words | codext encode reverse replace-\ _ +string_test +``` + +Another example: + +```sh +$ echo -en "3132333435" | codext encode tokenize-2 +31 32 33 34 35 +``` + +Or using encodings chaining: + +```sh +$ echo -en "test string" | codext encode reverse-words reverse substitute-string/phrase +phrase test +``` + diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..ebcf1c7 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,5 @@ +jinja2>=3.1 +markdown>=3.5 +mkdocs>=1.5 +mkdocs-material>=9.5 +pymdown-extensions>=10.0 diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index e9fa675..0000000 --- a/mkdocs.yml +++ /dev/null @@ -1,31 +0,0 @@ -site_name: "Codext - Extension of native codecs for Python" -repo_url: https://github.com/dhondta/python-codext -site_author: dhondta -docs_dir: docs -nav: - - Introduction: index.md - - Features: features.md - - 'Guess mode': guessing.md - - Encodings: - - Base: enc/base.md - - Binary: enc/binary.md - - Common: enc/common.md - - Compressions: enc/compressions.md - - Cryptography: enc/crypto.md - - Hashing: enc/hashing.md - - Languages: enc/languages.md - - Others: enc/others.md - - Steganography: enc/stegano.md - - 'String manipulations': manipulations.md - - 'CLI tool': cli.md - - 'Create your codec': howto.md -extra: - mailto: alexandre.dhondt@gmail.com -theme: readthedocs -extra_javascript: - - js/collapsible-navbar.js -use_directory_urls: false -markdown_extensions: - - toc: - permalink: true - - admonition diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1644aee --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,78 @@ +[build-system] +requires = ["setuptools>=80.0.0", "setuptools-scm"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +version = {attr = "codext.__info__.__version__"} + +[tool.setuptools.packages.find] +where = ["src"] + +[project] +name = "codext" +authors = [ + {name="Alexandre D'Hondt", email="alexandre.dhondt@gmail.com"}, +] +description = "Native codecs extension" +license = {file = "LICENSE"} +keywords = ["python", "development", "programming", "codecs", "encodings"] +requires-python = ">=3.8,<4" +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Developers", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", + "Programming Language :: Python :: 3", + "Topic :: Software Development :: Libraries :: Python Modules", +] +dependencies = [ + "legacycrypt; python_version >= '3.13'", + "markdown2>=2.5.4", +] +dynamic = ["version"] + +[project.readme] +file = "README.md" +content-type = "text/markdown" + +[project.urls] +documentation = "https://python-codext.readthedocs.io/en/latest/?badge=latest" +homepage = "https://github.com/dhondta/python-codext" +issues = "https://github.com/dhondta/python-codext/issues" +repository = "https://github.com/dhondta/python-codext" + +[project.scripts] +base1 = "codext.base.baseN:main1" +base2 = "codext.base.baseN:main2" +base3 = "codext.base.baseN:main3" +base4 = "codext.base.baseN:main4" +base8 = "codext.base.baseN:main8" +base10 = "codext.base.baseN:main10" +base16 = "codext.base.baseN:main16" +base26 = "codext.base.baseN:main26" +base32 = "codext.base.baseN:main32" +base32-hex = "codext.base.baseN:main32hex" +base32-geohash = "codext.base.baseN:main32geo" +base32-crockford = "codext.base.baseN:main32crk" +base32-z = "codext.base.baseN:mainz32" +base36 = "codext.base.baseN:main36" +base45 = "codext.base.base45:main" +base58-bitcoin = "codext.base.baseN:main58bc" +base58-ripple = "codext.base.baseN:main58rp" +base58-flickr = "codext.base.baseN:main58fl" +base62 = "codext.base.baseN:main62" +base63 = "codext.base.baseN:main63" +base64 = "codext.base.baseN:main64" +base64-url = "codext.base.baseN:main64url" +base67 = "codext.base.baseN:main67" +base85 = "codext.base.base85:main85" +base85-adobe = "codext.base.base85:main85adobe" +base85-xbtoa = "codext.base.base85:main85xbtoa" +base85-ipv6 = "codext.base.base85:main85rfc1924" +base85-xml = "codext.base.base85:main85xml" +base85-zeromq = "codext.base.base85:main85zeromq" +base91 = "codext.base.base91:main91" +base100 = "codext.base.base100:main100" +base122 = "codext.base.base122:main122" +codext = "codext.__init__:main" +unbase = "codext.base.__init__:main" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..fcccae1 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = src diff --git a/requirements.txt b/requirements.txt index ffe2fce..dcaadfd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -six +legacycrypt; python_version >= '3.13' +markdown2>=2.5.4 diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 958a404..0000000 --- a/setup.cfg +++ /dev/null @@ -1,80 +0,0 @@ -[metadata] -name = codext -version = file: codext/VERSION.txt -author = Alexandre D'Hondt -author-email = alexandre.dhondt@gmail.com -home-page = https://github.com/dhondta/python-codext -description = Native codecs extension -long_description = file: README.md -long_description_content_type = text/markdown -keywords = - python - development - programming - codecs - encodings -license = GPLv3 -license-file = LICENSE -classifier = - Development Status :: 5 - Production/Stable - Environment :: Console - Intended Audience :: Developers - License :: OSI Approved :: GNU General Public License v3 (GPLv3) - Programming Language :: Python :: 2 - Programming Language :: Python :: 2.7 - Programming Language :: Python :: 3 - Programming Language :: Python :: 3.6 - Programming Language :: Python :: 3.7 - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Topic :: Software Development :: Libraries :: Python Modules - -[options] -packages = find: -include_package_data = False -install_requires = - markdown2==2.3.10; python_version=='2.7' # rq.filter: >=2.4.0 - markdown2>=2.4.0; python_version>='3.6' - six -setup-requires = setuptools -python-requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,<4 - -[options.package_data] -* = *.txt,*.json - -[options.entry_points] -console_scripts = - base1 = codext.base.baseN:main1 - base2 = codext.base.baseN:main2 - base3 = codext.base.baseN:main3 - base4 = codext.base.baseN:main4 - base8 = codext.base.baseN:main8 - base10 = codext.base.baseN:main10 - base16 = codext.base.baseN:main16 - base26 = codext.base.baseN:main26 - base32 = codext.base.baseN:main32 - base32-hex = codext.base.baseN:main32hex - base32-geohash = codext.base.baseN:main32geo - base32-crockford = codext.base.baseN:main32crk - base32-z = codext.base.baseN:mainz32 - base36 = codext.base.baseN:main36 - base45 = codext.base.base45:main - base58-bitcoin = codext.base.baseN:main58bc - base58-ripple = codext.base.baseN:main58rp - base58-flickr = codext.base.baseN:main58fl - base62 = codext.base.baseN:main62 - base63 = codext.base.baseN:main63 - base64 = codext.base.baseN:main64 - base64-url = codext.base.baseN:main64url - base67 = codext.base.baseN:main67 - base85 = codext.base.base85:main85 - base85-adobe = codext.base.base85:main85adobe - base85-xbtoa = codext.base.base85:main85xbtoa - base85-ipv6 = codext.base.base85:main85rfc1924 - base85-xml = codext.base.base85:main85xml - base85-zeromq = codext.base.base85:main85zeromq - base91 = codext.base.base91:main91 - base100 = codext.base.base100:main100 - base122 = codext.base.base122:main122 - codext = codext.__init__:main - unbase = codext.base.__init__:main diff --git a/setup.py b/setup.py deleted file mode 100644 index c823345..0000000 --- a/setup.py +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env python -from setuptools import setup - -setup() diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt new file mode 100644 index 0000000..d00a804 --- /dev/null +++ b/src/codext/VERSION.txt @@ -0,0 +1 @@ +1.16.1 diff --git a/codext/__common__.py b/src/codext/__common__.py similarity index 90% rename from codext/__common__.py rename to src/codext/__common__.py index 41cb5b2..7c3a001 100644 --- a/codext/__common__.py +++ b/src/codext/__common__.py @@ -1,1508 +1,1510 @@ -# -*- coding: UTF-8 -*- -import _codecs -import codecs -import json -import os -import random -import re -import sys -from encodings.aliases import aliases as ALIASES -from functools import reduce, update_wrapper, wraps -from importlib import import_module -from inspect import currentframe -from itertools import chain, product -from locale import getlocale -from math import log -from pkgutil import iter_modules -from platform import system -from random import randint -from six import binary_type, string_types, text_type, BytesIO -from string import * -from types import FunctionType, ModuleType -try: # Python2 - import __builtin__ as builtins -except ImportError: - import builtins -try: # Python2 - from inspect import getfullargspec -except ImportError: - from inspect import getargspec as getfullargspec -try: # Python2 - from string import maketrans -except ImportError: - maketrans = str.maketrans -try: # Python3 - from importlib import reload -except ImportError: - pass - - -__all__ = ["add", "add_macro", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess", - "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "i2s", "is_native", - "list_categories", "list_encodings", "list_macros", "lookup", "maketrans", "os", "rank", "re", "register", - "remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro", - "DARWIN", "LANG", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"] -CODECS_REGISTRY = None -CODECS_OVERWRITTEN = [] -CODECS_CATEGORIES = ["native", "custom"] -CODECS_CACHE = {} -LANG = getlocale() -if LANG: - LANG = (LANG[0] or "")[:2].lower() -MASKS = { - 'a': printable, - 'b': "".join(chr(i) for i in range(256)), - 'd': digits, - 'h': digits + "abcdef", - 'H': digits + "ABCDEF", - 'l': ascii_lowercase, - 'p': punctuation, - 's': " ", - 'u': ascii_uppercase, -} - -__codecs_registry = [] - -MACROS = {} -PERS_MACROS = {} -PERS_MACROS_FILE = os.path.expanduser("~/.codext-macros.json") - -DARWIN = system() == "Darwin" -LINUX = system() == "Linux" -PY3 = sys.version[0] == "3" -UNIX = DARWIN or LINUX -WINDOWS = system() == "Windows" - -entropy = lambda s: -sum([p * log(p, 2) for p in [float(s.count(c)) / len(s) for c in set(s)]]) - -isb = lambda s: isinstance(s, binary_type) -iss = lambda s: isinstance(s, string_types) -fix = lambda x, ref: b(x) if isb(ref) else ensure_str(x) if iss(ref) else x - -s2i = lambda s: int(codecs.encode(s, "base16"), 16) -exc_name = lambda e: "".join(t.capitalize() for t in re.split(r"[-_+]", e)) - - -def i2s(input): - h = hex(input)[2:].rstrip("eL") - return codecs.decode(h.zfill(len(h) + len(h) % 2), "hex") - - -class CodecMacro(tuple): - """Macro details when looking up the codec registry. """ - def __new__(cls, name): - self = tuple.__new__(cls) - self.name = name - # get from personal macros first - try: - self.codecs = PERS_MACROS[name] - except KeyError: - try: - self.codecs = MACROS[name] - except KeyError: - raise LookupError("unknown macro: %s" % name) - if not isinstance(self.codecs, (tuple, list)): - raise ValueError("bad macro list: %s" % str(self.codecs)) - self.codecs = [lookup(e, False) for e in self.codecs] # lookup(e, False) - self.parameters = {'name': name, 'category': "macro"} # ^ means that macros won't be nestable - # test examples to check that the chain of encodings works - for action, examples in (self.codecs[0].parameters.get('examples', {}) or {'enc-dec(': ["T3st str!"]}).items(): - if re.match(r"enc(-dec)?\(", action): - for e in (examples.keys() if action.startswith("enc(") else examples or []): - rd = re.match(r"\@random(?:\{(\d+(?:,(\d+))*?)\})?$", e) - if rd: - for n in (rd.group(1) or "512").split(","): - self.encode("".join(chr(randint(0, 255)) for i in range(int(n)))) - continue - self.encode(e) - - class Codec: - decode = self.decode - encode = self.encode - - class IncrementalEncoder(codecs.IncrementalEncoder): - def encode(self, input, final=False): - return b(self.encode(input, self.errors)[0]) - self.incrementalencoder = IncrementalEncoder - - class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - return ensure_str(self.decode(input, self.errors)[0]) - self.incrementaldecoder = IncrementalDecoder - - class StreamWriter(Codec, codecs.StreamWriter): - charbuffertype = bytes - self.streamwriter = StreamWriter - - class StreamReader(Codec, codecs.StreamReader): - charbuffertype = bytes - self.streamreader = StreamReader - - return self - - def decode(self, input, error="strict"): - """ Decode with each codec in reverse order. """ - for ci in self.codecs[::-1]: - input, l = ci.decode(input, error) - return input, l - - def encode(self, input, error="strict"): - """ Encode with each codec. """ - for ci in self.codecs: - input, l = ci.encode(input, error) - return input, l - - def __repr__(self): - return "" % (self.name, id(self)) - - -# inspired from: https://stackoverflow.com/questions/10875442/possible-to-change-a-functions-repr-in-python -class Repr(object): - def __init__(self, name, func): - self.__name = name - self.__func = func - update_wrapper(self, func) - - def __call__(self, *args, **kwargs): - return self.__func(*args, **kwargs) - - def __repr__(self): - return "" % (self.__name, id(self)) - - -def __stdin_pipe(): - """ Stdin pipe read function. """ - try: - with open(0, 'rb') as f: - for l in f: - yield l - except TypeError: - for l in sys.stdin: - yield l - - -def _input(infile): - # handle input file or stdin - c = b("") - if infile: - with open(infile, 'rb') as f: - c = f.read() - else: - for line in __stdin_pipe(): - c += line - return c - - -def _set_exc(name, etype="ValueError"): - if not hasattr(builtins, name): - exec("class %s(%s): __module__ = 'builtins'" % (name, etype)) - setattr(builtins, name, locals()[name]) -_set_exc("InputSizeLimitError") -_set_exc("ParameterError") - - -def _stripl(s, st_lines, st_crlf): - if st_crlf: - s = s.replace(b"\r\n", b"") if isb(s) else s.replace("\r\n", "") - if st_lines: - s = s.replace(b"\n", b"") if isb(s) else s.replace("\n", "") - return s - - -def _with_repr(name): - def _wrapper(f): - return Repr(name, f) - return _wrapper - - -def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False, **kwargs): - """ This adds a new codec to the codecs module setting its encode and/or decode functions, eventually dynamically - naming the encoding with a pattern and with file handling. - - :param ename: encoding name - :param encode: encoding function or None - :param decode: decoding function or None - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - """ - remove(ename) - if encode: - if not isinstance(encode, FunctionType): - raise ValueError("Bad 'encode' function") - _set_exc("%sEncodeError" % exc_name(ename)) # create the custom encode exception as a builtin - if decode: - if not isinstance(decode, FunctionType): - raise ValueError("Bad 'decode' function") - _set_exc("%sDecodeError" % exc_name(ename)) # create the custom decode exception as a builtin - if not encode and not decode: - raise ValueError("At least one en/decoding function must be defined") - for exc in kwargs.get('extra_exceptions', []): - _set_exc(exc) # create additional custom exceptions as builtins - glob = currentframe().f_back.f_globals - # search function for the new encoding - @_with_repr(ename) - def getregentry(encoding): - if encoding != ename and not (pattern and re.match(pattern, encoding)): - return - fenc, fdec, name = encode, decode, encoding - # prepare CodecInfo input arguments - if pattern: - m, args, i = re.match(pattern, encoding), [], 1 - try: - while True: - try: - g = m.group(i) or "" - if g.isdigit() and not g.startswith("0") and "".join(set(g)) != "01": - g = int(g) - args += [g] - i += 1 - except AttributeError: - # this occurs when m is None or there is an error in fenc(g) or fdec(g), meaning no match - if m is not None: - raise - return - except IndexError: - # this occurs while m is not None, but possibly no capture group that gives at least 1 group index ; - # in this case, if fenc/fdec is a decorated function, execute it with no arg - if len(args) == 0: - if fenc and len(getfullargspec(fenc).args) == 1: - fenc = fenc() - if fdec and len(getfullargspec(fdec).args) == 1: - fdec = fdec() - else: - fenc = fenc(*args) if fenc else fenc - fdec = fdec(*args) if fdec else fdec - if fenc: - fenc = fix_inout_formats(fenc) - if fdec: - fdec = fix_inout_formats(fdec) - sl, sc = kwargs.pop('strip_lines', False), kwargs.pop('strip_crlf', False) - if sl or sc: - def _striplines(f): - def __wrapper(input, *a, **kw): - return f(_stripl(input, sc, sl), *a, **kw) - return __wrapper - # this fixes issues with wrapped encoded inputs - fdec = _striplines(fdec) - - class Codec(codecs.Codec): - def encode(self, input, errors="strict"): - if fenc is None: - raise NotImplementedError - return fenc(input, errors) - - def decode(self, input, errors="strict"): - if fdec is None: - raise NotImplementedError - return fdec(input, errors) - - class IncrementalEncoder(codecs.IncrementalEncoder): - def encode(self, input, final=False): - if fenc is None: - raise NotImplementedError - return b(fenc(input, self.errors)[0]) - - class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - if fdec is None: - raise NotImplementedError - return ensure_str(fdec(input, self.errors)[0]) - - class StreamWriter(Codec, codecs.StreamWriter): - charbuffertype = bytes - - class StreamReader(Codec, codecs.StreamReader): - charbuffertype = bytes - - ci = codecs.CodecInfo( - name=name, - encode=Codec().encode, - decode=Codec().decode, - incrementalencoder=IncrementalEncoder, - incrementaldecoder=IncrementalDecoder, - streamwriter=StreamWriter, - streamreader=StreamReader, - _is_text_encoding=text, - ) - ci.parameters = kwargs - ci.parameters['name'] = ename - ci.parameters['add_to_codecs'] = add_to_codecs - ci.parameters['pattern'] = pattern - ci.parameters['text'] = text - f = glob.get('__file__', os.path.join("custom", "_")) - cat = f.split(os.path.sep)[-2].rstrip("s") - if cat not in CODECS_CATEGORIES: - CODECS_CATEGORIES.append(cat) - ci.parameters['category'] = kwargs.get('category', cat) - ci.parameters['examples'] = kwargs.get('examples', glob.get('__examples__')) - ci.parameters['guess'] = kwargs.get('guess', glob.get('__guess__', [ename])) or [] - ci.parameters['module'] = kwargs.get('module', glob.get('__name__')) - ci.parameters.setdefault("scoring", {}) - for attr in ["bonus_func", "entropy", "expansion_factor", "len_charset", "penalty", "printables_rate", - "padding_char", "transitive"]: - a = kwargs.pop(attr, None) - if a is not None: - ci.parameters['scoring'][attr] = a - return ci - - getregentry.__name__ = re.sub(r"[\s\-]", "_", ename) - if kwargs.get('aliases'): - getregentry.__aliases__ = list(map(lambda n: re.sub(r"[\s\-]", "_", n), kwargs['aliases'])) - getregentry.__pattern__ = pattern - register(getregentry, add_to_codecs) - return getregentry - - -def add_macro(mname, *encodings): - """ This allows to define a macro, chaining multiple codecs one after the other. This relies on a default set of - macros from a YAML file embedded in the package and a local YAML file from the home folder that takes - precedence for defining personal macros. - - :param mname: macro name - :param encodings: encoding names of the encodings to be chained with the macro - """ - global PERS_MACROS - # check for name clash with alreday existing macros and codecs - if mname in MACROS or mname in PERS_MACROS: - raise ValueError("Macro name already exists") - try: - ci = lookup(mname, False) - raise ValueError("Macro name clashes with codec '%s'" % ci.name) - except LookupError: - pass - try: - PERS_MACROS[mname] = encodings - CodecMacro(mname) - with open(PERS_MACROS_FILE, 'w') as f: - json.dump(PERS_MACROS, f, indent=2) - except ValueError: - del PERS_MACROS[mname] - raise -codecs.add_macro = add_macro - - -def add_map(ename, encmap, repl_char="?", sep="", ignore_case=None, no_error=False, intype=None, outype=None, **kwargs): - """ This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs - module dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with - a pattern and with file handling (if text is True). - - :param ename: encoding name - :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture - group of the regex pattern) or a function building the encoding map - :param repl_char: replacement char (used when errors handling is set to "replace") - :param sep: string of possible character separators (hence, only single-char separators are considered) ; - - while encoding, the first separator is used - - while decoding, separators can be mixed in the input text - :param ignore_case: ignore text case while encoding and/or decoding - :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) - :param intype: specify the input type for pre-transforming the input text - :param outype: specify the output type for post-transforming the output text - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - """ - outype = outype or intype - if ignore_case not in [None, "encode", "decode", "both"]: - raise ValueError("Bad ignore_case parameter while creating encoding map") - if intype not in [None, "str", "bin", "ord"]: - raise ValueError("Bad input type parameter while creating encoding map") - if outype not in [None, "str", "bin", "ord"]: - raise ValueError("Bad output type parameter while creating encoding map") - - def __generic_code(decode=False): - def _wrapper(param): - """ The parameter for wrapping comes from the encoding regex pattern ; e.g. - [no pattern] => param will be None everytime - r"barbie[-_]?([1-4])$" => param could be int 1, 2, 3 or 4 - r"^morse(|[-_]?.{3})$" => param could be None, "-ABC" (for mapping to ".-/") - - In order of precedence: - 1. when param is a key in mapdict or mapdict is a list of encoding maps (hence in the case of "barbie...", - param MUST be an int, otherwise for the first case it could clash with a character of the encoding map) - 2. otherwise handle it as a new encoding character map "ABC" translates to ".-/" for morse - """ - p = param - if isinstance(encmap, FunctionType): - mapdict = encmap(p) - p = None - else: - mapdict = encmap - if isinstance(mapdict, dict): - smapdict = {k: v for k, v in mapdict.items()} - elif isinstance(mapdict, list) and isinstance(mapdict[0], dict): - smapdict = {k: v for k, v in mapdict[0].items()} - else: - raise ValueError("Bad mapping dictionary or list of mapping dictionaries") - if p is not None: - # case 1: param is empty string - if p == "": - if isinstance(mapdict, list): - smapdict = {k: v for k, v in mapdict[0].items()} - elif isinstance(mapdict, dict): - if '' in mapdict.keys() and isinstance(mapdict[''], dict): - smapdict = {k: v for k, v in mapdict[''].items()} - else: - smapdict = {k: v for k, v in mapdict.items()} - # no 'else' handling a LookupError here ; this case is covered by the first if/elif/else block - # case 2: list or dictionary or dictionary of numbered encodings - elif isinstance(p, int): - # if mapdict is a list, we shall align the parameter (starting from 1) as an index (starting from 0) - if isinstance(mapdict, list): - p -= 1 - if isinstance(mapdict, list) and 0 <= p < len(mapdict) or \ - isinstance(mapdict, dict) and p in mapdict.keys(): - smapdict = {k: v for k, v in mapdict[p].items()} - else: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) - # case 3: dictionary of regex-selected encoding mappings - elif isinstance(mapdict, dict) and isinstance(list(mapdict.values())[0], dict): - tmp = None - for r, d in mapdict.items(): - if r == '': # this is already handled in case 1 ; anyway, an empty regex always matches, hence - continue # it must be excluded - if re.match(r, p): - tmp = d - break - if tmp is None: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) - smapdict = tmp - # case 4: encoding characters translation - else: - # collect base tokens in order of appearance in the mapping dictionary - base_tokens = "" - for _, c in sorted(mapdict.items()): - for t in c: - for st in t: - if st not in base_tokens: - base_tokens += st - if " " not in sep: - base_tokens = base_tokens.replace(" ", "") - if len(p) > 0 and p[0] in "-_" and len(p[1:]) == len(set(p[1:])) == len(base_tokens): - p = p[1:] - if len(p) == len(set(p)) == len(base_tokens): - t = maketrans(base_tokens, p) - for k, v in smapdict.items(): - smapdict[k] = [x.translate(t) for x in v] if isinstance(v, list) else v.translate(t) - else: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) - if ignore_case is not None: - cases = ["upper", "lower"] - case_d = cases[any(c in str(list(smapdict.values())) for c in "abcdefghijklmnopqrstuvwxyz")] - case_e = cases[any(c in str(list(smapdict.keys())) for c in "abcdefghijklmnopqrstuvwxyz")] - i = ignore_case - smapdict = {getattr(k, case_e)() if i in ["both", "encode"] else k: \ - ([getattr(x, case_d)() for x in v] if isinstance(v, list) else getattr(v, case_d)()) \ - if i in ["both", "decode"] else v for k, v in smapdict.items()} - if decode: - tmp = {} - # this has a meaning for encoding maps that could have clashes in encoded chars (e.g. Bacon's cipher ; - # I => abaaa but also J => abaaa, with the following, we keep I instead of letting J overwrite it) - for k, v in sorted(smapdict.items()): - if not isinstance(v, list): - v = [v] - for x in v: - if x not in tmp.keys(): - tmp[x] = k - smapdict, cs = tmp, reduce(lambda acc, x: acc + x, tmp.keys()) - kwargs['strip_lines'], kwargs['strip_crlf'] = "\n" not in set(cs), "\r\n" not in cs - # this allows to avoid an error with Python2 in the "for i, c in enumerate(parts)" loop - if '' not in smapdict.keys(): - smapdict[''] = "" - # determine token and result lengths - tmaxlen = max(map(len, smapdict.keys())) - tminlen = max(1, min(map(len, set(smapdict.keys()) - {''}))) - l = [] - for x in smapdict.values(): - getattr(l, ["append", "extend"][isinstance(x, list)])(x) - rminlen = max(1, min(map(len, set(l) - {''}))) - - # generic encoding/decoding function for map encodings - def code(text, errors="strict"): - icase = ignore_case == "both" or \ - decode and ignore_case == "decode" or \ - not decode and ignore_case == "encode" - if icase: - case = case_d if decode else case_e - if no_error: - errors = "leave" - text = ensure_str(text) - if not decode: - if intype == "bin": - text = "".join("{:0>8}".format(bin(ord(c))[2:]) for c in text) - elif intype == "ord": - text = "".join(str(ord(c)).zfill(3) for c in text) - r = "" - lsep = "" if decode else sep if len(sep) <= 1 else sep[0] - kind = ["character", "token"][tmaxlen > 1] - error_func = handle_error(ename, errors, lsep, repl_char, rminlen, decode, kind) - - # get the value from the mapping dictionary, trying the token with its inverted case if relevant - def __get_value(token, position, case_changed=False): - try: - result = smapdict[token] - except KeyError: - if icase and not case_changed: - token_inv_case = getattr(token, case)() - return __get_value(token_inv_case, position, True) - return error_func(token, position) - if isinstance(result, list): - result = result[0] - return result + lsep - - # if a separator is defined, rely on it by splitting the input text - if decode and len(sep) > 0: - for i, c in enumerate(re.split("[" + sep + "]", text)): - r += __get_value(c, i) - # otherwise, move through the text using a cursor for tokenizing it ; this allows defining more complex - # encodings with variable token lengths - else: - cursor, bad = 0, "" - while cursor < len(text): - token = text[cursor:cursor+1] - for l in range(tminlen, tmaxlen + 1): - token = text[cursor:cursor+l] - if token in smapdict.keys() or icase and getattr(token, case)() in smapdict.keys(): - r += __get_value(token, cursor) - cursor += l - break - else: - # collect bad chars and only move the cursor one char to the right - bad += text[cursor] - cursor += 1 - # if the number of bad chars is the minimum token length, consume it and start a new buffer - if len(bad) == tminlen or errors == "leave": - posn = cursor - len(bad) - r += error_func(bad, posn) - bad = "" - if decode: - if outype in ["bin", "ord"]: - tmp, r = "", r.replace(lsep, "") - step = [3, 8][outype == "bin"] - for i in range(0, len(r), step): - s = r[i:i+step] - try: - tmp += chr(int(s, 2) if outype == "bin" else int(s)) - except ValueError: - if len(s) > 0: - tmp += "[" + s + "]" - r = tmp + lsep - return r[:len(r)-len(lsep)], len(b(text)) - return code - if re.search(r"\([^(?:)]", kwargs.get('pattern', "")) is None: - # in this case, there is no capturing group for parametrization - return _wrapper(None) - return _wrapper - - glob = currentframe().f_back.f_globals - kwargs['category'] = glob['__file__'].split(os.path.sep)[-2].rstrip("s") - kwargs['examples'] = kwargs.get('examples', glob.get('__examples__')) - kwargs['encmap'] = encmap - kwargs['repl_char'] = repl_char - kwargs['sep'] = sep - kwargs['ignore_case'] = ignore_case - kwargs['no_error'] = no_error - kwargs['intype'] = intype - kwargs['outype'] = outype - kwargs['module'] = glob.get('__name__') - try: - if isinstance(encmap, dict): - smapdict = {k: v for k, v in encmap.items()} - elif isinstance(encmap, list) and isinstance(encmap[0], dict): - smapdict = {k: v for k, v in encmap[0].items()} - kwargs['repl_minlen'] = i = max(1, min(map(len, set(smapdict.values()) - {''}))) - kwargs['repl_minlen_b'] = max(1, min(map(len, map(b, set(smapdict.values()) - {''})))) - except: - pass - return add(ename, __generic_code(), __generic_code(True), **kwargs) -codecs.add_map = add_map - - -def clear(): - """ Clear codext's local registry of search functions. """ - global __codecs_registry, MACROS, PERS_MACROS - __codecs_registry, MACROS, PERS_MACROS = [], {}, {} -codecs.clear = clear - - -def examples(encoding, number=10): - """ Use the search function to get the matching encodings and provide examples of valid encoding names. """ - e = [] - for name in search(encoding): - for search_function in __codecs_registry: - n = search_function.__name__ - if name in [n, n.replace("_", "-")]: - temp = [] - for s in generate_strings_from_regex(search_function.__pattern__, yield_max=16*number): - temp.append(s) - random.shuffle(temp) - i = 0 - while i < min(number, len(temp)): - if not temp[i].isdigit(): - try: - lookup(temp[i], False) - e.append(temp[i]) - except LookupError: - pass - i += 1 - for alias, codec in ALIASES.items(): - if name == codec: - if codec not in e: - e.append(codec) - if not alias.isdigit(): - e.append(alias) - random.shuffle(e) - return sorted([e[i] for i in range(min(number, len(e)))], key=_human_keys) -codecs.examples = examples - - -def is_native(encoding): - """ Determine if a given encoding is native or not. """ - return lookup(encoding, False).parameters['category'] == "native" - - -def list_categories(): - """ Get a list of all codec categories. """ - c = CODECS_CATEGORIES - root = os.path.dirname(__file__) - for d in os.listdir(root): - if os.path.isdir(os.path.join(root, d)) and not d.startswith("__"): - c.append(d.rstrip("s")) - # particular category, hardcoded from base/_base.py - c += ["base-generic"] - return c -list_categories() - - -def list_encodings(*categories): - """ Get a list of all codecs. """ - # if "non-native" is in the input list, extend the list with the whole categories but "native" - categories, exclude = list(categories), [] - for c in categories[:]: - if c == "non-native": - for c in CODECS_CATEGORIES: - if c == "native" or c in categories: - continue - categories.append(c) - categories.remove("non-native") - if c.startswith("~"): - exclude.append(c[1:]) - categories.remove(c) - try: - categories.remove(c[1:]) - except ValueError: - pass - # now, filter codecs according to the input list of categories - enc = [] - if (len(categories) == 0 or "native" in categories) and "native" not in exclude: - for a in set(ALIASES.values()): - try: - ci = __orig_lookup(a) - except LookupError: - continue - if lookup(a) is ci: - enc.append(ci.name) - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - name = search_function.__name__.replace("_", "-") - p = search_function.__pattern__ - ci = search_function(name) if p is None else search_function(generate_string_from_regex(p)) - c = "other" if ci is None else ci.parameters['category'] - if (len(categories) == 0 or c in categories) and c not in exclude: - enc.append(name) - for category in categories: - if category not in CODECS_CATEGORIES: - raise ValueError("Category '%s' does not exist" % category) - return sorted(list(set(enc)), key=_human_keys) - - -def list_macros(): - """ Get a list of all macros, with the precedence on personal ones. """ - return sorted(list(set(list(MACROS.keys()) + list(PERS_MACROS.keys())))) - - -def remove(name): - """ Remove all search functions matching the input encoding name from codext's local registry or any macro with the - given name. """ - global __codecs_registry, MACROS, PERS_MACROS - tbr = [] - for search_function in __codecs_registry: - if search_function(name) is not None: - tbr.append(search_function) - for search_function in tbr: - __codecs_registry.remove(search_function) - try: - del MACROS[name] - except KeyError: - pass - try: - del PERS_MACROS[name] - with open(PERS_MACROS_FILE, 'w') as f: - json.dump(PERS_MACROS, f, indent=2) - except KeyError: - pass - try: - del CODECS_CACHE[name] - except KeyError: - pass - for s in ["En", "De"]: - try: - delattr(builtins, "%s%scodeError" % (name.capitalize(), s)) - except AttributeError: - pass -codecs.remove = remove - - -def reset(): - """ Reset codext's local registry of search functions and macros. """ - global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS - clear() - d = os.path.dirname(__file__) - for pkg in sorted(os.listdir(d)): - if pkg.startswith("_") or not os.path.isdir(os.path.join(d, pkg)): - continue - reload(import_module("codext." + pkg)) - # backup codext's registry - if CODECS_REGISTRY is None: - CODECS_REGISTRY = __codecs_registry[:] - # restore codext's registry - else: - __codecs_registry = CODECS_REGISTRY[:] - # restore codext's embedded set of macros - with open(os.path.join(os.path.dirname(__file__), "macros.json")) as f: - MACROS = json.load(f) - # reload personal set of macros - PERS_MACROS = {} - if os.path.exists(PERS_MACROS_FILE): - with open(PERS_MACROS_FILE) as f: - PERS_MACROS = json.load(f) -codecs.reset = reset - - -# conversion functions -def b(s): - """ Non-crashing bytes conversion function. """ - if PY3: - try: - return s.encode("latin-1") - except: - pass - try: - return s.encode("utf-8") - except: - pass - return s - - -def ensure_str(s, encoding='utf-8', errors='strict'): - """ Similar to six.ensure_str. Adapted here to avoid messing up with six version errors. """ - if not PY3 and isinstance(s, text_type): - return s.encode(encoding, errors) - elif PY3 and isinstance(s, binary_type): - try: - return s.decode(encoding, errors) - except: - return s.decode("latin-1") - return s - - -# make conversion functions compatible with input/output strings/bytes -def fix_inout_formats(f): - """ This decorator ensures that the first output of f will have the same text format as the first input (str or - bytes). """ - @wraps(f) - def _wrapper(*args, **kwargs): - a0 = args[0] - a0_isb = isb(a0) - a0 = ensure_str(a0) if iss(a0) or a0_isb else a0 - r = f(a0, *args[1:], **kwargs) - # special case: input is in bytes ; ensure that the returned length is this of the bytes, not this processed by - # the decode/encode function - if isinstance(r, (tuple, list)) and isinstance(r[1], int) and a0_isb: - r = tuple([list(r)[0]] + [len(args[0])] + list(r)[2:]) - return (fix(r[0], args[0]), ) + r[1:] if isinstance(r, (tuple, list)) else fix(r, args[0]) - return _wrapper - - -# alphabet generation function from a given mask -def get_alphabet_from_mask(mask): - """ This function generates an alphabet from the given mask. The style used is similar to Hashcat ; group keys are - marked with a heading "?". """ - i, alphabet = 0, "" - while i < len(mask): - c = mask[i] - if c == "?" and i < len(mask) - 1 and mask[i+1] in MASKS.keys(): - for c in MASKS[mask[i+1]]: - if c not in alphabet: - alphabet += c - i += 1 - elif c not in alphabet: - alphabet += c - i += 1 - return alphabet - - -# generic error handling function -def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=False, kind="character", item="position"): - """ This shortcut function allows to handle error modes given some tuning parameters. - - :param ename: encoding name - :param errors: error handling mode - :param sep: token separator - :param repl_char: replacement character (for use when errors="replace") - :param repl_minlen: repeat number for the replacement character - :param decode: whether we are encoding or decoding - :param item: position item description (for describing the error ; e.g. "group" or "token") - """ - exc = "%s%scodeError" % (exc_name(ename), ["En", "De"][decode]) - - def _handle_error(token, position, output="", eename=None): - """ This handles an encoding/decoding error according to the selected handling mode. - - :param token: input token to be encoded/decoded - :param position: token position index - :param output: output, as decoded up to the position of the error - """ - if errors == "strict": - msg = "'%s' codec can't %scode %s '%s' in %s %d" - token = ensure_str(token) - token = token[:7] + "..." if len(token) > 10 else token - err = getattr(builtins, exc)(msg % (eename or ename, ["en", "de"][decode], kind, token, item, position)) - err.output = output - err.__cause__ = err - raise err - elif errors == "leave": - return token + sep - elif errors == "replace": - return repl_char * repl_minlen + sep - elif errors == "ignore": - return "" - else: - raise ValueError("Unsupported error handling '{}'".format(errors)) - return _handle_error - - -# codecs module hooks -__orig_lookup = _codecs.lookup -__orig_register = _codecs.register - - -def __add(ename, encode=None, decode=None, pattern=None, text=True, **kwargs): - kwargs.pop('add_to_codecs', None) - return add(ename, encode, decode, pattern, text, True, **kwargs) -__add.__doc__ = add.__doc__ -codecs.add = __add - - -def decode(obj, encoding='utf-8', errors='strict'): - """ Custom decode function relying on the hooked lookup function. """ - return lookup(encoding).decode(obj, errors)[0] -codecs.decode = decode - - -def encode(obj, encoding='utf-8', errors='strict'): - """ Custom encode function relying on the hooked lookup function. """ - n, m = 1, re.search(r"\[(\d+)\]$", encoding) - if m: - n = int(m.group(1)) - encoding = re.sub(r"\[(\d+)\]$", "", encoding) - ci = lookup(encoding) - for i in range(n): - obj = ci.encode(obj, errors)[0] - return obj -codecs.encode = encode - - -def lookup(encoding, macro=True): - """ Hooked lookup function for searching first for codecs in the local registry of this module. """ - # first, try to match the given encoding with codecs' search functions - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - codecinfo = search_function(encoding) - if codecinfo is not None: - return codecinfo - # then, if a codec name was given, generate an encoding name from its pattern and get the CodecInfo - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - if search_function.__name__.replace("_", "-") == encoding or \ - encoding in getattr(search_function, "__aliases__", []): - codecinfo = search_function(generate_string_from_regex(search_function.__pattern__)) - if codecinfo is not None: - return codecinfo - # finally, get a CodecInfo with the original lookup function and refine it with a dictionary of parameters - try: - ci = __orig_lookup(encoding) - ci.parameters = {'category': "native", 'module': "codecs", 'name': ALIASES.get(ci.name, ci.name)} - return ci - except LookupError: - if not macro: - raise - try: - return CodecMacro(encoding) - except LookupError: - e = LookupError("unknown encoding: %s" % encoding) - e.__cause__ = e # stop exception chaining - raise e -codecs.lookup = lookup - - -def register(search_function, add_to_codecs=False): - """ Register function for registering new codecs in the local registry of this module and, if required, in the - native codecs registry (for use with the built-in 'open' function). - - :param search_function: search function for the codecs registry - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - """ - if search_function not in __codecs_registry: - try: - __orig_lookup(search_function.__name__) - l = CODECS_OVERWRITTEN - except LookupError: - l = __codecs_registry - l.append(search_function) - if add_to_codecs: - __orig_register(search_function) - - -def __register(search_function): - """ Same as register(...), but with add_to_codecs set by default to True. """ - register(search_function, True) -codecs.register = __register - - -def search(encoding_regex, extended=True): - """ Function similar to lookup but allows to search for an encoding based on a regex instead. It searches this way - into the local registry but also tries a simple lookup with the original lookup function. """ - matches = [] - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - n = search_function.__name__ - for name in [n, n.replace("_", "-")]: - if re.search(encoding_regex, name): - matches.append(n.replace("_", "-")) - continue - if extended: - # in some cases, encoding_regex can match a generated string that uses a particular portion of its - # generating pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also - # find "morse" or "atbash" very rarely because of their dynamic patterns and the limited number of randomly - # generated strings - # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of - # matches ; executing 5 times the string generation for a given codec but adding the codec to the list of - # matches only if we get at least 3 matches ensures that we consider up to 2 failures that could be - # stochastic, therefore drastically decreasing the probability to get a "junk" encoding in the matches list - c = 0 - for i in range(5): - for s in generate_strings_from_regex(search_function.__pattern__): - if re.search(encoding_regex, s): - c += 1 - break - if c >= 3: - matches.append(n) - break - for s, n in ALIASES.items(): - if re.search(encoding_regex, s) or re.search(encoding_regex, n): - matches.append(n) - return sorted(list(set(matches)), key=_human_keys) -codecs.search = search - - -# utility function for the search feature -CATEGORIES = { - 'digit': digits, - 'not_digit': reduce(lambda x, c: x.replace(c, ""), digits, printable), - 'space': whitespace, - 'not_space': reduce(lambda x, c: x.replace(c, ""), whitespace, printable), - 'word': ascii_letters + digits + '_', - 'not_word': reduce(lambda x, c: x.replace(c, ""), ascii_letters + digits + '_', printable), -} -REPEAT_MAX = 10 -STAR_PLUS_MAX = 10 -YIELD_MAX = 100 - - -def __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max, parsed=False): - """ Recursive function to generate strings from a regex pattern. """ - if regex is None: - return - __groups = {} - tokens = [] - negate, last_rand = False, None - for state in (regex if parsed else re.sre_parse.parse(b(getattr(regex, "pattern", regex)))): - code = getattr(state[0], "name", state[0]).lower() - value = getattr(state[1], "name", state[1]) - value = value.lower() if isinstance(value, str) else value - if code in ["assert_not", "at"]: - continue - elif code == "any": - charset = list(printable.replace("\n", "")) - while charset[0] == last_rand and len(charset) > 1: - random.shuffle(charset) - last_rand = charset[0] - tokens.append(charset) # should be ord(x) with x belongs to [0, 256[ - elif code == "assert": - tokens.append(list(__gen_str_from_re(value[1], star_plus_max, repeat_max, yield_max, True))) - elif code == "branch": - result = [] - for r in value[1]: - result += list(__gen_str_from_re(r, star_plus_max, repeat_max, yield_max, True)) or [""] - tokens.append(result) - elif code == "category": - charset = list(CATEGORIES[value[9:]]) - if negate: - negate = False - charset = list(set(printable).difference(charset)) - while charset[0] == last_rand and len(charset) > 1: - random.shuffle(charset) - last_rand = charset[0] - tokens.append(charset) - elif code == "groupref": - tokens.extend(__groups[value]) - elif code == "in": - subtokens = list(__gen_str_from_re(value, star_plus_max, repeat_max, yield_max, True)) - subtokens = [x for l in subtokens for x in l] - tokens.append(subtokens) - elif code == "literal": - tokens.append(chr(value)) - elif code in ["max_repeat", "min_repeat"]: - start, end = value[:2] - end = min(end, star_plus_max) - start = min(start, end) - charset = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) - subtokens = [] - if start == 0 and end == 1: - subtokens.append("") - subtokens.extend(charset) - elif len(charset) ** end > repeat_max: - for i in range(min(repeat_max, 10 * len(charset))): - n = random.randint(start, end + 1) - token = "" if n == 0 else "".join(random.choice(charset) for i in range(n)) - if token not in subtokens: - subtokens.append(token) - else: - i -= 1 - else: - for n in range(start, end + 1): - for c in product(charset, repeat=n): - subtokens.append("".join(c)) - tokens.append(subtokens) - elif code == "negate": - negate = True - elif code == "not_literal": - charset = list(printable.replace(chr(value), "")) - while charset[0] == last_rand and len(charset) > 1: - random.shuffle(charset) - last_rand = charset[0] - tokens.append(charset) - elif code == "range": - tokens.append("".join(chr(i) for i in range(value[0], value[1] + 1))) - elif code == "subpattern": - result = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) - if value[0]: - __groups[value[0]] = result - tokens.append(result) - else: - raise NotImplementedError("Unhandled code '{}'".format(code)) - if len(tokens) == 0: - tokens = [""] - i = 0 - for result in product(*tokens): - yield "".join(result) - i += 1 - if i >= yield_max: - break - - -def _human_keys(text): - """ Sorting function for considering strings with numbers (e.g. base2, base10, base100) """ - tokens = [] - for s in re.split(r"(\d+|\D+)", text): - tokens.append(int(s) if s.isdigit() else s) - return tokens - - -def generate_string_from_regex(regex): - """ Utility function to generate a single string from a regex pattern. """ - if regex: - return list(generate_strings_from_regex(regex, yield_max=1))[0] - - -def generate_strings_from_regex(regex, star_plus_max=STAR_PLUS_MAX, repeat_max=REPEAT_MAX, yield_max=YIELD_MAX): - """ Utility function to generate strings from a regex pattern. """ - i = 0 - for result in __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max): - yield result - - -# guess feature objects -__module_exists = lambda n: n in [x[1] for x in iter_modules()] -stopfunc = ModuleType("stopfunc", """ - Predefined stop functions - ~~~~~~~~~~~~~~~~~~~~~~~~~ - - This submodule contains stop functions for the guess feature of codext. - - - `flag`: searches for the pattern "[Ff][Ll1][Aa4@][Gg9]" (either UTF-8 or UTF-16) - - `lang_**`: checks if the given lang (any from the PROFILES_DIRECTORY of the langdetect module) is detected - - `printables`: checks that every output character is in the set of printables - - `regex`: takes one argument, the regular expression, for checking a string against the given pattern - - `text`: checks for printables and an entropy less than 4.6 (empirically determined) -""") -stopfunc.printables = lambda s: all(c in printable for c in ensure_str(s)) -stopfunc.printables.__name__ = stopfunc.printables.__qualname__ = "printables" -stopfunc.regex = lambda p: lambda s: re.search(p, ensure_str(s)) is not None -stopfunc.regex.__name__ = stopfunc.regex.__qualname__ = "regex" -stopfunc.text = lambda s: stopfunc.printables(s) and entropy(s) < 4.6 -stopfunc.text.__name__ = stopfunc.text.__qualname__ = "text" -stopfunc.flag = lambda x: re.search(r"[Ff][Ll1][Aa4@][Gg96]", ensure_str(x)) is not None -stopfunc.flag.__name__ = stopfunc.flag.__qualname__ = "flag" -stopfunc.default = stopfunc.text - -stopfunc.LANG_BACKEND = None -stopfunc.LANG_BACKENDS = [n for n in ["pycld2", "langdetect", "langid", "cld3", "textblob"] if __module_exists(n)] -if len(stopfunc.LANG_BACKENDS) > 0: - stopfunc.LANG_BACKEND = stopfunc.LANG_BACKENDS[0] -if "cld3" in stopfunc.LANG_BACKENDS: - stopfunc.CLD3_LANGUAGES = "af|am|ar|bg|bn|bs|ca|ce|co|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|fy|ga|gd|gl|gu|ha|" \ - "hi|hm|hr|ht|hu|hy|id|ig|is|it|iw|ja|jv|ka|kk|km|kn|ko|ku|ky|la|lb|lo|lt|lv|mg|mi|mk|" \ - "ml|mn|mr|ms|mt|my|ne|nl|no|ny|pa|pl|ps|pt|ro|ru|sd|si|sk|sl|sm|sn|so|sq|sr|st|su|sv|" \ - "sw|ta|te|tg|th|tr|uk|ur|uz|vi|xh|yi|yo|zh|zu".split("|") -if "textblob" in stopfunc.LANG_BACKENDS: - stopfunc.TEXTBLOB_LANGUAGES = "af|ar|az|be|bg|bn|ca|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|ga|gl|gu|hi|hr|ht|hu|" \ - "id|is|it|iw|ja|ka|kn|ko|la|lt|lv|mk|ms|mt|nl|no|pl|pt|ro|ru|sk|sl|sq|sr|sv|sw|ta|" \ - "te|th|tl|tr|uk|ur|vi|yi|zh".split("|") - - -def _detect(text): - _lb, t = stopfunc.LANG_BACKEND, ensure_str(text) - if _lb is None: - raise ValueError("No language backend %s" % ["selected", "installed"][len(stopfunc.LANG_BACKENDS) == 0]) - return langid.classify(t)[0] if _lb == "langid" else \ - langdetect.detect(t) if _lb == "langdetect" else \ - pycld2.detect(t)[2][0][1] if _lb == "pycld2" else \ - cld3.get_language(t).language[:2] if _lb == "cld3" else \ - textblob.TextBlob(t).detect_language()[:2] - - -def _lang(lang): - def _test(s): - if not stopfunc.text(s): - return False - try: - return _detect(ensure_str(s))[:2] == lang - except: - return False - return _test - - -def _load_lang_backend(backend=None): - # import the requested backend library if not imported yet - if backend is None or backend in stopfunc.LANG_BACKENDS: - stopfunc.LANG_BACKEND = backend - if backend: - globals()[backend] = __import__(backend) - else: - raise ValueError("Unsupported language detection backend") - # remove language-related stop functions - for attr in dir(stopfunc): - if attr.startswith("_") or not isinstance(getattr(stopfunc, attr), FunctionType): - continue - if re.match(r"lang_[a-z]{2}$", attr): - delattr(stopfunc, attr) - # rebind applicable language-related stop functions - if stopfunc.LANG_BACKEND: - _lb = stopfunc.LANG_BACKEND - if _lb == "langid": - langid.langid.load_model() - for lang in ( - langid.langid.identifier.nb_classes if _lb == "langid" else \ - list(set(p[:2] for p in os.listdir(langdetect.PROFILES_DIRECTORY))) if _lb == "langdetect" else \ - list(set(x[1][:2] for x in pycld2.LANGUAGES if x[0] in pycld2.DETECTED_LANGUAGES)) if _lb == "pycld2" else \ - stopfunc.CLD3_LANGUAGES if _lb == "cld3" else \ - stopfunc.TEXTBLOB_LANGUAGES if _lb == "textblob" else \ - []): - n = "lang_%s" % lang - setattr(stopfunc, n, _lang(lang)) - getattr(stopfunc, n).__name__ = getattr(stopfunc, n).__qualname__ = n - if LANG: - flng = "lang_%s" % LANG - if getattr(stopfunc, flng, None): - stopfunc.default = getattr(stopfunc, flng) -stopfunc._reload_lang = _load_lang_backend - - -def _validate(stop_function, lang_backend="none"): - s, lb = stop_function, lang_backend - if isinstance(s, string_types): - if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ - all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): - stopfunc._reload_lang(lb) - f = getattr(stopfunc, s, None) - if f: - return f - elif not isinstance(s, FunctionType): - raise ValueError("Bad stop function") - return s -stopfunc._validate = _validate - - -def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, result, found=(), - stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): - """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ - if depth > min_depth and stop_func(input): - if not stop and (show or debug) and found not in result: - s = repr(input) - s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s - s = "[+] %s: %s" % (", ".join(found), s) - print(s if len(s) <= 80 else s[:77] + "...") - result[found] = input - if depth >= max_depth or len(result) > 0 and stop: - return - prev_enc = found[-1] if len(found) > 0 else "" - e = encodings.get(depth, encodings.get(-1, [])) - for new_input, encoding in __rank(prev_input, input, prev_enc, e, scoring_heuristic, extended): - if len(result) > 0 and stop: - return - if debug: - print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) - __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ), - stop, show, scoring_heuristic, extended, debug) - - -def __make_encodings_dict(include, exclude): - """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible - encoding names. It also creates a cache with the CodecInfo objects for improving performance. """ - def _develop(d, keep=True): - d = d or {} - for k, v in d.items(): - l, cc = [], [e for e in v if e in CODECS_CATEGORIES] - # list from in-scope categories and then everything that is not a category - for enc in ((list_encodings(*cc) if len(cc) > 0 or keep else []) + \ - [e for e in v if e not in CODECS_CATEGORIES]): - g = [] - for e in (search(enc, False) or [enc]): - try: - ci = lookup(e, False) - g.extend(ci.parameters['guess']) - except: - pass - if enc in g: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected - l.append(enc) - else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected - l.extend(g) - d[k] = list(set(l)) - return d - exclude = _develop(exclude, False) - return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()} - - -def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False): - """ Filter valid encodings and rank them by relevance. """ - ranking = {} - for e in encodings: - try: - codec = CODECS_CACHE[e] - except KeyError: - CODECS_CACHE[e] = codec = lookup(e, False) - t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) - if t: - ranking[e] = t - for encoding, result in sorted(ranking.items(), key=lambda x: (-x[1][0], x[0])): - yield result if yield_score else result[1], encoding - - -class _Text(object): - __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"] - - def __init__(self, text, pad_char=None): - self.text = ensure_str(text) - c = self.text[-1] - pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c) - self.padding = pad_char is not None and last_char == pad_char - if self.padding: - text = text.rstrip(pad_char) - self.len = len(self.text) - self.lcharset = len(set(self.text)) - self.printables = float(len([c for c in self.text if c in printable])) / self.len - self.entropy = entropy(self.text) - - -def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False): - """ Score relevant encodings given an input. """ - obj = None - sc = codec.parameters.get('scoring', {}) - no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False) - # ignore encodings that fail to decode with their default errors handling value - try: - new_input = codec.decode(input)[0] - except: - return - # ignore encodings that give an output identical to the input (identity transformation) or to the previous input - if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input): - return - # ignore encodings that transitively give the same output (identity transformation by chaining twice a same - # codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9) - if transitive and prev_encoding: - ci_prev = lookup(prev_encoding, False) - if ci_prev.parameters['name'] == codec.parameters['name']: - return - # compute input's characteristics only once and only if the control flow reaches this point - pad = sc.get('padding_char') - if obj is None: - obj = _Text(input, pad) - if heuristic: - # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base - # codecs) so that we can put the right one as early as possible and eventually exclude bad candidates - s = -sc.get('penalty', .0) - # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ; - # on the contrary, if the length of input text's charset is strictly greater, give a penalty - lcs = sc.get('len_charset', 256) - if isinstance(lcs, type(lambda: None)): - lcs = int(lcs(encoding)) - if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset: - s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1) - elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset: - s -= .2 # this can occur for encodings with no_error set to True - # then, take padding into account, giving a bonus if padding is to be encountered and effectively present, - # or a penalty when it should not be encountered but it is present - if pad and obj.padding: - s += .2 # when padding is encountered while it is legitimate, it could be a good indication => bonus - elif not pad and obj.padding: - s -= .1 # it could arise a padding character is encountered while not being padding => small penalty - # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when - # lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased) - if not no_error: - pr = sc.get('printables_rate', 0) - if isinstance(pr, type(lambda: None)): - pr = float(pr(obj.printables)) - if obj.printables - pr <= .05: - s += .1 - expf = sc.get('expansion_factor', 1.) - if expf: - f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f - if isinstance(expf, type(lambda: None)): - try: # this case allows to consider the current encoding name from the current codec - expf = expf(f, encoding) - except TypeError: - expf = expf(f) - if isinstance(expf, (int, float)): - tmp = expf - expf = (1/f - .1 <= 1/expf <= 1/f + .1) - elif isinstance(expf, (tuple, list)) and len(expf) == 2: - expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] - s += [-1., .1][expf] - # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the - # number of input characters to take bad entropies of shorter strings into account - entr = sc.get('entropy', lambda e: e) - entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr - if isinstance(entr, type(lambda: None)): - try: # this case allows to consider the current encoding name from the current codec - entr = entr(obj.entropy, encoding) - except TypeError: - entr = entr(obj.entropy) - if entr is not None: - # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1) - d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - obj.entropy) - if d_entr <= .5: - s += .5 - d_entr - # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) - bonus = sc.get('bonus_func') - if bonus is not None: - if isinstance(bonus, type(lambda: None)): - bonus = bonus(obj, codec, encoding) - if bonus: - s += .2 - else: - s = 1. - # exclude negative (and eventually null) scores as they are (hopefully) not relevant - if extended and s >= .0 or not extended and s > .0: - return s, new_input - - -def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=None, exclude=None, found=(), - stop=True, show=False, scoring_heuristic=True, extended=False, debug=False): - """ Try decoding without the knowledge of the encoding(s). - - :param input: input text to be guessed - :param stop_func: function defining the stop condition - :param min_depth: minimum search depth - :param max_depth: maximum search depth - ;param include: inclusion item OR list with category, codec or encoding names OR dictionary with lists per - depth (nothing means include every encoding) - :param exclude: exclusion item OR list with category, codec or encoding names OR dictionary with lists per - depth (nothing means exclude no encoding) - :param found: tuple of already found encodings - :param stop: whether to stop or not when a valid solution is found - :param show: whether to immediately show once a solution is found - :param scoring_heuristic: whether to apply the scoring heuristic during the search (if disabled, all scores are 1., - meaning that every non-failing encoding will be considered with no order of precedence) - :param extended: whether to also consider null scores with the heuristic - :param debug: whether to show each attempt at each depth during computation - """ - if len(input) == 0: - return "" - # check for min and max depths - if max_depth <= 0: - raise ValueError("Depth must be a non-null positive integer") - if min_depth > max_depth: - raise ValueError("Min depth shall be less than or equal to the max depth") - # take the tuple of found encodings into account - if len(found) > 0: - for encoding in found: - input = decode(input, encoding) - # handle the stop function as a regex if a string was given - if isinstance(stop_func, string_types): - stop_func = stopfunc.regex(stop_func) - # reformat include and exclude arguments ; supported formats: - for n, l in zip(["inc", "exc"], [include, exclude]): - if l is None: - if n == "inc": - include = l = {-1: CODECS_CATEGORIES} - else: - exclude = l = {} - # "category" OR "enc_name" OR whatever => means a single item for all depths - if isinstance(l, string_types): - if n == "inc": - include = l = {-1: [l]} - else: - exclude = l = {-1: [l]} - # ["enc_name1", "enc_name2", ...] => means for all depths - if isinstance(l, (list, tuple)): - if n == "inc": - include = l = {-1: l} - else: - exclude = l = {-1: l} - # {-1: [...], 2: [...], ...} => means prefedined depths with their lists of in-/excluded encodings - if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()): - raise ValueError("Include argument shall be a list or a dictionary with integer keys") - # precompute encodings lists per depth and cache the related CodecInfo objects - encodings, result = __make_encodings_dict(include, exclude), {} - try: - # breadth-first search - for d in range(max_depth): - __guess("", input, stop_func, 0, d+1, min_depth, encodings, result, tuple(found), stop, show, - scoring_heuristic, extended, debug) - if stop and len(result) > 0: - break - except KeyboardInterrupt: - pass - CODECS_CACHE = {} - return result -codecs.guess = guess - - -def rank(input, extended=False, limit=-1, include=None, exclude=None): - """ Rank the most probable encodings based on the given input. - - :param input: input text to be evaluated - :param extended: whether to consider null scores too (NB: negative scores are not output !) - :param limit: number of encodings to be returned (-1 means all of them) - :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) - :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) - """ - encodings = __make_encodings_dict({-1: include or CODECS_CATEGORIES}, {-1: exclude or []}) - r = list(__rank(None, input, "", encodings[-1], True, extended, True)) - return r[:limit] if len(r) > 1 else r -codecs.rank = rank - +# -*- coding: UTF-8 -*- +import _codecs +import builtins +import codecs +import hashlib +import json +import os +import random +import re +import sys +from encodings.aliases import aliases as ALIASES +from functools import reduce, update_wrapper, wraps +from importlib import import_module +from inspect import currentframe +from io import BytesIO +from itertools import chain, product +from locale import getlocale +from math import log +from pkgutil import iter_modules +from platform import system +from random import randint +from string import * +from types import FunctionType, ModuleType +try: + import re._parser as sre_parse +except ImportError: + import sre_parse + +# from Python 3.11, 'sre_parse' is bound as '_parser' ; monkey-patch it for backward-compatibility +re.sre_parse = sre_parse + +maketrans = str.maketrans + + +__all__ = ["add", "add_macro", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess", + "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "hashlib", "i2s", + "is_native", "list_categories", "list_encodings", "list_macros", "lookup", "maketrans", "os", "rank", "re", + "register", "remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro", + "DARWIN", "LANG", "LINUX", "MASKS", "UNIX", "WINDOWS"] +CODECS_REGISTRY = None +CODECS_OVERWRITTEN = [] +CODECS_CATEGORIES = ["native", "custom"] +CODECS_CACHE = {} +LANG = getlocale() +if LANG: + LANG = (LANG[0] or "")[:2].lower() +MASKS = { + 'a': printable, + 'b': "".join(chr(i) for i in range(256)), + 'd': digits, + 'h': digits + "abcdef", + 'H': digits + "ABCDEF", + 'l': ascii_lowercase, + 'p': punctuation, + 's': " ", + 'u': ascii_uppercase, +} + +__codecs_registry = [] + +MACROS = {} +PERS_MACROS = {} +PERS_MACROS_FILE = os.path.expanduser("~/.codext-macros.json") + +DARWIN = system() == "Darwin" +LINUX = system() == "Linux" +UNIX = DARWIN or LINUX +WINDOWS = system() == "Windows" + +entropy = lambda s: -sum([p * log(p, 2) for p in [float(s.count(c)) / len(s) for c in set(s)]]) + +isb = lambda s: isinstance(s, bytes) +iss = lambda s: isinstance(s, str) +fix = lambda x, ref: b(x) if isb(ref) else ensure_str(x) if iss(ref) else x + +s2i = lambda s: int(codecs.encode(s, "base16"), 16) +exc_name = lambda e: "".join(t.capitalize() for t in re.split(r"[-_+]", e)) + + +def i2s(input): + h = hex(input)[2:].rstrip("eL") + return codecs.decode(h.zfill(len(h) + len(h) % 2), "hex") + + +class CodecMacro(tuple): + """Macro details when looking up the codec registry. """ + def __new__(cls, name): + self = tuple.__new__(cls) + self.name = name + # get from personal macros first + try: + self.codecs = PERS_MACROS[name] + except KeyError: + try: + self.codecs = MACROS[name] + except KeyError: + raise LookupError(f"unknown macro: {name}") + if not isinstance(self.codecs, (tuple, list)): + raise ValueError(f"bad macro list: {self.codecs}") + self.codecs = [lookup(e, False) for e in self.codecs] # lookup(e, False) + self.parameters = {'name': name, 'category': "macro"} # ^ means that macros won't be nestable + # test examples to check that the chain of encodings works + for action, examples in (self.codecs[0].parameters.get('examples', {}) or {'enc-dec(': ["T3st str!"]}).items(): + if re.match(r"enc(-dec)?\(", action): + for e in (examples.keys() if action.startswith("enc(") else examples or []): + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + if rd: + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + self.encode(s.lower() if rd.group(1) else s) + continue + self.encode(e) + + class Codec: + decode = self.decode + encode = self.encode + + class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return b(self.encode(input, self.errors)[0]) + self.incrementalencoder = IncrementalEncoder + + class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + return ensure_str(self.decode(input, self.errors)[0]) + self.incrementaldecoder = IncrementalDecoder + + class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + self.streamwriter = StreamWriter + + class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + self.streamreader = StreamReader + + return self + + def decode(self, input, error="strict"): + """ Decode with each codec in reverse order. """ + for ci in self.codecs[::-1]: + input, l = ci.decode(input, error) + return input, l + + def encode(self, input, error="strict"): + """ Encode with each codec. """ + for ci in self.codecs: + input, l = ci.encode(input, error) + return input, l + + def __repr__(self): + return f"" + + +# inspired from: https://stackoverflow.com/questions/10875442/possible-to-change-a-functions-repr-in-python +class Repr(object): + def __init__(self, name, func): + self.__name = name + self.__func = func + update_wrapper(self, func) + + def __call__(self, *args, **kwargs): + return self.__func(*args, **kwargs) + + def __repr__(self): + return f"" + + +def __stdin_pipe(): + """ Stdin pipe read function. """ + try: + with open(0, 'rb') as f: + for l in f: + yield l + except TypeError: + for l in sys.stdin: + yield l + + +def _input(infile): + # handle input file or stdin + c = b("") + if infile: + with open(infile, 'rb') as f: + c = f.read() + else: + for line in __stdin_pipe(): + c += line + return c + + +def _set_exc(name, etype="ValueError"): + if not hasattr(builtins, name): + ns = {} + exec(f"class {name}({etype}): __module__ = 'builtins'", {}, ns) + setattr(builtins, name, ns[name]) +_set_exc("InputSizeLimitError") +_set_exc("ParameterError") + + +def _stripl(s, st_lines, st_crlf): + if st_crlf: + s = s.replace(b"\r\n", b"") if isb(s) else s.replace("\r\n", "") + if st_lines: + s = s.replace(b"\n", b"") if isb(s) else s.replace("\n", "") + return s + + +def _with_repr(name): + def _wrapper(f): + return Repr(name, f) + return _wrapper + + +def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False, **kwargs): + """ This adds a new codec to the codecs module setting its encode and/or decode functions, eventually dynamically + naming the encoding with a pattern and with file handling. + + :param ename: encoding name + :param encode: encoding function or None + :param decode: decoding function or None + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + """ + remove(ename) + if encode: + if not isinstance(encode, FunctionType): + raise ValueError("Bad 'encode' function") + _set_exc(f"{exc_name(ename)}EncodeError") # create the custom encode exception as a builtin + if decode: + if not isinstance(decode, FunctionType): + raise ValueError("Bad 'decode' function") + _set_exc(f"{exc_name(ename)}DecodeError") # create the custom decode exception as a builtin + if not encode and not decode: + raise ValueError("At least one en/decoding function must be defined") + for exc in kwargs.get('extra_exceptions', []): + _set_exc(exc) # create additional custom exceptions as builtins + glob = currentframe().f_back.f_globals + # search function for the new encoding + @_with_repr(ename) + def getregentry(encoding): + if encoding != ename and not (pattern and re.match(pattern, encoding)): + return + fenc, fdec, name = encode, decode, encoding + # prepare CodecInfo input arguments + if pattern: + m, args, i = re.match(pattern, encoding), [], 1 + try: + while True: + try: + g = m.group(i) or "" + if g.isdigit() and not g.startswith("0") and (re.match(r"10+", g) or "".join(set(g)) != "01"): + g = int(g) + args += [g] + i += 1 + except AttributeError: + # this occurs when m is None or there is an error in fenc(g) or fdec(g), meaning no match + if m is not None: + raise + return + except IndexError: + # this occurs while m is not None, but possibly no capture group that gives at least 1 group index ; + # in this case, if fenc/fdec is a decorated function, execute it with no arg + if len(args) == 0: + from inspect import getfullargspec + if fenc and len(getfullargspec(fenc).args) == 1: + fenc = fenc() + if fdec and len(getfullargspec(fdec).args) == 1: + fdec = fdec() + else: + fenc = fenc(*args) if fenc else fenc + fdec = fdec(*args) if fdec else fdec + if fenc: + fenc = fix_inout_formats(fenc) + if fdec: + fdec = fix_inout_formats(fdec) + sl, sc = kwargs.pop('strip_lines', False), kwargs.pop('strip_crlf', False) + if sl or sc: + def _striplines(f): + def __wrapper(input, *a, **kw): + return f(_stripl(input, sc, sl), *a, **kw) + return __wrapper + # this fixes issues with wrapped encoded inputs + fdec = _striplines(fdec) + + class Codec(codecs.Codec): + def encode(self, input, errors="strict"): + if fenc is None: + raise NotImplementedError + return fenc(input, errors) + + def decode(self, input, errors="strict"): + if fdec is None: + raise NotImplementedError + return fdec(input, errors) + + class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + if fenc is None: + raise NotImplementedError + return b(fenc(input, self.errors)[0]) + + class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + if fdec is None: + raise NotImplementedError + return ensure_str(fdec(input, self.errors)[0]) + + class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + + class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + + ci = codecs.CodecInfo( + name=name, + encode=Codec().encode, + decode=Codec().decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamwriter=StreamWriter, + streamreader=StreamReader, + _is_text_encoding=text, + ) + ci.parameters = kwargs + ci.parameters['name'] = ename + ci.parameters['add_to_codecs'] = add_to_codecs + ci.parameters['pattern'] = pattern + ci.parameters['text'] = text + f = glob.get('__file__', os.path.join("custom", "_")) + cat = f.split(os.path.sep)[-2].rstrip("s") + if cat not in CODECS_CATEGORIES: + CODECS_CATEGORIES.append(cat) + ci.parameters['category'] = kwargs.get('category', cat) + ci.parameters['examples'] = kwargs.get('examples', glob.get('__examples__')) + ci.parameters['guess'] = kwargs.get('guess', glob.get('__guess__', [ename])) or [] + ci.parameters['module'] = kwargs.get('module', glob.get('__name__')) + ci.parameters.setdefault("scoring", {}) + for attr in ["bonus_func", "entropy", "expansion_factor", "len_charset", "penalty", "printables_rate", + "padding_char", "transitive"]: + a = kwargs.pop(attr, None) + if a is not None: + ci.parameters['scoring'][attr] = a + return ci + + getregentry.__name__ = re.sub(r"[\s\-]", "_", ename) + if kwargs.get('aliases'): + getregentry.__aliases__ = list(map(lambda n: re.sub(r"[\s\-]", "_", n), kwargs['aliases'])) + getregentry.__pattern__ = pattern + register(getregentry, add_to_codecs) + return getregentry + + +def add_macro(mname, *encodings): + """ This allows to define a macro, chaining multiple codecs one after the other. This relies on a default set of + macros from a YAML file embedded in the package and a local YAML file from the home folder that takes + precedence for defining personal macros. + + :param mname: macro name + :param encodings: encoding names of the encodings to be chained with the macro + """ + global PERS_MACROS # noqa: F824 + # check for name clash with alreday existing macros and codecs + if mname in MACROS or mname in PERS_MACROS: + raise ValueError("Macro name already exists") + try: + ci = lookup(mname, False) + raise ValueError(f"Macro name clashes with codec '{ci.name}'") + except LookupError: + pass + try: + PERS_MACROS[mname] = encodings + CodecMacro(mname) + with open(PERS_MACROS_FILE, 'w') as f: + json.dump(PERS_MACROS, f, indent=2) + except ValueError: + del PERS_MACROS[mname] + raise +codecs.add_macro = add_macro + + +def add_map(ename, encmap, repl_char="?", sep="", ignore_case=None, no_error=False, intype=None, outype=None, **kwargs): + """ This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs + module dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with + a pattern and with file handling (if text is True). + + :param ename: encoding name + :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture + group of the regex pattern) or a function building the encoding map + :param repl_char: replacement char (used when errors handling is set to "replace") + :param sep: string of possible character separators (hence, only single-char separators are considered) ; + - while encoding, the first separator is used + - while decoding, separators can be mixed in the input text + :param ignore_case: ignore text case while encoding and/or decoding + :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) + :param intype: specify the input type for pre-transforming the input text + :param outype: specify the output type for post-transforming the output text + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + """ + outype = outype or intype + if ignore_case not in [None, "encode", "decode", "both"]: + raise ValueError("Bad ignore_case parameter while creating encoding map") + if intype not in [None, "str", "bin", "ord"]: + raise ValueError("Bad input type parameter while creating encoding map") + if outype not in [None, "str", "bin", "ord"]: + raise ValueError("Bad output type parameter while creating encoding map") + + def __generic_code(decode=False): + def _wrapper(param): + """ The parameter for wrapping comes from the encoding regex pattern ; e.g. + [no pattern] => param will be None everytime + r"barbie[-_]?([1-4])$" => param could be int 1, 2, 3 or 4 + r"^morse(|[-_]?.{3})$" => param could be None, "-ABC" (for mapping to ".-/") + + In order of precedence: + 1. when param is a key in mapdict or mapdict is a list of encoding maps (hence in the case of "barbie...", + param MUST be an int, otherwise for the first case it could clash with a character of the encoding map) + 2. otherwise handle it as a new encoding character map "ABC" translates to ".-/" for morse + """ + p = param + if isinstance(encmap, FunctionType): + mapdict = encmap(p) + p = None + else: + mapdict = encmap + if isinstance(mapdict, dict): + smapdict = {k: v for k, v in mapdict.items()} + elif isinstance(mapdict, list) and isinstance(mapdict[0], dict): + smapdict = {k: v for k, v in mapdict[0].items()} + else: + raise ValueError("Bad mapping dictionary or list of mapping dictionaries") + if p is not None: + # case 1: param is empty string + if p == "": + if isinstance(mapdict, list): + smapdict = {k: v for k, v in mapdict[0].items()} + elif isinstance(mapdict, dict): + if '' in mapdict.keys() and isinstance(mapdict[''], dict): + smapdict = {k: v for k, v in mapdict[''].items()} + else: + smapdict = {k: v for k, v in mapdict.items()} + # no 'else' handling a LookupError here ; this case is covered by the first if/elif/else block + # case 2: list or dictionary or dictionary of numbered encodings + elif isinstance(p, int): + # if mapdict is a list, we shall align the parameter (starting from 1) as an index (starting from 0) + if isinstance(mapdict, list): + p -= 1 + if isinstance(mapdict, list) and 0 <= p < len(mapdict) or \ + isinstance(mapdict, dict) and p in mapdict.keys(): + smapdict = {k: v for k, v in mapdict[p].items()} + else: + raise LookupError(f"Bad parameter for encoding '{ename}': '{p}'") + # case 3: dictionary of regex-selected encoding mappings + elif isinstance(mapdict, dict) and isinstance(list(mapdict.values())[0], dict): + tmp = None + for r, d in mapdict.items(): + if r == '': # this is already handled in case 1 ; anyway, an empty regex always matches, hence + continue # it must be excluded + if re.match(r, p): + tmp = d + break + if tmp is None: + raise LookupError(f"Bad parameter for encoding '{ename}': '{p}'") + smapdict = tmp + # case 4: encoding characters translation + else: + # collect base tokens in order of appearance in the mapping dictionary + base_tokens = "" + for _, c in sorted(mapdict.items()): + for t in c: + for st in t: + if st not in base_tokens: + base_tokens += st + if " " not in sep: + base_tokens = base_tokens.replace(" ", "") + if len(p) > 0 and p[0] in "-_" and len(p[1:]) == len(set(p[1:])) == len(base_tokens): + p = p[1:] + if len(p) == len(set(p)) == len(base_tokens): + t = maketrans(base_tokens, p) + for k, v in smapdict.items(): + smapdict[k] = [x.translate(t) for x in v] if isinstance(v, list) else v.translate(t) + else: + raise LookupError(f"Bad parameter for encoding '{ename}': '{p}'") + if ignore_case is not None: + cases = ["upper", "lower"] + case_d = cases[any(c in str(list(smapdict.values())) for c in "abcdefghijklmnopqrstuvwxyz")] + case_e = cases[any(c in str(list(smapdict.keys())) for c in "abcdefghijklmnopqrstuvwxyz")] + i = ignore_case + smapdict = {getattr(k, case_e)() if i in ["both", "encode"] else k: \ + ([getattr(x, case_d)() for x in v] if isinstance(v, list) else getattr(v, case_d)()) \ + if i in ["both", "decode"] else v for k, v in smapdict.items()} + if decode: + tmp = {} + # this has a meaning for encoding maps that could have clashes in encoded chars (e.g. Bacon's cipher ; + # I => abaaa but also J => abaaa, with the following, we keep I instead of letting J overwrite it) + for k, v in sorted(smapdict.items()): + if not isinstance(v, list): + v = [v] + for x in v: + if x not in tmp.keys(): + tmp[x] = k + smapdict, cs = tmp, reduce(lambda acc, x: acc + x, tmp.keys()) + kwargs['strip_lines'], kwargs['strip_crlf'] = "\n" not in set(cs), "\r\n" not in cs + # this allows to avoid an error with Python2 in the "for i, c in enumerate(parts)" loop + if '' not in smapdict.keys(): + smapdict[''] = "" + # determine token and result lengths + tmaxlen = max(map(len, smapdict.keys())) + tminlen = max(1, min(map(len, set(smapdict.keys()) - {''}))) + l = [] + for x in smapdict.values(): + getattr(l, ["append", "extend"][isinstance(x, list)])(x) + rminlen = max(1, min(map(len, set(l) - {''}))) + + # generic encoding/decoding function for map encodings + def code(text, errors="strict"): + icase = ignore_case == "both" or \ + decode and ignore_case == "decode" or \ + not decode and ignore_case == "encode" + if icase: + case = case_d if decode else case_e + if no_error: + errors = "leave" + text = ensure_str(text) + if not decode: + if intype == "bin": + text = "".join(f"{bin(ord(c))[2:]:0>8}" for c in text) + elif intype == "ord": + text = "".join(str(ord(c)).zfill(3) for c in text) + r = "" + lsep = "" if decode else sep if len(sep) <= 1 else sep[0] + kind = ["character", "token"][tmaxlen > 1] + error_func = handle_error(ename, errors, lsep, repl_char, rminlen, decode, kind) + + # get the value from the mapping dictionary, trying the token with its inverted case if relevant + def __get_value(token, position, case_changed=False): + try: + result = smapdict[token] + except KeyError: + if icase and not case_changed: + token_inv_case = getattr(token, case)() + return __get_value(token_inv_case, position, True) + return error_func(token, position) + if isinstance(result, list): + result = result[0] + return result + lsep + + # if a separator is defined, rely on it by splitting the input text + if decode and len(sep) > 0: + for i, c in enumerate(re.split("[" + sep + "]", text)): + r += __get_value(c, i) + # otherwise, move through the text using a cursor for tokenizing it ; this allows defining more complex + # encodings with variable token lengths + else: + cursor, bad = 0, "" + while cursor < len(text): + token = text[cursor:cursor+1] + for l in range(tminlen, tmaxlen + 1): + token = text[cursor:cursor+l] + if token in smapdict.keys() or icase and getattr(token, case)() in smapdict.keys(): + r += __get_value(token, cursor) + cursor += l + break + else: + # collect bad chars and only move the cursor one char to the right + bad += text[cursor] + cursor += 1 + # if the number of bad chars is the minimum token length, consume it and start a new buffer + if len(bad) == tminlen or errors == "leave": + posn = cursor - len(bad) + r += error_func(bad, posn) + bad = "" + if decode: + if outype in ["bin", "ord"]: + tmp, r = "", r.replace(lsep, "") + step = [3, 8][outype == "bin"] + for i in range(0, len(r), step): + s = r[i:i+step] + try: + tmp += chr(int(s, 2) if outype == "bin" else int(s)) + except ValueError: + if len(s) > 0: + tmp += "[" + s + "]" + r = tmp + lsep + return r[:len(r)-len(lsep)], len(b(text)) + return code + if re.search(r"\([^(?:)]", kwargs.get('pattern', "")) is None: + # in this case, there is no capturing group for parametrization + return _wrapper(None) + return _wrapper + + glob = currentframe().f_back.f_globals + kwargs['category'] = glob['__file__'].split(os.path.sep)[-2].rstrip("s") + kwargs['examples'] = kwargs.get('examples', glob.get('__examples__')) + kwargs['encmap'] = encmap + kwargs['repl_char'] = repl_char + kwargs['sep'] = sep + kwargs['ignore_case'] = ignore_case + kwargs['no_error'] = no_error + kwargs['intype'] = intype + kwargs['outype'] = outype + kwargs['module'] = glob.get('__name__') + try: + if isinstance(encmap, dict): + smapdict = {k: v for k, v in encmap.items()} + elif isinstance(encmap, list) and isinstance(encmap[0], dict): + smapdict = {k: v for k, v in encmap[0].items()} + kwargs['repl_minlen'] = i = max(1, min(map(len, set(smapdict.values()) - {''}))) + kwargs['repl_minlen_b'] = max(1, min(map(len, map(b, set(smapdict.values()) - {''})))) + except: + pass + return add(ename, __generic_code(), __generic_code(True), **kwargs) +codecs.add_map = add_map + + +def clear(): + """ Clear codext's local registry of search functions. """ + global __codecs_registry, MACROS, PERS_MACROS # noqa: F824 + __codecs_registry, MACROS, PERS_MACROS = [], {}, {} +codecs.clear = clear + + +def examples(encoding, number=10): + """ Use the search function to get the matching encodings and provide examples of valid encoding names. """ + e = [] + for name in search(encoding): + for search_function in __codecs_registry: + n = search_function.__name__ + if name in [n, n.replace("_", "-")]: + temp = [] + for s in generate_strings_from_regex(search_function.__pattern__, yield_max=16*number): + temp.append(s) + random.shuffle(temp) + i = 0 + while i < min(number, len(temp)): + if not temp[i].isdigit(): + try: + lookup(temp[i], False) + e.append(temp[i]) + except LookupError: + pass + i += 1 + for alias, codec in ALIASES.items(): + if name == codec: + if codec not in e: + e.append(codec) + if not alias.isdigit(): + e.append(alias) + random.shuffle(e) + return sorted([e[i] for i in range(min(number, len(e)))], key=_human_keys) +codecs.examples = examples + + +def is_native(encoding): + """ Determine if a given encoding is native or not. """ + return lookup(encoding, False).parameters['category'] == "native" + + +def list_categories(): + """ Get a list of all codec categories. """ + c = CODECS_CATEGORIES + root = os.path.dirname(__file__) + for d in os.listdir(root): + if os.path.isdir(os.path.join(root, d)) and not d.startswith("__"): + c.append(d.rstrip("s")) + # particular category, hardcoded from base/_base.py + c += ["base-generic"] + return list(set(c)) +list_categories() + + +def list_encodings(*categories): + """ Get a list of all codecs. """ + # if "non-native" is in the input list, extend the list with the whole categories but "native" + categories, exclude = list(categories), [] + for c in categories[:]: + if c == "non-native": + for c in CODECS_CATEGORIES: + if c == "native" or c in categories: + continue + categories.append(c) + categories.remove("non-native") + if c.startswith("~"): + exclude.append(c[1:]) + categories.remove(c) + try: + categories.remove(c[1:]) + except ValueError: + pass + # now, filter codecs according to the input list of categories + enc = [] + if (len(categories) == 0 or "native" in categories) and "native" not in exclude: + for a in set(ALIASES.values()): + try: + ci = __orig_lookup(a) + except LookupError: + continue + if lookup(a) is ci: + enc.append(ci.name) + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + name = search_function.__name__.replace("_", "-") + p = search_function.__pattern__ + ci = search_function(name) if p is None else search_function(generate_string_from_regex(p)) + c = "other" if ci is None else ci.parameters['category'] + if (len(categories) == 0 or c in categories) and c not in exclude: + enc.append(name) + for category in categories: + if category not in CODECS_CATEGORIES: + raise ValueError(f"Category '{category}' does not exist") + return sorted(list(set(enc)), key=_human_keys) + + +def list_macros(): + """ Get a list of all macros, with the precedence on personal ones. """ + return sorted(list(set(list(MACROS.keys()) + list(PERS_MACROS.keys())))) + + +def remove(name): + """ Remove all search functions matching the input encoding name from codext's local registry or any macro with the + given name. """ + global __codecs_registry, MACROS, PERS_MACROS # noqa: F824 + tbr = [] + for search_function in __codecs_registry: + if search_function(name) is not None: + tbr.append(search_function) + for search_function in tbr: + __codecs_registry.remove(search_function) + try: + del MACROS[name] + except KeyError: + pass + try: + del PERS_MACROS[name] + with open(PERS_MACROS_FILE, 'w') as f: + json.dump(PERS_MACROS, f, indent=2) + except KeyError: + pass + try: + del CODECS_CACHE[name] + except KeyError: + pass + for s in ["En", "De"]: + try: + delattr(builtins, f"{name.capitalize()}{s}codeError") + except AttributeError: + pass +codecs.remove = remove + + +def reset(): + """ Reset codext's local registry of search functions and macros. """ + from importlib import reload + global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS # noqa: F824 + clear() + d = os.path.dirname(__file__) + for pkg in sorted(os.listdir(d)): + if pkg.startswith("_") or not os.path.isdir(os.path.join(d, pkg)): + continue + reload(import_module("codext." + pkg)) + # backup codext's registry + if CODECS_REGISTRY is None: + CODECS_REGISTRY = __codecs_registry[:] + # restore codext's registry + else: + __codecs_registry = CODECS_REGISTRY[:] + # restore codext's embedded set of macros + with open(os.path.join(os.path.dirname(__file__), "macros.json")) as f: + MACROS = json.load(f) + # reload personal set of macros + PERS_MACROS = {} + if os.path.exists(PERS_MACROS_FILE): + with open(PERS_MACROS_FILE) as f: + PERS_MACROS = json.load(f) +codecs.reset = reset + + +# conversion functions +def b(s): + """ Non-crashing bytes conversion function. """ + try: + return s.encode("latin-1") + except: + pass + try: + return s.encode("utf-8") + except: + pass + return s + + +def ensure_str(s, encoding="utf-8", errors='strict'): + """ Dummy str conversion function. """ + if isinstance(s, bytes): + try: + return s.decode(encoding, errors) + except: + return s.decode("latin-1") + return s + + +# make conversion functions compatible with input/output strings/bytes +def fix_inout_formats(f): + """ This decorator ensures that the first output of f will have the same text format as the first input (str or + bytes). """ + @wraps(f) + def _wrapper(*args, **kwargs): + a0 = args[0] + a0_isb = isb(a0) + a0 = ensure_str(a0) if iss(a0) or a0_isb else a0 + r = f(a0, *args[1:], **kwargs) + # special case: input is in bytes ; ensure that the returned length is this of the bytes, not this processed by + # the decode/encode function + if isinstance(r, (tuple, list)) and isinstance(r[1], int) and a0_isb: + r = tuple([list(r)[0]] + [len(args[0])] + list(r)[2:]) + return (fix(r[0], args[0]), ) + r[1:] if isinstance(r, (tuple, list)) else fix(r, args[0]) + return _wrapper + + +# alphabet generation function from a given mask +def get_alphabet_from_mask(mask): + """ This function generates an alphabet from the given mask. The style used is similar to Hashcat ; group keys are + marked with a heading "?". """ + i, alphabet = 0, "" + while i < len(mask): + c = mask[i] + if c == "?" and i < len(mask) - 1 and mask[i+1] in MASKS.keys(): + for c in MASKS[mask[i+1]]: + if c not in alphabet: + alphabet += c + i += 1 + elif c not in alphabet: + alphabet += c + i += 1 + return alphabet + + +# generic error handling function +def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=False, kind="character", item="position"): + """ This shortcut function allows to handle error modes given some tuning parameters. + + :param ename: encoding name + :param errors: error handling mode + :param sep: token separator + :param repl_char: replacement character (for use when errors="replace") + :param repl_minlen: repeat number for the replacement character + :param decode: whether we are encoding or decoding + :param item: position item description (for describing the error ; e.g. "group" or "token") + """ + exc = f"{exc_name(ename)}{['En','De'][decode]}codeError" + + def _handle_error(token, position, output="", eename=None): + """ This handles an encoding/decoding error according to the selected handling mode. + + :param token: input token to be encoded/decoded + :param position: token position index + :param output: output, as decoded up to the position of the error + """ + if errors == "strict": + token = f"{token[:7]}..." if len(token := ensure_str(token)) > 10 else token + err = getattr(builtins, exc)(f"'{eename or ename}' codec can't {['en','de'][decode]}code {kind} '{token}' " + f"in {item} {position}") + err.output = output + err.__cause__ = err + raise err + elif errors == "leave": + return token + sep + elif errors == "replace": + return repl_char * repl_minlen + sep + elif errors == "ignore": + return "" + else: + raise ValueError(f"Unsupported error handling '{errors}'") + return _handle_error + + +# codecs module hooks +__orig_lookup = _codecs.lookup +__orig_register = _codecs.register + + +def __add(ename, encode=None, decode=None, pattern=None, text=True, **kwargs): + kwargs.pop('add_to_codecs', None) + return add(ename, encode, decode, pattern, text, True, **kwargs) +__add.__doc__ = add.__doc__ +codecs.add = __add + + +def decode(obj, encoding='utf-8', errors='strict'): + """ Custom decode function relying on the hooked lookup function. """ + return lookup(encoding).decode(obj, errors)[0] +codecs.decode = decode + + +def encode(obj, encoding='utf-8', errors='strict'): + """ Custom encode function relying on the hooked lookup function. """ + n, m = 1, re.search(r"\[(\d+)\]$", encoding) + if m: + n = int(m.group(1)) + encoding = re.sub(r"\[(\d+)\]$", "", encoding) + ci = lookup(encoding) + for i in range(n): + try: + obj = ci.encode(obj, errors)[0] + except (AttributeError, TypeError) as e: # occurs for encodings that require str as input while 'obj' is bytes + if str(e) not in ["'bytes' object has no attribute 'encode'", + "ord() expected string of length 1, but int found"] or \ + encoding in ["latin-1", "utf-8"]: # encodings considered when using b(...) + raise + obj = ci.encode(ensure_str(obj), errors)[0] + return obj +codecs.encode = encode + + +def lookup(encoding, macro=True): + """ Hooked lookup function for searching first for codecs in the local registry of this module. """ + # first, try to match the given encoding with codecs' search functions + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + codecinfo = search_function(encoding) + if codecinfo is not None: + return codecinfo + # then, if a codec name was given, generate an encoding name from its pattern and get the CodecInfo + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + if search_function.__name__.replace("_", "-") == encoding or \ + encoding in getattr(search_function, "__aliases__", []): + codecinfo = search_function(generate_string_from_regex(search_function.__pattern__)) + if codecinfo is not None: + return codecinfo + # finally, get a CodecInfo with the original lookup function and refine it with a dictionary of parameters + try: + ci = __orig_lookup(encoding) + ci.parameters = {'category': "native", 'module': "codecs", 'name': ALIASES.get(ci.name, ci.name)} + return ci + except LookupError: + if not macro: + raise + try: + return CodecMacro(encoding) + except LookupError: + e = LookupError(f"unknown encoding: {encoding}") + e.__cause__ = e # stop exception chaining + raise e +codecs.lookup = lookup + + +def register(search_function, add_to_codecs=False): + """ Register function for registering new codecs in the local registry of this module and, if required, in the + native codecs registry (for use with the built-in 'open' function). + + :param search_function: search function for the codecs registry + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + """ + if search_function not in __codecs_registry: + try: + __orig_lookup(search_function.__name__) + l = CODECS_OVERWRITTEN + except LookupError: + l = __codecs_registry + l.append(search_function) + if add_to_codecs: + __orig_register(search_function) + + +def __register(search_function): + """ Same as register(...), but with add_to_codecs set by default to True. """ + register(search_function, True) +codecs.register = __register + + +def search(encoding_regex, extended=True): + """ Function similar to lookup but allows to search for an encoding based on a regex instead. It searches this way + into the local registry but also tries a simple lookup with the original lookup function. """ + matches = [] + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + n = search_function.__name__ + for name in [n, n.replace("_", "-")]: + if re.search(encoding_regex, name): + matches.append(n.replace("_", "-")) + continue + if extended: + # in some cases, encoding_regex can match a generated string that uses a particular portion of its + # generating pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also + # find "morse" or "atbash" very rarely because of their dynamic patterns and the limited number of randomly + # generated strings + # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of + # matches ; executing 5 times the string generation for a given codec but adding the codec to the list of + # matches only if we get at least 3 matches ensures that we consider up to 2 failures that could be + # stochastic, therefore drastically decreasing the probability to get a "junk" encoding in the matches list + c = 0 + for i in range(5): + for s in generate_strings_from_regex(search_function.__pattern__): + if re.search(encoding_regex, s): + c += 1 + break + if c >= 3: + matches.append(n) + break + for s, n in ALIASES.items(): + if re.search(encoding_regex, s) or re.search(encoding_regex, n): + matches.append(n) + return sorted(list(set(matches)), key=_human_keys) +codecs.search = search + + +# utility function for the search feature +CATEGORIES = { + 'digit': digits, + 'not_digit': reduce(lambda x, c: x.replace(c, ""), digits, printable), + 'space': whitespace, + 'not_space': reduce(lambda x, c: x.replace(c, ""), whitespace, printable), + 'word': ascii_letters + digits + '_', + 'not_word': reduce(lambda x, c: x.replace(c, ""), ascii_letters + digits + '_', printable), +} +REPEAT_MAX = 10 +STAR_PLUS_MAX = 10 +YIELD_MAX = 100 + + +def __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max, parsed=False): + """ Recursive function to generate strings from a regex pattern. """ + if regex is None: + return + __groups = {} + tokens = [] + negate, last_rand = False, None + for state in (regex if parsed else re.sre_parse.parse(b(getattr(regex, "pattern", regex)))): + code = getattr(state[0], "name", state[0]).lower() + value = getattr(state[1], "name", state[1]) + value = value.lower() if isinstance(value, str) else value + if code in ["assert_not", "at"]: + continue + elif code == "any": + charset = list(printable.replace("\n", "")) + while charset[0] == last_rand and len(charset) > 1: + random.shuffle(charset) + last_rand = charset[0] + tokens.append(charset) # should be ord(x) with x belongs to [0, 256[ + elif code == "assert": + tokens.append(list(__gen_str_from_re(value[1], star_plus_max, repeat_max, yield_max, True))) + elif code == "branch": + result = [] + for r in value[1]: + result += list(__gen_str_from_re(r, star_plus_max, repeat_max, yield_max, True)) or [""] + tokens.append(result) + elif code == "category": + charset = list(CATEGORIES[value[9:]]) + if negate: + negate = False + charset = list(set(printable).difference(charset)) + while charset[0] == last_rand and len(charset) > 1: + random.shuffle(charset) + last_rand = charset[0] + tokens.append(charset) + elif code == "groupref": + tokens.extend(__groups[value]) + elif code == "in": + subtokens = list(__gen_str_from_re(value, star_plus_max, repeat_max, yield_max, True)) + subtokens = [x for l in subtokens for x in l] + tokens.append(subtokens) + elif code == "literal": + tokens.append(chr(value)) + elif code in ["max_repeat", "min_repeat"]: + start, end = value[:2] + end = min(end, star_plus_max) + start = min(start, end) + charset = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) + subtokens = [] + if start == 0 and end == 1: + subtokens.append("") + subtokens.extend(charset) + elif len(charset) ** end > repeat_max: + for i in range(min(repeat_max, 10 * len(charset))): + n = random.randint(start, end + 1) + token = "" if n == 0 else "".join(random.choice(charset) for i in range(n)) + if token not in subtokens: + subtokens.append(token) + else: + i -= 1 + else: + for n in range(start, end + 1): + for c in product(charset, repeat=n): + subtokens.append("".join(c)) + tokens.append(subtokens) + elif code == "negate": + negate = True + elif code == "not_literal": + charset = list(printable.replace(chr(value), "")) + while charset[0] == last_rand and len(charset) > 1: + random.shuffle(charset) + last_rand = charset[0] + tokens.append(charset) + elif code == "range": + tokens.append("".join(chr(i) for i in range(value[0], value[1] + 1))) + elif code == "subpattern": + result = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) + if value[0]: + __groups[value[0]] = result + tokens.append(result) + else: + raise NotImplementedError(f"Unhandled code '{code}'") + if len(tokens) == 0: + tokens = [""] + i = 0 + for result in product(*tokens): + yield "".join(result) + i += 1 + if i >= yield_max: + break + + +def _human_keys(text): + """ Sorting function for considering strings with numbers (e.g. base2, base10, base100) """ + tokens = [] + for s in re.split(r"(\d+|\D+)", text): + tokens.append(int(s) if s.isdigit() else s) + return tokens + + +def generate_string_from_regex(regex): + """ Utility function to generate a single string from a regex pattern. """ + if regex: + return list(generate_strings_from_regex(regex, yield_max=1))[0] + + +def generate_strings_from_regex(regex, star_plus_max=STAR_PLUS_MAX, repeat_max=REPEAT_MAX, yield_max=YIELD_MAX): + """ Utility function to generate strings from a regex pattern. """ + for r in __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max): + yield r + + +# guess feature objects +__module_exists = lambda n: n in [x[1] for x in iter_modules()] +stopfunc = ModuleType("stopfunc", """ + Predefined stop functions + ~~~~~~~~~~~~~~~~~~~~~~~~~ + + This submodule contains stop functions for the guess feature of codext. + + - `flag`: searches for the pattern "[Ff][Ll1][Aa4@][Gg9]" (either UTF-8 or UTF-16) + - `lang_**`: checks if the given lang (any from the PROFILES_DIRECTORY of the langdetect module) is detected + - `printables`: checks that every output character is in the set of printables + - `regex`: takes one argument, the regular expression, for checking a string against the given pattern + - `text`: checks for printables and an entropy less than 4.6 (empirically determined) +""") +stopfunc.printables = lambda s: all(c in printable for c in ensure_str(s)) +stopfunc.printables.__name__ = stopfunc.printables.__qualname__ = "printables" +stopfunc.regex = lambda p: lambda s: re.search(p, ensure_str(s)) is not None +stopfunc.regex.__name__ = stopfunc.regex.__qualname__ = "regex" +stopfunc.text = lambda s: stopfunc.printables(s) and entropy(s) < 4.6 +stopfunc.text.__name__ = stopfunc.text.__qualname__ = "text" +stopfunc.flag = lambda x: re.search(r"[Ff][Ll1][Aa4@][Gg96]", ensure_str(x)) is not None +stopfunc.flag.__name__ = stopfunc.flag.__qualname__ = "flag" +stopfunc.default = stopfunc.text + +stopfunc.LANG_BACKEND = None +stopfunc.LANG_BACKENDS = [n for n in ["pycld2", "langdetect", "langid", "cld3", "textblob"] if __module_exists(n)] +if len(stopfunc.LANG_BACKENDS) > 0: + stopfunc.LANG_BACKEND = stopfunc.LANG_BACKENDS[0] +if "cld3" in stopfunc.LANG_BACKENDS: + stopfunc.CLD3_LANGUAGES = "af|am|ar|bg|bn|bs|ca|ce|co|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|fy|ga|gd|gl|gu|ha|" \ + "hi|hm|hr|ht|hu|hy|id|ig|is|it|iw|ja|jv|ka|kk|km|kn|ko|ku|ky|la|lb|lo|lt|lv|mg|mi|mk|" \ + "ml|mn|mr|ms|mt|my|ne|nl|no|ny|pa|pl|ps|pt|ro|ru|sd|si|sk|sl|sm|sn|so|sq|sr|st|su|sv|" \ + "sw|ta|te|tg|th|tr|uk|ur|uz|vi|xh|yi|yo|zh|zu".split("|") +if "textblob" in stopfunc.LANG_BACKENDS: + stopfunc.TEXTBLOB_LANGUAGES = "af|ar|az|be|bg|bn|ca|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|ga|gl|gu|hi|hr|ht|hu|" \ + "id|is|it|iw|ja|ka|kn|ko|la|lt|lv|mk|ms|mt|nl|no|pl|pt|ro|ru|sk|sl|sq|sr|sv|sw|ta|" \ + "te|th|tl|tr|uk|ur|vi|yi|zh".split("|") + + +def _detect(text): + _lb, t = stopfunc.LANG_BACKEND, ensure_str(text) + if _lb is None: + raise ValueError("No language backend %s" % ["selected", "installed"][len(stopfunc.LANG_BACKENDS) == 0]) + return langid.classify(t)[0] if _lb == "langid" else \ + langdetect.detect(t) if _lb == "langdetect" else \ + pycld2.detect(t)[2][0][1] if _lb == "pycld2" else \ + cld3.get_language(t).language[:2] if _lb == "cld3" else \ + textblob.TextBlob(t).detect_language()[:2] + + +def _lang(lang): + def _test(s): + if not stopfunc.text(s): + return False + try: + return _detect(ensure_str(s))[:2] == lang + except: + return False + return _test + + +def _load_lang_backend(backend=None): + # import the requested backend library if not imported yet + if backend is None or backend in stopfunc.LANG_BACKENDS: + stopfunc.LANG_BACKEND = backend + if backend: + globals()[backend] = __import__(backend) + else: + raise ValueError("Unsupported language detection backend") + # remove language-related stop functions + for attr in dir(stopfunc): + if attr.startswith("_") or not isinstance(getattr(stopfunc, attr), FunctionType): + continue + if re.match(r"lang_[a-z]{2}$", attr): + delattr(stopfunc, attr) + # rebind applicable language-related stop functions + if stopfunc.LANG_BACKEND: + _lb = stopfunc.LANG_BACKEND + if _lb == "langid": + langid.langid.load_model() + for lang in ( + langid.langid.identifier.nb_classes if _lb == "langid" else \ + list(set(p[:2] for p in os.listdir(langdetect.PROFILES_DIRECTORY))) if _lb == "langdetect" else \ + list(set(x[1][:2] for x in pycld2.LANGUAGES if x[0] in pycld2.DETECTED_LANGUAGES)) if _lb == "pycld2" else \ + stopfunc.CLD3_LANGUAGES if _lb == "cld3" else \ + stopfunc.TEXTBLOB_LANGUAGES if _lb == "textblob" else \ + []): + n = f"lang_{lang}" + setattr(stopfunc, n, _lang(lang)) + getattr(stopfunc, n).__name__ = getattr(stopfunc, n).__qualname__ = n + if LANG: + flng = f"lang_{LANG}" + if getattr(stopfunc, flng, None): + stopfunc.default = getattr(stopfunc, flng) +stopfunc._reload_lang = _load_lang_backend + + +def _validate(stop_function, lang_backend="none"): + s, lb = stop_function, lang_backend + if isinstance(s, str): + if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ + all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): + stopfunc._reload_lang(lb) + f = getattr(stopfunc, s, None) + if f: + return f + elif not isinstance(s, FunctionType): + raise ValueError("Bad stop function") + return s +stopfunc._validate = _validate + + +def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, result, found=(), + stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): + """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ + if depth > min_depth and stop_func(input): + if not stop and (show or debug) and found not in result: + s = repr(input) + s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s + s = f"[+] {', '.join(found)}: {s}" + print(s if len(s) <= 80 else f"{s[:77]}...") + result[found] = input + if depth >= max_depth or len(result) > 0 and stop: + return + prev_enc = found[-1] if len(found) > 0 else "" + e = encodings.get(depth, encodings.get(-1, [])) + for new_input, encoding in __rank(prev_input, input, prev_enc, e, scoring_heuristic, extended): + if len(result) > 0 and stop: + return + if debug: + print(f"[*] Depth {depth+1:0{len(str(max_depth))}}/{max_depth}: {encoding}") + __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ), + stop, show, scoring_heuristic, extended, debug) + + +def __make_encodings_dict(include, exclude): + """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible + encoding names. It also creates a cache with the CodecInfo objects for improving performance. """ + def _develop(d, keep=True): + d = d or {} + for k, v in d.items(): + l, cc, sc = [], [e for e in v if e in CODECS_CATEGORIES], [e for e in v if e not in CODECS_CATEGORIES] + # list from in-scope categories and then everything that is not a category + for enc in ((list_encodings(*cc) if (len(cc) > 0 or keep) and len(sc) == 0 else []) + sc): + g = [] + for e in (search(enc, False) or [enc]): + try: + ci = lookup(e, False) + g.extend(ci.parameters['guess']) + except: + pass + if enc in g: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected + l.append(enc) + else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected + l.extend(g) + d[k] = list(set(l)) + return d + _excl, _incl = _develop(exclude, False), _develop(include) + return {k: [x for x in v if x not in _excl.get(k, [])] for k, v in _incl.items()} + + +def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False): + """ Filter valid encodings and rank them by relevance. """ + ranking = {} + for e in encodings: + try: + codec = CODECS_CACHE[e] + except KeyError: + try: + CODECS_CACHE[e] = codec = lookup(e, False) + except LookupError: + continue + t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) + if t: + ranking[e] = t + for encoding, result in sorted(ranking.items(), key=lambda x: (-x[1][0], x[0])): + yield result if yield_score else result[1], encoding + + +class _Text(object): + __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"] + + def __init__(self, text, pad_char=None): + self.text = ensure_str(text) + c = self.text[-1] + pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c) + self.padding = pad_char is not None and last_char == pad_char + if self.padding: + text = text.rstrip(b(pad_char) if isinstance(text, bytes) else pad_char) + self.len = len(self.text) + self.lcharset = len(set(self.text)) + self.printables = float(len([c for c in self.text if c in printable])) / self.len + self.entropy = entropy(self.text) + + +def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False): + """ Score relevant encodings given an input. """ + obj = None + sc = codec.parameters.get('scoring', {}) + no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False) + # ignore encodings that fail to decode with their default errors handling value + try: + new_input = codec.decode(input)[0] + except: + return + # ignore encodings that give an output identical to the input (identity transformation) or to the previous input + if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input): + return + # ignore encodings that transitively give the same output (identity transformation by chaining twice a same + # codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9) + if transitive and prev_encoding: + ci_prev = lookup(prev_encoding, False) + if ci_prev.parameters['name'] == codec.parameters['name']: + return + # compute input's characteristics only once and only if the control flow reaches this point + pad = sc.get('padding_char') + if obj is None: + obj = _Text(input, pad) + if heuristic: + # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base + # codecs) so that we can put the right one as early as possible and eventually exclude bad candidates + s = -sc.get('penalty', .0) + # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ; + # on the contrary, if the length of input text's charset is strictly greater, give a penalty + lcs = sc.get('len_charset', 256) + if isinstance(lcs, type(lambda: None)): + lcs = int(lcs(encoding)) + if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset: + s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1) + elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset: + s -= .2 # this can occur for encodings with no_error set to True + # then, take padding into account, giving a bonus if padding is to be encountered and effectively present, + # or a penalty when it should not be encountered but it is present + if pad and obj.padding: + s += .2 # when padding is encountered while it is legitimate, it could be a good indication => bonus + elif not pad and obj.padding: + s -= .1 # it could arise a padding character is encountered while not being padding => small penalty + # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when + # lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased) + if not no_error: + pr = sc.get('printables_rate', 0) + if isinstance(pr, type(lambda: None)): + pr = float(pr(obj.printables)) + if obj.printables - pr <= .05: + s += .1 + expf = sc.get('expansion_factor', 1.) + if expf: + f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f + if isinstance(expf, type(lambda: None)): + try: # this case allows to consider the current encoding name from the current codec + expf = expf(f, encoding) + except TypeError: + expf = expf(f) + if isinstance(expf, (int, float)): + expf = 1/f - .1 <= 1/expf <= 1/f + .1 + elif isinstance(expf, (tuple, list)) and len(expf) == 2: + expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] + s += [-1., .1][expf] + # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the + # number of input characters to take bad entropies of shorter strings into account + entr = sc.get('entropy', lambda e: e) + entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr + if isinstance(entr, type(lambda: None)): + try: # this case allows to consider the current encoding name from the current codec + entr = entr(obj.entropy, encoding) + except TypeError: + entr = entr(obj.entropy) + if entr is not None: + # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1) + d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - obj.entropy) + if d_entr <= .5: + s += .5 - d_entr + # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) + bonus = sc.get('bonus_func') + if bonus is not None: + if isinstance(bonus, type(lambda: None)): + bonus = bonus(obj, codec, encoding) + if bonus: + s += .2 + else: + s = 1. + # exclude negative (and eventually null) scores as they are (hopefully) not relevant + if extended and s >= .0 or not extended and s > .0: + return s, new_input + + +def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=None, exclude=None, found=(), + stop=True, show=False, scoring_heuristic=True, extended=False, debug=False): + """ Try decoding without the knowledge of the encoding(s). + + :param input: input text to be guessed + :param stop_func: function defining the stop condition + :param min_depth: minimum search depth + :param max_depth: maximum search depth + ;param include: inclusion item OR list with category, codec or encoding names OR dictionary with lists per + depth (nothing means include every encoding) + :param exclude: exclusion item OR list with category, codec or encoding names OR dictionary with lists per + depth (nothing means exclude no encoding) + :param found: tuple of already found encodings + :param stop: whether to stop or not when a valid solution is found + :param show: whether to immediately show once a solution is found + :param scoring_heuristic: whether to apply the scoring heuristic during the search (if disabled, all scores are 1., + meaning that every non-failing encoding will be considered with no order of precedence) + :param extended: whether to also consider null scores with the heuristic + :param debug: whether to show each attempt at each depth during computation + """ + if len(input) == 0: + return "" + # check for min and max depths + if max_depth <= 0: + raise ValueError("Depth must be a non-null positive integer") + if min_depth > max_depth: + raise ValueError("Min depth shall be less than or equal to the max depth") + # take the tuple of found encodings into account + if len(found) > 0: + for encoding in found: + input = decode(input, encoding) + # handle the stop function as a regex if a string was given + if isinstance(stop_func, str): + stop_func = stopfunc.regex(stop_func) + # reformat include and exclude arguments ; supported formats: + for n, l in zip(["inc", "exc"], [include, exclude]): + if l is None: + if n == "inc": + include = l = {-1: CODECS_CATEGORIES} + else: + exclude = l = {} + # "category" OR "enc_name" OR whatever => means a single item for all depths + if isinstance(l, str): + if n == "inc": + include = l = {-1: [l]} + else: + exclude = l = {-1: [l]} + # ["enc_name1", "enc_name2", ...] => means for all depths + if isinstance(l, (list, tuple)): + if n == "inc": + include = l = {-1: l} + else: + exclude = l = {-1: l} + # {-1: [...], 2: [...], ...} => means prefedined depths with their lists of in-/excluded encodings + if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()): + raise ValueError("Include argument shall be a list or a dictionary with integer keys") + # precompute encodings lists per depth and cache the related CodecInfo objects + encodings, result = __make_encodings_dict(include, exclude), {} + try: + # breadth-first search + for d in range(max_depth): + __guess("", input, stop_func, 0, d+1, min_depth, encodings, result, tuple(found), stop, show, + scoring_heuristic, extended, debug) + if stop and len(result) > 0: + break + except KeyboardInterrupt: + pass + CODECS_CACHE = {} + return result +codecs.guess = guess + + +def rank(input, extended=False, limit=-1, include=None, exclude=None): + """ Rank the most probable encodings based on the given input. + + :param input: input text to be evaluated + :param extended: whether to consider null scores too (NB: negative scores are not output !) + :param limit: number of encodings to be returned (-1 means all of them) + :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) + :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) + """ + encodings = __make_encodings_dict(include if isinstance(include, dict) else {-1: include or CODECS_CATEGORIES}, + exclude if isinstance(exclude, dict) else {-1: exclude or []}) + r = list(__rank(None, input, "", encodings[-1], True, extended, True)) + return r[:limit] if len(r) > 1 else r +codecs.rank = rank + diff --git a/codext/__info__.py b/src/codext/__info__.py similarity index 96% rename from codext/__info__.py rename to src/codext/__info__.py index f299990..85c3966 100644 --- a/codext/__info__.py +++ b/src/codext/__info__.py @@ -1,16 +1,16 @@ -# -*- coding: UTF-8 -*- -"""Codext package information. - -""" -import os -from datetime import datetime - -__author__ = "Alexandre D'Hondt" -__copyright__ = "© 2019-{} A. D'Hondt".format(datetime.now().year) -__email__ = "alexandre.dhondt@gmail.com" -__license__ = "GPLv3 (https://www.gnu.org/licenses/gpl-3.0.fr.html)" -__source__ = "https://github.com/dhondta/python-codext" - -with open(os.path.join(os.path.dirname(__file__), "VERSION.txt")) as f: - __version__ = f.read().strip() - +# -*- coding: UTF-8 -*- +"""Codext package information. + +""" +import os +from datetime import datetime + +__author__ = "Alexandre D'Hondt" +__copyright__ = "© 2019-{} A. D'Hondt".format(datetime.now().year) +__email__ = "alexandre.dhondt@gmail.com" +__license__ = "GPLv3 (https://www.gnu.org/licenses/gpl-3.0.fr.html)" +__source__ = "https://github.com/dhondta/python-codext" + +with open(os.path.join(os.path.dirname(__file__), "VERSION.txt")) as f: + __version__ = f.read().strip() + diff --git a/codext/__init__.py b/src/codext/__init__.py similarity index 93% rename from codext/__init__.py rename to src/codext/__init__.py index f95abb8..c503d03 100644 --- a/codext/__init__.py +++ b/src/codext/__init__.py @@ -1,255 +1,257 @@ -# -*- coding: UTF-8 -*- -"""Codecs extension module. - -""" -from __future__ import print_function -from _codecs import lookup as orig_lookup -from ast import literal_eval -from six import binary_type, text_type - -from .__common__ import * -from .__info__ import __author__, __copyright__, __email__, __license__, __source__, __version__ - - -__all__ = ["add", "add_map", "clear", "decode", "encode", "guess", "lookup", "open", "rank", "register", "remove", - "reset"] - -decode = codecs.decode -encode = codecs.encode -guess = codecs.guess -lookup = codecs.lookup -open = codecs.open - -_lst = list -list = list_encodings # not included in __all__ because of shadow name - - -reset() - - -def __format_list(items, include=True): - if items is None: - return - d = {-1: list_encodings() if include else []} - for n, i in enumerate(items): - try: - depth, i = i.split(":") - depth = int(depth.strip().replace("~", "-")) - if depth < 0: - depth = -1 - except ValueError: - if n == 0: - d[-1] = [] - depth = -1 - d.setdefault(depth, []) - d[depth].append(i.strip()) - return d - - -def __print_tabular(lst, space=4): - try: - cols, _ = os.get_terminal_size() - # first, convert the list to a table that fits into the terminal - i, line, w = 0, "", [] - while i < len(lst): - x = lst[i] - l = len(x) - col = "%-{}s".format(l + space) % x - i += 1 - w.append(l) - if len(line) + len(col) > cols: - break - line += col - while True: - t = [lst[j:j+i] for j in range(0, len(lst), i)] - w = [max(0 if j+k*i >= len(lst) else len(lst[j+k*i]) for k in range(len(t))) for j, _ in enumerate(w)] - if sum(w) + space * len(w) >= cols: - i -= 1 - w.pop() - else: - break - print("\n".join("".join("%-{}s".format(w[n] + space) % x for n, x in enumerate(r)) for r in t) + "\n") - except (AttributeError, OSError): - print(", ".join(lst) + "\n") - - -def main(): - import argparse, os - - class _CustomFormatter(argparse.RawTextHelpFormatter): - def __init__(self, prog, **kwargs): - kwargs['max_help_position'] = 32 - super(_CustomFormatter, self).__init__(prog, **kwargs) - - def _format_action_invocation(self, action): - if not action.option_strings: - metavar, = self._metavar_formatter(action, action.dest)(1) - return metavar - else: - return ", ".join(action.option_strings) - - descr = "Codecs Extension (CodExt) {}\n\nAuthor : {} ({})\nCopyright: {}\nLicense : {}\nSource : {}\n" \ - "\nThis tool allows to encode/decode input strings/files with an extended set of codecs.\n\n" \ - .format(__version__, __author__, __email__, __copyright__, __license__, __source__) - examples = "usage examples:\n- " + "\n- ".join([ - "codext search bitcoin", - "codext decode base32 -i file.b32", - "codext encode morse < to_be_encoded.txt", - "echo \"test\" | codext encode base100", - "echo -en \"test\" | codext encode braille -o test.braille", - "codext encode base64 < to_be_encoded.txt > text.b64", - "echo -en \"test\" | codext encode base64 | codext encode base32", - "echo -en \"mrdvm6teie6t2cq=\" | codext encode upper | codext decode base32 | codext decode base64", - "echo -en \"test\" | codext encode upper reverse base32 | codext decode base32 reverse lower", - "echo -en \"test\" | codext encode upper reverse base32 base64 morse", - "echo -en \"test\" | codext encode base64 gzip | codext guess", - "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base", - ]) - kw = {'formatter_class': _CustomFormatter} - parser = argparse.ArgumentParser(description=descr, epilog=examples, **kw) - kw2 = {'required': True} if PY3 else {} - sparsers = parser.add_subparsers(dest="command", help="command to be executed", **kw2) - parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)") - parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)") - parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip", - help="strip newlines from input (default: False)") - encode = sparsers.add_parser("encode", help="encode input using the specified codecs", **kw) - encode.add_argument("encoding", nargs="+", help="list of encodings to apply") - encode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], - help="error handling (default: strict)") - decode = sparsers.add_parser("decode", help="decode input using the specified codecs", **kw) - decode.add_argument("encoding", nargs="+", help="list of encodings to apply") - decode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], - help="error handling (default: strict)") - guess = sparsers.add_parser("guess", help="try guessing the decoding codecs", **kw) - guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)") - guess.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely not used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - guess.add_argument("-E", "--extended", action="store_true", - help="while using the scoring heuristic, also consider null scores (default: False)") - lng = "lang_%s" % LANG - def_func = lng if getattr(stopfunc, lng, None) else "text" - guess.add_argument("-f", "--stop-function", default=def_func, metavar="FUNC", help="result checking function " - "(default: %s) ; format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-" - "sensitive ; add -i to force it as case-insensitive or add '(?i)' in front of the expression" - % def_func) - guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down" - " the search but may be more accurate (default: False)") - guess.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - guess.add_argument("-I", "--case-insensitive", dest="icase", action="store_true", - help="while using the regex stop function, set it as case-insensitive (default: False)") - if len(stopfunc.LANG_BACKENDS) > 0: - _lb = stopfunc.LANG_BACKEND - guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"], - help="natural language detection backend (default: %s)" % _lb) - guess.add_argument("-m", "--min-depth", default=0, type=int, metavar="INT", - help="minimum codec search depth before triggering results (default: 0)") - guess.add_argument("-M", "--max-depth", default=5, type=int, metavar="INT", - help="maximum codec search depth (default: 5)") - guess.add_argument("-s", "--do-not-stop", action="store_true", - help="do not stop if a valid output is found (default: False)") - guess.add_argument("-v", "--verbose", action="store_true", - help="show guessing information and steps (default: False)") - rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw) - rank.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely not used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - rank.add_argument("-E", "--extended", action="store_true", - help="while using the scoring heuristic, also consider null scores (default: False)") - rank.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results") - search = sparsers.add_parser("search", help="search for codecs") - search.add_argument("pattern", nargs="+", help="encoding pattern to search") - listi = sparsers.add_parser("list", help="list items") - lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", **kw2) - liste = lsparsers.add_parser("encodings", help="list encodings") - liste.add_argument("category", nargs="+", help="selected categories") - listm = lsparsers.add_parser("macros", help="list macros") - addm = sparsers.add_parser("add-macro", help="add a macro to the registry") - addm.add_argument("name", help="macro's name") - addm.add_argument("encoding", nargs="+", help="list of encodings to chain") - remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry") - remm.add_argument("name", help="macro's name") - args = parser.parse_args() - if args.command in ["guess", "rank"]: - args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) - try: - # if a search pattern is given, only handle it - if args.command == "search": - results = [] - for enc in args.pattern: - results.extend(codecs.search(enc)) - print(", ".join(results) or "No encoding found") - return 0 - # add/remove macros (not requiring to input a file or text) - elif args.command == "add-macro": - add_macro(args.name, *args.encoding) - return 0 - elif args.command == "remove-macro": - remove_macro(args.name) - return 0 - # list encodings or macros - elif args.command == "list": - if args.type == "encodings": - cats = args.category or list_categories() - for c in sorted(cats): - l = list_encodings(c) - if len(l) > 0: - if len(cats) > 0: - print(c.upper() + ":") - __print_tabular(l) - elif args.type == "macros": - l = list_macros() - if len(l) > 0: - __print_tabular(l) - return 0 - # handle input file or stdin - c =_input(args.infile) - c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") - # strip any other (CR)LF - if args.strip: - c = re.sub(r"\r?\n", "", c) if isinstance(c, str) else c.replace(b"\r\n", b"").replace(b"\n", b"") - if args.command in ["decode", "encode"]: - # encode or decode - for encoding in args.encoding: - c = getattr(codecs, ["encode", "decode"][args.command == "decode"])(c, encoding, args.errors) - # handle output file or stdout - if args.outfile: - with open(args.outfile, 'wb') as f: - f.write(c) - else: - print(ensure_str(c or "Could not %scode :-(" % ["en", "de"][args.command == "decode"]), end="") - elif args.command == "guess": - s, lb = args.stop_function, args.lang_backend - if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ - all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): - stopfunc._reload_lang(lb) - r = codecs.guess(c, - getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth, - args.include, args.exclude, args.encoding, not args.do_not_stop, True, # show - not args.no_heuristic, args.extended, args.verbose) - for i, o in enumerate(r.items()): - e, out = o - if len(e) > 0: - if args.outfile: - n, ext = os.path.splitext(args.outfile) - fn = args.outfile if len(r) == 1 else "%s-%d%s" % (n, i+1, ext) - else: - print("Codecs: %s" % ", ".join(e)) - print(ensure_str(out)) - if len(r) == 0: - print("Could not decode :-(") - elif args.command == "rank": - for i, e in codecs.rank(c, args.extended, args.limit, args.include, args.exclude): - s = "[+] %.5f: %s" % (i[0], e) - print(s if len(s) <= 80 else s[:77] + "...") - except Exception as e: - raise e - m = str(e) - print("codext: " + m[0].lower() + m[1:]) - +# -*- coding: UTF-8 -*- +"""Codecs extension module. + +""" +from .__common__ import * +from .__info__ import __author__, __copyright__, __email__, __license__, __source__, __version__ + + +__all__ = ["add", "add_map", "clear", "decode", "encode", "guess", "lookup", "open", "rank", "register", "remove", + "reset"] + +decode = codecs.decode +encode = codecs.encode +guess = codecs.guess +lookup = codecs.lookup +open = codecs.open + +_lst = list +list = list_encodings # not included in __all__ because of shadow name + + +reset() + + +# populate codext with attributes from codecs that were not modified +for attr in codecs.__all__: + if attr in __all__: + continue + locals()[attr] = getattr(codecs, attr) + __all__.append(attr) + + +def __format_list(items, include=True): + if items is None: + return + d = {-1: list_encodings() if include else []} + for n, i in enumerate(items): + try: + depth, i = i.split(":") + depth = int(depth.strip().replace("~", "-")) + if depth < 0: + depth = -1 + except ValueError: + if n == 0: + d[-1] = [] + depth = -1 + d.setdefault(depth, []) + d[depth].append(i.strip()) + return d + + +def __print_tabular(lst, space=4): + try: + cols, _ = os.get_terminal_size() + # first, convert the list to a table that fits into the terminal + i, line, w = 0, "", [] + while i < len(lst): + x = lst[i] + l = len(x) + col = "%-{}s".format(l + space) % x + i += 1 + w.append(l) + if len(line) + len(col) > cols: + break + line += col + while True: + t = [lst[j:j+i] for j in range(0, len(lst), i)] + w = [max(0 if j+k*i >= len(lst) else len(lst[j+k*i]) for k in range(len(t))) for j, _ in enumerate(w)] + if sum(w) + space * len(w) >= cols: + i -= 1 + w.pop() + else: + break + print("\n".join("".join("%-{}s".format(w[n] + space) % x for n, x in enumerate(r)) for r in t) + "\n") + except (AttributeError, OSError): + print(", ".join(lst) + "\n") + + +def main(): + import argparse, os + + class _CustomFormatter(argparse.RawTextHelpFormatter): + def __init__(self, prog, **kwargs): + kwargs['max_help_position'] = 32 + super(_CustomFormatter, self).__init__(prog, **kwargs) + + def _format_action_invocation(self, action): + if not action.option_strings: + metavar, = self._metavar_formatter(action, action.dest)(1) + return metavar + else: + return ", ".join(action.option_strings) + + descr = "Codecs Extension (CodExt) {}\n\nAuthor : {} ({})\nCopyright: {}\nLicense : {}\nSource : {}\n" \ + "\nThis tool allows to encode/decode input strings/files with an extended set of codecs.\n\n" \ + .format(__version__, __author__, __email__, __copyright__, __license__, __source__) + examples = "usage examples:\n- " + "\n- ".join([ + "codext search bitcoin", + "codext decode base32 -i file.b32", + "codext encode morse < to_be_encoded.txt", + "echo \"test\" | codext encode base100", + "echo -en \"test\" | codext encode braille -o test.braille", + "codext encode base64 < to_be_encoded.txt > text.b64", + "echo -en \"test\" | codext encode base64 | codext encode base32", + "echo -en \"mrdvm6teie6t2cq=\" | codext encode upper | codext decode base32 | codext decode base64", + "echo -en \"test\" | codext encode upper reverse base32 | codext decode base32 reverse lower", + "echo -en \"test\" | codext encode upper reverse base32 base64 morse", + "echo -en \"test\" | codext encode base64 gzip | codext guess", + "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base", + ]) + kw = {'formatter_class': _CustomFormatter} + parser = argparse.ArgumentParser(description=descr, epilog=examples, **kw) + sparsers = parser.add_subparsers(dest="command", help="command to be executed", required=True) + parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)") + parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)") + parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip", + help="strip newlines from input (default: False)") + encode = sparsers.add_parser("encode", help="encode input using the specified codecs", **kw) + encode.add_argument("encoding", nargs="+", help="list of encodings to apply") + encode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], + help="error handling (default: strict)") + decode = sparsers.add_parser("decode", help="decode input using the specified codecs", **kw) + decode.add_argument("encoding", nargs="+", help="list of encodings to apply") + decode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], + help="error handling (default: strict)") + guess = sparsers.add_parser("guess", help="try guessing the decoding codecs", **kw) + guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)") + guess.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely not used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + guess.add_argument("-E", "--extended", action="store_true", + help="while using the scoring heuristic, also consider null scores (default: False)") + lng = "lang_%s" % LANG + def_func = lng if getattr(stopfunc, lng, None) else "text" + guess.add_argument("-f", "--stop-function", default=def_func, metavar="FUNC", help="result checking function " + "(default: %s) ; format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-" + "sensitive ; add -i to force it as case-insensitive or add '(?i)' in front of the expression" + % def_func) + guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down" + " the search but may be more accurate (default: False)") + guess.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + guess.add_argument("-I", "--case-insensitive", dest="icase", action="store_true", + help="while using the regex stop function, set it as case-insensitive (default: False)") + if len(stopfunc.LANG_BACKENDS) > 0: + _lb = stopfunc.LANG_BACKEND + guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"], + help="natural language detection backend (default: %s)" % _lb) + guess.add_argument("-m", "--min-depth", default=0, type=int, metavar="INT", + help="minimum codec search depth before triggering results (default: 0)") + guess.add_argument("-M", "--max-depth", default=5, type=int, metavar="INT", + help="maximum codec search depth (default: 5)") + guess.add_argument("-s", "--do-not-stop", action="store_true", + help="do not stop if a valid output is found (default: False)") + guess.add_argument("-v", "--verbose", action="store_true", + help="show guessing information and steps (default: False)") + rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw) + rank.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely not used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + rank.add_argument("-E", "--extended", action="store_true", + help="while using the scoring heuristic, also consider null scores (default: False)") + rank.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results") + search = sparsers.add_parser("search", help="search for codecs") + search.add_argument("pattern", nargs="+", help="encoding pattern to search") + listi = sparsers.add_parser("list", help="list items") + lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", required=True) + liste = lsparsers.add_parser("encodings", help="list encodings") + liste.add_argument("category", nargs="*", help="selected categories") + listm = lsparsers.add_parser("macros", help="list macros") + addm = sparsers.add_parser("add-macro", help="add a macro to the registry") + addm.add_argument("name", help="macro's name") + addm.add_argument("encoding", nargs="+", help="list of encodings to chain") + remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry") + remm.add_argument("name", help="macro's name") + args = parser.parse_args() + if args.command in ["guess", "rank"]: + args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) + try: + # if a search pattern is given, only handle it + if args.command == "search": + results = [] + for enc in args.pattern: + results.extend(codecs.search(enc)) + print(", ".join(results) or "No encoding found") + return 0 + # add/remove macros (not requiring to input a file or text) + elif args.command == "add-macro": + add_macro(args.name, *args.encoding) + return 0 + elif args.command == "remove-macro": + remove_macro(args.name) + return 0 + # list encodings or macros + elif args.command == "list": + if args.type == "encodings": + if args.category: + for c in sorted(args.category): + if len(l := list_encodings(c)) > 0: + print(c.upper() + ":") + __print_tabular(l) + else: + __print_tabular(list_encodings()) + elif args.type == "macros": + l = list_macros() + if len(l) > 0: + __print_tabular(l) + return 0 + # handle input file or stdin + c =_input(args.infile) + c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") + # strip any other (CR)LF + if args.strip: + c = re.sub(r"\r?\n", "", c) if isinstance(c, str) else c.replace(b"\r\n", b"").replace(b"\n", b"") + if args.command in ["decode", "encode"]: + # encode or decode + for encoding in args.encoding: + c = getattr(codecs, ["encode", "decode"][args.command == "decode"])(c, encoding, args.errors) + # handle output file or stdout + if args.outfile: + with open(args.outfile, 'wb') as f: + f.write(c) + else: + print(ensure_str(c or "Could not %scode :-(" % ["en", "de"][args.command == "decode"]), end="") + elif args.command == "guess": + s, lb = args.stop_function, getattr(args, "lang_backend", "none") + if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ + all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): + stopfunc._reload_lang(lb) + r = codecs.guess(c, + getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth, + args.include, args.exclude, args.encoding, not args.do_not_stop, True, # show + not args.no_heuristic, args.extended, args.verbose) + for i, o in enumerate(r.items()): + e, out = o + if len(e) > 0: + if args.outfile: + n, ext = os.path.splitext(args.outfile) + fn = args.outfile if len(r) == 1 else "%s-%d%s" % (n, i+1, ext) + else: + print("Codecs: %s" % ", ".join(e)) + print(ensure_str(out)) + if len(r) == 0: + print("Could not decode :-(") + elif args.command == "rank": + for i, e in codecs.rank(c, args.extended, args.limit, args.include, args.exclude): + s = "[+] %.5f: %s" % (i[0], e) + print(s if len(s) <= 80 else s[:77] + "...") + except Exception as e: + raise e + m = str(e) + print("codext: " + m[0].lower() + m[1:]) + diff --git a/codext/base/__init__.py b/src/codext/base/__init__.py old mode 100755 new mode 100644 similarity index 97% rename from codext/base/__init__.py rename to src/codext/base/__init__.py index 8c0d220..79deab5 --- a/codext/base/__init__.py +++ b/src/codext/base/__init__.py @@ -1,64 +1,64 @@ -# -*- coding: UTF-8 -*- -from argparse import ArgumentParser, RawTextHelpFormatter -from types import MethodType - -from .base45 import * -from .base85 import * -from .base91 import * -from .base100 import * -from .base122 import * -from .baseN import * -from ..__common__ import * -from ..__info__ import __version__ - - -def main(): - descr = """Usage: unbase [OPTION]... [FILE] -Decode multi-layer base encoded FILE, or standard input, to standard output. - -With no FILE, or when FILE is -, read standard input. - -Optional arguments: - -E, --extended also consider generic base codecs while guess-decoding - -f, --stop-function set the result chceking function (default: text) - format: printables|text|flag|lang_[bigram] - -M, --max-depth maximum codec search depth (default: 5) - -m, --min-depth minimum codec search depth (default: 0) - -p, --pattern pattern to be matched while searching - -s, --show show the decoding chain - - --help display this help and exit - --verbose show guessing information and steps - --version output version information and exit - -Report unbase bugs to -Full documentation at: -""" - parser = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) - parser.format_help = MethodType(lambda s: s.description, parser) - group = parser.add_mutually_exclusive_group() - parser.add_argument("file", nargs="?") - parser.add_argument("-E", "--extended", action="store_true") - group.add_argument("-f", "--stop-function", default="text") - parser.add_argument("-M", "--max-depth", type=int, default=10) - parser.add_argument("-m", "--min-depth", type=int, default=0) - group.add_argument("-p", "--pattern") - parser.add_argument("-s", "--show", action="store_true") - parser.add_argument("--help", action="help") - parser.add_argument("--version", action="version") - parser.add_argument("--verbose", action="store_true") - parser.version = "CodExt " + __version__ - args = parser.parse_args() - c, e = _input(args.file), [["base%d-generic" % i for i in range(2, 256)], []][args.extended] - c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") - r = codecs.guess(c, stopfunc._validate(args.stop_function), 0, args.max_depth, "base", tuple(e), stop=False, - show=args.verbose, debug=args.verbose) - if len(r) == 0: - print("Could not decode :-(") - return 0 - ans = max(r.items(), key=lambda x: len(x[0])) - if args.show: - print(" - ".join(ans[0])) - print(ensure_str(ans[1])) - return 0 - +# -*- coding: UTF-8 -*- +from argparse import ArgumentParser, RawTextHelpFormatter +from types import MethodType + +from .base45 import * +from .base85 import * +from .base91 import * +from .base100 import * +from .base122 import * +from .baseN import * +from ..__common__ import * +from ..__info__ import __version__ + + +def main(): + descr = """Usage: unbase [OPTION]... [FILE] +Decode multi-layer base encoded FILE, or standard input, to standard output. + +With no FILE, or when FILE is -, read standard input. + +Optional arguments: + -E, --extended also consider generic base codecs while guess-decoding + -f, --stop-function set the result chceking function (default: text) + format: printables|text|flag|lang_[bigram] + -M, --max-depth maximum codec search depth (default: 5) + -m, --min-depth minimum codec search depth (default: 0) + -p, --pattern pattern to be matched while searching + -s, --show show the decoding chain + + --help display this help and exit + --verbose show guessing information and steps + --version output version information and exit + +Report unbase bugs to +Full documentation at: +""" + parser = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) + parser.format_help = MethodType(lambda s: s.description, parser) + group = parser.add_mutually_exclusive_group() + parser.add_argument("file", nargs="?") + parser.add_argument("-E", "--extended", action="store_true") + group.add_argument("-f", "--stop-function", default="text") + parser.add_argument("-M", "--max-depth", type=int, default=10) + parser.add_argument("-m", "--min-depth", type=int, default=0) + group.add_argument("-p", "--pattern") + parser.add_argument("-s", "--show", action="store_true") + parser.add_argument("--help", action="help") + parser.add_argument("--version", action="version") + parser.add_argument("--verbose", action="store_true") + parser.version = "CodExt " + __version__ + args = parser.parse_args() + c, e = _input(args.file), [["base%d-generic" % i for i in range(2, 256)], []][args.extended] + c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") + r = codecs.guess(c, stopfunc._validate(args.stop_function), 0, args.max_depth, "base", tuple(e), stop=False, + show=args.verbose, debug=args.verbose) + if len(r) == 0: + print("Could not decode :-(") + return 0 + ans = max(r.items(), key=lambda x: len(x[0])) + if args.show: + print(" - ".join(ans[0])) + print(ensure_str(ans[1])) + return 0 + diff --git a/codext/base/_base.py b/src/codext/base/_base.py old mode 100755 new mode 100644 similarity index 95% rename from codext/base/_base.py rename to src/codext/base/_base.py index fce8b9a..f41df0b --- a/codext/base/_base.py +++ b/src/codext/base/_base.py @@ -1,291 +1,294 @@ -# -*- coding: UTF-8 -*- -"""Generic baseN functions. - -""" -from argparse import ArgumentParser, RawTextHelpFormatter -from math import log -from six import integer_types, string_types -from string import ascii_lowercase as lower, ascii_uppercase as upper, digits, printable -from textwrap import wrap as wraptext -from types import FunctionType, MethodType - -from ..__common__ import * -from ..__common__ import _set_exc -from ..__info__ import __version__ - - -_set_exc("BaseError") -_set_exc("BaseEncodeError") -_set_exc("BaseDecodeError") -""" -Curve fitting: - ->>> import matplotlib.pyplot as plt ->>> import pandas as pd ->>> import scipy.optimize ->>> from statistics import mean ->>> from tinyscript import random ->>> x, y = [], [] ->>> for i in range(2, 256): - v = [] - for j in range(16, 2048, 16): - s = random.randstr(j) - v.append(float(len(codext.encode(s, "base%d-generic" % i))) / len(s)) - x.append(i) - y.append(mean(v)) ->>> data = pd.DataFrame({'base': x, 'expf': y}) ->>> def fit(x, y, func, params): - params, cv = scipy.optimize.curve_fit(func, x, y, params) - print(params) - y2 = func(x, *params) - plt.clf() - plt.plot(x, y, ".", color="blue", alpha=.3) - plt.plot(x, y2, color="red", linewidth=3.0) - plt.show() ->>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (1, 1, 1, 1)) -[ 0.02841434 0.00512664 -0.99999984 0.01543879] ->>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (.028, .005, -1, .015)) -[ 0.02827357 0.00510124 -0.99999984 0.01536941] -""" -EXPANSION_FACTOR = lambda base: 0.02827357 / (base**0.00510124-0.99999984) + 0.01536941 -SIZE_LIMIT = 1024 * 1024 * 1024 - - -def _generate_charset(n): - """ Generate a characters set. - - :param n: size of charset - """ - if 1 < n <= len(printable): - return printable[:n] - elif len(printable) < n < 256: - return "".join(chr(i) for i in range(n)) - raise ValueError("Bad size of character set") - - -def _get_charset(charset, p=""): - """ Characters set selection function. It allows to define charsets in many different ways. - - :param charset: charset object, can be a string (the charset itself), a function (that chooses the right charset - depending on the input parameter) or a dictionary (either by exact key or by pattern matching) - :param p: the parameter for choosing the charset - """ - # case 1: charset is a function, so return its result - if isinstance(charset, FunctionType): - return charset(p) - # case 2: charset is a string, so return it - elif isinstance(charset, string_types): - return charset - # case 3: charset is a dict with keys '' and 'inv', typically for a charset using lowercase and uppercase characters - # that can be inverted - elif isinstance(charset, dict) and list(charset.keys()) == ["", "inv"]: - return charset["inv" if re.match(r"[-_]inv(erted)?$", p) else ""] - # case 4: charset is a dict, but not with the specific keys '' and 'inv', so consider it as pattern-charset pairs - elif isinstance(charset, dict): - # try to handle [p]arameter as a simple key - try: - return charset[p] - except KeyError: - pass - # or handle [p]arameter as a pattern - default, n, best = None, None, None - for pattern, cset in charset.items(): - n = len(cset) - if re.match(pattern, ""): - default = cset - continue - m = re.match(pattern, p) - if m: # find the longest match from the patterns - s, e = m.span() - if e - s > len(best or ""): - best = pattern - if best: - return charset[best] - # special case: the given [p]arameter can be the charset itself if it has the right length - p = re.sub(r"^[-_]+", "", p) - if len(p) == n: - return p - # or simply rely on key '' - if default is not None: - return default - raise ValueError("Bad charset descriptor ('%s')" % p) - - -# generic base en/decoding functions -def base_encode(input, charset, errors="strict", exc=BaseEncodeError): - """ Base-10 to base-N encoding. - - :param input: input (str or int) to be decoded - :param charset: base-N characters set - :param errors: errors handling marker - :param exc: exception to be raised in case of error - """ - i, n, r = input if isinstance(input, integer_types) else s2i(input), len(charset), "" - if n == 1: - if i > SIZE_LIMIT: - raise InputSizeLimitError("Input exceeded size limit") - return i * charset[0] - if n == 10: - return str(i) if charset == digits else "".join(charset[int(x)] for x in str(i)) - while i > 0: - i, c = divmod(i, n) - r = charset[c] + r - return r - - -def base_decode(input, charset, errors="strict", exc=BaseDecodeError): - """ Base-N to base-10 decoding. - - :param input: input to be decoded - :param charset: base-N characters set - :param errors: errors handling marker - :param exc: exception to be raised in case of error - """ - i, n, dec = 0, len(charset), lambda n: base_encode(n, [chr(x) for x in range(256)], errors, exc) - if n == 1: - return i2s(len(input)) - if n == 10: - return i2s(int(input)) if charset == digits else "".join(str(charset.index(c)) for c in input) - for k, c in enumerate(input): - try: - i = i * n + charset.index(c) - except ValueError: - handle_error("base", errors, exc, decode=True)(c, k, dec(i), "base%d" % n) - return dec(i) - - -# base codec factory functions -def base(charset, pattern, pow2=False, encode_template=base_encode, decode_template=base_decode, name=None, **kwargs): - """ Base-N codec factory. - - :param charset: charset selection function - :param pattern: matching pattern for the codec name (first capturing group is used as the parameter for selecting - the charset) - :param pow2: whether the base codec's N is a power of 2 - """ - cs = _get_charset(charset) - n = len(cs) - nb = log(n, 2) - if pow2 and nb != int(nb): - raise BaseError("Bad charset ; {} is not a power of 2".format(n)) - - def encode(param="", *args): - a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) - def _encode(input, errors="strict"): - if len(input) == 0: - return "", 0 - return encode_template(input, a, errors), len(input) - return _encode - - def decode(param="", *args): - a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) - sl, sc = "\n" not in a, "\n" not in a and not "\r" in a - def _decode(input, errors="strict"): - if len(input) == 0: - return "", 0 - input = _stripl(input, sc, sl) - return decode_template(input, a, errors), len(input) - return _decode - - kwargs['len_charset'] = n - kwargs['printables_rate'] = float(len([c for c in cs if c in printable])) / len(cs) - kwargs['expansion_factor'] = kwargs.pop('expansion_factor', (EXPANSION_FACTOR(n), .05)) - n = "base{}".format(n) if name is None else name - try: - g = [n, n + "-inv"] if "[-_]inv(erted)?$" in charset.keys() else [n] - except AttributeError: - g = [n] - kwargs['guess'] = kwargs.get('guess', g) - add(n, encode, decode, pattern, entropy=nb, **kwargs) - - -def base_generic(): - """ Base-N generic codec. """ - def encode(n): - a = _generate_charset(int(n)) - def _encode(input, errors="strict"): - return base_encode(input, a, errors), len(input) - return _encode - - def decode(n): - a = _generate_charset(int(n)) - sl, sc = "\n" not in a, "\n" not in a and not "\r" in a - def _decode(input, errors="strict"): - input = _stripl(input, sc, sl) - return base_decode(input, a, errors), len(input) - return _decode - - add("base", encode, decode, r"^base[-_]?([2-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:[-_]generic)?$", - guess=["base%d-generic" % i for i in range(2, 255)], entropy=lambda e, n: log(int(n.split("-")[0][4:]), 2), - len_charset=lambda n: int(n.split("-")[0][4:]), printables_rate=1., category="base-generic", penalty=.4, - expansion_factor=lambda f, n: (EXPANSION_FACTOR(int(n.split("-")[0][4:])), .05)) - - -def main(n, ref=None, alt=None, inv=True, swap=True, wrap=True): - base = str(n) + ("-" + alt.lstrip("-") if alt else "") - src = "The data are encoded as described for the base%(base)s alphabet in %(reference)s.\n" % \ - {'base': base, 'reference': "\n" + ref if len(ref) > 20 else ref} if ref else "" - text = "%(source)sWhen decoding, the input may contain newlines in addition to the bytes of the formal base" \ - "%(base)s alphabet. Use --ignore-garbage to attempt to recover from any other non-alphabet bytes in the" \ - " encoded stream." % {'base': base, 'source': src} - text = "\n".join(x for x in wraptext(text, 74)) - descr = """Usage: base%(base)s [OPTION]... [FILE] -Base%(base)s encode or decode FILE, or standard input, to standard output. - -With no FILE, or when FILE is -, read standard input. - -Mandatory arguments to long options are mandatory for short options too. - -d, --decode decode data - -i, --ignore-garbage when decoding, ignore non-alphabet characters -%(inv)s%(swap)s%(wrap)s - - --help display this help and exit - --version output version information and exit - -%(text)s - -Report base%(base)s translation bugs to -Full documentation at: -""" % {'base': base, 'text': text, - 'inv': ["", " -I, --invert invert charsets from the base alphabet (e.g. digits and letters)\n"][inv], - 'swap': ["", " -s, --swapcase swap the case\n"][swap], - 'wrap': ["", " -w, --wrap=COLS wrap encoded lines after COLS character (default 76).\n"+ 26 * " " + \ - "Use 0 to disable line wrapping"][wrap]} - - def _main(): - p = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) - p.format_help = MethodType(lambda s: s.description, p) - p.add_argument("file", nargs="?") - p.add_argument("-d", "--decode", action="store_true") - p.add_argument("-i", "--ignore-garbage", action="store_true") - if inv: - p.add_argument("-I", "--invert", action="store_true") - if swap: - p.add_argument("-s", "--swapcase", action="store_true") - if wrap: - p.add_argument("-w", "--wrap", type=int, default=76) - p.add_argument("--help", action="help") - p.add_argument("--version", action="version") - p.version = "CodExt " + __version__ - args = p.parse_args() - if args.decode: - args.wrap = 0 - args.invert = getattr(args, "invert", False) - c, f = _input(args.file), [encode, decode][args.decode] - if swap and args.swapcase and args.decode: - c = codecs.decode(c, "swapcase") - c = b(c).rstrip(b"\r\n") - try: - c = f(c, "base" + base + ["", "-inv"][getattr(args, "invert", False)], - ["strict", "ignore"][args.ignore_garbage]) - except Exception as err: - print("%sbase%s: invalid input" % (getattr(err, "output", ""), base)) - return 1 - c = ensure_str(c) - if swap and args.swapcase and not args.decode: - c = codecs.encode(c, "swapcase") - for l in (wraptext(c, args.wrap) if args.wrap > 0 else [c]) if wrap else c.split("\n"): - print(l) - return 0 - return _main - +# -*- coding: UTF-8 -*- +"""Generic baseN functions. + +""" +from argparse import ArgumentParser, RawTextHelpFormatter +from math import log +from string import ascii_lowercase as lower, ascii_uppercase as upper, digits, printable +from sys import stdout +from textwrap import wrap as wraptext +from types import FunctionType, MethodType + +from ..__common__ import * +from ..__common__ import _set_exc +from ..__info__ import __version__ + + +_set_exc("BaseError") +_set_exc("BaseEncodeError") +_set_exc("BaseDecodeError") +""" +Curve fitting: + +>>> import matplotlib.pyplot as plt +>>> import pandas as pd +>>> import scipy.optimize +>>> from statistics import mean +>>> from tinyscript import random +>>> x, y = [], [] +>>> for i in range(2, 256): + v = [] + for j in range(16, 2048, 16): + s = random.randstr(j) + v.append(float(len(codext.encode(s, "base%d-generic" % i))) / len(s)) + x.append(i) + y.append(mean(v)) +>>> data = pd.DataFrame({'base': x, 'expf': y}) +>>> def fit(x, y, func, params): + params, cv = scipy.optimize.curve_fit(func, x, y, params) + print(params) + y2 = func(x, *params) + plt.clf() + plt.plot(x, y, ".", color="blue", alpha=.3) + plt.plot(x, y2, color="red", linewidth=3.0) + plt.show() +>>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (1, 1, 1, 1)) +[ 0.02841434 0.00512664 -0.99999984 0.01543879] +>>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (.028, .005, -1, .015)) +[ 0.02827357 0.00510124 -0.99999984 0.01536941] +""" +EXPANSION_FACTOR = lambda base: 0.02827357 / (base**0.00510124-0.99999984) + 0.01536941 +SIZE_LIMIT = 1024 * 1024 * 1024 + + +def _generate_charset(n): + """ Generate a characters set. + + :param n: size of charset + """ + if 1 < n <= len(printable): + return printable[:n] + elif len(printable) < n < 256: + return "".join(chr(i) for i in range(n)) + raise ValueError("Bad size of character set") + + +def _get_charset(charset, p=""): + """ Characters set selection function. It allows to define charsets in many different ways. + + :param charset: charset object, can be a string (the charset itself), a function (that chooses the right charset + depending on the input parameter) or a dictionary (either by exact key or by pattern matching) + :param p: the parameter for choosing the charset + """ + # case 1: charset is a function, so return its result + if isinstance(charset, FunctionType): + return charset(p) + # case 2: charset is a string, so return it + elif isinstance(charset, str): + return charset + # case 3: charset is a dict with keys '' and 'inv', typically for a charset using lowercase and uppercase characters + # that can be inverted + elif isinstance(charset, dict) and list(charset.keys()) == ["", "inv"]: + return charset["inv" if re.match(r"[-_]inv(erted)?$", p) else ""] + # case 4: charset is a dict, but not with the specific keys '' and 'inv', so consider it as pattern-charset pairs + elif isinstance(charset, dict): + # try to handle [p]arameter as a simple key + try: + return charset[p] + except KeyError: + pass + # or handle [p]arameter as a pattern + default, n, best = None, None, None + for pattern, cset in charset.items(): + n = len(cset) + if re.match(pattern, ""): + default = cset + continue + m = re.match(pattern, p) + if m: # find the longest match from the patterns + s, e = m.span() + if e - s > len(best or ""): + best = pattern + if best: + return charset[best] + # special case: the given [p]arameter can be the charset itself if it has the right length + p = re.sub(r"^[-_]+", "", p) + if len(p) == n: + return p + # or simply rely on key '' + if default is not None: + return default + raise ValueError("Bad charset descriptor ('%s')" % p) + + +# generic base en/decoding functions +def base_encode(input, charset, errors="strict", exc=BaseEncodeError): + """ Base-10 to base-N encoding. + + :param input: input (str or int) to be decoded + :param charset: base-N characters set + :param errors: errors handling marker + :param exc: exception to be raised in case of error + """ + i, n, r = input if isinstance(input, int) else s2i(input), len(charset), "" + if n == 1: + if i > SIZE_LIMIT: + raise InputSizeLimitError("Input exceeded size limit") + return i * charset[0] + if n == 10: + return str(i) if charset == digits else "".join(charset[int(x)] for x in str(i)) + while i > 0: + i, c = divmod(i, n) + r = charset[c] + r + return r + + +def base_decode(input, charset, errors="strict", exc=BaseDecodeError): + """ Base-N to base-10 decoding. + + :param input: input to be decoded + :param charset: base-N characters set + :param errors: errors handling marker + :param exc: exception to be raised in case of error + """ + i, n, dec = 0, len(charset), lambda n: base_encode(n, [chr(x) for x in range(256)], errors, exc) + if n == 1: + return i2s(len(input)) + if n == 10: + return i2s(int(input)) if charset == digits else "".join(str(charset.index(c)) for c in input) + for k, c in enumerate(input): + try: + i = i * n + charset.index(c) + except ValueError: + handle_error("base", errors, exc, decode=True)(c, k, dec(i), "base%d" % n) + return dec(i) + + +# base codec factory functions +def base(charset, pattern, pow2=False, encode_template=base_encode, decode_template=base_decode, name=None, **kwargs): + """ Base-N codec factory. + + :param charset: charset selection function + :param pattern: matching pattern for the codec name (first capturing group is used as the parameter for selecting + the charset) + :param pow2: whether the base codec's N is a power of 2 + """ + cs = _get_charset(charset) + n = len(cs) + nb = log(n, 2) + if pow2 and nb != int(nb): + raise BaseError("Bad charset ; {} is not a power of 2".format(n)) + + def encode(param="", *args): + a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) + def _encode(input, errors="strict"): + if len(input) == 0: + return "", 0 + return encode_template(input, a, errors), len(input) + return _encode + + def decode(param="", *args): + a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) + sl, sc = "\n" not in a, "\n" not in a and not "\r" in a + def _decode(input, errors="strict"): + if len(input) == 0: + return "", 0 + input = _stripl(input, sc, sl) + return decode_template(input, a, errors), len(input) + return _decode + + kwargs['len_charset'] = n + kwargs['printables_rate'] = float(len([c for c in cs if c in printable])) / len(cs) + kwargs['expansion_factor'] = kwargs.pop('expansion_factor', (EXPANSION_FACTOR(n), .05)) + n = "base{}".format(n) if name is None else name + try: + g = [n, n + "-inv"] if "[-_]inv(erted)?$" in charset.keys() else [n] + except AttributeError: + g = [n] + kwargs['guess'] = kwargs.get('guess', g) + add(n, encode, decode, pattern, entropy=nb, **kwargs) + + +def base_generic(): + """ Base-N generic codec. """ + def encode(n): + a = _generate_charset(int(n)) + def _encode(input, errors="strict"): + return base_encode(input, a, errors), len(input) + return _encode + + def decode(n): + a = _generate_charset(int(n)) + sl, sc = "\n" not in a, "\n" not in a and not "\r" in a + def _decode(input, errors="strict"): + input = _stripl(input, sc, sl) + return base_decode(input, a, errors), len(input) + return _decode + + add("base", encode, decode, r"^base[-_]?([2-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:[-_]generic)?$", + guess=["base%d-generic" % i for i in range(2, 255)], entropy=lambda e, n: log(int(n.split("-")[0][4:]), 2), + len_charset=lambda n: int(n.split("-")[0][4:]), printables_rate=1., category="base-generic", penalty=.4, + expansion_factor=lambda f, n: (EXPANSION_FACTOR(int(n.split("-")[0][4:])), .05)) + + +def main(n, ref=None, alt=None, inv=True, swap=True, wrap=True): + base = str(n) + ("-" + alt.lstrip("-") if alt else "") + src = "The data are encoded as described for the base%(base)s alphabet in %(reference)s.\n" % \ + {'base': base, 'reference': "\n" + ref if len(ref) > 20 else ref} if ref else "" + text = "%(source)sWhen decoding, the input may contain newlines in addition to the bytes of the formal base" \ + "%(base)s alphabet. Use --ignore-garbage to attempt to recover from any other non-alphabet bytes in the" \ + " encoded stream." % {'base': base, 'source': src} + text = "\n".join(x for x in wraptext(text, 74)) + descr = """Usage: base%(base)s [OPTION]... [FILE] +Base%(base)s encode or decode FILE, or standard input, to standard output. + +With no FILE, or when FILE is -, read standard input. + +Mandatory arguments to long options are mandatory for short options too. + -d, --decode decode data + -i, --ignore-garbage when decoding, ignore non-alphabet characters +%(inv)s%(swap)s%(wrap)s + + --help display this help and exit + --version output version information and exit + +%(text)s + +Report base%(base)s translation bugs to +Full documentation at: +""" % {'base': base, 'text': text, + 'inv': ["", " -I, --invert invert charsets from the base alphabet (e.g. digits and letters)\n"][inv], + 'swap': ["", " -s, --swapcase swap the case\n"][swap], + 'wrap': ["", " -w, --wrap=COLS wrap encoded lines after COLS character (default 76).\n"+ 26 * " " + \ + "Use 0 to disable line wrapping"][wrap]} + + def _main(): + p = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) + p.format_help = MethodType(lambda s: s.description, p) + p.add_argument("file", nargs="?") + p.add_argument("-d", "--decode", action="store_true") + p.add_argument("-i", "--ignore-garbage", action="store_true") + if inv: + p.add_argument("-I", "--invert", action="store_true") + if swap: + p.add_argument("-s", "--swapcase", action="store_true") + if wrap: + p.add_argument("-w", "--wrap", type=int, default=76) + p.add_argument("--help", action="help") + p.add_argument("--version", action="version") + p.version = "CodExt " + __version__ + args = p.parse_args() + if args.decode: + args.wrap = 0 + args.invert = getattr(args, "invert", False) + c, f = _input(args.file), [encode, decode][args.decode] + if swap and args.swapcase and args.decode: + c = codecs.decode(c, "swapcase") + c = b(c).rstrip(b"\r\n") + try: + c = f(c, "base" + base + ["", "-inv"][getattr(args, "invert", False)], + ["strict", "ignore"][args.ignore_garbage]) + except Exception as err: + print("%sbase%s: invalid input" % (getattr(err, "output", ""), base)) + return 1 + if args.decode: + stdout.buffer.write(c) + return 0 + c = ensure_str(c) + if swap and args.swapcase: + c = codecs.encode(c, "swapcase") + for l in (wraptext(c, args.wrap) if args.wrap > 0 else [c]) if wrap else c.split("\n"): + print(l) + return 0 + return _main + diff --git a/codext/base/_base2n.py b/src/codext/base/_base2n.py old mode 100755 new mode 100644 similarity index 97% rename from codext/base/_base2n.py rename to src/codext/base/_base2n.py index d34072d..0e1f2d2 --- a/codext/base/_base2n.py +++ b/src/codext/base/_base2n.py @@ -1,112 +1,112 @@ -# -*- coding: UTF-8 -*- -"""BaseN functions with N a power of 2. - -""" -from math import ceil, log - -from ..__common__ import * -from ..__common__ import _set_exc -from ._base import base, _get_charset - - -_bin = lambda x: bin(x if isinstance(x, int) else ord(x)) - - -# base en/decoding functions for N a power of 2 -_set_exc("Base2NDecodeError") -_set_exc("Base2NEncodeError") - - -def base2n(charset, pattern=None, name=None, **kwargs): - """ Base-N codec factory for N a power of 2. - - :param charset: charset selection function - :param pattern: matching pattern for the codec name (first capturing group is used as the parameter for selecting - the charset) - :param name: forced encoding name (useful e.g. for zbase32) - """ - base(charset, pattern, True, base2n_encode, base2n_decode, name, **kwargs) - - -def base2n_encode(string, charset, errors="strict"): - """ 8-bits characters to base-N encoding for N a power of 2. - - :param string: string to be decoded - :param charset: base-N characters set - :param errors: errors handling marker - """ - bs, r, n = "", "", len(charset) - # find the number of bits for the given character set and the quantum - nb_out = int(log(n, 2)) - q = nb_out - while q % 8 != 0: - q += nb_out - # iterate over the characters, gathering bits to be mapped to the charset - for i, c in enumerate(b(string)): - bs += "{:0>8}".format(_bin(c)[2:]) - while len(bs) >= nb_out: - r += charset[int(bs[:nb_out], 2)] - bs = bs[nb_out:] - if len(bs) > 0: - for i in range(0, len(bs), nb_out): - c = ("{:0<%d}" % nb_out).format(bs[i:i+nb_out]) - p = len(c) - len(bs[i:i+nb_out]) - r += charset[int(c, 2)] - l = len(r) * nb_out - while l % q != 0: - l += nb_out - return r + int(l / nb_out - len(r)) * "=" - - -def base2n_decode(string, charset, errors="strict"): - """ Base-N to 8-bits characters decoding for N a power of 2. - - :param string: string to be decoded - :param charset: base-N characters set - :param errors: errors handling marker - """ - bs, r, n = "", "", len(charset) - # particular case: for hex, ensure the right case in the charset ; not that this way, if mixed cases are used, it - # will trigger an error (this is the expected behavior) - if n == 16: - if any(c in string for c in "abcdef"): - charset = charset.lower() - elif any(c in string for c in "ABCDEF"): - charset = charset.upper() - string = re.sub(r"\s", "", string) - # find the number of bits for the given character set and the number of padding characters - nb_in = int(log(n, 2)) - n_pad = len(string) - len(string.rstrip("=")) - # iterate over the characters, mapping them to the character set and converting the resulting bits to 8-bits chars - for i, c in enumerate(string): - if c == "=": - bs += "0" * nb_in - else: - try: - bs += ("{:0>%d}" % nb_in).format(_bin(charset.index(c))[2:]) - except ValueError: - if errors == "strict": - e = Base2NDecodeError("'base%d' codec can't decode character '%s' in position %d" % (n, c, i)) - e.__cause__ = e # block exceptions chaining - raise e - elif errors == "replace": - bs += "0" * nb_in - elif errors == "ignore": - continue - else: - raise ValueError("Unsupported error handling {}".format(errors)) - if len(bs) > 8: - r += chr(int(bs[:8], 2)) - bs = bs[8:] - # if the number of bits is not multiple of 8 bits, it could mean a bad padding - if len(bs) != 8: - if errors == "strict": - raise Base2NDecodeError("Incorrect padding") - elif errors in ["replace", "ignore"]: - pass - else: - raise ValueError("Unsupported error handling {}".format(errors)) - r += chr(int(bs, 2)) - np = int(ceil(n_pad * nb_in / 8.0)) - return r[:-np] if np > 0 else r - +# -*- coding: UTF-8 -*- +"""BaseN functions with N a power of 2. + +""" +from math import ceil, log + +from ..__common__ import * +from ..__common__ import _set_exc +from ._base import base, _get_charset + + +_bin = lambda x: bin(x if isinstance(x, int) else ord(x)) + + +# base en/decoding functions for N a power of 2 +_set_exc("Base2NDecodeError") +_set_exc("Base2NEncodeError") + + +def base2n(charset, pattern=None, name=None, **kwargs): + """ Base-N codec factory for N a power of 2. + + :param charset: charset selection function + :param pattern: matching pattern for the codec name (first capturing group is used as the parameter for selecting + the charset) + :param name: forced encoding name (useful e.g. for zbase32) + """ + base(charset, pattern, True, base2n_encode, base2n_decode, name, **kwargs) + + +def base2n_encode(string, charset, errors="strict"): + """ 8-bits characters to base-N encoding for N a power of 2. + + :param string: string to be decoded + :param charset: base-N characters set + :param errors: errors handling marker + """ + bs, r, n = "", "", len(charset) + # find the number of bits for the given character set and the quantum + nb_out = int(log(n, 2)) + q = nb_out + while q % 8 != 0: + q += nb_out + # iterate over the characters, gathering bits to be mapped to the charset + for i, c in enumerate(b(string)): + bs += "{:0>8}".format(_bin(c)[2:]) + while len(bs) >= nb_out: + r += charset[int(bs[:nb_out], 2)] + bs = bs[nb_out:] + if len(bs) > 0: + for i in range(0, len(bs), nb_out): + c = ("{:0<%d}" % nb_out).format(bs[i:i+nb_out]) + p = len(c) - len(bs[i:i+nb_out]) + r += charset[int(c, 2)] + l = len(r) * nb_out + while l % q != 0: + l += nb_out + return r + int(l / nb_out - len(r)) * "=" + + +def base2n_decode(string, charset, errors="strict"): + """ Base-N to 8-bits characters decoding for N a power of 2. + + :param string: string to be decoded + :param charset: base-N characters set + :param errors: errors handling marker + """ + bs, r, n = "", "", len(charset) + # particular case: for hex, ensure the right case in the charset ; not that this way, if mixed cases are used, it + # will trigger an error (this is the expected behavior) + if n == 16: + if any(c in string for c in "abcdef"): + charset = charset.lower() + elif any(c in string for c in "ABCDEF"): + charset = charset.upper() + string = re.sub(r"\s", "", string) + # find the number of bits for the given character set and the number of padding characters + nb_in = int(log(n, 2)) + n_pad = len(string) - len(string.rstrip("=")) + # iterate over the characters, mapping them to the character set and converting the resulting bits to 8-bits chars + for i, c in enumerate(string): + if c == "=": + bs += "0" * nb_in + else: + try: + bs += ("{:0>%d}" % nb_in).format(_bin(charset.index(c))[2:]) + except ValueError: + if errors == "strict": + e = Base2NDecodeError("'base%d' codec can't decode character '%s' in position %d" % (n, c, i)) + e.__cause__ = e # block exceptions chaining + raise e + elif errors == "replace": + bs += "0" * nb_in + elif errors == "ignore": + continue + else: + raise ValueError("Unsupported error handling {}".format(errors)) + if len(bs) > 8: + r += chr(int(bs[:8], 2)) + bs = bs[8:] + # if the number of bits is not multiple of 8 bits, it could mean a bad padding + if len(bs) != 8: + if errors == "strict": + raise Base2NDecodeError("Incorrect padding") + elif errors in ["replace", "ignore"]: + pass + else: + raise ValueError("Unsupported error handling {}".format(errors)) + r += chr(int(bs, 2)) + np = int(ceil(n_pad * nb_in / 8.0)) + return r[:-np] if np > 0 else r + diff --git a/src/codext/base/base100.py b/src/codext/base/base100.py new file mode 100644 index 0000000..2287463 --- /dev/null +++ b/src/codext/base/base100.py @@ -0,0 +1,47 @@ +# -*- coding: UTF-8 -*- +"""Base100 Codec - base100 content encoding. + +Note: only works in Python3 ; strongly inspired from https://github.com/MasterGroosha/pybase100 + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ._base import main +from ..__common__ import * + +# no __examples__ ; handled manually in tests/test_base.py + +class Base100DecodeError(ValueError): + __module__ = "builtins" + + +def base100_encode(input, errors="strict"): + input = b(input) + r = [240, 159, 0, 0] * len(input) + for i, c in enumerate(input): + r[4*i+2] = (c + 55) // 64 + 143 + r[4*i+3] = (c + 55) % 64 + 128 + return bytes(r), len(input) + + +def base100_decode(input, errors="strict"): + input = b(_stripl(input, True, True)) + if errors == "ignore": + input = input.replace(b"\n", b"") + if len(input) % 4 != 0: + raise Base100DecodeError("Bad input (length should be multiple of 4)") + r = [None] * (len(input) // 4) + for i, c in enumerate(input): + if i % 4 == 2: + tmp = ((c - 143) * 64) % 256 + elif i % 4 == 3: + r[i//4] = (c - 128 + tmp - 55) & 0xff + return bytes(r), len(input) + + +add("base100", base100_encode, base100_decode, r"^(?:base[-_]?100|emoji)$", expansion_factor=1.) +main100 = main(100, "") + diff --git a/src/codext/base/base122.py b/src/codext/base/base122.py new file mode 100644 index 0000000..b326341 --- /dev/null +++ b/src/codext/base/base122.py @@ -0,0 +1,98 @@ +# -*- coding: UTF-8 -*- +"""Base122 Codec - base122 content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ._base import main +from ..__common__ import * + + +__examples__ = { + 'enc(base122|base-122)': { + 'this is a test': ":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", + b'This is another longer test string with d1g1t5 and sp3c141 characters !\n': \ + b"*\x1a\xca\x97\x19\x01Rs\x10\x18-f{QPe9\x08\xcb\x86{9Ne9\x08\x0eF+Mh 9]\x0e\xd3\x8b" + b"9N ;Z.FA\x01H13L.C)\x01Bn2\x08\x0e7\x01MF1\x1a\x0c$\x06\x1b!Br0XnF+If \x10B@" + }, + 'enc-dec(base_122)': ["@random"], +} + + +_BAD = [0, 10, 13, 34, 38, 92] +_i = lambda c: c if isinstance(c, int) else ord(c) + + +# inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js +def base122_encode(input, errors="strict"): + idx, bit, r, l = 0, 0, [], len(input) + + def _get_7bits(idx, bit): + if idx >= l: + return idx, bit, False + B1 = _i(input[idx]) + p1 = (((254 >> bit) & B1) << bit) >> 1 + bit += 7 + if bit < 8: + return idx, bit, p1 + bit -= 8 + idx += 1 + if idx >= l: + return idx, bit, p1 + B2 = _i(input[idx]) + p2 = (((65280 >> bit) & B2) & 255) >> (8 - bit) + return idx, bit, (p1 | p2) + + while True: + if idx >= l: + break + # get seven bits of input data + idx, bit, B = _get_7bits(idx, bit) + # check for illegal chars + try: + bad_idx = _BAD.index(B) + except ValueError: + r.append(B) + continue + idx, bit, nB = _get_7bits(idx, bit) + if nB is False: + nB, bad_idx = B, 7 + B1, B2 = 194, 128 + B1 |= (7 & bad_idx) << 2 + B1 |= int((nB & 64) > 0) + B2 |= nB & 63 + r.extend([B1, B2]) + return "".join(map(chr, r)).encode("latin-1"), len(input) + + +# inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js +def base122_decode(input, errors="strict"): + currB, bob, r, input = 0, 0, [], list(map(ord, input)) + + def _get_7bits(currB, bob, B, decoded): + B <<= 1 + currB |= (B % 0x100000000) >> bob + bob += 7 + if bob >= 8: + decoded += [currB] + bob -= 8 + return (B << (7 - bob)) & 255, bob + + for i in range(len(input)): + if input[i] >= 128: + try: + currB, bob = _get_7bits(currB, bob, _BAD[(input[i] >> 8) & 7], r) + except IndexError: + pass + currB, bob = _get_7bits(currB, bob, input[i] & 127, r) + else: + currB, bob = _get_7bits(currB, bob, input[i], r) + return "".join(map(chr, r)).rstrip("\0"), len(input) + + +add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085) +main122 = main(122, "", wrap=False) + diff --git a/codext/base/base45.py b/src/codext/base/base45.py old mode 100755 new mode 100644 similarity index 96% rename from codext/base/base45.py rename to src/codext/base/base45.py index 6f15150..272c3e9 --- a/codext/base/base45.py +++ b/src/codext/base/base45.py @@ -1,84 +1,84 @@ -# -*- coding: UTF-8 -*- -"""Base45 Codec - base45 content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ._base import _get_charset, digits, lower, main, upper -from ..__common__ import * - - -__examples__ = { - 'enc(base45|base-45|base_45)': {'this is a test!': "AWE+EDH44.OEOCC7WE QEX0"}, - 'enc(base45-inv|base_45_inv)': {'this is a test!': "K6O+ONREE.YOYMMH6O 0O7A"}, - 'dec(base45)': {'BAD STRING\00': None, 'AWE+EDH44.OEOCC7WE QEX000': None}, -} -__guess__ = ["base45", "base45-inv"] - - -B45 = { - '': digits + upper + " $%*+-./:", - '[-_]inv(?:erted)?$': upper + digits + " $%*+-./:", -} - - -__chr = lambda c: chr(c >> 8) + chr(c & 0xff) if isinstance(c, int) and 256 <= c <= 65535 else \ - chr(c) if isinstance(c, int) else c -__ord = lambda c: ord(c) if not isinstance(c, int) else c - - -def base45_encode(mode): - b45 = _get_charset(B45, mode) - def encode(text, errors="strict"): - t, s = b(text), "" - for i in range(0, len(text), 2): - n = 256 * __ord(t[i]) - try: - n += __ord(t[i+1]) - except IndexError: - n = __ord(t[i]) - s += b45[n % 45] + b45[n // 45] - break - m = n // 45**2 - n -= m * 45**2 - s += b45[n % 45] + b45[n // 45] + b45[m] - return s, len(text) - return encode - - -def base45_decode(mode): - b45 = {c: i for i, c in enumerate(_get_charset(B45, mode))} - def decode(text, errors="strict"): - t, s = b(text), "" - ehandler = handle_error("base45", errors, decode=True) - for i in range(0, len(text), 3): - try: - n = b45[__chr(t[i])] - except KeyError: - ehandler(__chr(t[i]), i, s) - try: - j = i + 1 - n += 45 * b45[__chr(t[j])] - except KeyError: - ehandler(__chr(t[j]), j, s) - except IndexError: - ehandler(__chr(t[i]), i, s) - try: - k = i + 2 - n += 45 ** 2 * b45[__chr(t[k])] - except KeyError: - ehandler(__chr(t[k]), k, s) - except IndexError: - s += __chr(n) - continue - s += __chr(n // 256) + __chr(n % 256) - return s, len(text) - return decode - - -add("base45", base45_encode, base45_decode, r"^base[-_]?45(|[-_]inv(?:erted)?)$", expansion_factor=1.5) -main = main(45, "") - +# -*- coding: UTF-8 -*- +"""Base45 Codec - base45 content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ._base import _get_charset, digits, lower, main, upper +from ..__common__ import * + + +__examples__ = { + 'enc(base45|base-45|base_45)': {'this is a test!': "AWE+EDH44.OEOCC7WE QEX0"}, + 'enc(base45-inv|base_45_inv)': {'this is a test!': "K6O+ONREE.YOYMMH6O 0O7A"}, + 'dec(base45)': {'BAD STRING\00': None, 'AWE+EDH44.OEOCC7WE QEX000': None}, +} +__guess__ = ["base45", "base45-inv"] + + +B45 = { + '': digits + upper + " $%*+-./:", + '[-_]inv(?:erted)?$': upper + digits + " $%*+-./:", +} + + +__chr = lambda c: chr(c >> 8) + chr(c & 0xff) if isinstance(c, int) and 256 <= c <= 65535 else \ + chr(c) if isinstance(c, int) else c +__ord = lambda c: ord(c) if not isinstance(c, int) else c + + +def base45_encode(mode): + b45 = _get_charset(B45, mode) + def encode(text, errors="strict"): + t, s = b(text), "" + for i in range(0, len(text), 2): + n = 256 * __ord(t[i]) + try: + n += __ord(t[i+1]) + except IndexError: + n = __ord(t[i]) + s += b45[n % 45] + b45[n // 45] + break + m = n // 45**2 + n -= m * 45**2 + s += b45[n % 45] + b45[n // 45] + b45[m] + return s, len(text) + return encode + + +def base45_decode(mode): + b45 = {c: i for i, c in enumerate(_get_charset(B45, mode))} + def decode(text, errors="strict"): + t, s = b(text), "" + ehandler = handle_error("base45", errors, decode=True) + for i in range(0, len(text), 3): + try: + n = b45[__chr(t[i])] + except KeyError: + ehandler(__chr(t[i]), i, s) + try: + j = i + 1 + n += 45 * b45[__chr(t[j])] + except KeyError: + ehandler(__chr(t[j]), j, s) + except IndexError: + ehandler(__chr(t[i]), i, s) + try: + k = i + 2 + n += 45 ** 2 * b45[__chr(t[k])] + except KeyError: + ehandler(__chr(t[k]), k, s) + except IndexError: + s += __chr(n) + continue + s += __chr(n // 256) + __chr(n % 256) + return s, len(text) + return decode + + +add("base45", base45_encode, base45_decode, r"^base[-_]?45(|[-_]inv(?:erted)?)$", expansion_factor=1.5) +main = main(45, "") + diff --git a/codext/base/base85.py b/src/codext/base/base85.py old mode 100755 new mode 100644 similarity index 97% rename from codext/base/base85.py rename to src/codext/base/base85.py index bc6d8b2..22aad28 --- a/codext/base/base85.py +++ b/src/codext/base/base85.py @@ -1,186 +1,185 @@ -# -*- coding: UTF-8 -*- -"""Base85 Codec - base85 content encoding. - -This is a simple wrapper for adding base64.b85**code to the codecs. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import base64 -from six import integer_types - -from ._base import _get_charset, digits, lower, main, upper -from ..__common__ import * - - -__examples__ = { - 'enc-dec(base85|z85|base85-ipv6)': ["@random{512,1024,2048}"], - 'enc-dec(base85-btoa|base85-xbtoa)': ["@random{512,1024,2048}"], - 'enc(base85|ascii85)': {'this is a test': "FD,B0+DGm>@3BZ'F*%"}, - 'enc(base85-adobe)': {'this is a test': "<~FD,B0+DGm>@3BZ'F*%~>", - 'this is a test\0\0\0\0\0\0': "<~FD,B0+DGm>@3BZ'F*%B^z~>"}, - 'enc(z85|base85-z)': {'this is a test': "BzbxfazC)tvixV6B94"}, - 'enc(base85-ipv6|base85_rfc1924)': {'this is a test': "bZBXFAZc?TVIXv6b94"}, - 'enc(base85_btoa)': {'this is a test': "FD,B0+DGm>@3BZ'F*%B^"}, - 'enc(base85_btoa)': {'this\0\0\0\0test': "FD,B0+DGm>@3BZ'F*%B^"}, - 'enc(base85_btoa)': {'this is a test\0\0\0\0': "FD,B0+DGm>y@3BZ'F*%B^z"}, - 'enc(base85-xbtoa)': {'this is a test': "xbtoa Begin\nFD,B0+DGm>@3BZ'F*%B^\nxbtoa End N 14 e E 4b" \ - " S 523 R 1b132e"}, - 'dec(base85-xbtoa)': {'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End': None, - 'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End N 14 e E 4b S 523 R 000bad': - None}, - 'enc(base85-xml)': {'this is a test': "bZBXFAZc@TVIXv6b94"}, - 'enc(base85|ascii85)': {'this\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0test': "FD,B0zzz!!!\"@ATMq"}, -} -__guess__ = ["ascii85", "z85", "base85-ipv6", "base85-xml", "base85-adobe", "base85-xbtoa"] - - -B85 = { - r'(base[-_]?85([-_]ascii)?|ascii85)$': "!\"#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_`" + lower[:21], - r'(z85|base[-_]?85[-_]z(eromq)?)$': digits + lower + upper + ".-:+=^!/*?&<>()[]{}@%$#", - r'base[-_]?85[-_](rfc1924|ipv6)$': digits + upper + lower + "!#$%&()*+-;<=>?@^_`{|}~", - r'base[-_]?85[-_]xml$': digits + upper + lower[:-1] + "!#$()*+,-./:;=?@^`{|}~z_", -} -B85[r'(base[-_]?85[-_]adobe)$'] = B85[r'(base[-_]?85[-_]x?btoa)$'] = B85[r'(base[-_]?85([-_]ascii)?|ascii85)$'] -POW85 = [85 ** i for i in range(5)] - - -def __format(text, mode, decode=False, **kwargs): - if "adobe" in mode: - if decode: - if text.startswith("<~") and text.endswith("~>"): - text = text[2:-2] - else: - text = "<~" + text + "~>" - elif "xbtoa" in mode: - sp, ep = "xbtoa [bB]egin\n", "xbtoa [eE]nd" - if decode: - if re.match(r"^xbtoa\s+[bB]egin\n", text) and \ - re.search(r"\nxbtoa\s+[eE]nd N \d+{h} E{h} S{h} R{h}\s*$".format(h=" [0-9a-fA-F]+"), text): - text = "".join(text.split("\n")[1:-1]).replace(" ", "") - elif not decode: - l, t = kwargs['length'], "\n".join(text[i:i+78] for i in range(0, len(text), 78)) - text = "xbtoa Begin\n%s\nxbtoa End N %d %x E %x S %x R %x" % \ - (t, l, l, kwargs['c_xor'], kwargs['c_sum'], kwargs['c_rot']) - return text - - -def __xbtoa_values(text): - try: - hr = "[0-9a-fA-F]+" - return re.search(r"\nxbtoa\s+[eE]nd N (\d+) ({h}) E ({h}) S ({h}) R ({h})\s*$".format(h=hr), text).groups() - except: - raise Base85DecodeError("Bad or missing xbtoa parameters") - - -def base85_encode(mode): - b85 = _get_charset(B85, mode) - def encode(input, errors="strict"): - r, l, kw = "", len(input), {} - if l == 0: - return input, 0 - if "xbtoa" in mode: - kw['length'] = l - kw['c_xor'], kw['c_sum'], kw['c_rot'] = 0, 0, 0 - n_pad = (4 - l % 4) % 4 - for i in range(0, l, 4): - block = input[i:i+4] - if block == "\0\0\0\0" and b85[-3:] == "stu": - r += "z" - if block == "\x20\x20\x20\x20" and "btoa" in mode: - r += "y" - if "xbtoa" in mode: - for c in block: - k = ord(c) - kw['c_xor'] ^= k - kw['c_sum'] += k + 1 - kw['c_rot'] <<= 1 - if kw['c_rot'] & 0x80000000: - kw['c_rot'] += 1 - kw['c_rot'] += k - if block == "\0\0\0\0" and b85[-3:] == "stu" or block == "\x20\x20\x20\x20" and "btoa" in mode: - continue - if len(block) < 4: - block += n_pad * "\0" - n, bl = s2i(block), "" - for _ in range(5): - n, k = divmod(n, 85) - bl = b85[k] + bl - r += bl - if "btoa" not in mode and n_pad: - r = r[:-n_pad] - if b85[-3:] == "stu" and r[-5:] == "!!!!!": - r = r[:-5] + "z" - return __format(r, mode, **kw), l - return encode - - -def base85_decode(mode): - b85 = _get_charset(B85, mode) - def decode(input, errors="strict"): - r, l, i, n_pad = "", len(input), 0, 0 - if l == 0: - return input, 0 - if "xbtoa" in mode: - v = __xbtoa_values(input) - n_last = int(v[0]) % 4 - c_xor, c_sum, c_rot = 0, 0, 0 - input = __format(input, mode, True) - ehandler = handle_error("base85", errors, decode=True) - if b85[-3:] == "stu" and input[-1] == "z": - input = input[:-1] + "!!!!!" - l = len(input) - while i < l: - n, incr = 0, 5 - if input[i] == "z" and b85[-3:] == "stu": - bl, incr = "\0\0\0\0", 1 - elif input[i] == "y" and "btoa" in mode: - bl, incr = "\x20\x20\x20\x20", 1 - else: - block = input[i:i+5] - if len(block) < 5: - n_pad = 5 - len(block) % 5 - block += n_pad * "\0" - for k, c in enumerate(block[::-1]): - try: - n += (b85.index(c) if c != "\0" else 255) * POW85[k] - except ValueError: - r += ehandler(c, i + k, r) - bl = codecs.decode("{:0>8}".format(hex(n & 0xffffffff)[2:]), "hex") - if "xbtoa" in mode: - if i + 5 == l and n_last > 0: - bl = bl[:n_last] - for c in bl: - k = ord(c) - c_xor ^= k - c_sum += k + 1 - c_rot <<= 1 - if c_rot & 0x80000000: - c_rot += 1 - c_rot += k - r += bl - i += incr - if n_pad > 0: - r = r[:-n_pad] - if "xbtoa" in mode: - chkv = ["%d" % len(r), "%x" % len(r), "%x" % c_xor, "%x" % c_sum, "%x" % c_rot] - if any(v1 != v2 for v1, v2 in zip(v, chkv)) and errors == "strict": - raise Base85ValueError("A check value does not match (%s != %s)" % (str(list(v)).replace("'", ""), - str(chkv).replace("'", ""))) - return r, l - return decode - - -add("base85", base85_encode, base85_decode, expansion_factor=lambda f, ename: f if "xbtoa" in ename else 1.25, - pattern=r"^(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)$", - extra_exceptions=["Base85ValueError"]) -main85 = main(85, None) -main85adobe = main(85, None, "adobe") -main85xbtoa = main(85, None, "xbtoa", wrap=False) -main85rfc1924 = main(85, "RFC 1924", "ipv6") -main85xml = main(85, "", "xml") -main85zeromq = main(85, "", "zeromq") - +# -*- coding: UTF-8 -*- +"""Base85 Codec - base85 content encoding. + +This is a simple wrapper for adding base64.b85**code to the codecs. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import base64 + +from ._base import _get_charset, digits, lower, main, upper +from ..__common__ import * + + +__examples__ = { + 'enc-dec(base85|z85|base85-ipv6)': ["@random{512,1024,2048}"], + 'enc-dec(base85-btoa|base85-xbtoa)': ["@random{512,1024,2048}"], + 'enc(base85|ascii85)': {'this is a test': "FD,B0+DGm>@3BZ'F*%"}, + 'enc(base85-adobe)': {'this is a test': "<~FD,B0+DGm>@3BZ'F*%~>", + 'this is a test\0\0\0\0\0\0': "<~FD,B0+DGm>@3BZ'F*%B^z~>"}, + 'enc(z85|base85-z)': {'this is a test': "BzbxfazC)tvixV6B94"}, + 'enc(base85-ipv6|base85_rfc1924)': {'this is a test': "bZBXFAZc?TVIXv6b94"}, + 'enc(base85_btoa)': {'this is a test': "FD,B0+DGm>@3BZ'F*%B^"}, + 'enc(base85_btoa)': {'this\0\0\0\0test': "FD,B0+DGm>@3BZ'F*%B^"}, + 'enc(base85_btoa)': {'this is a test\0\0\0\0': "FD,B0+DGm>y@3BZ'F*%B^z"}, + 'enc(base85-xbtoa)': {'this is a test': "xbtoa Begin\nFD,B0+DGm>@3BZ'F*%B^\nxbtoa End N 14 e E 4b" \ + " S 523 R 1b132e"}, + 'dec(base85-xbtoa)': {'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End': None, + 'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End N 14 e E 4b S 523 R 000bad': + None}, + 'enc(base85-xml)': {'this is a test': "bZBXFAZc@TVIXv6b94"}, + 'enc(base85|ascii85)': {'this\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0test': "FD,B0zzz!!!\"@ATMq"}, +} +__guess__ = ["ascii85", "z85", "base85-ipv6", "base85-xml", "base85-adobe", "base85-xbtoa"] + + +B85 = { + r'(base[-_]?85([-_]ascii)?|ascii85)$': "!\"#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_`" + lower[:21], + r'(z85|base[-_]?85[-_]z(eromq)?)$': digits + lower + upper + ".-:+=^!/*?&<>()[]{}@%$#", + r'base[-_]?85[-_](rfc1924|ipv6)$': digits + upper + lower + "!#$%&()*+-;<=>?@^_`{|}~", + r'base[-_]?85[-_]xml$': digits + upper + lower[:-1] + "!#$()*+,-./:;=?@^`{|}~z_", +} +B85[r'(base[-_]?85[-_]adobe)$'] = B85[r'(base[-_]?85[-_]x?btoa)$'] = B85[r'(base[-_]?85([-_]ascii)?|ascii85)$'] +POW85 = [85 ** i for i in range(5)] + + +def __format(text, mode, decode=False, **kwargs): + if "adobe" in mode: + if decode: + if text.startswith("<~") and text.endswith("~>"): + text = text[2:-2] + else: + text = "<~" + text + "~>" + elif "xbtoa" in mode: + sp, ep = "xbtoa [bB]egin\n", "xbtoa [eE]nd" + if decode: + if re.match(r"^xbtoa\s+[bB]egin\n", text) and \ + re.search(r"\nxbtoa\s+[eE]nd N \d+{h} E{h} S{h} R{h}\s*$".format(h=" [0-9a-fA-F]+"), text): + text = "".join(text.split("\n")[1:-1]).replace(" ", "") + elif not decode: + l, t = kwargs['length'], "\n".join(text[i:i+78] for i in range(0, len(text), 78)) + text = "xbtoa Begin\n%s\nxbtoa End N %d %x E %x S %x R %x" % \ + (t, l, l, kwargs['c_xor'], kwargs['c_sum'], kwargs['c_rot']) + return text + + +def __xbtoa_values(text): + try: + hr = "[0-9a-fA-F]+" + return re.search(r"\nxbtoa\s+[eE]nd N (\d+) ({h}) E ({h}) S ({h}) R ({h})\s*$".format(h=hr), text).groups() + except: + raise Base85DecodeError("Bad or missing xbtoa parameters") + + +def base85_encode(mode): + b85 = _get_charset(B85, mode) + def encode(input, errors="strict"): + r, l, kw = "", len(input), {} + if l == 0: + return input, 0 + if "xbtoa" in mode: + kw['length'] = l + kw['c_xor'], kw['c_sum'], kw['c_rot'] = 0, 0, 0 + n_pad = (4 - l % 4) % 4 + for i in range(0, l, 4): + block = input[i:i+4] + if block == "\0\0\0\0" and b85[-3:] == "stu": + r += "z" + if block == "\x20\x20\x20\x20" and "btoa" in mode: + r += "y" + if "xbtoa" in mode: + for c in block: + k = ord(c) + kw['c_xor'] ^= k + kw['c_sum'] += k + 1 + kw['c_rot'] <<= 1 + if kw['c_rot'] & 0x80000000: + kw['c_rot'] += 1 + kw['c_rot'] += k + if block == "\0\0\0\0" and b85[-3:] == "stu" or block == "\x20\x20\x20\x20" and "btoa" in mode: + continue + if len(block) < 4: + block += n_pad * "\0" + n, bl = s2i(block), "" + for _ in range(5): + n, k = divmod(n, 85) + bl = b85[k] + bl + r += bl + if "btoa" not in mode and n_pad: + r = r[:-n_pad] + if b85[-3:] == "stu" and r[-5:] == "!!!!!": + r = r[:-5] + "z" + return __format(r, mode, **kw), l + return encode + + +def base85_decode(mode): + b85 = _get_charset(B85, mode) + def decode(input, errors="strict"): + r, l, i, n_pad = "", len(input), 0, 0 + if l == 0: + return input, 0 + if "xbtoa" in mode: + v = __xbtoa_values(input) + n_last = int(v[0]) % 4 + c_xor, c_sum, c_rot = 0, 0, 0 + input = __format(input, mode, True) + ehandler = handle_error("base85", errors, decode=True) + if b85[-3:] == "stu" and input[-1] == "z": + input = input[:-1] + "!!!!!" + l = len(input) + while i < l: + n, incr = 0, 5 + if input[i] == "z" and b85[-3:] == "stu": + bl, incr = "\0\0\0\0", 1 + elif input[i] == "y" and "btoa" in mode: + bl, incr = "\x20\x20\x20\x20", 1 + else: + block = input[i:i+5] + if len(block) < 5: + n_pad = 5 - len(block) % 5 + block += n_pad * "\0" + for k, c in enumerate(block[::-1]): + try: + n += (b85.index(c) if c != "\0" else 255) * POW85[k] + except ValueError: + r += ehandler(c, i + k, r) + bl = codecs.decode("{:0>8}".format(hex(n & 0xffffffff)[2:]), "hex") + if "xbtoa" in mode: + if i + 5 == l and n_last > 0: + bl = bl[:n_last] + for c in bl: + k = ord(c) + c_xor ^= k + c_sum += k + 1 + c_rot <<= 1 + if c_rot & 0x80000000: + c_rot += 1 + c_rot += k + r += bl + i += incr + if n_pad > 0: + r = r[:-n_pad] + if "xbtoa" in mode: + chkv = ["%d" % len(r), "%x" % len(r), "%x" % c_xor, "%x" % c_sum, "%x" % c_rot] + if any(v1 != v2 for v1, v2 in zip(v, chkv)) and errors == "strict": + raise Base85ValueError("A check value does not match (%s != %s)" % (str(list(v)).replace("'", ""), + str(chkv).replace("'", ""))) + return r, l + return decode + + +add("base85", base85_encode, base85_decode, expansion_factor=lambda f, ename: f if "xbtoa" in ename else 1.25, + pattern=r"^(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)$", + extra_exceptions=["Base85ValueError"]) +main85 = main(85, None) +main85adobe = main(85, None, "adobe") +main85xbtoa = main(85, None, "xbtoa", wrap=False) +main85rfc1924 = main(85, "RFC 1924", "ipv6") +main85xml = main(85, "", "xml") +main85zeromq = main(85, "", "zeromq") + diff --git a/codext/base/base91.py b/src/codext/base/base91.py old mode 100755 new mode 100644 similarity index 97% rename from codext/base/base91.py rename to src/codext/base/base91.py index 21a21d5..4082256 --- a/codext/base/base91.py +++ b/src/codext/base/base91.py @@ -1,113 +1,113 @@ -# -*- coding: UTF-8 -*- -"""Base91 Codec - base91 content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ._base import _get_charset, digits, lower, main, upper -from ..__common__ import * - -# no __examples__ ; handled manually in tests/test_base.py -__guess__ = ["base91", "base91-inv", "base91-alt", "base91-alt-inv"] - - -B91 = { - r'': upper + lower + digits + "!#$%&()*+,./:;<=>?@[]^_`{|}~\"", - r'[-_]inv(erted)?$': digits + upper + lower + "!#$%&()*+,./:;<=>?@[]^_`{|}~\"", - r'[-_]alt(ernate)?$': "!#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_" + lower + "{|}", - r'[-_]alt(ernate)?[-_]inv(erted)?$': "!#$%&'()*+,-./" + upper + ":;<=>?@" + lower + "[\\]^_" + digits + "{|}", -} - - -__chr = lambda c: chr(c) if isinstance(c, int) else c -__ord = lambda c: ord(c) if not isinstance(c, int) else c - - -def base91_encode(mode): - b91 = _get_charset(B91, mode) - def encode(text, errors="strict"): - t, s, bits = b(text), "", "" - if re.search(r'[-_]alt(ernate)?$', mode): - while len(bits) < 13 and t: - bits += "{:08b}".format(__ord(t[0])) - t = t[1:] - while len(bits) > 13 or t: - n = int(bits[:13], 2) - s += b91[n // 91] + b91[n % 91] - bits = bits[13:] - while len(bits) < 13 and t: - bits += "{:08b}".format(__ord(t[0])) - t = t[1:] - if len(bits) > 0: - if len(bits) < 7: - bits += "0" * (6 - len(bits)) - s += b91[int(bits, 2)] - else: - bits += "0" * (13 - len(bits)) - n = int(bits, 2) - s += b91[n // 91] + b91[n % 91] - else: - for c in t: - bits = bin(__ord(c))[2:].zfill(8) + bits - if len(bits) > 13: - n = int(bits[-13:], 2) - if n > 88: - bits = bits[:-13] - else: - n = int(bits[-14:], 2) - bits = bits[:-14] - s += b91[n % 91] + b91[n // 91] - if len(bits) > 0: - n = int(bits, 2) - s += b91[n % 91] - if len(bits) > 7 or n > 90: - s += b91[n // 91] - return s, len(t) - return encode - - -def base91_decode(mode): - b91 = {c: i for i, c in enumerate(_get_charset(B91, mode))} - def decode(text, errors="strict"): - t, s, bits, alt = b(_stripl(text, True, True)), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None - ehandler = handle_error("base91", errors, decode=True) - for i in range(0, len(t), 2): - try: - n = b91[__chr(t[i])] * [1, 91][alt] - except KeyError: - ehandler(__chr(t[i]), i, s) - try: - j = i + 1 - n += b91[__chr(t[j])] * [91, 1][alt] - except IndexError: - pass - except KeyError: - ehandler(__chr(t[j]), j, s) - if alt: - bits += "{:013b}".format(n) - while 8 <= len(bits): - s += chr(int(bits[0:8], 2)) - bits = bits[8:] - else: - bits = bin(n)[2:].zfill([14, 13][n & 8191 > 88]) + bits - while len(bits) > 8: - s += chr(int(bits[-8:], 2)) - bits = bits[:-8] - if alt and len(t) % 2 == 1: - bits += "{:06b}".format(b91[__chr(t[-1])]) - while 8 <= len(bits): - s += chr(int(bits[:8], 2)) - bits = bits[8:] - elif not alt and len(bits) > 0 and not set(bits) == {"0"}: - s += chr(int(bits, 2)) - return s.rstrip("\0"), len(t) - return decode - - -add("base91", base91_encode, base91_decode, r"^base[-_]?91((?:|[-_]alt(?:ernate)?)(?:|[-_]inv(?:erted)?)?)$", - entropy=6.5, expansion_factor=1.231) -main91 = main(91, "") - +# -*- coding: UTF-8 -*- +"""Base91 Codec - base91 content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ._base import _get_charset, digits, lower, main, upper +from ..__common__ import * + +# no __examples__ ; handled manually in tests/test_base.py +__guess__ = ["base91", "base91-inv", "base91-alt", "base91-alt-inv"] + + +B91 = { + r'': upper + lower + digits + "!#$%&()*+,./:;<=>?@[]^_`{|}~\"", + r'[-_]inv(erted)?$': digits + upper + lower + "!#$%&()*+,./:;<=>?@[]^_`{|}~\"", + r'[-_]alt(ernate)?$': "!#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_" + lower + "{|}", + r'[-_]alt(ernate)?[-_]inv(erted)?$': "!#$%&'()*+,-./" + upper + ":;<=>?@" + lower + "[\\]^_" + digits + "{|}", +} + + +__chr = lambda c: chr(c) if isinstance(c, int) else c +__ord = lambda c: ord(c) if not isinstance(c, int) else c + + +def base91_encode(mode): + b91 = _get_charset(B91, mode) + def encode(text, errors="strict"): + t, s, bits = b(text), "", "" + if re.search(r'[-_]alt(ernate)?$', mode): + while len(bits) < 13 and t: + bits += "{:08b}".format(__ord(t[0])) + t = t[1:] + while len(bits) > 13 or t: + n = int(bits[:13], 2) + s += b91[n // 91] + b91[n % 91] + bits = bits[13:] + while len(bits) < 13 and t: + bits += "{:08b}".format(__ord(t[0])) + t = t[1:] + if len(bits) > 0: + if len(bits) < 7: + bits += "0" * (6 - len(bits)) + s += b91[int(bits, 2)] + else: + bits += "0" * (13 - len(bits)) + n = int(bits, 2) + s += b91[n // 91] + b91[n % 91] + else: + for c in t: + bits = bin(__ord(c))[2:].zfill(8) + bits + if len(bits) > 13: + n = int(bits[-13:], 2) + if n > 88: + bits = bits[:-13] + else: + n = int(bits[-14:], 2) + bits = bits[:-14] + s += b91[n % 91] + b91[n // 91] + if len(bits) > 0: + n = int(bits, 2) + s += b91[n % 91] + if len(bits) > 7 or n > 90: + s += b91[n // 91] + return s, len(t) + return encode + + +def base91_decode(mode): + b91 = {c: i for i, c in enumerate(_get_charset(B91, mode))} + def decode(text, errors="strict"): + t, s, bits, alt = b(_stripl(text, True, True)), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None + ehandler = handle_error("base91", errors, decode=True) + for i in range(0, len(t), 2): + try: + n = b91[__chr(t[i])] * [1, 91][alt] + except KeyError: + ehandler(__chr(t[i]), i, s) + try: + j = i + 1 + n += b91[__chr(t[j])] * [91, 1][alt] + except IndexError: + pass + except KeyError: + ehandler(__chr(t[j]), j, s) + if alt: + bits += "{:013b}".format(n) + while 8 <= len(bits): + s += chr(int(bits[0:8], 2)) + bits = bits[8:] + else: + bits = bin(n)[2:].zfill([14, 13][n & 8191 > 88]) + bits + while len(bits) > 8: + s += chr(int(bits[-8:], 2)) + bits = bits[:-8] + if alt and len(t) % 2 == 1: + bits += "{:06b}".format(b91[__chr(t[-1])]) + while 8 <= len(bits): + s += chr(int(bits[:8], 2)) + bits = bits[8:] + elif not alt and len(bits) > 0 and not set(bits) == {"0"}: + s += chr(int(bits, 2)) + return s.rstrip("\0"), len(t) + return decode + + +add("base91", base91_encode, base91_decode, r"^base[-_]?91((?:|[-_]alt(?:ernate)?)(?:|[-_]inv(?:erted)?)?)$", + entropy=6.5, expansion_factor=1.231) +main91 = main(91, "") + diff --git a/codext/base/baseN.py b/src/codext/base/baseN.py old mode 100755 new mode 100644 similarity index 97% rename from codext/base/baseN.py rename to src/codext/base/baseN.py index cf4abe4..c3965c7 --- a/codext/base/baseN.py +++ b/src/codext/base/baseN.py @@ -1,132 +1,132 @@ -# -*- coding: UTF-8 -*- -"""BaseN Codecs - base content encodings. - -These codecs: -- en/decode strings from str to str -- en/decode strings from bytes to bytes -- decode file content to str (read) -- encode file content from str to bytes (write) -""" -from ..__common__ import * -from ._base import base, base_generic, digits, lower, main, upper -from ._base2n import base2n - - -B1 = {chr(i): chr(i) for i in range(2**8)} -B1[''] = "A" -base(B1, r"^(?:base[-_]?1(|[-_].)|unary)$", guess=[]) -main1 = main(1) - - -B2 = {r'': "01", r'[-_]inv(erted)?$': "10"} -base2n(B2, r"^(?:base[-_]?2|bin(?:ary)?)(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{2})$", expansion_factor=8.) -main2 = main(2) - - -B3 = {r'': "123", r'[-_]inv(erted)?$': "321"} -base(B3, r"^base[-_]?3(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{3})$", expansion_factor=5.) -main3 = main(3) - - -B4 = {r'': "1234", r'[-_]inv(erted)?$': "4321"} -base2n(B4, r"^base[-_]?4(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{4})$", expansion_factor=4.) -main4 = main(4) - - -B8 = {r'': "abcdefgh", r'[-_]inv(erted)?$': "hgfedcba"} -base2n(B8, r"^base[-_]?8(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{8})$") -main8 = main(8) - - -B10 = {r'': "0123456789"} -base(B10, r"^(?:base[-_]?10|int(?:eger)?|dec(?:imal)?)$") -main10 = main(10) - - -B11 = {r'': "0123456789a", r'[-_]inv(erted)?$': "a0123456789"} -base(B11, r"^base[-_]?11(|[-_]inv(?:erted)?)$") -main11 = main(11) - - -B16 = {'': digits + "ABCDEF", '[-_]inv(erted)?$': "ABCDEF" + digits} -base2n(B16, r"^(?:base[-_]?16|hex)(|[-_]inv(?:erted)?)$", expansion_factor=2.) -main16 = main(16, "RFC 4648") - - -B26 = {'': upper} -base(B26, r"^base[-_]?26$") -main26 = main(26, inv=False) - - -B32 = { - r'': upper + "234567", - r'[-_]?z(?:base32)?$': "ybndrfg8ejkmcpqxot1uwisza345h769", - r'[-_]inv(erted)?$': "234567" + upper, - r'(?:[-_](ext(ended)?)?)?[-_]hex$': digits + upper[:22], - r'[-_]?crockford': digits + "ABCDEFGHJKMNPQRSTVWXYZ", - r'[-_]?geohash': digits + "bcdefghjkmnpqrstuvwxyz", -} -base2n(B32, r"^(?:base[-_]?32(|[-_]inv(?:erted)?|(?:[-_]ext(?:ended)?)?[-_]hex|[-_](?:z|geohash|crockford))|" - r"(zbase32|geohash|crockford))$", padding_char="=", - guess=["base32", "base32-inv", "base32-hex", "base32-geohash", "base32-crockford"]) -main32 = main(32, "RFC 4648") -main32hex = main(32, "RFC 4648", "hex", False) -main32geo = main(32, "", "geohash", False) -main32crk = main(32, "", "crockford", False) -mainz32 = main(32, "", "z", False) - - -B36 = {'': digits + upper, '[-_]inv(erted)?$': upper + digits} -base(B36, r"^base[-_]?36(|[-_]inv(?:erted)?)$") -main36 = main(36, "") - - -B58 = { - r'(|[-_]?(bc|bitcoin))$': "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz", - r'[-_]?(rp|ripple)$': "rpshnaf39wBUDNEGHJKLM4PQRST7VWXYZ2bcdeCg65jkm8oFqi1tuvAxyz", - r'[-_]?(fl|flickr|short[-]?url|url)$': "123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ", -} -base(B58, r"^base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))$", - guess=["base58-bitcoin", "base58-ripple", "base58-flickr"]) -main58bc = main(58, "", "bitcoin") -main58rp = main(58, "", "ripple") -main58fl = main(58, "", "flickr") - - -B62 = {'': digits + upper + lower, '[-_]inv(erted)?$': upper + lower + digits} -base(B62, r"^base[-_]?62(|[-_]inv(?:erted)?)$") -main62 = main(62, "") - - -B63 = {'': digits + upper + lower + "_", 'inv': upper + lower + digits + "_"} -base(B63, r"^base[-_]?63(|[-_]inv(?:erted)?)$") -main63 = main(63) - - -B64 = { - r'': upper + lower + digits + "+/", - r'[-_]inv(erted)?$': digits + upper + lower + "+/", - r'[-_]?(file|url)(safe)?$': upper + lower + digits + "-_", -} -base2n(B64, r"^base[-_]?64(|[-_]inv(?:erted)?|[-_]?(?:file|url)(?:safe)?)$", padding_char="=", - guess=["base64", "base64-inv", "base64-url"]) -main64 = main(64, "RFC 4648") -main64url = main(64, "RFC 4648 / Base64URL", "url", False) - - -B67 = { - r'': upper + lower + digits + "-_.!~", - r'[-_]inv(erted)?$': lower + upper + digits + "-_.!~", -} -base(B67, r"^base[-_]?67(|[-_]inv(?:erted)?)$") -main67 = main(67) - - -B128 = {r'': "".join(chr(i) for i in range(128))} -base(B128, r"^base[-_]?128$", padding_char="=") -main128 = main(128, None, False, wrap=False) - - -# generic base encodings, to be added after all others as they have the precedence -base_generic() - +# -*- coding: UTF-8 -*- +"""BaseN Codecs - base content encodings. + +These codecs: +- en/decode strings from str to str +- en/decode strings from bytes to bytes +- decode file content to str (read) +- encode file content from str to bytes (write) +""" +from ..__common__ import * +from ._base import base, base_generic, digits, lower, main, upper +from ._base2n import base2n + + +B1 = {chr(i): chr(i) for i in range(2**8)} +B1[''] = "A" +base(B1, r"^(?:base[-_]?1(|[-_].)|unary)$", guess=[]) +main1 = main(1) + + +B2 = {r'': "01", r'[-_]inv(erted)?$': "10"} +base2n(B2, r"^(?:base[-_]?2|bin(?:ary)?)(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{2})$", expansion_factor=8.) +main2 = main(2) + + +B3 = {r'': "123", r'[-_]inv(erted)?$': "321"} +base(B3, r"^base[-_]?3(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{3})$", expansion_factor=5.) +main3 = main(3) + + +B4 = {r'': "1234", r'[-_]inv(erted)?$': "4321"} +base2n(B4, r"^base[-_]?4(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{4})$", expansion_factor=4.) +main4 = main(4) + + +B8 = {r'': "abcdefgh", r'[-_]inv(erted)?$': "hgfedcba"} +base2n(B8, r"^base[-_]?8(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{8})$") +main8 = main(8) + + +B10 = {r'': "0123456789"} +base(B10, r"^(?:base[-_]?10|int(?:eger)?|dec(?:imal)?)$") +main10 = main(10) + + +B11 = {r'': "0123456789a", r'[-_]inv(erted)?$': "a0123456789"} +base(B11, r"^base[-_]?11(|[-_]inv(?:erted)?)$") +main11 = main(11) + + +B16 = {'': digits + "ABCDEF", '[-_]inv(erted)?$': "ABCDEF" + digits} +base2n(B16, r"^(?:base[-_]?16|hex)(|[-_]inv(?:erted)?)$", expansion_factor=2.) +main16 = main(16, "RFC 4648") + + +B26 = {'': upper} +base(B26, r"^base[-_]?26$") +main26 = main(26, inv=False) + + +B32 = { + r'': upper + "234567", + r'[-_]?z(?:base32)?$': "ybndrfg8ejkmcpqxot1uwisza345h769", + r'[-_]inv(erted)?$': "234567" + upper, + r'(?:[-_](ext(ended)?)?)?[-_]hex$': digits + upper[:22], + r'[-_]?crockford': digits + "ABCDEFGHJKMNPQRSTVWXYZ", + r'[-_]?geohash': digits + "bcdefghjkmnpqrstuvwxyz", +} +base2n(B32, r"^(?:base[-_]?32(|[-_]inv(?:erted)?|(?:[-_]ext(?:ended)?)?[-_]hex|[-_](?:z|geohash|crockford))|" + r"(zbase32|geohash|crockford))$", padding_char="=", + guess=["base32", "base32-inv", "base32-hex", "base32-geohash", "base32-crockford"]) +main32 = main(32, "RFC 4648") +main32hex = main(32, "RFC 4648", "hex", False) +main32geo = main(32, "", "geohash", False) +main32crk = main(32, "", "crockford", False) +mainz32 = main(32, "", "z", False) + + +B36 = {'': digits + upper, '[-_]inv(erted)?$': upper + digits} +base(B36, r"^base[-_]?36(|[-_]inv(?:erted)?)$") +main36 = main(36, "") + + +B58 = { + r'(|[-_]?(bc|bitcoin))$': "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz", + r'[-_]?(rp|ripple)$': "rpshnaf39wBUDNEGHJKLM4PQRST7VWXYZ2bcdeCg65jkm8oFqi1tuvAxyz", + r'[-_]?(fl|flickr|short[-]?url|url)$': "123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ", +} +base(B58, r"^base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))$", + guess=["base58-bitcoin", "base58-ripple", "base58-flickr"]) +main58bc = main(58, "", "bitcoin") +main58rp = main(58, "", "ripple") +main58fl = main(58, "", "flickr") + + +B62 = {'': digits + upper + lower, '[-_]inv(erted)?$': upper + lower + digits} +base(B62, r"^base[-_]?62(|[-_]inv(?:erted)?)$") +main62 = main(62, "") + + +B63 = {'': digits + upper + lower + "_", 'inv': upper + lower + digits + "_"} +base(B63, r"^base[-_]?63(|[-_]inv(?:erted)?)$") +main63 = main(63) + + +B64 = { + r'': upper + lower + digits + "+/", + r'[-_]inv(erted)?$': digits + upper + lower + "+/", + r'[-_]?(file|url)(safe)?$': upper + lower + digits + "-_", +} +base2n(B64, r"^base[-_]?64(|[-_]inv(?:erted)?|[-_]?(?:file|url)(?:safe)?)$", padding_char="=", + guess=["base64", "base64-inv", "base64-url"]) +main64 = main(64, "RFC 4648") +main64url = main(64, "RFC 4648 / Base64URL", "url", False) + + +B67 = { + r'': upper + lower + digits + "-_.!~", + r'[-_]inv(erted)?$': lower + upper + digits + "-_.!~", +} +base(B67, r"^base[-_]?67(|[-_]inv(?:erted)?)$") +main67 = main(67) + + +B128 = {r'': "".join(chr(i) for i in range(128))} +base(B128, r"^base[-_]?128$", padding_char="=") +main128 = main(128, None, False, wrap=False) + + +# generic base encodings, to be added after all others as they have the precedence +base_generic() + diff --git a/codext/binary/__init__.py b/src/codext/binary/__init__.py old mode 100755 new mode 100644 similarity index 95% rename from codext/binary/__init__.py rename to src/codext/binary/__init__.py index ea0005b..2b97568 --- a/codext/binary/__init__.py +++ b/src/codext/binary/__init__.py @@ -1,8 +1,8 @@ -# -*- coding: UTF-8 -*- -from .baudot import * -from .bcd import * -from .excess3 import * -from .gray import * -from .manchester import * -from .rotate import * - +# -*- coding: UTF-8 -*- +from .baudot import * +from .bcd import * +from .excess3 import * +from .gray import * +from .manchester import * +from .rotate import * + diff --git a/codext/binary/baudot.py b/src/codext/binary/baudot.py old mode 100755 new mode 100644 similarity index 84% rename from codext/binary/baudot.py rename to src/codext/binary/baudot.py index a57e1ea..ba43397 --- a/codext/binary/baudot.py +++ b/src/codext/binary/baudot.py @@ -1,295 +1,281 @@ -# -*- coding: UTF-8 -*- -"""Baudot Codec - baudot content conversion to HTML. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_us"] -if PY3: - __CODES.extend(["ita2_meteo", "mtk2", "murray", "uk"]) -__guess__ = ["baudot%s-{}-{}".format(x, y) for x in __CODES for y in ["lsb", "msb"]] -__examples1__ = { - 'enc(baudot-BAD_ALPHABET)': None, - 'enc(baudot_ccitt2_lsb)': {'TEST 1234': "00001100001010000001001001101111101110011000001010"}, - 'enc(baudot-ita1)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, - 'enc(baudot_ita2_msb)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, - 'enc(baudot-ita2-us)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, - 'enc(baudot)': {'\x01\x02': None}, - 'enc(baudot_ccitt1-lsb)': {'TEST ': None}, -} -__examples2__ = { - 'enc(baudot_spaced-BAD_ALPHABET)': None, - 'enc(baudot-spaced_ccitt2_lsb)': {'TEST 1234': "00001 10000 10100 00001 00100 11011 11101 11001 10000 01010"}, - 'enc(baudot_spaced-ita1)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, - 'enc(baudot-spaced_ita2_msb)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, - 'enc(baudot_spaced-ita2-us)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, -} -__examples3__ = { - 'enc(baudot_tape-BAD_ALPHABET)': None, - 'enc(baudot_tape-ita1)': { - 'TEST 1234': "***.**\n* *. *\n .* \n* *. \n* *. *\n* . \n * . \n . *\n .* \n *. \n *. *", - }, - 'dec(baudot-tape_ita2)': {'BAD_HEADER\n .* \n': None}, - 'dec(baudot-tape_ita2-us)': {'***.**\nBAD_TAPE\n': None}, - 'dec(baudot_tape-ccitt1_lsb)': {'***.**\n .* \n* . *\n* . \n': None}, -} -if PY3: - __examples1__.update({ - 'enc(baudot_ccitt1_lsb)': {'TEST1234': "101010001010001101010100000100000100000100101"}, - 'enc(baudot-fr)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, - }) - __examples2__.update({ - 'enc(baudot-spaced_ccitt1_lsb)': {'TEST1234': "10101 00010 10001 10101 01000 00100 00010 00001 00101"}, - 'enc(baudot_spaced-fr)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, - }) - - -PATTERN = r"^baudot%s([-_](?:ccitt1|ccitt2|eu|fr|ita1|ita2|ita2[-_](?:us" + (r"|meteo" if PY3 else r"") + r")" + \ - (r"|mtk2|murray|uk" if PY3 else r"") + r"|us_tty)(?:[-_](?:lsb|msb))?)?$" -# reserved character -RES_CHR = "\xff" - -# sources: -# - http://rabbit.eng.miami.edu/info/baudot.html -# - https://en.wikipedia.org/wiki/Baudot_code -# - https://fr.qwe.wiki/wiki/Baudot_code -# all alphabets consider MSB by default -# CCITT-1 original Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) -CCITT1 = [ - "00001", "00010", - "\x00\xff\xff\xffA-JKEXGM/ZHLYSBRUTCQIWFNOVDP", - "\x00\xff\xff\xff1.6(2\xff7)\xff:\xff=3\xff8-4\xff9/\xff?\xff£5'0+" if PY3 else \ - "\x00\xff\xff\xff1.6(2\xff7)\xff:\xff=3\xff8-4\xff9/\xff?\xff$5'0+", -] -# CCITT-2 revised Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) -CCITT2 = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", -] -# Original Baudot (French/European ; sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -BAUDOT = EU = FR = [ - "10000", "01000", - "\x00AEÉYUIO\xffJGHBCFD \nXZSTWV\x7fKMLRQNP" if PY3 else "\x00AEeYUIO\xffJGHBCFD \nXZSTWV\x7fKMLRQNP", - "\x0012&34°5 67h89f0\xff.,:;!?'\x7f()=-/\u2116%" if PY3 else "\x0012&34o5 67h89f0\xff.,:;!?'\x7f()=-/\xff%", -] -# International Telegraphic Alphabet 1 (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -ITA1 = [ - "10000", "01000", - "\x00AE\rYUIO\xffJGHBCFD \xffXZSTWV\x7fKMLRQNP", - "\x0012\r34\xff5 67+89\xff0\xff\n,:.\xff?'\x7f()=-/\xff%", -] -# International Telegraphic Alphabet 2 (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -ITA2 = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "\x003\n- '87\r\x054\x07,!:(5+)2$6019?&\xff./=\xff", -] -# International Telegraphic Alphabet 2 - US TTY (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -ITA2_US = US_TTY = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", -] -# International Telegraphic Alphabet 2 - Meteo version (source: https://en.wikipedia.org/wiki/Baudot_code) -if PY3: - ITA2_METEO = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "-3\n\u2191 \x0787\r\u21974\u2199\u29b7\u2192\u25ef\u21905+\u21962\u21936019\u2295\u2198\xff./\u29b6\xff", - ] -# Russian MTK-2 alphabet (source: https://fr.qwe.wiki/wiki/Baudot_code) -if PY3: - MTK2 = [ - "11111", "11011", - "\x00Е\n\xff СИУ\r\xffРЙНФЦКТЗЛВХЫПЯОБГ\xffМЬЖ\xff", - "\x003\n- '87\r\xff4Ю,Э:(5+)2Щ6019?Ш\xff./=\xff", - ] -# Murray code ; NB: not all fractions are supported (source: https://en.wikipedia.org/wiki/Baudot_code) -if PY3: - MURRAY = [ - "00100", "11011", - " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", - "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*" if PY3 else \ - "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,$)*", - ] -# English Baudot ; NB: not all fractions are supported (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -if PY3: - UK = [ - "10000", "01000", - "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", - "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+" if PY3 else \ - "\x0012\xff34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/$+", - ] - - -def _bits_from_tape(tape, trans={'*': "1", ' ': "0"}): - """ Converts a tape-like string with the given translation for ones and zeros to a series of bits. """ - bits = "" - trans_rev = {v: k for k, v in trans.items()} - for i, line in enumerate(tape.splitlines()): - if i == 0: - if line != trans_rev['1'] * 3 + "." + trans_rev['1'] * 2: - raise ValueError("Bad tape header '{}'".format(line)) - else: - line = line[:3] + line[4:] - if len(line) != 5: - raise ValueError("Bad tape line '{}'".format(line)) - bits += "".join(trans.get(c, "") for c in line) - return bits - - -def _bits_to_tape(bits, trans={'1': "*", '0': " "}): - """ Converts a series of bits to a tape-like string with the given translation for ones and zeros. """ - tape = [trans['1'] * 3 + "." + trans['1'] * 2] - for i in range(0, len(bits), 5): - group = "".join(trans[b] for b in bits[i:i+5]) - tape.append(group[:3] + "." + group[3:]) - return "\n".join(tape) - - -def _check_alphabet(alphabet): - """ Checks the length of letters and figures (must be 32 chars). """ - for chars in alphabet: - l = len(chars) - if l != 32: - raise ValueError("Bad length of alphabet (%d instead of 32)" % l) - - -def _handle_alphabet(alphabet): - """ Gets the given alphabet name and transforms it to its dictionary with letters and figures. """ - alphabet = (alphabet or "baudot").lower().replace("-", "_").strip("_") - if "_lsb" in alphabet: - alphabet = alphabet.replace("_lsb", "") - func = lambda x: x[::-1] - else: - alphabet = alphabet.replace("_msb", "") - func = lambda x: x - _ = globals()[alphabet.upper()] - st, a = _[:2], _[2:] - _check_alphabet(a) - alphabet = {n: {ch: bin(i)[2:].zfill(5) for i, ch in enumerate(alph) if ch != RES_CHR} for n, alph in \ - zip(["letters", "figures"], a)} - return alphabet, {'letters': st[0], 'figures': st[1]}, func - - -def baudot_encode(alphabet=None, spaced=False, tape=False): - ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") - alphabet, states, func = _handle_alphabet(alphabet) - def encode(text, errors="strict"): - text = text.upper() - s, l, state, seen_states = "", len(b(text)), None, [] - for i, c in enumerate(text): - # if the state is undefined yet, find the relevant alphabet - if state is None: - bits= None - for st in states.keys(): - try: - bits = func(alphabet[st][c]) - state = st - if st not in seen_states: - seen_states.append(st) - break - except KeyError: - pass - if bits is None: - bits = handle_error(ename, errors, "?", 5)(c, i) - s += bits - # otherwise, handle state change (when the current alphabet does not contain the character to encode but the - # other alphabet does - else: - try: - s += func(alphabet[state][c]) - continue - except KeyError: - state = list(set(states.keys()) - {state})[0] - try: - s += func(states[state]) + func(alphabet[state][c]) - if state not in seen_states: - seen_states.append(state) - except KeyError as e: - state = list(set(states.keys()) - {state})[0] # reset the state - s += handle_error(ename, errors, "?", 5)(c, i) - # by default, if no state is specified, the encoded string is handled as letters ; so if figures are used only, - # it is necessary to include the groups of bits for figures at the beginning of the encoded string - s = (states['figures'] if seen_states == ["figures"] else "") + s - if spaced: - s = " ".join(s[i:i+5] for i in range(0, len(s), 5)) - elif tape: - s = _bits_to_tape(s) - return s, l - return encode - - -def baudot_decode(alphabet=None, spaced=False, tape=False): - ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") - alphabet, states, func = _handle_alphabet(alphabet) - alphabet = {st: {v: k for k, v in alph.items()} for st, alph in alphabet.items()} - states = {v: k for k, v in states.items()} - def decode(text, errors="strict"): - s, l = "", len(b(text)) - if spaced: - text = text.replace(" ", "") - elif tape: - text = _bits_from_tape(text) - # infer the starting state by searching for the first encountered groups of bits indicating a valid state ; - # by default, we assume letters - state = "letters" - for i in range(0, len(text), 5): - bits = func(text[i:i+5]) - # the following code handles a possible ambiguity ; e.g. when letters have a group of bits matching - # a state change - if bits in states.keys(): - error = False - # so, when we see the bits of a state, we parse previous groups in order to determine if they are valid - # groups in the corresponding state, that is, if no error occurs ; if an error occurs, then it is a - # valid state change and not simply a character, and we can set it as the starting state - for j in range(i-5, 0, -5): - try: - alphabet[states[bits]][text[j:j+5]] - except KeyError: - error = True - break - if error: - state = list(set(states.values()) - {states[bits]})[0] - break - # now parse the input text - for i in range(0, len(text), 5): - bits = func(text[i:i+5]) - try: - s += alphabet[state][bits] - except KeyError: - if bits in states.keys() and states[bits] != state: - state = states[bits] - else: - s += handle_error(ename, errors, decode=True, item="group")(bits, i//5) - return s, l - return decode - - -add("baudot", baudot_encode, baudot_decode, PATTERN % r"", examples=__examples1__, guess=[x % "" for x in __guess__], - entropy=1., printables_rate=1.) - - -baudot_spaced_encode = lambda a: baudot_encode(a, spaced=True) -baudot_spaced_decode = lambda a: baudot_decode(a, spaced=True) -add("baudot-spaced", baudot_spaced_encode, baudot_spaced_decode, PATTERN % r"[-_]spaced", examples=__examples2__, - guess=[x % "-spaced" for x in __guess__], entropy=1.48, printables_rate=1.) - - -baudot_tape_encode = lambda a: baudot_encode(a, tape=True) -baudot_tape_decode = lambda a: baudot_decode(a, tape=True) -add("baudot-tape", baudot_tape_encode, baudot_tape_decode, PATTERN % r"[-_]tape", examples=__examples3__, - guess=[x % "-tape" for x in __guess__], entropy=1.86, printables_rate=1.) - +# -*- coding: UTF-8 -*- +"""Baudot Codec - baudot content conversion to HTML. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_meteo", "ita2_us", "mtk2", "murray", "uk"] +__guess__ = ["baudot%s-{}-{}".format(x, y) for x in __CODES for y in ["lsb", "msb"]] +__examples1__ = { + 'enc(baudot-BAD_ALPHABET)': None, + 'enc(baudot_ccitt2_lsb)': {'TEST 1234': "00001100001010000001001001101111101110011000001010"}, + 'enc(baudot-ita1)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, + 'enc(baudot_ita2_msb)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, + 'enc(baudot-ita2-us)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, + 'enc(baudot)': {'\x01\x02': None}, + 'enc(baudot_ccitt1-lsb)': {'TEST ': None}, + 'enc(baudot_ccitt1_lsb)': {'TEST1234': "101010001010001101010100000100000100000100101"}, + 'enc(baudot-fr)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, +} +__examples2__ = { + 'enc(baudot_spaced-BAD_ALPHABET)': None, + 'enc(baudot-spaced_ccitt2_lsb)': {'TEST 1234': "00001 10000 10100 00001 00100 11011 11101 11001 10000 01010"}, + 'enc(baudot_spaced-ita1)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, + 'enc(baudot-spaced_ita2_msb)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, + 'enc(baudot_spaced-ita2-us)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, + 'enc(baudot-spaced_ccitt1_lsb)': {'TEST1234': "10101 00010 10001 10101 01000 00100 00010 00001 00101"}, + 'enc(baudot_spaced-fr)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, +} +__examples3__ = { + 'enc(baudot_tape-BAD_ALPHABET)': None, + 'enc(baudot_tape-ita1)': { + 'TEST 1234': "***.**\n* *. *\n .* \n* *. \n* *. *\n* . \n * . \n . *\n .* \n *. \n *. *", + }, + 'dec(baudot-tape_ita2)': {'BAD_HEADER\n .* \n': None}, + 'dec(baudot-tape_ita2-us)': {'***.**\nBAD_TAPE\n': None}, + 'dec(baudot_tape-ccitt1_lsb)': {'***.**\n .* \n* . *\n* . \n': None}, +} + + +PATTERN = r"^baudot%s([-_](?:ccitt1|ccitt2|eu|fr|ita1|ita2|ita2[-_](?:us|meteo)|mtk2|murray|uk|us_tty)" + \ + r"(?:[-_](?:lsb|msb))?)?$" +# reserved character +RES_CHR = "\xff" + +# sources: +# - http://rabbit.eng.miami.edu/info/baudot.html +# - https://en.wikipedia.org/wiki/Baudot_code +# - https://fr.qwe.wiki/wiki/Baudot_code +# all alphabets consider MSB by default +# CCITT-1 original Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) +CCITT1 = [ + "00001", "00010", + "\x00\xff\xff\xffA-JKEXGM/ZHLYSBRUTCQIWFNOVDP", + "\x00\xff\xff\xff1.6(2\xff7)\xff:\xff=3\xff8-4\xff9/\xff?\xff£5'0+", +] +# CCITT-2 revised Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) +CCITT2 = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", +] +# Original Baudot (French/European ; sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +BAUDOT = EU = FR = [ + "10000", "01000", + "\x00AEÉYUIO\xffJGHBCFD \nXZSTWV\x7fKMLRQNP", + "\x0012&34°5 67h89f0\xff.,:;!?'\x7f()=-/\u2116%", +] +# International Telegraphic Alphabet 1 (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +ITA1 = [ + "10000", "01000", + "\x00AE\rYUIO\xffJGHBCFD \xffXZSTWV\x7fKMLRQNP", + "\x0012\r34\xff5 67+89\xff0\xff\n,:.\xff?'\x7f()=-/\xff%", +] +# International Telegraphic Alphabet 2 (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +ITA2 = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "\x003\n- '87\r\x054\x07,!:(5+)2$6019?&\xff./=\xff", +] +# International Telegraphic Alphabet 2 - US TTY (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +ITA2_US = US_TTY = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", +] +# International Telegraphic Alphabet 2 - Meteo version (source: https://en.wikipedia.org/wiki/Baudot_code) +ITA2_METEO = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "-3\n\u2191 \x0787\r\u21974\u2199\u29b7\u2192\u25ef\u21905+\u21962\u21936019\u2295\u2198\xff./\u29b6\xff", +] +# Russian MTK-2 alphabet (source: https://fr.qwe.wiki/wiki/Baudot_code) +MTK2 = [ + "11111", "11011", + "\x00Е\nA СИУ\r\xffРЙНФЦКТЗЛВХЫПЯОБГ\xffМЬЖ\xff", + "\x003\n- '87\r\xff4Ю,Э:(5+)2Щ6019?Ш\xff./=\xff", +] +# Murray code ; NB: not all fractions are supported (source: https://en.wikipedia.org/wiki/Baudot_code) +MURRAY = [ + "00100", "11011", + " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", + "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*", +] +# English Baudot ; NB: not all fractions are supported (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +UK = [ + "10000", "01000", + "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", + "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+", +] + + +def _bits_from_tape(tape, trans={'*': "1", ' ': "0"}): + """ Converts a tape-like string with the given translation for ones and zeros to a series of bits. """ + bits = "" + trans_rev = {v: k for k, v in trans.items()} + for i, line in enumerate(tape.splitlines()): + if i == 0: + if line != trans_rev['1'] * 3 + "." + trans_rev['1'] * 2: + raise ValueError("Bad tape header '{}'".format(line)) + else: + line = line[:3] + line[4:] + if len(line) != 5: + raise ValueError("Bad tape line '{}'".format(line)) + bits += "".join(trans.get(c, "") for c in line) + return bits + + +def _bits_to_tape(bits, trans={'1': "*", '0': " "}): + """ Converts a series of bits to a tape-like string with the given translation for ones and zeros. """ + tape = [trans['1'] * 3 + "." + trans['1'] * 2] + for i in range(0, len(bits), 5): + group = "".join(trans[b] for b in bits[i:i+5]) + tape.append(group[:3] + "." + group[3:]) + return "\n".join(tape) + + +def _check_alphabet(alphabet): + """ Checks the length of letters and figures (must be 32 chars). """ + for chars in alphabet: + l = len(chars) + if l != 32: + raise ValueError("Bad length of alphabet (%d instead of 32)" % l) + + +def _handle_alphabet(alphabet): + """ Gets the given alphabet name and transforms it to its dictionary with letters and figures. """ + alphabet = (alphabet or "baudot").lower().replace("-", "_").strip("_") + if "_lsb" in alphabet: + alphabet = alphabet.replace("_lsb", "") + func = lambda x: x[::-1] + else: + alphabet = alphabet.replace("_msb", "") + func = lambda x: x + _ = globals()[alphabet.upper()] + st, a = _[:2], _[2:] + _check_alphabet(a) + alphabet = {n: {ch: bin(i)[2:].zfill(5) for i, ch in enumerate(alph) if ch != RES_CHR} for n, alph in \ + zip(["letters", "figures"], a)} + return alphabet, {'letters': st[0], 'figures': st[1]}, func + + +def baudot_encode(alphabet=None, spaced=False, tape=False): + ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") + alphabet, states, func = _handle_alphabet(alphabet) + def encode(text, errors="strict"): + text = text.upper() + s, l, state, seen_states = "", len(b(text)), None, [] + for i, c in enumerate(text): + # if the state is undefined yet, find the relevant alphabet + if state is None: + bits= None + for st in states.keys(): + try: + bits = func(alphabet[st][c]) + state = st + if st not in seen_states: + seen_states.append(st) + break + except KeyError: + pass + if bits is None: + bits = handle_error(ename, errors, "?", 5)(c, i) + s += bits + # otherwise, handle state change (when the current alphabet does not contain the character to encode but the + # other alphabet does + else: + try: + s += func(alphabet[state][c]) + continue + except KeyError: + state = list(set(states.keys()) - {state})[0] + try: + s += func(states[state]) + func(alphabet[state][c]) + if state not in seen_states: + seen_states.append(state) + except KeyError as e: + state = list(set(states.keys()) - {state})[0] # reset the state + s += handle_error(ename, errors, "?", 5)(c, i) + # by default, if no state is specified, the encoded string is handled as letters ; so if figures are used only, + # it is necessary to include the groups of bits for figures at the beginning of the encoded string + s = (states['figures'] if seen_states == ["figures"] else "") + s + if spaced: + s = " ".join(s[i:i+5] for i in range(0, len(s), 5)) + elif tape: + s = _bits_to_tape(s) + return s, l + return encode + + +def baudot_decode(alphabet=None, spaced=False, tape=False): + ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") + alphabet, states, func = _handle_alphabet(alphabet) + alphabet = {st: {v: k for k, v in alph.items()} for st, alph in alphabet.items()} + states = {v: k for k, v in states.items()} + def decode(text, errors="strict"): + s, l = "", len(b(text)) + if spaced: + text = text.replace(" ", "") + elif tape: + text = _bits_from_tape(text) + # infer the starting state by searching for the first encountered groups of bits indicating a valid state ; + # by default, we assume letters + state = "letters" + for i in range(0, len(text), 5): + bits = func(text[i:i+5]) + # the following code handles a possible ambiguity ; e.g. when letters have a group of bits matching + # a state change + if bits in states.keys(): + error = False + # so, when we see the bits of a state, we parse previous groups in order to determine if they are valid + # groups in the corresponding state, that is, if no error occurs ; if an error occurs, then it is a + # valid state change and not simply a character, and we can set it as the starting state + for j in range(i-5, 0, -5): + try: + alphabet[states[bits]][text[j:j+5]] + except KeyError: + error = True + break + if error: + state = list(set(states.values()) - {states[bits]})[0] + break + # now parse the input text + for i in range(0, len(text), 5): + bits = func(text[i:i+5]) + try: + s += alphabet[state][bits] + except KeyError: + if bits in states.keys() and states[bits] != state: + state = states[bits] + else: + s += handle_error(ename, errors, decode=True, item="group")(bits, i//5) + return s, l + return decode + + +add("baudot", baudot_encode, baudot_decode, PATTERN % r"", examples=__examples1__, guess=[x % "" for x in __guess__], + entropy=1., printables_rate=1.) + + +baudot_spaced_encode = lambda a: baudot_encode(a, spaced=True) +baudot_spaced_decode = lambda a: baudot_decode(a, spaced=True) +add("baudot-spaced", baudot_spaced_encode, baudot_spaced_decode, PATTERN % r"[-_]spaced", examples=__examples2__, + guess=[x % "-spaced" for x in __guess__], entropy=1.48, printables_rate=1.) + + +baudot_tape_encode = lambda a: baudot_encode(a, tape=True) +baudot_tape_decode = lambda a: baudot_decode(a, tape=True) +add("baudot-tape", baudot_tape_encode, baudot_tape_decode, PATTERN % r"[-_]tape", examples=__examples3__, + guess=[x % "-tape" for x in __guess__], entropy=1.86, printables_rate=1.) + diff --git a/codext/binary/bcd.py b/src/codext/binary/bcd.py old mode 100755 new mode 100644 similarity index 97% rename from codext/binary/bcd.py rename to src/codext/binary/bcd.py index 9f21147..a692f6b --- a/codext/binary/bcd.py +++ b/src/codext/binary/bcd.py @@ -1,80 +1,80 @@ -# -*- coding: UTF-8 -*- -"""BCD Codec - Binary Coded Decimal content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples1__ = { - 'enc(bcd|binary-coded-decimal|binary_coded_decimal)': { - 'This is a test!': "\x08A\x04\x10Q\x15\x03!\x05\x11P2\tp2\x11a\x01\x11Q\x16\x030", - }, - 'dec(binary-coded-decimal)': { - '\xaf': None, - '\xff': None, - '\x08A\x04\x10Q\x15\x03!\x05\x11P2\tp2\x11a\x01\x11Q\x16\x030': "This is a test!", - }, -} -__examples2__ = { - 'enc(bcd-ext0|bcd_extended_zeros)': { - 'This is a test': "\x00\x08\x04\x01\x00\x04\x01\x00\x05\x01\x01\x05\x00\x03\x02\x01\x00\x05\x01\x01\x05\x00" - "\x03\x02\x00\t\x07\x00\x03\x02\x01\x01\x06\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00", - }, -} -__examples3__ = { - 'enc(bcd-ext1|bcd_extended_ones)': { - 'This is a test': "\xf0\xf8\xf4\xf1\xf0\xf4\xf1\xf0\xf5\xf1\xf1\xf5\xf0\xf3\xf2\xf1\xf0\xf5\xf1\xf1\xf5\xf0" - "\xf3\xf2\xf0\xf9\xf7\xf0\xf3\xf2\xf1\xf1\xf6\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0", - }, -} - - -CODE = {str(i): bin(i)[2:].zfill(4) for i in range(10)} - - -def bcd_encode(prefix=""): - def encode(text, errors="strict"): - r, bits = "", prefix - for c in text: - for i in str(ord(c)).zfill(3): - bits += CODE[i] - if len(bits) == 8: - r += chr(int(bits, 2)) - bits = prefix - if len(bits) > 0: - r += chr(int(bits + "0000", 2)) - return r, len(b(text)) - return encode - - -def bcd_decode(prefix=""): - def decode(text, errors="strict"): - code = {v: k for k, v in CODE.items()} - r, d = "", "" - for i, c in enumerate(text): - bin_c = bin(ord(c))[2:].zfill(8) - for k in range(len(prefix), 8, 4): - hb = bin_c[k:k+4] - try: - d += code[hb] - except KeyError: - d += handle_error("bcd", errors, decode=True)(hb, i) - if len(d) == 3: - r += chr(int(d)) - d = "" - return r, len(b(text)) - return decode - - -add("bcd", bcd_encode(), bcd_decode(), pattern=r"^(?:bcd|binary[-_]coded[-_]decimals?)$", examples=__examples1__, - entropy=lambda e: .45739*e+2.63519, printables_rate=.2) -add("bcd-extended0", bcd_encode("0000"), bcd_decode("0000"), examples=__examples2__, entropy=lambda e: .13584*e+2.07486, - pattern=r"^(?:bcd|binary[-_]coded[-_]decimals?)[-_]ext(?:ended)?(?:[-_]?0|[-_]zeros?)$") -add("bcd-extended1", bcd_encode("1111"), bcd_decode("1111"), examples=__examples3__, entropy=lambda e: .13584*e+2.07486, - pattern=r"^(?:bcd|binary[-_]coded[-_]decimals?)[-_]ext(?:ended)?(?:[-_]?1|[-_]ones?)$") - +# -*- coding: UTF-8 -*- +"""BCD Codec - Binary Coded Decimal content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples1__ = { + 'enc(bcd|binary-coded-decimal|binary_coded_decimal)': { + 'This is a test!': "\x08A\x04\x10Q\x15\x03!\x05\x11P2\tp2\x11a\x01\x11Q\x16\x030", + }, + 'dec(binary-coded-decimal)': { + '\xaf': None, + '\xff': None, + '\x08A\x04\x10Q\x15\x03!\x05\x11P2\tp2\x11a\x01\x11Q\x16\x030': "This is a test!", + }, +} +__examples2__ = { + 'enc(bcd-ext0|bcd_extended_zeros)': { + 'This is a test': "\x00\x08\x04\x01\x00\x04\x01\x00\x05\x01\x01\x05\x00\x03\x02\x01\x00\x05\x01\x01\x05\x00" + "\x03\x02\x00\t\x07\x00\x03\x02\x01\x01\x06\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00", + }, +} +__examples3__ = { + 'enc(bcd-ext1|bcd_extended_ones)': { + 'This is a test': "\xf0\xf8\xf4\xf1\xf0\xf4\xf1\xf0\xf5\xf1\xf1\xf5\xf0\xf3\xf2\xf1\xf0\xf5\xf1\xf1\xf5\xf0" + "\xf3\xf2\xf0\xf9\xf7\xf0\xf3\xf2\xf1\xf1\xf6\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0", + }, +} + + +CODE = {str(i): bin(i)[2:].zfill(4) for i in range(10)} + + +def bcd_encode(prefix=""): + def encode(text, errors="strict"): + r, bits = "", prefix + for c in text: + for i in str(ord(c)).zfill(3): + bits += CODE[i] + if len(bits) == 8: + r += chr(int(bits, 2)) + bits = prefix + if len(bits) > 0: + r += chr(int(bits + "0000", 2)) + return r, len(b(text)) + return encode + + +def bcd_decode(prefix=""): + def decode(text, errors="strict"): + code = {v: k for k, v in CODE.items()} + r, d = "", "" + for i, c in enumerate(text): + bin_c = bin(ord(c))[2:].zfill(8) + for k in range(len(prefix), 8, 4): + hb = bin_c[k:k+4] + try: + d += code[hb] + except KeyError: + d += handle_error("bcd", errors, decode=True)(hb, i) + if len(d) == 3: + r += chr(int(d)) + d = "" + return r, len(b(text)) + return decode + + +add("bcd", bcd_encode(), bcd_decode(), pattern=r"^(?:bcd|binary[-_]coded[-_]decimals?)$", examples=__examples1__, + entropy=lambda e: .45739*e+2.63519, printables_rate=.2) +add("bcd-extended0", bcd_encode("0000"), bcd_decode("0000"), examples=__examples2__, entropy=lambda e: .13584*e+2.07486, + pattern=r"^(?:bcd|binary[-_]coded[-_]decimals?)[-_]ext(?:ended)?(?:[-_]?0|[-_]zeros?)$") +add("bcd-extended1", bcd_encode("1111"), bcd_decode("1111"), examples=__examples3__, entropy=lambda e: .13584*e+2.07486, + pattern=r"^(?:bcd|binary[-_]coded[-_]decimals?)[-_]ext(?:ended)?(?:[-_]?1|[-_]ones?)$") + diff --git a/codext/binary/excess3.py b/src/codext/binary/excess3.py old mode 100755 new mode 100644 similarity index 96% rename from codext/binary/excess3.py rename to src/codext/binary/excess3.py index 858bcb7..831cf41 --- a/codext/binary/excess3.py +++ b/src/codext/binary/excess3.py @@ -1,65 +1,65 @@ -# -*- coding: UTF-8 -*- -"""Excess-3 Codec - Excess-3 code (aka Stibitz code) content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(excess3|xs-3|stibitz)': { - 'This is a test!': ";t7C\x84H6T8D\x83e<\xa3eD\x944D\x84I6`", - 'This is another test ': ";t7C\x84H6T8D\x83e<\xa4CDDICt4DseD\x944D\x84I6P", - }, - 'dec(excess-3|xs3)': { - '\x00': None, - '\xff': None, - ';t7C\x84H6T8D\x83e<\xa3eD\x944D\x84I6`': "This is a test!", - ';t7C\x84H6T8D\x83e<\xa4CDDICt4DseD\x944D\x84I6P': "This is another test ", - }, -} - - -CODE = { - '0': "0011", '1': "0100", '2': "0101", '3': "0110", '4': "0111", - '5': "1000", '6': "1001", '7': "1010", '8': "1011", '9': "1100", -} - - -def excess3_encode(text, errors="strict"): - r, bits = "", "" - for c in text: - for i in str(ord(c)).zfill(3): - bits += CODE[i] - if len(bits) == 8: - r += chr(int(bits, 2)) - bits = "" - if len(bits) > 0: - r += chr(int(bits + "0000", 2)) - return r, len(b(text)) - - -def excess3_decode(text, errors="strict"): - code = {v: k for k, v in CODE.items()} - r, d = "", "" - for i, c in enumerate(text): - bin_c = bin(ord(c))[2:].zfill(8) - for k in range(0, 8, 4): - hb = bin_c[k:k+4] - try: - d += code[hb] - except KeyError: # (normal case) occurs when 0000 was used for padding - if i != len(text) - 1 or k != 4 or hb != "0000": - d += handle_error("excess3", errors, decode=True)(hb, i) - if len(d) == 3: - r += chr(int(d)) - d = "" - return r, len(b(text)) - - -add("excess3", excess3_encode, excess3_decode, pattern=r"^(?:excess\-?3|xs\-?3|stibitz)$", printables_rate=.45) - +# -*- coding: UTF-8 -*- +"""Excess-3 Codec - Excess-3 code (aka Stibitz code) content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(excess3|xs-3|stibitz)': { + 'This is a test!': ";t7C\x84H6T8D\x83e<\xa3eD\x944D\x84I6`", + 'This is another test ': ";t7C\x84H6T8D\x83e<\xa4CDDICt4DseD\x944D\x84I6P", + }, + 'dec(excess-3|xs3)': { + '\x00': None, + '\xff': None, + ';t7C\x84H6T8D\x83e<\xa3eD\x944D\x84I6`': "This is a test!", + ';t7C\x84H6T8D\x83e<\xa4CDDICt4DseD\x944D\x84I6P': "This is another test ", + }, +} + + +CODE = { + '0': "0011", '1': "0100", '2': "0101", '3': "0110", '4': "0111", + '5': "1000", '6': "1001", '7': "1010", '8': "1011", '9': "1100", +} + + +def excess3_encode(text, errors="strict"): + r, bits = "", "" + for c in text: + for i in str(ord(c)).zfill(3): + bits += CODE[i] + if len(bits) == 8: + r += chr(int(bits, 2)) + bits = "" + if len(bits) > 0: + r += chr(int(bits + "0000", 2)) + return r, len(b(text)) + + +def excess3_decode(text, errors="strict"): + code = {v: k for k, v in CODE.items()} + r, d = "", "" + for i, c in enumerate(text): + bin_c = bin(ord(c))[2:].zfill(8) + for k in range(0, 8, 4): + hb = bin_c[k:k+4] + try: + d += code[hb] + except KeyError: # (normal case) occurs when 0000 was used for padding + if i != len(text) - 1 or k != 4 or hb != "0000": + d += handle_error("excess3", errors, decode=True)(hb, i) + if len(d) == 3: + r += chr(int(d)) + d = "" + return r, len(b(text)) + + +add("excess3", excess3_encode, excess3_decode, pattern=r"^(?:excess\-?3|xs\-?3|stibitz)$", printables_rate=.45) + diff --git a/codext/binary/gray.py b/src/codext/binary/gray.py old mode 100755 new mode 100644 similarity index 95% rename from codext/binary/gray.py rename to src/codext/binary/gray.py index f1be17a..e32c979 --- a/codext/binary/gray.py +++ b/src/codext/binary/gray.py @@ -1,25 +1,25 @@ -# -*- coding: UTF-8 -*- -"""Gray Codec - gray code content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(gray|reflected-bin|reflected_binary)': { - 'this is a test': "N\\]J0]J0Q0NWJN", - 'THIS IS A TEST': "~lmz0mz0a0~gz~", - }, -} - - -ENCMAP = {chr(i): chr(i ^ (i >> 1)) for i in range(256)} - - -add_map("gray", ENCMAP, pattern=r"^(?:gray|reflected[-_]bin(?:ary)?)$", entropy=lambda e: e) - +# -*- coding: UTF-8 -*- +"""Gray Codec - gray code content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(gray|reflected-bin|reflected_binary)': { + 'this is a test': "N\\]J0]J0Q0NWJN", + 'THIS IS A TEST': "~lmz0mz0a0~gz~", + }, +} + + +ENCMAP = {chr(i): chr(i ^ (i >> 1)) for i in range(256)} + + +add_map("gray", ENCMAP, pattern=r"^(?:gray|reflected[-_]bin(?:ary)?)$", entropy=lambda e: e) + diff --git a/codext/binary/manchester.py b/src/codext/binary/manchester.py old mode 100755 new mode 100644 similarity index 97% rename from codext/binary/manchester.py rename to src/codext/binary/manchester.py index 32f3ac5..a50181d --- a/codext/binary/manchester.py +++ b/src/codext/binary/manchester.py @@ -1,50 +1,50 @@ -# -*- coding: UTF-8 -*- -"""Manchester Codec - Manchester content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples1__ = {'enc(manchester)': {'This is a test!': "fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV"}} -__examples2__ = { - 'enc(manchester-inverted|ethernet|ieee802.4)': { - 'This is a test!': "\x99\x9a\x96j\x96i\x95\xa5\xa6\xaa\x96i\x95\xa5\xa6\xaa\x96\xa9\xa6\xaa\x95\x9a\x96\x99" - "\x95\xa5\x95\x9a\xa6\xa9", - }, -} - - -def manchester_encode(clock): - def encode(text, errors="strict"): - r = "" - for c in text: - bin_c = bin(ord(c))[2:].zfill(8) - for i in range(0, 8, 4): - r += chr(int("".join(2*bit for bit in bin_c[i:i+4]), 2) ^ clock) - return r, len(b(text)) - return encode - - -def manchester_decode(clock): - def decode(text, errors="strict"): - r, bits = "", "" - for c in text: - bin_c = bin(ord(c) ^ clock)[2:].zfill(8) - bits += "".join(bin_c[i] for i in range(0, len(bin_c), 2)) - if len(bits) == 8: - r += chr(int(bits, 2)) - bits = "" - return r, len(b(text)) - return decode - - -add("manchester", manchester_encode(0x55), manchester_decode(0x55), examples=__examples1__, printables_rate=.25, - entropy=lambda e: .17616*e+2.56229) -add("manchester-inverted", manchester_encode(0xaa), manchester_decode(0xaa), examples=__examples2__, - pattern=r"^(?:manchester-inverted|ethernet|ieee802\.4)$", entropy=lambda e: .17616*e+2.56229) - +# -*- coding: UTF-8 -*- +"""Manchester Codec - Manchester content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples1__ = {'enc(manchester)': {'This is a test!': "fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV"}} +__examples2__ = { + 'enc(manchester-inverted|ethernet|ieee802.4)': { + 'This is a test!': "\x99\x9a\x96j\x96i\x95\xa5\xa6\xaa\x96i\x95\xa5\xa6\xaa\x96\xa9\xa6\xaa\x95\x9a\x96\x99" + "\x95\xa5\x95\x9a\xa6\xa9", + }, +} + + +def manchester_encode(clock): + def encode(text, errors="strict"): + r = "" + for c in text: + bin_c = bin(ord(c))[2:].zfill(8) + for i in range(0, 8, 4): + r += chr(int("".join(2*bit for bit in bin_c[i:i+4]), 2) ^ clock) + return r, len(b(text)) + return encode + + +def manchester_decode(clock): + def decode(text, errors="strict"): + r, bits = "", "" + for c in text: + bin_c = bin(ord(c) ^ clock)[2:].zfill(8) + bits += "".join(bin_c[i] for i in range(0, len(bin_c), 2)) + if len(bits) == 8: + r += chr(int(bits, 2)) + bits = "" + return r, len(b(text)) + return decode + + +add("manchester", manchester_encode(0x55), manchester_decode(0x55), examples=__examples1__, printables_rate=.25, + entropy=lambda e: .17616*e+2.56229) +add("manchester-inverted", manchester_encode(0xaa), manchester_decode(0xaa), examples=__examples2__, + pattern=r"^(?:manchester-inverted|ethernet|ieee802\.4)$", entropy=lambda e: .17616*e+2.56229) + diff --git a/src/codext/binary/rotate.py b/src/codext/binary/rotate.py new file mode 100644 index 0000000..fb0c697 --- /dev/null +++ b/src/codext/binary/rotate.py @@ -0,0 +1,51 @@ +# -*- coding: UTF-8 -*- +"""Rotate-Bits Codec - rotate-N-bits content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(rotate-0|rotate-8|rotate-left-8)': None, + 'enc(rotate1|rotate-right-1|rotate_1)': {'This is a test': "*4\xb4\xb9\x10\xb4\xb9\x10\xb0\x10:\xb2\xb9:"}, + 'enc(rotate-left-1|rotate_left_1)': {'This is a test': "¨ÐÒæ@Òæ@Â@èÊæè"}, +} +__guess__ = ["rotate-%d" % i for i in range(1, 8)] + ["rotate-left-%d" % i for i in range(1, 8)] + + +def _getn(i): + m = 1 + if str(i).startswith("left"): + i = i[4:].lstrip("-_") + m = -1 + return m * int(i) + + +def _rotaten(text, n=1): + r = "" + for c in ensure_str(text): + b = bin(ord(c))[2:].zfill(8) + r += chr(int(b[-n:] + b[:-n], 2)) + return r + + +def rotate_encode(i): + def encode(text, errors="strict"): + return _rotaten(text, _getn(i)), len(text) + return encode + + +def rotate_decode(i): + def decode(text, errors="strict"): + return _rotaten(text, -_getn(i)), len(text) + return decode + + +add("rotate", rotate_encode, rotate_decode, r"rotate(?:[-_]?bits)?[-_]?((?:(?:left|right)[-_]?)?[1-7])$", + transitive=True) + diff --git a/src/codext/checksums/__init__.py b/src/codext/checksums/__init__.py new file mode 100644 index 0000000..bb20356 --- /dev/null +++ b/src/codext/checksums/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +from .adler import * +from .crc import * +from .luhn import * + diff --git a/src/codext/checksums/adler.py b/src/codext/checksums/adler.py new file mode 100644 index 0000000..16163d8 --- /dev/null +++ b/src/codext/checksums/adler.py @@ -0,0 +1,17 @@ +# -*- coding: UTF-8 -*- +"""Adler Codecs - Adler32 checksum algorithm. + +This is a codec for computing checksums, for use with other codecs in encoding chains. + +This codec: +- transforms strings from str to str +- transforms strings from bytes to bytes +- transforms file content from str to bytes (write) +""" +from zlib import adler32 + +from ..__common__ import add, b + + +add("adler32", lambda data, error="strict": (adler32(b(data)) & 0xffffffff, len(data)), guess=None) + diff --git a/codext/hashing/checksums.py b/src/codext/checksums/crc.py similarity index 96% rename from codext/hashing/checksums.py rename to src/codext/checksums/crc.py index f94dd2e..dfea7ee 100644 --- a/codext/hashing/checksums.py +++ b/src/codext/checksums/crc.py @@ -1,16 +1,14 @@ # -*- coding: UTF-8 -*- -"""Case Codecs - string common checksums. +"""CRC Codecs - Cyclic Redundancy Check checksum algorithm. -These are codecs for hashing strings, for use with other codecs in encoding chains. +This is a codec for computing checksums, for use with other codecs in encoding chains. -These codecs: -- transform strings from str to str -- transform strings from bytes to bytes -- transform file content from str to bytes (write) +This codec: +- transforms strings from str to str +- transforms strings from bytes to bytes +- transforms file content from str to bytes (write) """ -from zlib import adler32 - -from ..__common__ import add, b +from ..__common__ import add CRC = { @@ -214,7 +212,7 @@ }, } -_pattern = lambda n="": r"^crc" + str(n) + r"(|[-_]?(?:%s))$" % "|".join(x for x in CRC[n].keys() if len(x) > 0) +_pattern = lambda n="": rf"^crc(?:[-_]?){n}(|[-_]?(?:{'|'.join(x for x in CRC[n].keys() if len(x) > 0)}))$" _rev_int = lambda i, l=None: int(bin(i)[2:].zfill(l or len(bin(i)[2:]))[::-1], 2) @@ -261,7 +259,6 @@ def _encode(data, error="strict"): return _crc -add("adler32", lambda data, error="strict": (adler32(b(data)) & 0xffffffff, len(data)), guess=None) add("crca", crc_checksum(), pattern=_pattern(), guess=None) for i in CRC.keys(): if isinstance(i, int): diff --git a/src/codext/checksums/luhn.py b/src/codext/checksums/luhn.py new file mode 100644 index 0000000..be19af6 --- /dev/null +++ b/src/codext/checksums/luhn.py @@ -0,0 +1,33 @@ +# -*- coding: UTF-8 -*- +"""Luhn Codec - Luhn Mod N checksum algorithm. + +This is a codec for computing checksums, for use with other codecs in encoding chains. + +This codec: +- transforms strings from str to str +- transforms strings from bytes to bytes +- transforms file content from str to bytes (write) +""" +from ..__common__ import * + + +def luhn(n=""): + alphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"[:(mod := n if isinstance(n, int) else 10)] + def encode(data, errors="strict"): + total, data = 0, "".join(c if c in alphabet else handle_error("luhn", errors, kind="character")(c, i, data) \ + for i, c in enumerate(data)) + if not (data := ensure_str(data).upper() if mod > 10 else ensure_str(data)): + return "", 0 + for i, c in enumerate(reversed(data)): + code = alphabet.index(c) + if i % 2 == 0: + d = code * 2 + code = d % mod + d // mod + total += code + check = (mod - total % mod) % mod + return alphabet[check], len(b(data)) + return encode + + +add("luhn", luhn, pattern=r"^luhn[-_]?(\d{1,2})?$", guess=None) + diff --git a/codext/common/__init__.py b/src/codext/common/__init__.py old mode 100755 new mode 100644 similarity index 94% rename from codext/common/__init__.py rename to src/codext/common/__init__.py index 403d991..3ca65e6 --- a/codext/common/__init__.py +++ b/src/codext/common/__init__.py @@ -1,7 +1,7 @@ -# -*- coding: UTF-8 -*- -from .a1z26 import * -from .cases import * -from .dummy import * -from .octal import * -from .ordinal import * - +# -*- coding: UTF-8 -*- +from .a1z26 import * +from .cases import * +from .dummy import * +from .octal import * +from .ordinal import * + diff --git a/codext/common/a1z26.py b/src/codext/common/a1z26.py old mode 100755 new mode 100644 similarity index 96% rename from codext/common/a1z26.py rename to src/codext/common/a1z26.py index cc4de7e..b184637 --- a/codext/common/a1z26.py +++ b/src/codext/common/a1z26.py @@ -1,60 +1,60 @@ -# -*- coding: UTF-8 -*- -"""A1Z26 Codec - A1Z26 content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from string import ascii_lowercase as lower - -from ..__common__ import * - - -SEP = "-_/|,;:*" - -__examples__ = { - 'enc(a1z26-BAD)': None, - 'dec(a1z26)': {'1-12-123': None}, - 'enc(a1z26)': {'test123': None, 'this is a test': "20-8-9-19 9-19 1 20-5-19-20"}, - 'enc(a1z26-/)': {'this is a test': "20/8/9/19 9/19 1 20/5/19/20"}, -} -__guess__ = ["a1z26", "a1z26_"] + ["a1z26-" + s for s in SEP[2:]] - - -def a1z26_encode(sep): - sep = sep[-1] if len(sep) > 0 else "-" - def encode(text, errors="strict"): - words = [] - for word in text.split(): - w = [] - for k, c in enumerate(word): - try: - w.append(str(lower.index(c.lower()) + 1)) - except ValueError: - w.append(handle_error("a1z26", errors)(c, k)) - words.append(sep.join(w).strip(sep)) - return " ".join(words), len(text) - return encode - - -def a1z26_decode(sep): - sep = sep[-1] if len(sep) > 0 else "-" - def decode(text, errors="strict"): - k, words = 0, [] - for word in text.split(): - w = "" - for i in word.split(sep): - k += 1 - try: - w += lower[int(i)-1] - except IndexError: - w += handle_error("a1z26", errors, decode=True)(str(i), k) - words.append(w) - return " ".join(words), len(text) - return decode - - -add("a1z26", a1z26_encode, a1z26_decode, pattern=r"^a1z26(|[-_]|[-_][/|,;:\*])$", printables_rate=1.) - +# -*- coding: UTF-8 -*- +"""A1Z26 Codec - A1Z26 content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from string import ascii_lowercase as lower + +from ..__common__ import * + + +SEP = "-_/|,;:*" + +__examples__ = { + 'enc(a1z26-BAD)': None, + 'dec(a1z26)': {'1-12-123': None}, + 'enc(a1z26)': {'test123': None, 'this is a test': "20-8-9-19 9-19 1 20-5-19-20"}, + 'enc(a1z26-/)': {'this is a test': "20/8/9/19 9/19 1 20/5/19/20"}, +} +__guess__ = ["a1z26", "a1z26_"] + ["a1z26-" + s for s in SEP[2:]] + + +def a1z26_encode(sep): + sep = sep[-1] if len(sep) > 0 else "-" + def encode(text, errors="strict"): + words = [] + for word in text.split(): + w = [] + for k, c in enumerate(word): + try: + w.append(str(lower.index(c.lower()) + 1)) + except ValueError: + w.append(handle_error("a1z26", errors)(c, k)) + words.append(sep.join(w).strip(sep)) + return " ".join(words), len(text) + return encode + + +def a1z26_decode(sep): + sep = sep[-1] if len(sep) > 0 else "-" + def decode(text, errors="strict"): + k, words = 0, [] + for word in text.split(): + w = "" + for i in word.split(sep): + k += 1 + try: + w += lower[int(i)-1] + except IndexError: + w += handle_error("a1z26", errors, decode=True)(str(i), k) + words.append(w) + return " ".join(words), len(text) + return decode + + +add("a1z26", a1z26_encode, a1z26_decode, pattern=r"^a1z26(|[-_]|[-_][/|,;:\*])$", printables_rate=1.) + diff --git a/codext/common/cases.py b/src/codext/common/cases.py similarity index 86% rename from codext/common/cases.py rename to src/codext/common/cases.py index 8aa87e4..2f91ada 100644 --- a/codext/common/cases.py +++ b/src/codext/common/cases.py @@ -27,11 +27,12 @@ add("lowercase", lowercase, uppercase, r"^lower(?:case)?$", penalty=.2) slugify = lambda i, e="strict", d="-": (re.sub(r"[^0-9a-z]+", d, i.lower()).strip(d), len(i)) -add("slugify", lambda i, e="strict": slugify(i, e), None, r"^(?:slug(?:ify)?|kebab(?:[-_]?case)?)$") +add("slugify", lambda i, e="strict": slugify(i, e), None, r"^(?:slug(?:ify)?|(?:dash|kebab)(?:[-_]?case)?)$") add("snakecase", lambda i, e="strict": slugify(i, e, "_"), None, r"^snake(?:[-_]?case)?$") +add("screamingsnakecase", lambda i, e="strict": slugify(i, e, "_").upper(), None, r"^screaming[-_]snake(?:[-_]?case)?$") swapcase = lambda i, e="strict": (i.swapcase(), len(i)) -add("swapcase", swapcase, swapcase, r"^(?:swap(?:[-_]?case)?|invert(?:case)?)$", penalty=.2) +add("swapcase", swapcase, swapcase, r"^(?:(?:flip|swap)(?:[-_]?case)?|invert(?:case)?)$", penalty=.2) title = lambda i, e="strict": (i.title(), len(i)) untitle = lambda i, e="strict": (" ".join(w[0].lower() + w[1:] if len(w) > 0 else "" for w in i.split()), len(i)) diff --git a/codext/common/dummy.py b/src/codext/common/dummy.py old mode 100755 new mode 100644 similarity index 77% rename from codext/common/dummy.py rename to src/codext/common/dummy.py index 7f4be19..e6d73cb --- a/codext/common/dummy.py +++ b/src/codext/common/dummy.py @@ -1,47 +1,57 @@ -# -*- coding: UTF-8 -*- -"""Dummy Codecs - simple string manipulations. - -These are dummy codecs for manipulating strings, for use with other codecs in encoding/decoding chains. - -These codecs: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import re - -from ..__common__ import * - - -def replace(pair, *args): - def code(input, errors="strict"): - return input.replace(pair[0], pair[1]), len(input) - return code -add("replace", replace, replace, r"^replace[-_]?((?!.*(.).*\2)..)$", guess=None) -# important note: ^ -# using "{2}" here instead will break the codec -# this is due to the fact the codext.__common__.generate_string_from_regex DOES NOT handle ASSERT_NOT (?!) and will -# faill to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo - - -def substitute(token, replacement): - def code(input, errors="strict"): - return input.replace(token, replacement), len(input) - return code -add("substitute", substitute, substitute, r"^substitute[-_]?(.*?)/(.*?)$", guess=None) - - -reverse = lambda i, e="strict": (i[::-1], len(i)) -add("reverse", reverse, reverse) - -_revl = lambda i, wd=False: "".join((" ".join(w[::-1] for w in l.split()) if wd else l[::-1]) \ - if not re.match(r"(\r?\n)", l) else l for l in re.split(r"(\r?\n)", i)) -line_reverse = lambda i, e="strict": (_revl(i), len(i)) -add("reverse-lines", line_reverse, line_reverse, r"^reverse[-_]lines$") -word_reverse = lambda i, e="strict": (_revl(i, True), len(i)) -add("reverse-words", word_reverse, word_reverse, r"^reverse[-_]words$") - -strip_spaces = lambda i, e="strict": (i.replace(" ", ""), len(i)) -add("strip-spaces", strip_spaces, strip_spaces, guess=None) - +# -*- coding: UTF-8 -*- +"""Dummy Codecs - simple string manipulations. + +These are dummy codecs for manipulating strings, for use with other codecs in encoding/decoding chains. + +These codecs: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import re + +from ..__common__ import * + + +def replace(pair, *args): + def code(input, errors="strict"): + return input.replace(pair[0], pair[1]), len(input) + return code +add("replace", replace, replace, r"^replace[-_]?((?!.*(.).*\2)..)$", guess=None) +# important note: ^ +# using "{2}" here instead will break the codec +# this is due to the fact the codext.__common__.generate_string_from_regex DOES NOT handle ASSERT_NOT (?!) and will +# fail to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo + + +def substitute(token, replacement): + def code(input, errors="strict"): + return input.replace(token, replacement), len(input) + return code +add("substitute", substitute, substitute, r"^substitute[-_]?(.*?)/(.*?)$", guess=None) + + +reverse = lambda i, e="strict": (i[::-1], len(i)) +add("reverse", reverse, reverse) + +_revl = lambda i, wd=False: "".join((" ".join(w[::-1] for w in l.split()) if wd else l[::-1]) \ + if not re.match(r"(\r?\n)", l) else l for l in re.split(r"(\r?\n)", i)) +line_reverse = lambda i, e="strict": (_revl(i), len(i)) +add("reverse-lines", line_reverse, line_reverse, r"^reverse[-_]lines$") +word_reverse = lambda i, e="strict": (_revl(i, True), len(i)) +add("reverse-words", word_reverse, word_reverse, r"^reverse[-_]words$") + +strip_spaces = lambda i, e="strict": (i.replace(" ", ""), len(i)) +add("strip-spaces", strip_spaces, strip_spaces, guess=None) + +def tokenize(n): + tlen = int(n[8:].lstrip("-_")) + def code(input, errors="strict"): + l = len(input) + if tlen > l: + raise LookupError("unknown encoding: %s" % n) + return " ".join(input[i:i+tlen] for i in range(0, l, tlen)), l + return code +add("tokenize", tokenize, tokenize, r"^(tokenize[-_]?[1-9][0-9]*)$", guess=None) + diff --git a/codext/common/octal.py b/src/codext/common/octal.py old mode 100755 new mode 100644 similarity index 96% rename from codext/common/octal.py rename to src/codext/common/octal.py index 6559409..9165065 --- a/codext/common/octal.py +++ b/src/codext/common/octal.py @@ -1,31 +1,31 @@ -# -*- coding: UTF-8 -*- -"""Octal Codec - octal content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples1__ = { - 'enc(octal-spaced|octals_spaced)': {'this is a test': "164 150 151 163 40 151 163 40 141 40 164 145 163 164"}, -} -__examples2__ = { - 'enc(octal|octals)': {'this is a test': "164150151163040151163040141040164145163164"}, -} - - -oct2 = lambda i: oct(i).lstrip("0").replace("o", "") - -ENCMAP1 = {chr(i): oct2(i) for i in range(256)} -ENCMAP2 = {chr(i): oct2(i).zfill(3) for i in range(256)} - - -add_map("octal-spaced", ENCMAP1, sep=" ", pattern=r"^octals?[-_]spaced$", examples=__examples1__, - entropy=lambda e: .07258*e+2.3739, printables_rate=1.) -add_map("octal", ENCMAP2, pattern=r"^octals?$", examples=__examples2__, entropy=lambda e: .08803*e+2.19498, - printables_rate=1.) - +# -*- coding: UTF-8 -*- +"""Octal Codec - octal content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples1__ = { + 'enc(octal-spaced|octals_spaced)': {'this is a test': "164 150 151 163 40 151 163 40 141 40 164 145 163 164"}, +} +__examples2__ = { + 'enc(octal|octals)': {'this is a test': "164150151163040151163040141040164145163164"}, +} + + +oct2 = lambda i: oct(i).lstrip("0").replace("o", "") + +ENCMAP1 = {chr(i): oct2(i) for i in range(256)} +ENCMAP2 = {chr(i): oct2(i).zfill(3) for i in range(256)} + + +add_map("octal-spaced", ENCMAP1, sep=" ", pattern=r"^octals?[-_]spaced$", examples=__examples1__, + entropy=lambda e: .07258*e+2.3739, printables_rate=1.) +add_map("octal", ENCMAP2, pattern=r"^octals?$", examples=__examples2__, entropy=lambda e: .08803*e+2.19498, + printables_rate=1.) + diff --git a/codext/common/ordinal.py b/src/codext/common/ordinal.py old mode 100755 new mode 100644 similarity index 96% rename from codext/common/ordinal.py rename to src/codext/common/ordinal.py index 4bf576b..6d4d227 --- a/codext/common/ordinal.py +++ b/src/codext/common/ordinal.py @@ -1,28 +1,28 @@ -# -*- coding: UTF-8 -*- -"""Ordinal Codec - ordinal content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples1__ = { - 'enc(ordinal-spaced|ordinals_spaced)': {'this is a test': "116 104 105 115 32 105 115 32 97 32 116 101 115 116"}, -} -__examples2__ = { - 'enc(ordinal|ordinals)': {'this is a test': "116104105115032105115032097032116101115116"}, -} - - -ENCMAP1 = {chr(i): str(i) for i in range(256)} -ENCMAP2 = {chr(i): str(i).zfill(3) for i in range(256)} - - -add_map("ordinal-spaced", ENCMAP1, sep=" ", pattern=r"^ordinals?[-_]spaced$", examples=__examples1__, entropy=3., - printables_rate=1.) -add_map("ordinal", ENCMAP2, pattern=r"^ordinals?$", examples=__examples2__, entropy=3., printables_rate=1.) - +# -*- coding: UTF-8 -*- +"""Ordinal Codec - ordinal content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples1__ = { + 'enc(ordinal-spaced|ordinals_spaced)': {'this is a test': "116 104 105 115 32 105 115 32 97 32 116 101 115 116"}, +} +__examples2__ = { + 'enc(ordinal|ordinals)': {'this is a test': "116104105115032105115032097032116101115116"}, +} + + +ENCMAP1 = {chr(i): str(i) for i in range(256)} +ENCMAP2 = {chr(i): str(i).zfill(3) for i in range(256)} + + +add_map("ordinal-spaced", ENCMAP1, sep=" ", pattern=r"^ordinals?[-_]spaced$", examples=__examples1__, entropy=3., + printables_rate=1.) +add_map("ordinal", ENCMAP2, pattern=r"^ordinals?$", examples=__examples2__, entropy=3., printables_rate=1.) + diff --git a/codext/compressions/__init__.py b/src/codext/compressions/__init__.py old mode 100755 new mode 100644 similarity index 95% rename from codext/compressions/__init__.py rename to src/codext/compressions/__init__.py index 606a1dc..eae3eee --- a/codext/compressions/__init__.py +++ b/src/codext/compressions/__init__.py @@ -1,12 +1,12 @@ -# -*- coding: UTF-8 -*- -from .gzipp import * -from .lz77 import * -from .lz78 import * -from .pkzip import * - - -for e in list_encodings("compression"): - ci = lookup(e, False) - ci.parameters['scoring']['entropy'] = 7.9 - ci.parameters['scoring']['expansion_factor'] = lambda f: f - +# -*- coding: UTF-8 -*- +from .gzipp import * +from .lz77 import * +from .lz78 import * +from .pkzip import * + + +for e in list_encodings("compression"): + ci = lookup(e, False) + ci.parameters['scoring']['entropy'] = 7.9 + ci.parameters['scoring']['expansion_factor'] = lambda f: f + diff --git a/codext/compressions/gzipp.py b/src/codext/compressions/gzipp.py old mode 100755 new mode 100644 similarity index 96% rename from codext/compressions/gzipp.py rename to src/codext/compressions/gzipp.py index 14e65bc..e162239 --- a/codext/compressions/gzipp.py +++ b/src/codext/compressions/gzipp.py @@ -1,44 +1,44 @@ -# -*- coding: UTF-8 -*- -"""Gzip Codec - gzip content compression. - -NB: Not an encoding properly speaking. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import zlib -from gzip import GzipFile - -from ..__common__ import * - - -__examples__ = {'enc-dec(gzip)': ["test", "This is a test", "@random{512,1024,2048}"]} - - -def gzip_compress(text, errors="strict"): - out = BytesIO() - with GzipFile(fileobj=out, mode="wb") as f: - f.write(b(text)) - return out.getvalue(), len(text) - - -def gzip_decompress(data, errors="strict"): - # then try decompressing considering the file signature - try: - with GzipFile(fileobj=BytesIO(b(data)), mode="rb") as f: - r = f.read() - except: - pass - # try decompressing without considering the file signature - try: - r = zlib.decompress(b(data), 16 + zlib.MAX_WBITS) - except: - return handle_error("gzip", errors, decode=True)(data[0], 0) if len(data) > 0 else "", len(data) - return r, len(r) - - -add("gzip", gzip_compress, gzip_decompress) - +# -*- coding: UTF-8 -*- +"""Gzip Codec - gzip content compression. + +NB: Not an encoding properly speaking. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import zlib +from gzip import GzipFile + +from ..__common__ import * + + +__examples__ = {'enc-dec(gzip)': ["test", "This is a test", "@random{512,1024,2048}"]} + + +def gzip_compress(text, errors="strict"): + out = BytesIO() + with GzipFile(fileobj=out, mode="wb") as f: + f.write(b(text)) + return out.getvalue(), len(text) + + +def gzip_decompress(data, errors="strict"): + # then try decompressing considering the file signature + try: + with GzipFile(fileobj=BytesIO(b(data)), mode="rb") as f: + r = f.read() + except: + pass + # try decompressing without considering the file signature + try: + r = zlib.decompress(b(data), 16 + zlib.MAX_WBITS) + except: + return handle_error("gzip", errors, decode=True)(data[0], 0) if len(data) > 0 else "", len(data) + return r, len(r) + + +add("gzip", gzip_compress, gzip_decompress) + diff --git a/codext/compressions/lz77.py b/src/codext/compressions/lz77.py similarity index 100% rename from codext/compressions/lz77.py rename to src/codext/compressions/lz77.py diff --git a/codext/compressions/lz78.py b/src/codext/compressions/lz78.py similarity index 100% rename from codext/compressions/lz78.py rename to src/codext/compressions/lz78.py diff --git a/src/codext/compressions/pkzip.py b/src/codext/compressions/pkzip.py new file mode 100644 index 0000000..35ec94e --- /dev/null +++ b/src/codext/compressions/pkzip.py @@ -0,0 +1,55 @@ +# -*- coding: UTF-8 -*- +"""Pkzip Codec - pkzip content compression. + +NB: Not an encoding properly speaking. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import zipfile + +from ..__common__ import * + + +_str = ["test", "This is a test", "@random{512,1024,2048}"] +__examples1__ = {'enc-dec(pkzip-deflate|deflate)': _str} +__examples2__ = {'enc-dec(pkzip_bz2|bzip2)': _str} +__examples3__ = {'enc-dec(pkzip-lzma|lzma)': _str} + + +NULL = { + 8: b"\x03\x00", + 12: b"BZh9\x17rE8P\x90\x00\x00\x00\x00", + 14: b"\t\x04\x05\x00]\x00\x00\x80\x00\x00\x83\xff\xfb\xff\xff\xc0\x00\x00\x00", +} + + +def pkzip_encode(compression_type): + def _encode(text, errors="strict"): + c = zipfile._get_compressor(compression_type) + return c.compress(b(text)) + c.flush(), len(text) + return _encode + + +def pkzip_decode(compression_type, name): + def _decode(data, errors="strict"): + d = zipfile._get_decompressor(compression_type) + r = d.decompress(b(data)) + if len(r) == 0 and b(data) != NULL[compression_type]: + return handle_error(name, errors, decode=True)(data[0], 0) if len(data) > 0 else "", len(data) + return r, len(r) + return _decode + + +add("pkzip_deflate", pkzip_encode(8), pkzip_decode(8, "deflate"), r"(?:(?:pk)?zip[-_])?deflate", + examples=__examples1__, guess=["deflate"]) + +add("pkzip_bzip2", pkzip_encode(12), pkzip_decode(12, "bzip2"), r"(?:(?:pk)?zip[-_])?bz(?:ip)?2", + examples=__examples2__, guess=["bz2"]) + +add("pkzip_lzma", pkzip_encode(14), pkzip_decode(14, "lzma"), r"(?:(?:pk)?zip[-_])?lzma", + examples=__examples3__, guess=["lzma"]) + diff --git a/codext/crypto/__init__.py b/src/codext/crypto/__init__.py old mode 100755 new mode 100644 similarity index 79% rename from codext/crypto/__init__.py rename to src/codext/crypto/__init__.py index 6928637..21da6d9 --- a/codext/crypto/__init__.py +++ b/src/codext/crypto/__init__.py @@ -1,12 +1,14 @@ -# -*- coding: UTF-8 -*- -from .affine import * -from .atbash import * -from .bacon import * -from .barbie import * -from .citrix import * -from .railfence import * -from .rot import * -from .scytale import * -from .shift import * -from .xor import * - +# -*- coding: UTF-8 -*- +from .affine import * +from .atbash import * +from .bacon import * +from .barbie import * +from .citrix import * +from .polybius import * +from .railfence import * +from .rot import * +from .scytale import * +from .shift import * +from .vigenere import * +from .xor import * + diff --git a/codext/crypto/affine.py b/src/codext/crypto/affine.py old mode 100755 new mode 100644 similarity index 96% rename from codext/crypto/affine.py rename to src/codext/crypto/affine.py index cc18818..59c9d34 --- a/codext/crypto/affine.py +++ b/src/codext/crypto/affine.py @@ -1,32 +1,32 @@ -# -*- coding: UTF-8 -*- -"""Affine Cipher Codec - affine content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) - -Reference: https://crypto.interactive-maths.com/affine-cipher.html -""" -from ..__common__ import * - - -__guess__ = [] - - -def encmap_factory(mask=None): - mask = mask or "?l?u?s-1,2" - mask, key = mask.split("-") - a, b = map(int, key.split(",")) - alphabet = get_alphabet_from_mask(mask) - encmap = {c: alphabet[(a * alphabet.index(c) + b) % len(alphabet)] for c in alphabet} - if len(set(encmap.keys())) != len(set(encmap.values())): - raise LookupError("Bad parameter for encoding 'affine': {}, {}".format(a, b)) - if ' ' not in encmap.keys(): - encmap[' '] = " " - return encmap - - -add_map("affine", encmap_factory, pattern=r"^affine(?:[-_]cipher)?(?:[-_](.+?\-\d+\,\d+))?$") - +# -*- coding: UTF-8 -*- +"""Affine Cipher Codec - affine content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) + +Reference: https://crypto.interactive-maths.com/affine-cipher.html +""" +from ..__common__ import * + + +__guess__ = [] + + +def encmap_factory(mask=None): + mask = mask or "?l?u?s-1,2" + mask, key = mask.split("-") + a, b = map(int, key.split(",")) + alphabet = get_alphabet_from_mask(mask) + encmap = {c: alphabet[(a * alphabet.index(c) + b) % len(alphabet)] for c in alphabet} + if len(set(encmap.keys())) != len(set(encmap.values())): + raise LookupError("Bad parameter for encoding 'affine': {}, {}".format(a, b)) + if ' ' not in encmap.keys(): + encmap[' '] = " " + return encmap + + +add_map("affine", encmap_factory, pattern=r"^affine(?:[-_]cipher)?(?:[-_](.+?\-\d+\,\d+))?$") + diff --git a/codext/crypto/atbash.py b/src/codext/crypto/atbash.py old mode 100755 new mode 100644 similarity index 96% rename from codext/crypto/atbash.py rename to src/codext/crypto/atbash.py index 5cb9f83..b6dbf16 --- a/codext/crypto/atbash.py +++ b/src/codext/crypto/atbash.py @@ -1,34 +1,34 @@ -# -*- coding: UTF-8 -*- -"""Atbash Cipher Codec - atbash-based content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) - -Reference: https://crypto.interactive-maths.com/atbash-cipher.html -""" -from ..__common__ import * - - -__guess__ = ["atbash"] - - -def encmap_factory(mask=None): - mask = mask or "?u?l" - # [...] enclosure causes the mask to be handled as a whole - if mask[0] == "[" and mask[-1] == "]": - alphabet = get_alphabet_from_mask(mask[1:-1]) - return {k: v for k, v in zip(alphabet, alphabet[::-1])} - # not enclosing the whole mask means that each group is to be considered separately - else: - m = {} - for group in re.findall(r"(\?.|[^?]+)", mask): - alphabet = get_alphabet_from_mask(group) - m.update({k: v for k, v in zip(alphabet, alphabet[::-1])}) - return m - - -add_map("atbash", encmap_factory, no_error=True, pattern=r"atbash(?:[-_]cipher)?(?:[-_](.+))?$") - +# -*- coding: UTF-8 -*- +"""Atbash Cipher Codec - atbash-based content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) + +Reference: https://crypto.interactive-maths.com/atbash-cipher.html +""" +from ..__common__ import * + + +__guess__ = ["atbash"] + + +def encmap_factory(mask=None): + mask = mask or "?u?l" + # [...] enclosure causes the mask to be handled as a whole + if mask[0] == "[" and mask[-1] == "]": + alphabet = get_alphabet_from_mask(mask[1:-1]) + return {k: v for k, v in zip(alphabet, alphabet[::-1])} + # not enclosing the whole mask means that each group is to be considered separately + else: + m = {} + for group in re.findall(r"(\?.|[^?]+)", mask): + alphabet = get_alphabet_from_mask(group) + m.update({k: v for k, v in zip(alphabet, alphabet[::-1])}) + return m + + +add_map("atbash", encmap_factory, no_error=True, pattern=r"atbash(?:[-_]cipher)?(?:[-_](.+))?$") + diff --git a/codext/crypto/bacon.py b/src/codext/crypto/bacon.py old mode 100755 new mode 100644 similarity index 97% rename from codext/crypto/bacon.py rename to src/codext/crypto/bacon.py index e7daf92..a7048ef --- a/codext/crypto/bacon.py +++ b/src/codext/crypto/bacon.py @@ -1,36 +1,36 @@ -# -*- coding: UTF-8 -*- -"""Bacon's Cipher Codec - bacon content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) - -Reference: https://en.wikipedia.org/wiki/Bacon%27s_cipher -""" -from ..__common__ import * - - -__examples__ = { - 'enc(bacon|bacon_cipher|baconian-cipher|bacon-ab|bacon_AB)': { - 'this is a test': "baabaaabbbabaaabaaab abaaabaaab aaaaa baabaaabaabaaabbaaba", - }, - 'enc(bacon-01|bacon_01)': { - 'this is a test': "10010001110100010001 0100010001 00000 10010001001000110010", - }, -} -__guess__ = {"bacon", "bacon-ba", "bacon-01", "bacon-10"} - - -ENCMAP = { - 'A': "aaaaa", 'B': "aaaab", 'C': "aaaba", 'D': "aaabb", 'E': "aabaa", 'F': "aabab", 'G': "aabba", 'H': "aabbb", - 'I': "abaaa", 'J': "abaaa", 'K': "abaab", 'L': "ababa", 'M': "ababb", 'N': "abbaa", 'O': "abbab", 'P': "abbba", - 'Q': "abbbb", 'R': "baaaa", 'S': "baaab", 'T': "baaba", 'U': "baabb", 'V': "baabb", 'W': "babaa", 'X': "babab", - 'Y': "babba", 'Z': "babbb", ' ': " ", -} - - -add_map("bacon", ENCMAP, ignore_case="both", pattern=r"bacon(?:(?:ian)?[-_]cipher)?([\-_].{2})?$", expansion_factor=5., - printables_rate=1.) - +# -*- coding: UTF-8 -*- +"""Bacon's Cipher Codec - bacon content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) + +Reference: https://en.wikipedia.org/wiki/Bacon%27s_cipher +""" +from ..__common__ import * + + +__examples__ = { + 'enc(bacon|bacon_cipher|baconian-cipher|bacon-ab|bacon_AB)': { + 'this is a test': "baabaaabbbabaaabaaab abaaabaaab aaaaa baabaaabaabaaabbaaba", + }, + 'enc(bacon-01|bacon_01)': { + 'this is a test': "10010001110100010001 0100010001 00000 10010001001000110010", + }, +} +__guess__ = {"bacon", "bacon-ba", "bacon-01", "bacon-10"} + + +ENCMAP = { + 'A': "aaaaa", 'B': "aaaab", 'C': "aaaba", 'D': "aaabb", 'E': "aabaa", 'F': "aabab", 'G': "aabba", 'H': "aabbb", + 'I': "abaaa", 'J': "abaaa", 'K': "abaab", 'L': "ababa", 'M': "ababb", 'N': "abbaa", 'O': "abbab", 'P': "abbba", + 'Q': "abbbb", 'R': "baaaa", 'S': "baaab", 'T': "baaba", 'U': "baabb", 'V': "baabb", 'W': "babaa", 'X': "babab", + 'Y': "babba", 'Z': "babbb", ' ': " ", +} + + +add_map("bacon", ENCMAP, ignore_case="both", pattern=r"bacon(?:(?:ian)?[-_]cipher)?([\-_].{2})?$", expansion_factor=5., + printables_rate=1.) + diff --git a/codext/crypto/barbie.py b/src/codext/crypto/barbie.py old mode 100755 new mode 100644 similarity index 97% rename from codext/crypto/barbie.py rename to src/codext/crypto/barbie.py index 4e593b8..830119a --- a/codext/crypto/barbie.py +++ b/src/codext/crypto/barbie.py @@ -1,54 +1,54 @@ -# -*- coding: UTF-8 -*- -"""Barbie typewriter Codec - barbie content encoding. - -While Barbie typewriter is more a cipher, its very limited key size of 2 bits makes it easy to turn into four variants - of the same encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) - -Reference: http://www.cryptomuseum.com/crypto/mehano/barbie/ -""" -from ..__common__ import * - - -__examples__ = { - 'enc(barbie1)': {'\r': None}, - 'enc(barbie1|barbie_1|barbie-1)': {'this is a test': "hstf tf i hafh"}, - 'enc(barbie2|barbie_2|barbie-2)': {'this is a test': "sfhp hp t sips"}, - 'enc(barbie3|barbie_3|barbie-3)': {'this is a test': "fpsu su h ftuf"}, - 'enc(barbie4|barbie_4|barbie-4)': {'this is a test': "pufq fq s phqp"}, -} -__guess__ = ["barbie-%d" % i for i in range(1, 5)] - - -STD = [ - "abcdefghijklmnopqrstuvABCDEFGHIJKLMNOPQRSTUVWXYZ0123456 \n\t", - "icolapxstvybjeruknfhqg>FAUTCYOLVJDZINQKSEHG<.1PB5234067 \n\t", - "torbiudfhgzcvanqyepskxRC>GHAPNDQIUXSPNRKLG1XYCUDV ¢ £ § €", - "; d z w 8 9 - ¨ _ & m @ : \" * ( # W M § ^ , ¢ / ? ! ) % X \' R + € £ =", - "¢ l w ; 9 - ¨ § ) \" j ? , m # * @ . Z £ ! W + ^ / & ( : 1 _ S % = € \'", - "+ b ; ¢ - ¨ § £ ( m v / W j @ # ? M B € & . % ! ^ \" * , 2 ) E : \' = _", - "% c ¢ + ¨ § £ € * j g ^ . v ? @ / Z F = \" N : & ! m # W 3 ( T , _ \' )", -] -ENCMAP = [] -for i in range(4): - encmap = {} - for j, c in enumerate(STD[0]): - encmap[c] = STD[i+1][j] - spec = SPEC[i+1].split() - for j, c in enumerate(SPEC[0].split()): - encmap[c] = spec[j] - ENCMAP.append(encmap) - - -add_map("barbie", ENCMAP, pattern=r"^barbie[-_]?([1-4])$", printables_rate=lambda pr: .857 * pr) - +# -*- coding: UTF-8 -*- +"""Barbie typewriter Codec - barbie content encoding. + +While Barbie typewriter is more a cipher, its very limited key size of 2 bits makes it easy to turn into four variants + of the same encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) + +Reference: http://www.cryptomuseum.com/crypto/mehano/barbie/ +""" +from ..__common__ import * + + +__examples__ = { + 'enc(barbie1)': {'\r': None}, + 'enc(barbie1|barbie_1|barbie-1)': {'this is a test': "hstf tf i hafh"}, + 'enc(barbie2|barbie_2|barbie-2)': {'this is a test': "sfhp hp t sips"}, + 'enc(barbie3|barbie_3|barbie-3)': {'this is a test': "fpsu su h ftuf"}, + 'enc(barbie4|barbie_4|barbie-4)': {'this is a test': "pufq fq s phqp"}, +} +__guess__ = ["barbie-%d" % i for i in range(1, 5)] + + +STD = [ + "abcdefghijklmnopqrstuvABCDEFGHIJKLMNOPQRSTUVWXYZ0123456 \n\t", + "icolapxstvybjeruknfhqg>FAUTCYOLVJDZINQKSEHG<.1PB5234067 \n\t", + "torbiudfhgzcvanqyepskxRC>GHAPNDQIUXSPNRKLG1XYCUDV ¢ £ § €", + "; d z w 8 9 - ¨ _ & m @ : \" * ( # W M § ^ , ¢ / ? ! ) % X \' R + € £ =", + "¢ l w ; 9 - ¨ § ) \" j ? , m # * @ . Z £ ! W + ^ / & ( : 1 _ S % = € \'", + "+ b ; ¢ - ¨ § £ ( m v / W j @ # ? M B € & . % ! ^ \" * , 2 ) E : \' = _", + "% c ¢ + ¨ § £ € * j g ^ . v ? @ / Z F = \" N : & ! m # W 3 ( T , _ \' )", +] +ENCMAP = [] +for i in range(4): + encmap = {} + for j, c in enumerate(STD[0]): + encmap[c] = STD[i+1][j] + spec = SPEC[i+1].split() + for j, c in enumerate(SPEC[0].split()): + encmap[c] = spec[j] + ENCMAP.append(encmap) + + +add_map("barbie", ENCMAP, pattern=r"^barbie[-_]?([1-4])$", printables_rate=lambda pr: .857 * pr) + diff --git a/codext/crypto/citrix.py b/src/codext/crypto/citrix.py similarity index 96% rename from codext/crypto/citrix.py rename to src/codext/crypto/citrix.py index 43ac77b..c361eab 100644 --- a/codext/crypto/citrix.py +++ b/src/codext/crypto/citrix.py @@ -1,52 +1,52 @@ -# -*- coding: UTF-8 -*- -"""Citrix Codec - citrix password encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) - -Reference: https://crypto.interactive-maths.com/atbash-cipher.html -""" -from ..__common__ import * - - -__examples__ = { - 'enc(citrix-ctx0)': None, - 'enc(citrix|citrix-1|citrix_ctx1)': {'this is a test': "NBBMNAAGIDEPJJBMNIFNIMEMJKEL"}, -} -__guess__ = ["citrix-ctx1"] - - -_dec = lambda g: ((ord(g[0]) - 0x41) & 0xf) ^ ((((ord(g[1]) - 0x41) & 0xf) << 4) & 0xf0) -_enc = lambda o: chr(((o >> 4) & 0xf) + 0x41) + chr((o & 0xf) + 0x41) - - -def citrix_encode(t): - def encode(text, errors="strict"): - l = len(text) - r, x = "", 0 - for c in text: - x = ord(c) ^ 0xa5 ^ x - r += _enc(x) - return r, l - return encode - - -def citrix_decode(t): - def decode(text, errors="strict"): - l = len(text) - text = text[::-1] - r = "" - for i in range(0, l, 2): - x = 0 if i + 2 >= l else _dec(text[i+2:i+4]) - x ^= _dec(text[i:i+2]) ^ 0xa5 - r += chr(x) - return r[::-1], l - return decode - - -add("citrix", citrix_encode, citrix_decode, r"citrix(|[-_]?(?:ctx)?1)$", entropy=4., printables_rate=1., - expansion_factor=2.) - +# -*- coding: UTF-8 -*- +"""Citrix Codec - citrix password encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) + +Reference: https://crypto.interactive-maths.com/atbash-cipher.html +""" +from ..__common__ import * + + +__examples__ = { + 'enc(citrix-ctx0)': None, + 'enc(citrix|citrix-1|citrix_ctx1)': {'this is a test': "NBBMNAAGIDEPJJBMNIFNIMEMJKEL"}, +} +__guess__ = ["citrix-ctx1"] + + +_dec = lambda g: ((ord(g[0]) - 0x41) & 0xf) ^ ((((ord(g[1]) - 0x41) & 0xf) << 4) & 0xf0) +_enc = lambda o: chr(((o >> 4) & 0xf) + 0x41) + chr((o & 0xf) + 0x41) + + +def citrix_encode(t): + def encode(text, errors="strict"): + l = len(text) + r, x = "", 0 + for c in text: + x = ord(c) ^ 0xa5 ^ x + r += _enc(x) + return r, l + return encode + + +def citrix_decode(t): + def decode(text, errors="strict"): + l = len(text) + text = text[::-1] + r = "" + for i in range(0, l, 2): + x = 0 if i + 2 >= l else _dec(text[i+2:i+4]) + x ^= _dec(text[i:i+2]) ^ 0xa5 + r += chr(x) + return r[::-1], l + return decode + + +add("citrix", citrix_encode, citrix_decode, r"citrix(|[-_]?(?:ctx)?1)$", entropy=4., printables_rate=1., + expansion_factor=2.) + diff --git a/src/codext/crypto/polybius.py b/src/codext/crypto/polybius.py new file mode 100755 index 0000000..73fae76 --- /dev/null +++ b/src/codext/crypto/polybius.py @@ -0,0 +1,77 @@ +# -*- coding: UTF-8 -*- +"""Polybius Square Codec - polybius-square content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(polybius|polybius-square|polybius_square)': {'this is a test': "44232443 2443 11 44154344"}, + 'enc(polybius-ABCDEFGHIKLMNOPQRSTUVWXYZ)': {'this is a test': "44232443 2443 11 44154344"}, + 'dec(polybius)': {'44232443 2443 11 44154344': "THIS IS A TEST"}, +} +__guess__ = ["polybius"] + + +# Standard 5×5 Polybius square (I and J share the same cell): +# 1 2 3 4 5 +# 1 A B C D E +# 2 F G H I K +# 3 L M N O P +# 4 Q R S T U +# 5 V W X Y Z +_DEFAULT_ALPHABET = "ABCDEFGHIKLMNOPQRSTUVWXYZ" + + +def __make_maps(alphabet): + """ Build the encoding and decoding maps for the given 25-character alphabet. """ + alph = alphabet.upper() if alphabet else _DEFAULT_ALPHABET + if len(alph) != 25 or len(set(alph)) != 25: + raise LookupError("Polybius square requires exactly 25 distinct characters; " + f"got {len(alph)} character(s) with {len(set(alph))} unique: {alph}") + encmap = {alph[i]: str(i // 5 + 1) + str(i % 5 + 1) for i in range(25)} + decmap = {v: k for k, v in encmap.items()} + if 'J' not in encmap and 'I' in encmap: + encmap['J'] = encmap['I'] + encmap[' '] = ' ' + return encmap, decmap + + +def polybius_encode(alphabet=_DEFAULT_ALPHABET): + encmap, _ = __make_maps(alphabet) + def encode(text, errors="strict"): + _h = handle_error("polybius", errors) + r = "" + for pos, c in enumerate(ensure_str(text).upper()): + r += encmap[c] if c in encmap else _h(c, pos, r) + return r, len(text) + return encode + + +def polybius_decode(alphabet=_DEFAULT_ALPHABET): + _, decmap = __make_maps(alphabet) + def decode(text, errors="strict"): + _h = handle_error("polybius", errors, decode=True) + r, t, i = "", ensure_str(text), 0 + while i < len(t): + if t[i] == " ": + r += " " + i += 1 + elif i + 1 < len(t): + r += decmap.get(t[i:i+2]) or _h(t[i:i+2], i, r) + i += 2 + else: + r += _h(t[i], i, r) + i += 1 + return r, len(t) + return decode + + +add("polybius", polybius_encode, polybius_decode, r"^polybius(?:[-_]square)?(?:[-_]([A-Za-z]{25}))?$", + printables_rate=1., expansion_factor=(1.7, .3)) + diff --git a/codext/crypto/railfence.py b/src/codext/crypto/railfence.py similarity index 94% rename from codext/crypto/railfence.py rename to src/codext/crypto/railfence.py index 3d150c0..a25f27a 100644 --- a/codext/crypto/railfence.py +++ b/src/codext/crypto/railfence.py @@ -1,96 +1,96 @@ -# -*- coding: UTF-8 -*- -"""Rail Fence Cipher Codec - rail fence content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(rail_123|rail-2-123)': {'this is a test': None}, - 'enc(railfence|zigzag)': {'this is a test': "t ashsi etist"}, - 'enc(rail-5|zigzag_5)': {'this is a test': "tah istsiet s"}, - 'enc(rail_5-3|rail_5_3)': {'this is a test': "it sss etiath "}, - 'enc(rail-5-3-up|rail_5_3-up)': {'this is a test': "h tiats e ssit"}, - 'enc(rail-7-4|rail_7_4)': {'this is a test': "a stiet shsti"}, - 'dec(zigzag)': {'': ""}, -} -__guess__ = ["railfence-%d" % i for i in range(1, 11)] + ["railfence-%d-up" % i for i in range(1, 11)] - - -def __build(text, rails, offset, up): - l, rail = len(text), offset - # set the starting rail and direction - if up: - dr = -1 - rail = rails - offset - 1 - else: - dr = 1 - # create rails - f = [[None] * l for i in range(rails)] - # now zig-zag between rails - for x in range(l): - f[rail][x] = text[x] - if rail >= rails - 1: - dr = -1 - elif rail <= 0: - dr = 1 - rail += dr - return f - - -def __check(length, rails, offset): - if rails > length: - raise ParameterError("Bad parameter for encoding 'railfence': rails=%d (should be >%d)" % (rails, length)) - if offset > rails: - raise ParameterError("Bad parameter for encoding 'railfence': offset=%d (should be >%d)" % (offset, rails)) - - -def railfence_encode(rails, offset, up): - rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" - def encode(text, errors="strict"): - r, l = "", len(text) - __check(l, rails, offset) - f = __build(text, rails, offset, up) - for rail in range(rails): - for x in range(l): - if f[rail][x] is not None: - r += f[rail][x] - return r, l - return encode - - -def railfence_decode(rails, offset, up): - rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" - def decode(text, errors="strict"): - # this if block is particularly useful with Python2 ; see codecs.py at line 492 in comparison with codecs.py - # from Python3 at line 501: in Python2, a last block can be read while empty while in Python3 not - # as a consequence, in Python2, an error is triggered as an empty text cannot be decoded with Rail Fence with - # a rails parameter > 0 (see the __check(length, rails, offset)) function - if text == "": - return "", 0 - r, i, l = "", 0, len(text) - __check(l, rails, offset) - f = __build("." * len(text), rails, offset, up) - # put the characters in the right place - for rail in range(rails): - for x in range(l): - if f[rail][x] == ".": - f[rail][x] = text[i] - i += 1 - # read the characters in the right order - for x in range(l): - for rail in range(rails): - if f[rail][x] is not None: - r += f[rail][x] - return r, len(text) - return decode - - -add("railfence", railfence_encode, railfence_decode, - r"^(?:rail(?:[-_]?fence)?|zigzag)(?:[-_]([1-9]|[1-9]\d+)(?:[-_]([0-9]|[1-9]\d+))?(?:[-_](up))?)?$") - +# -*- coding: UTF-8 -*- +"""Rail Fence Cipher Codec - rail fence content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(rail_123|rail-2-123)': {'this is a test': None}, + 'enc(railfence|zigzag)': {'this is a test': "t ashsi etist"}, + 'enc(rail-5|zigzag_5)': {'this is a test': "tah istsiet s"}, + 'enc(rail_5-3|rail_5_3)': {'this is a test': "it sss etiath "}, + 'enc(rail-5-3-up|rail_5_3-up)': {'this is a test': "h tiats e ssit"}, + 'enc(rail-7-4|rail_7_4)': {'this is a test': "a stiet shsti"}, + 'dec(zigzag)': {'': ""}, +} +__guess__ = ["railfence-%d" % i for i in range(1, 11)] + ["railfence-%d-up" % i for i in range(1, 11)] + + +def __build(text, rails, offset, up): + l, rail = len(text), offset + # set the starting rail and direction + if up: + dr = -1 + rail = rails - offset - 1 + else: + dr = 1 + # create rails + f = [[None] * l for i in range(rails)] + # now zig-zag between rails + for x in range(l): + f[rail][x] = text[x] + if rail >= rails - 1: + dr = -1 + elif rail <= 0: + dr = 1 + rail += dr + return f + + +def __check(length, rails, offset): + if rails > length: + raise ParameterError("Bad parameter for encoding 'railfence': rails=%d (should be <= %d)" % (rails, length)) + if offset > rails: + raise ParameterError("Bad parameter for encoding 'railfence': offset=%d (should be <= %d)" % (offset, rails)) + + +def railfence_encode(rails, offset, up): + rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" + def encode(text, errors="strict"): + r, l = "", len(text) + __check(l, rails, offset) + f = __build(text, rails, offset, up) + for rail in range(rails): + for x in range(l): + if f[rail][x] is not None: + r += f[rail][x] + return r, l + return encode + + +def railfence_decode(rails, offset, up): + rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" + def decode(text, errors="strict"): + # this if block is particularly useful with Python2 ; see codecs.py at line 492 in comparison with codecs.py + # from Python3 at line 501: in Python2, a last block can be read while empty while in Python3 not + # as a consequence, in Python2, an error is triggered as an empty text cannot be decoded with Rail Fence with + # a rails parameter > 0 (see the __check(length, rails, offset)) function + if text == "": + return "", 0 + r, i, l = "", 0, len(text) + __check(l, rails, offset) + f = __build("." * len(text), rails, offset, up) + # put the characters in the right place + for rail in range(rails): + for x in range(l): + if f[rail][x] == ".": + f[rail][x] = text[i] + i += 1 + # read the characters in the right order + for x in range(l): + for rail in range(rails): + if f[rail][x] is not None: + r += f[rail][x] + return r, len(text) + return decode + + +add("railfence", railfence_encode, railfence_decode, + r"^(?:rail(?:[-_]?fence)?|zigzag)(?:[-_]([1-9]|[1-9]\d+)(?:[-_]([0-9]|[1-9]\d+))?(?:[-_](up))?)?$") + diff --git a/codext/crypto/rot.py b/src/codext/crypto/rot.py old mode 100755 new mode 100644 similarity index 97% rename from codext/crypto/rot.py rename to src/codext/crypto/rot.py index 3f696f4..88700bd --- a/codext/crypto/rot.py +++ b/src/codext/crypto/rot.py @@ -1,102 +1,102 @@ -# -*- coding: UTF-8 -*- -"""ROT Codec - rot-with-N-offset content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from string import ascii_lowercase as LC, ascii_uppercase as UC, digits as DIG - -from ..__common__ import * - - -__examples1__ = { - 'enc(rot0|rot--10|rot100)': None, - 'enc(rot1|rot-1|caesar_1)': {'this is a test': "uijt jt b uftu"}, - 'enc(rot3|caesar-3)': {'this is a test': "wklv lv d whvw"}, - 'enc(rot47)': {'this is a test': "E9:D :D 2 E6DE"}, -} -__examples2__ = { - 'enc(prot0|prot--10|prot100)': None, - 'enc(prot1|prog-caesar_1)': {'this is a test': "ujlw oz j eqfh"}, - 'enc(prot3|pcaesar-3)': {'this is a test': "wlny qb l gshj"}, -} -__examples3__ = { - 'enc(arot0|arot--10|arot100)': None, - 'enc(arot1|alt-caesar_1)': {'this is a test': "ugjr ht b udts"}, - 'enc(arot3|acaesar-3)': {'this is a test': "welp fv d wbvq"}, -} -__guess1__ = ["rot-%d" % i for i in range(1, 26)] + ["rot-47"] -__guess2__ = ["progressive-rot-%d" % i for i in range(1, 26)] + ["progressive-rot-n%d" % i for i in range(1, 26)] -__guess3__ = ["alternative-rot-%d" % i for i in range(1, 26) if i != 13] - - -ROT47 = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" - - -def _rotn(text, n=13, a=(LC, UC), alt=False, prog=False, neg=False): - r = "" - for i, c in enumerate(ensure_str(text)): - found = False - for l in a: - if c in l: - r += l[(l.index(c) + [1, -1][alt and i % 2 == 1] * n + ([1, -1][neg] * i if prog else 0)) % len(l)] - found = True - break - if not found: - r += c - return r - - -def arot_encode(i): - def encode(text, errors="strict"): - return _rotn(ensure_str(text), i, alt=True), len(text) - return encode - - -def arot_decode(i): - def decode(text, errors="strict"): - return _rotn(ensure_str(text), -i, alt=True), len(text) - return decode - - -def rot_encode(i): - def encode(text, errors="strict"): - t = ensure_str(text) - r = _rotn(t, 47, [ROT47]) if i == 47 else _rotn(t, i) - return r, len(r) - return encode - - -def rot_decode(i): - def decode(text, errors="strict"): - t = ensure_str(text) - r = _rotn(t, -47, [ROT47]) if i == 47 else _rotn(t, -i) - return r, len(r) - return decode - - -def prot_encode(n, i): - def encode(text, errors="strict"): - return _rotn(ensure_str(text), i, prog=True, neg=n == "n"), len(text) - return encode - - -def prot_decode(n, i): - def decode(text, errors="strict"): - return _rotn(ensure_str(text), -i, prog=True, neg=n != "n"), len(text) - return decode - - -# note: alternative-rot-13 is equivalent to rot-13, therefore excluded from the regex -add("alternative-rot", arot_encode, arot_decode, r"a(?:lt(?:ernative)?-)?(?:caesar|rot)[-_]?([1-9]|1[0-24-9]|2[0-5])$", - penalty=.2, entropy=lambda e: e, printables_rate=lambda pr: pr, transitive=True, examples=__examples3__, - guess=__guess3__) -add("rot", rot_encode, rot_decode, r"(?:caesar|rot)[-_]?([1-9]|1[0-9]|2[0-5]|47)$", aliases=["caesar"], penalty=.2, - entropy=lambda e: e, printables_rate=lambda pr: pr, transitive=True, examples=__examples1__, guess=__guess1__) -add("progressive-rot", prot_encode, prot_decode, r"p(?:rog(?:ressive)?-)?(?:caesar|rot)[-_]?(n?)([1-9]|1[0-9]|2[0-5])$", - penalty=.2, entropy=lambda e: e, printables_rate=lambda pr: pr, transitive=True, examples=__examples2__, - guess=__guess2__) - +# -*- coding: UTF-8 -*- +"""ROT Codec - rot-with-N-offset content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from string import ascii_lowercase as LC, ascii_uppercase as UC, digits as DIG + +from ..__common__ import * + + +__examples1__ = { + 'enc(rot0|rot--10|rot100)': None, + 'enc(rot1|rot-1|caesar_1)': {'this is a test': "uijt jt b uftu"}, + 'enc(rot3|caesar-3)': {'this is a test': "wklv lv d whvw"}, + 'enc(rot47)': {'this is a test': "E9:D :D 2 E6DE"}, +} +__examples2__ = { + 'enc(prot0|prot--10|prot100)': None, + 'enc(prot1|prog-caesar_1)': {'this is a test': "ujlw oz j eqfh"}, + 'enc(prot3|pcaesar-3)': {'this is a test': "wlny qb l gshj"}, +} +__examples3__ = { + 'enc(arot0|arot--10|arot100)': None, + 'enc(arot1|alt-caesar_1)': {'this is a test': "ugjr ht b udts"}, + 'enc(arot3|acaesar-3)': {'this is a test': "welp fv d wbvq"}, +} +__guess1__ = ["rot-%d" % i for i in range(1, 26)] + ["rot-47"] +__guess2__ = ["progressive-rot-%d" % i for i in range(1, 26)] + ["progressive-rot-n%d" % i for i in range(1, 26)] +__guess3__ = ["alternative-rot-%d" % i for i in range(1, 26) if i != 13] + + +ROT47 = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" + + +def _rotn(text, n=13, a=(LC, UC), alt=False, prog=False, neg=False): + r = "" + for i, c in enumerate(ensure_str(text)): + found = False + for l in a: + if c in l: + r += l[(l.index(c) + [1, -1][alt and i % 2 == 1] * n + ([1, -1][neg] * i if prog else 0)) % len(l)] + found = True + break + if not found: + r += c + return r + + +def arot_encode(i): + def encode(text, errors="strict"): + return _rotn(ensure_str(text), i, alt=True), len(text) + return encode + + +def arot_decode(i): + def decode(text, errors="strict"): + return _rotn(ensure_str(text), -i, alt=True), len(text) + return decode + + +def rot_encode(i): + def encode(text, errors="strict"): + t = ensure_str(text) + r = _rotn(t, 47, [ROT47]) if i == 47 else _rotn(t, i) + return r, len(r) + return encode + + +def rot_decode(i): + def decode(text, errors="strict"): + t = ensure_str(text) + r = _rotn(t, -47, [ROT47]) if i == 47 else _rotn(t, -i) + return r, len(r) + return decode + + +def prot_encode(n, i): + def encode(text, errors="strict"): + return _rotn(ensure_str(text), i, prog=True, neg=n == "n"), len(text) + return encode + + +def prot_decode(n, i): + def decode(text, errors="strict"): + return _rotn(ensure_str(text), -i, prog=True, neg=n != "n"), len(text) + return decode + + +# note: alternative-rot-13 is equivalent to rot-13, therefore excluded from the regex +add("alternative-rot", arot_encode, arot_decode, r"a(?:lt(?:ernative)?-)?(?:caesar|rot)[-_]?([1-9]|1[0-24-9]|2[0-5])$", + penalty=.2, entropy=lambda e: e, printables_rate=lambda pr: pr, transitive=True, examples=__examples3__, + guess=__guess3__) +add("rot", rot_encode, rot_decode, r"(?:caesar|rot)[-_]?([1-9]|1[0-9]|2[0-5]|47)$", aliases=["caesar"], penalty=.2, + entropy=lambda e: e, printables_rate=lambda pr: pr, transitive=True, examples=__examples1__, guess=__guess1__) +add("progressive-rot", prot_encode, prot_decode, r"p(?:rog(?:ressive)?-)?(?:caesar|rot)[-_]?(n?)([1-9]|1[0-9]|2[0-5])$", + penalty=.2, entropy=lambda e: e, printables_rate=lambda pr: pr, transitive=True, examples=__examples2__, + guess=__guess2__) + diff --git a/codext/crypto/scytale.py b/src/codext/crypto/scytale.py old mode 100755 new mode 100644 similarity index 96% rename from codext/crypto/scytale.py rename to src/codext/crypto/scytale.py index 7490241..286d51e --- a/codext/crypto/scytale.py +++ b/src/codext/crypto/scytale.py @@ -1,54 +1,54 @@ -# -*- coding: UTF-8 -*- -"""Scytale-N Codec - scytale content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from math import ceil - -from ..__common__ import * - - -__examples__ = { - 'enc(scytale0|scytale--10|scytale01)': None, - 'enc(scytale2|scytale-2|scytale_2)': {'this is a test': "ti satshsi et"}, - 'enc(scytale5|scytale-5|scytale_5)': {'this is a test': "tithsei ssat "}, -} -__guess__ = ["scytale-%d" % i for i in range(1, 10)] - - -PADDING_CHAR = "" - - -def scytale_encode(l): - def encode(text, errors="strict"): - s, n = "", int(ceil(len(text) / float(l))) - for x in range(l): - for y in range(n): - try: - s += text[y*l+x] - except IndexError: - s += PADDING_CHAR - return s, len(s) - return encode - - -def scytale_decode(l): - def decode(text, errors="strict"): - s, n = "", int(ceil(len(text) / float(l))) - pl = l * n - len(text) - for x in range(n): - for y in range(l): - if y >= l-pl and x == n-1: - continue - s += text[y*n+x-max(0,y-(l-pl))] - s = s.rstrip(PADDING_CHAR) - return s, len(s) - return decode - - -add("scytale", scytale_encode, scytale_decode, r"^scytale[-_]?([1-9]\d*)$") - +# -*- coding: UTF-8 -*- +"""Scytale-N Codec - scytale content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from math import ceil + +from ..__common__ import * + + +__examples__ = { + 'enc(scytale0|scytale--10|scytale01)': None, + 'enc(scytale2|scytale-2|scytale_2)': {'this is a test': "ti satshsi et"}, + 'enc(scytale5|scytale-5|scytale_5)': {'this is a test': "tithsei ssat "}, +} +__guess__ = ["scytale-%d" % i for i in range(1, 10)] + + +PADDING_CHAR = "" + + +def scytale_encode(l): + def encode(text, errors="strict"): + s, n = "", int(ceil(len(text) / float(l))) + for x in range(l): + for y in range(n): + try: + s += text[y*l+x] + except IndexError: + s += PADDING_CHAR + return s, len(s) + return encode + + +def scytale_decode(l): + def decode(text, errors="strict"): + s, n = "", int(ceil(len(text) / float(l))) + pl = l * n - len(text) + for x in range(n): + for y in range(l): + if y >= l-pl and x == n-1: + continue + s += text[y*n+x-max(0,y-(l-pl))] + s = s.rstrip(PADDING_CHAR) + return s, len(s) + return decode + + +add("scytale", scytale_encode, scytale_decode, r"^scytale[-_]?([1-9]\d*)$") + diff --git a/codext/crypto/shift.py b/src/codext/crypto/shift.py old mode 100755 new mode 100644 similarity index 96% rename from codext/crypto/shift.py rename to src/codext/crypto/shift.py index 599e60d..d1c432e --- a/codext/crypto/shift.py +++ b/src/codext/crypto/shift.py @@ -1,34 +1,34 @@ -# -*- coding: UTF-8 -*- -"""Shift Codec - Shift-ordinal-with-N content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(shift0|shift--10|shift256)': None, - 'enc(shift1|shift_1|shift-1)': {'this is a test': "uijt!jt!b!uftu"}, - 'enc(shift9|shift_9|shift-9)': {'this is a test': "}qr|)r|)j)}n|}"}, -} -__guess__ = ["shift-%d" % i for i in range(1, 256)] - - -def ord_shift_decode(i): - return ord_shift_encode(-int(i)) - - -def ord_shift_encode(i): - def encode(text, errors="strict"): - r = "".join(chr((ord(c) + int(i)) % 256) for c in text) - return r, len(r) - return encode - - -add("shift", ord_shift_encode, ord_shift_decode, r"shift[-_]?([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])$", - transitive=True) - +# -*- coding: UTF-8 -*- +"""Shift Codec - Shift-ordinal-with-N content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(shift0|shift--10|shift256)': None, + 'enc(shift1|shift_1|shift-1)': {'this is a test': "uijt!jt!b!uftu"}, + 'enc(shift9|shift_9|shift-9)': {'this is a test': "}qr|)r|)j)}n|}"}, +} +__guess__ = ["shift-%d" % i for i in range(1, 256)] + + +def ord_shift_decode(i): + return ord_shift_encode(-int(i)) + + +def ord_shift_encode(i): + def encode(text, errors="strict"): + r = "".join(chr((ord(c) + int(i)) % 256) for c in text) + return r, len(r) + return encode + + +add("shift", ord_shift_encode, ord_shift_decode, r"shift[-_]?([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])$", + transitive=True) + diff --git a/src/codext/crypto/vigenere.py b/src/codext/crypto/vigenere.py new file mode 100755 index 0000000..0143046 --- /dev/null +++ b/src/codext/crypto/vigenere.py @@ -0,0 +1,87 @@ +# -*- coding: UTF-8 -*- +"""Vigenere Cipher Codec - vigenere content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from string import ascii_lowercase as LC, ascii_uppercase as UC + +from ..__common__ import * + +__examples1__ = { + 'enc(autoclave)': None, + 'enc(autokey-queenly)': {'ATTACKATDAWN': 'QNXEPVYTWTWP'}, + 'enc-dec(autoclave-key)': ['hello world', 'ATTACK AT DAWN', 'Test 1234!', 'Mixed Case 123'], +} +__examples2__ = { + 'enc(beaufort)': None, + 'enc(beaufort-lemon)': {'ATTACKATDAWN': 'LLTOLBETLNPR'}, + 'enc(beaufort-key)': {'hello': 'danzq'}, + 'enc(beaufort_key)': {'Hello World': 'Danzq Cwnnh'}, + 'enc-dec(beaufort-secret)': ['hello world', 'ATTACK AT DAWN', 'Test 1234!'], +} +__examples3__ = { + 'enc(trithemius-cipher)': {'this is a test': "tikv mx g ambd"}, + 'enc(trithemius)': {'HELLO': "HFNOS", '12345!@#$': "12345!@#$"}, + 'enc-dec(trithemius)': ["Hello, World!", "@random"], +} +__examples4__ = { + 'enc(vigenere)': None, + 'enc(vigenere-lemon)': {'ATTACKATDAWN': 'LXFOPVEFRNHR'}, + 'enc(vigenere-key)': {'hello': 'rijvs'}, + 'enc(vigenère_key)': {'Hello World': 'Rijvs Uyvjn'}, + 'enc-dec(vigenere-secret)': ['hello world', 'ATTACK AT DAWN', 'Test 1234!'], +} +__guess1__ = ["autoclave-key", "autoclave-password", "autoclave-secret"] +__guess2__ = ["beaufort-key", "beaufort-password", "beaufort-secret"] +__guess3__ = ["trithemius"] +__guess4__ = ["vigenere-key", "vigenere-password", "vigenere-secret"] + + +bchar = lambda c, k, i, d=False: (LC if (b := c in LC) else UC)[(ord(k[i % len(k)]) - ord('a') - \ + (ord(c) - ord("Aa"[b]))) % 26] +vchar = lambda c, k, i, d=False: (LC if (b := c in LC) else UC)[(ord(c) - ord("Aa"[b]) + \ + [1, -1][d] * (ord(k[i % len(k)]) - ord('a'))) % 26] + + +def __make(enc, char_func, key_stream=False): + def _code(decode=False): + def _wrapper(key): + def _subwrapper(text, errors="strict"): + if not (k := key.lower()) or not k.isalpha(): + raise LookupError(f"Bad parameter for encoding '{enc}': key must be a non-empty alphabetic string") + if key_stream and not decode: + k += "".join(c.lower() for c in ensure_str(text) if c in LC or c in UC) + result, i = [], 0 + if key_stream and decode: + k = list(k) + for c in ensure_str(text): + if c in LC or c in UC: + result.append(dc := char_func(c, k, i, decode)) + if key_stream and decode: + k.append(dc.lower()) + i += 1 + else: + result.append(c) + return (r := "".join(result)), len(r) + return _subwrapper + return _wrapper + return _code(), _code(True) + + +add("autoclave", *__make("autoclave", vchar, True), r"auto(?:clave|key)(?:[-_]cipher)?(?:[-_]([a-zA-Z]+))?$", + examples=__examples1__, guess=__guess1__, penalty=.1) + +add("beaufort", *__make("beaufort", bchar), r"beaufort(?:[-_]cipher)?(?:[-_]([a-zA-Z]+))?$", + examples=__examples2__, guess=__guess2__, penalty=.1) + +enc, dec = __make("trithemius", vchar) +add("trithemius", enc(k := "ABCDEFGHIJKLMNOPQRSTUVWXYZ"), dec(k), r"trithemius(?:[-_]cipher)?$", + examples=__examples3__, guess=__guess3__, penalty=.1) + +add("vigenere", *__make("vigenere", vchar), r"vigen[eè]re(?:[-_]cipher)?(?:[-_]([a-zA-Z]+))?$", + examples=__examples4__, guess=__guess4__, penalty=.1) + diff --git a/codext/crypto/xor.py b/src/codext/crypto/xor.py old mode 100755 new mode 100644 similarity index 96% rename from codext/crypto/xor.py rename to src/codext/crypto/xor.py index 61da6e9..cc77057 --- a/codext/crypto/xor.py +++ b/src/codext/crypto/xor.py @@ -1,35 +1,35 @@ -# -*- coding: UTF-8 -*- -"""XOR Codec - XOR-with-1-byte content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(xor0|xor--10|xor256|xor300)': None, - 'enc(xor3|xor-3|xor_3)': {'this is a test': "wkjp#jp#b#wfpw"}, - 'enc(xor3|xor-3|xor_3)': {'wkjp#jp#b#wfpw': "this is a test"}, - 'enc(xor6|xor-6|xor_6)': {'this is a test': "rnou&ou&g&rcur"}, -} -__guess__ = ["xor-%d" % i for i in range(1, 256)] - - -def _xorn(text, n=1): - return "".join(chr(ord(c) ^ (n % 256)) for c in text) - - -def xor_byte_encode(i): - def encode(text, errors="strict"): - r = _xorn(ensure_str(text), i) - return r, len(r) - return encode - - -add("xor", xor_byte_encode, xor_byte_encode, r"^xor[-_]?([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])$", - transitive=True) - +# -*- coding: UTF-8 -*- +"""XOR Codec - XOR-with-1-byte content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(xor0|xor--10|xor256|xor300)': None, + 'enc(xor3|xor-3|xor_3)': {'this is a test': "wkjp#jp#b#wfpw"}, + 'enc(xor3|xor-3|xor_3)': {'wkjp#jp#b#wfpw': "this is a test"}, + 'enc(xor6|xor-6|xor_6)': {'this is a test': "rnou&ou&g&rcur"}, +} +__guess__ = ["xor-%d" % i for i in range(1, 256)] + + +def _xorn(text, n=1): + return "".join(chr(ord(c) ^ (n % 256)) for c in text) + + +def xor_byte_encode(i): + def encode(text, errors="strict"): + r = _xorn(ensure_str(text), i) + return r, len(r) + return encode + + +add("xor", xor_byte_encode, xor_byte_encode, r"^xor[-_]?([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])$", + transitive=True) + diff --git a/codext/hashing/__init__.py b/src/codext/hashing/__init__.py old mode 100755 new mode 100644 similarity index 81% rename from codext/hashing/__init__.py rename to src/codext/hashing/__init__.py index 2aa13a0..21f0ef5 --- a/codext/hashing/__init__.py +++ b/src/codext/hashing/__init__.py @@ -1,8 +1,8 @@ -# -*- coding: UTF-8 -*- -from .blake import * -from .checksums import * -from .crypt import * -from .md import * -from .sha import * -from .shake import * - +# -*- coding: UTF-8 -*- +from .blake import * +from .crypt import * +from .md import * +from .mmh3 import * +from .sha import * +from .shake import * + diff --git a/src/codext/hashing/blake.py b/src/codext/hashing/blake.py new file mode 100644 index 0000000..e168819 --- /dev/null +++ b/src/codext/hashing/blake.py @@ -0,0 +1,25 @@ +# -*- coding: UTF-8 -*- +"""Blake2 Codecs - string hashing with blake. + +These are codecs for hashing strings, for use with other codecs in encoding chains. + +These codecs: +- transform strings from str to str +- transform strings from bytes to bytes +- transform file content from str to bytes (write) +""" +from ..__common__ import * + + +def blake_hash(c): + def _hash_transform(l): + l = (l or "64" if c == "b" else "32").lstrip("_-") + def _encode(data, error="strict"): + return getattr(hashlib, "blake2%s" % c)(b(data), digest_size=int(l)).hexdigest(), len(data) + return _encode + return _hash_transform + + +add("blake2b", blake_hash("b"), pattern=r"^blake2b(|[-_](?:[1-9]|[1-5]\d|6[0-4]))$", guess=None) +add("blake2s", blake_hash("s"), pattern=r"^blake2s(|[-_](?:[1-9]|[1-2]\d|3[0-2]))$", guess=None) + diff --git a/src/codext/hashing/crypt.py b/src/codext/hashing/crypt.py new file mode 100644 index 0000000..f83806f --- /dev/null +++ b/src/codext/hashing/crypt.py @@ -0,0 +1,36 @@ +# -*- coding: UTF-8 -*- +"""Crypt Hashing Codec - string hashing with Unix's Crypt. + +These are codecs for hashing strings, for use with other codecs in encoding chains. + +These codecs: +- transform strings from str to str +- transform strings from bytes to bytes +- transform file content from str to bytes (write) +""" +from ..__common__ import add, ensure_str, UNIX + + +if UNIX: + try: + import crypt + except ImportError: + try: + import legacycrypt as crypt + except ImportError: + crypt = None + + if crypt is not None: + METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] + + def crypt_hash(method): + method = (method or "").lstrip("-_") or "blowfish" + if method not in METHODS: + raise NotImplementedError("method '%s' is not implemented" % method) + def _encode(input, error="strict"): + m = getattr(crypt, "METHOD_" + method.upper()) + return crypt.crypt(ensure_str(input), crypt.mksalt(m)), len(input) + return _encode + + add("crypt", crypt_hash, pattern=r"^crypt(|[-_](?:%s))$" % "|".join(METHODS), guess=None) + diff --git a/codext/hashing/md.py b/src/codext/hashing/md.py similarity index 91% rename from codext/hashing/md.py rename to src/codext/hashing/md.py index 181d85c..eae8e38 100644 --- a/codext/hashing/md.py +++ b/src/codext/hashing/md.py @@ -1,5 +1,5 @@ # -*- coding: UTF-8 -*- -"""Case Codecs - string hashing with Message Digest (MD). +"""MD Hashing Codecs - string hashing with Message Digest (MD). These are codecs for hashing strings, for use with other codecs in encoding chains. @@ -8,9 +8,7 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib - -from ..__common__ import add, b +from ..__common__ import * MD2_TABLE = [41, 46, 67, 201, 162, 216, 124, 1, 61, 54, 84, 161, 236, 240, 6, 19, 98, 167, 5, 243, 192, 199, 115, 140, @@ -55,6 +53,7 @@ def md2(data): add("md2", lambda s, error="strict": (md2(s), len(s)), guess=None) -add("md4", lambda s, error="strict": (hashlib.new("md4", b(s)).hexdigest(), len(s)), guess=None) add("md5", lambda s, error="strict": (hashlib.new("md5", b(s)).hexdigest(), len(s)), guess=None) +if "md4" in hashlib.algorithms_available: + add("md4", lambda s, error="strict": (hashlib.new("md4", b(s)).hexdigest(), len(s)), guess=None) diff --git a/src/codext/hashing/mmh3.py b/src/codext/hashing/mmh3.py new file mode 100644 index 0000000..8c26639 --- /dev/null +++ b/src/codext/hashing/mmh3.py @@ -0,0 +1,18 @@ +# -*- coding: UTF-8 -*- +"""MMH3 Codecs - string hashing with MurmurHash3. + +These are codecs for hashing strings, for use with other codecs in encoding chains. + +These codecs: +- transform strings from str to str +- transform strings from bytes to bytes +- transform file content from str to bytes (write) +""" +from ..__common__ import * + + +if "mmh3_32" in hashlib.algorithms_available: + add("mmh3_32", lambda s, error="strict": (hashlib.mmh3_32(b(s)).hexdigest(), len(s)), guess=None) +if "mmh3_128" in hashlib.algorithms_available: + add("mmh3_128", lambda s, error="strict": (hashlib.mmh3_128(b(s)).hexdigest(), len(s)), guess=None) + diff --git a/codext/hashing/sha.py b/src/codext/hashing/sha.py similarity index 52% rename from codext/hashing/sha.py rename to src/codext/hashing/sha.py index dd94002..044e159 100644 --- a/codext/hashing/sha.py +++ b/src/codext/hashing/sha.py @@ -1,5 +1,5 @@ # -*- coding: UTF-8 -*- -"""Case Codecs - string hashing with Secure Hash Algorithms. +"""SHA Hashing Codecs - string hashing with Secure Hash Algorithms. These are codecs for hashing strings, for use with other codecs in encoding chains. @@ -8,9 +8,7 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib - -from ..__common__ import add, b, PY3 +from ..__common__ import * add("sha1", lambda s, error="strict": (hashlib.sha1(b(s)).hexdigest(), len(s)), guess=None) @@ -18,15 +16,12 @@ add("sha256", lambda s, error="strict": (hashlib.sha256(b(s)).hexdigest(), len(s)), guess=None) add("sha384", lambda s, error="strict": (hashlib.sha384(b(s)).hexdigest(), len(s)), guess=None) add("sha512", lambda s, error="strict": (hashlib.sha512(b(s)).hexdigest(), len(s)), guess=None) - - -if PY3: - add("sha3_224", lambda s, error="strict": (hashlib.sha3_224(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]224$", - guess=None) - add("sha3_256", lambda s, error="strict": (hashlib.sha3_256(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]256$", - guess=None) - add("sha3_384", lambda s, error="strict": (hashlib.sha3_384(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]384$", - guess=None) - add("sha3_512", lambda s, error="strict": (hashlib.sha3_512(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]512$", - guess=None) +add("sha3_224", lambda s, error="strict": (hashlib.sha3_224(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]224$", + guess=None) +add("sha3_256", lambda s, error="strict": (hashlib.sha3_256(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]256$", + guess=None) +add("sha3_384", lambda s, error="strict": (hashlib.sha3_384(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]384$", + guess=None) +add("sha3_512", lambda s, error="strict": (hashlib.sha3_512(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]512$", + guess=None) diff --git a/src/codext/hashing/shake.py b/src/codext/hashing/shake.py new file mode 100644 index 0000000..2b04424 --- /dev/null +++ b/src/codext/hashing/shake.py @@ -0,0 +1,25 @@ +# -*- coding: UTF-8 -*- +"""Shake Hashing Codecs - string hashing with SHAKE. + +These are codecs for hashing strings, for use with other codecs in encoding chains. + +These codecs: +- transform strings from str to str +- transform strings from bytes to bytes +- transform file content from str to bytes (write) +""" +from ..__common__ import * + + +def shake_hash(i): + def _hash_transform(l): + l = (l or str(i)).lstrip("_-") + def _encode(data, error="strict"): + return getattr(hashlib, "shake_%d" % i)(b(data)).hexdigest(int(l)), len(data) + return _encode + return _hash_transform + + +add("shake_128", shake_hash(128), pattern=r"^shake[-_]?128(|[-_][1-9]\d*)$", guess=None) +add("shake_256", shake_hash(256), pattern=r"^shake[-_]?256(|[-_][1-9]\d*)$", guess=None) + diff --git a/codext/languages/__init__.py b/src/codext/languages/__init__.py old mode 100755 new mode 100644 similarity index 95% rename from codext/languages/__init__.py rename to src/codext/languages/__init__.py index 196b8d3..8dbe999 --- a/codext/languages/__init__.py +++ b/src/codext/languages/__init__.py @@ -1,12 +1,12 @@ -# -*- coding: UTF-8 -*- -from .braille import * -from .galactic import * -from .ipsum import * -from .leetspeak import * -from .morse import * -from .navajo import * -from .radio import * -from .southpark import * -from .tap import * -from .tomtom import * - +# -*- coding: UTF-8 -*- +from .braille import * +from .galactic import * +from .ipsum import * +from .leetspeak import * +from .morse import * +from .navajo import * +from .radio import * +from .southpark import * +from .tap import * +from .tomtom import * + diff --git a/codext/languages/braille.py b/src/codext/languages/braille.py old mode 100755 new mode 100644 similarity index 93% rename from codext/languages/braille.py rename to src/codext/languages/braille.py index b28c56e..775399c --- a/codext/languages/braille.py +++ b/src/codext/languages/braille.py @@ -1,34 +1,33 @@ -# -*- coding: UTF-8 -*- -"""Braille Codec - braille content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(braille)': {'this is a test': "⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞"}, -} - - -ENCMAP = { - # digits - '0': '⠴', '1': '⠂', '2': '⠆', '3': '⠒', '4': '⠲', '5': '⠢', '6': '⠖', '7': '⠶', '8': '⠦', '9': '⠔', - # letters - 'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑', 'f': '⠋', 'g': '⠛', 'h': '⠓', 'i': '⠊', 'j': '⠚', 'k': '⠅', - 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕', 'p': '⠏', 'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞', 'u': '⠥', 'v': '⠧', - 'w': '⠺', 'x': '⠭', 'y': '⠽', 'z': '⠵', - # punctuation - ' ': '⠀', '!': '⠮', '"': '⠐', '#': '⠼', '$': '⠫', '%': '⠩', '&': '⠯', ':': '⠱', ';': '⠰', '<': '⠣', '=': '⠿', - '>': '⠜', '?': '⠹', '@': '⠈', "'": '⠄', '(': '⠷', ')': '⠾', '*': '⠡', '+': '⠬', ',': '⠠', '-': '⠤', '.': '⠨', - '/': '⠌', '[': '⠪', '\\': '⠳', ']': '⠻', '^': '⠘', '_': '⠸', -} - - -if PY3: - add_map("braille", ENCMAP, ignore_case="encode") - +# -*- coding: UTF-8 -*- +"""Braille Codec - braille content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(braille)': {'this is a test': "⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞"}, +} + + +ENCMAP = { + # digits + '0': '⠴', '1': '⠂', '2': '⠆', '3': '⠒', '4': '⠲', '5': '⠢', '6': '⠖', '7': '⠶', '8': '⠦', '9': '⠔', + # letters + 'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑', 'f': '⠋', 'g': '⠛', 'h': '⠓', 'i': '⠊', 'j': '⠚', 'k': '⠅', + 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕', 'p': '⠏', 'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞', 'u': '⠥', 'v': '⠧', + 'w': '⠺', 'x': '⠭', 'y': '⠽', 'z': '⠵', + # punctuation + ' ': '⠀', '!': '⠮', '"': '⠐', '#': '⠼', '$': '⠫', '%': '⠩', '&': '⠯', ':': '⠱', ';': '⠰', '<': '⠣', '=': '⠿', + '>': '⠜', '?': '⠹', '@': '⠈', "'": '⠄', '(': '⠷', ')': '⠾', '*': '⠡', '+': '⠬', ',': '⠠', '-': '⠤', '.': '⠨', + '/': '⠌', '[': '⠪', '\\': '⠳', ']': '⠻', '^': '⠘', '_': '⠸', +} + + +add_map("braille", ENCMAP, ignore_case="encode") + diff --git a/codext/languages/galactic.py b/src/codext/languages/galactic.py similarity index 86% rename from codext/languages/galactic.py rename to src/codext/languages/galactic.py index e77cb3a..26544b5 100644 --- a/codext/languages/galactic.py +++ b/src/codext/languages/galactic.py @@ -29,7 +29,6 @@ } -if PY3: - add_map("galactic", ENCMAP, ignore_case="encode", printables_rate=0., - pattern=r"^(?:galactic(?:[-_]alphabet)?|minecraft(?:[-_](?:enchantment|enchanting[-_]language))?)$") +add_map("galactic", ENCMAP, ignore_case="encode", printables_rate=0., + pattern=r"^(?:galactic(?:[-_]alphabet)?|minecraft(?:[-_](?:enchantment|enchanting[-_]language))?)$") diff --git a/codext/languages/ipsum.py b/src/codext/languages/ipsum.py old mode 100755 new mode 100644 similarity index 97% rename from codext/languages/ipsum.py rename to src/codext/languages/ipsum.py index 5a0fee7..a56c197 --- a/codext/languages/ipsum.py +++ b/src/codext/languages/ipsum.py @@ -1,97 +1,97 @@ -# -*- coding: UTF-8 -*- -"""Letters Codec - letter indices-related content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import random - -from ..__common__ import * - - -__examples__ = { - 'enc-dec(ipsum|lorem-ipsum)': ["This is a test !"], - 'enc(ipsum)': {'Bad test#': None}, -} - - -DICT = { - 'a': ['a', 'ac', 'accumsan', 'ad', 'adipiscing', 'aenean', 'aliquam', 'aliquet', 'amet', 'ante', 'aptent', 'arcu', - 'at', 'auctor', 'augue'], - 'b': ['babel', 'bibendum', 'blandit', 'bomba', 'botum', 'buxus'], - 'c': ['class', 'commodo', 'condimentum', 'congue', 'consectetur', 'consequat', 'conubia', 'convallis', 'cras', - 'cubilia', 'curabitur', 'curae', 'cursus'], - 'd': ['dapibus', 'diam', 'dictum', 'dictumst', 'dignissim', 'dis', 'dolor', 'donec', 'dui', 'duis'], - 'e': ['efficitur', 'egestas', 'eget', 'eleifend', 'elementum', 'elit', 'enim', 'erat', 'eros', 'est', 'et', 'etiam', - 'eu', 'euismod', 'ex'], - 'f': ['facilisis', 'fames', 'faucibus', 'felis', 'fermentum', 'feugiat', 'finibus', 'fringilla', 'fusce'], - 'g': ['gadus', 'galliarus', 'ganeo', 'garba', 'gemma', 'gener', 'genuine', 'gestus', 'gramma', 'gravida', 'grex', - 'gusto', 'guttur', 'gyro'], - 'h': ['habitant', 'habitasse', 'hac', 'haicu', 'halo', 'helleborum', 'hendrerit', 'hilarius', 'himenaeos', - 'horreum', 'hydrus', 'hystericus'], - 'i': ['iaculis', 'id', 'imperdiet', 'in', 'inceptos', 'integer', 'interdum', 'ipsum'], - 'j': ['jaccae', 'jacio', 'jecur', 'jocundiatas', 'jovis', 'juctim', 'juger', 'juno', 'jussum', 'justo'], - 'k': ['kal', 'kalatorium', 'kalium', 'kaput', 'kardo', 'kenia', 'koppa', 'kum'], - 'l': ['lacinia', 'lacus', 'laoreet', 'lectus', 'leo', 'libero', 'ligula', 'litora', 'lobortis', 'lorem', 'luctus'], - 'm': ['maecenas', 'magna', 'magnis', 'malesuada', 'massa', 'mattis', 'mauris', 'maximus', 'metus', 'mi', 'molestie', - 'mollis', 'montes', 'morbi', 'mus'], - 'n': ['nam', 'nascetur', 'natoque', 'nec', 'neque', 'netus', 'nibh', 'nisi', 'nisl', 'non', 'nostra', 'nulla', - 'nullam', 'nunc'], - 'o': ['odio', 'orci', 'ornare'], - 'p': ['parturient', 'pellentesque', 'penatibus', 'per', 'pharetra', 'phasellus', 'placerat', 'platea', 'porta', - 'porttitor', 'posuere', 'potenti', 'praesent', 'pretium', 'primis', 'proin', 'pulvinar', 'purus'], - 'q': ['qua', 'quadrum', 'quam', 'quasi', 'quintum', 'quis', 'quisque', 'quo', 'quom', 'quota', 'qur'], - 'r': ['radicitus', 'radius', 'ratio', 'recidivus', 'rectio', 'rhoncus', 'ridiculus', 'risus', 'ros', 'rutrum'], - 's': ['sagittis', 'sapien', 'scelerisque', 'sed', 'sem', 'semper', 'senectus', 'sit', 'sociosqu', 'sodales', - 'sollicitudin', 'suscipit', 'suspendisse'], - 't': ['taciti', 'tellus', 'tempor', 'tempus', 'tincidunt', 'torquent', 'tortor', 'tristique', 'turpis'], - 'u': ['ullamcorper', 'ultrices', 'ultricies', 'urna', 'ut'], - 'v': ['varius', 'vehicula', 'vel', 'velit', 'venenatis', 'vestibulum', 'vitae', 'vivamus', 'volutpat', 'vulputate'], - 'w': ['wadiarus', 'warantus', 'warra', 'werumensium', 'wormicia'], - 'x': ['xandicus', 'xenon', 'xenium', 'xiphias', 'xvir', 'xylon', 'xysticus', 'xystus'], - 'y': ['yata', 'yatum', 'yatus', 'ypra'], - 'z': ['zamia', 'zelosus', 'zerum', 'zonatus', 'zymus'], -} -SCHARS = "0123456789.,:;!?+=-*/\\" - - -def ipsum_encode(text, errors="strict"): - s, strip = "", False - for i, c in enumerate(text): - try: - if c == " " or c in SCHARS: - s += c - strip = False - else: - w = random.choice(DICT[c.lower()]) - s += (w.capitalize() if c.isupper() else w) + " " - strip = True - except KeyError: - s += handle_error("ipsum", errors, " ")(c, i) - return s[:-1] if strip else s, len(text) - - -def ipsum_decode(text, errors="strict"): - s = "" - words = text.split(" ") - for i, w in enumerate(words[:-1] if words[-1] == "" else words): - if w.strip() == "": - s += " " - elif w in SCHARS: - s += w - else: - try: - if w.lower().strip(SCHARS) not in DICT[w[0].lower()]: - raise KeyError - s += w[:len(w)-len(w.lstrip(SCHARS))] + w.strip(SCHARS)[0] + w[len(w.rstrip(SCHARS)):len(w)] - except KeyError: - s += handle_error("ipsum", errors, decode=True, item="word")(w, i) - return s, len(text) - - -add("ipsum", ipsum_encode, ipsum_decode, pattern=r"^(?:lorem[-_]?)?ipsum$", printables_rate=1., - expansion_factor=(6., .5)) - +# -*- coding: UTF-8 -*- +"""Letters Codec - letter indices-related content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import random + +from ..__common__ import * + + +__examples__ = { + 'enc-dec(ipsum|lorem-ipsum)': ["This is a test !"], + 'enc(ipsum)': {'Bad test#': None}, +} + + +DICT = { + 'a': ['a', 'ac', 'accumsan', 'ad', 'adipiscing', 'aenean', 'aliquam', 'aliquet', 'amet', 'ante', 'aptent', 'arcu', + 'at', 'auctor', 'augue'], + 'b': ['babel', 'bibendum', 'blandit', 'bomba', 'botum', 'buxus'], + 'c': ['class', 'commodo', 'condimentum', 'congue', 'consectetur', 'consequat', 'conubia', 'convallis', 'cras', + 'cubilia', 'curabitur', 'curae', 'cursus'], + 'd': ['dapibus', 'diam', 'dictum', 'dictumst', 'dignissim', 'dis', 'dolor', 'donec', 'dui', 'duis'], + 'e': ['efficitur', 'egestas', 'eget', 'eleifend', 'elementum', 'elit', 'enim', 'erat', 'eros', 'est', 'et', 'etiam', + 'eu', 'euismod', 'ex'], + 'f': ['facilisis', 'fames', 'faucibus', 'felis', 'fermentum', 'feugiat', 'finibus', 'fringilla', 'fusce'], + 'g': ['gadus', 'galliarus', 'ganeo', 'garba', 'gemma', 'gener', 'genuine', 'gestus', 'gramma', 'gravida', 'grex', + 'gusto', 'guttur', 'gyro'], + 'h': ['habitant', 'habitasse', 'hac', 'haicu', 'halo', 'helleborum', 'hendrerit', 'hilarius', 'himenaeos', + 'horreum', 'hydrus', 'hystericus'], + 'i': ['iaculis', 'id', 'imperdiet', 'in', 'inceptos', 'integer', 'interdum', 'ipsum'], + 'j': ['jaccae', 'jacio', 'jecur', 'jocundiatas', 'jovis', 'juctim', 'juger', 'juno', 'jussum', 'justo'], + 'k': ['kal', 'kalatorium', 'kalium', 'kaput', 'kardo', 'kenia', 'koppa', 'kum'], + 'l': ['lacinia', 'lacus', 'laoreet', 'lectus', 'leo', 'libero', 'ligula', 'litora', 'lobortis', 'lorem', 'luctus'], + 'm': ['maecenas', 'magna', 'magnis', 'malesuada', 'massa', 'mattis', 'mauris', 'maximus', 'metus', 'mi', 'molestie', + 'mollis', 'montes', 'morbi', 'mus'], + 'n': ['nam', 'nascetur', 'natoque', 'nec', 'neque', 'netus', 'nibh', 'nisi', 'nisl', 'non', 'nostra', 'nulla', + 'nullam', 'nunc'], + 'o': ['odio', 'orci', 'ornare'], + 'p': ['parturient', 'pellentesque', 'penatibus', 'per', 'pharetra', 'phasellus', 'placerat', 'platea', 'porta', + 'porttitor', 'posuere', 'potenti', 'praesent', 'pretium', 'primis', 'proin', 'pulvinar', 'purus'], + 'q': ['qua', 'quadrum', 'quam', 'quasi', 'quintum', 'quis', 'quisque', 'quo', 'quom', 'quota', 'qur'], + 'r': ['radicitus', 'radius', 'ratio', 'recidivus', 'rectio', 'rhoncus', 'ridiculus', 'risus', 'ros', 'rutrum'], + 's': ['sagittis', 'sapien', 'scelerisque', 'sed', 'sem', 'semper', 'senectus', 'sit', 'sociosqu', 'sodales', + 'sollicitudin', 'suscipit', 'suspendisse'], + 't': ['taciti', 'tellus', 'tempor', 'tempus', 'tincidunt', 'torquent', 'tortor', 'tristique', 'turpis'], + 'u': ['ullamcorper', 'ultrices', 'ultricies', 'urna', 'ut'], + 'v': ['varius', 'vehicula', 'vel', 'velit', 'venenatis', 'vestibulum', 'vitae', 'vivamus', 'volutpat', 'vulputate'], + 'w': ['wadiarus', 'warantus', 'warra', 'werumensium', 'wormicia'], + 'x': ['xandicus', 'xenon', 'xenium', 'xiphias', 'xvir', 'xylon', 'xysticus', 'xystus'], + 'y': ['yata', 'yatum', 'yatus', 'ypra'], + 'z': ['zamia', 'zelosus', 'zerum', 'zonatus', 'zymus'], +} +SCHARS = "0123456789.,:;!?+=-*/\\" + + +def ipsum_encode(text, errors="strict"): + s, strip = "", False + for i, c in enumerate(text): + try: + if c == " " or c in SCHARS: + s += c + strip = False + else: + w = random.choice(DICT[c.lower()]) + s += (w.capitalize() if c.isupper() else w) + " " + strip = True + except KeyError: + s += handle_error("ipsum", errors, " ")(c, i) + return s[:-1] if strip else s, len(text) + + +def ipsum_decode(text, errors="strict"): + s = "" + words = text.split(" ") + for i, w in enumerate(words[:-1] if words[-1] == "" else words): + if w.strip() == "": + s += " " + elif w in SCHARS: + s += w + else: + try: + if w.lower().strip(SCHARS) not in DICT[w[0].lower()]: + raise KeyError + s += w[:len(w)-len(w.lstrip(SCHARS))] + w.strip(SCHARS)[0] + w[len(w.rstrip(SCHARS)):len(w)] + except KeyError: + s += handle_error("ipsum", errors, decode=True, item="word")(w, i) + return s, len(text) + + +add("ipsum", ipsum_encode, ipsum_decode, pattern=r"^(?:lorem[-_]?)?ipsum$", printables_rate=1., + expansion_factor=(6., .5)) + diff --git a/codext/languages/leetspeak.py b/src/codext/languages/leetspeak.py old mode 100755 new mode 100644 similarity index 96% rename from codext/languages/leetspeak.py rename to src/codext/languages/leetspeak.py index 0628742..f3af876 --- a/codext/languages/leetspeak.py +++ b/src/codext/languages/leetspeak.py @@ -1,23 +1,23 @@ -# -*- coding: UTF-8 -*- -"""Leetspeak Codec - leetspeak content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(leet|1337|leetspeak)': {'this is a test': "7h15 15 4 7357"}, - 'dec(leet|1337|leetspeak)': {'7H15 15 4 7357': "THIS IS A TEST"}, -} - - -ENCMAP = {k: v for k, v in zip("aabeliostzg", "@4831105729")} - - -add_map("leet", ENCMAP, ignore_case="encode", no_error=True, pattern=r"(?:leet|1337|leetspeak)$", entropy=lambda e: e) - +# -*- coding: UTF-8 -*- +"""Leetspeak Codec - leetspeak content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(leet|1337|leetspeak)': {'this is a test': "7h15 15 4 7357"}, + 'dec(leet|1337|leetspeak)': {'7H15 15 4 7357': "THIS IS A TEST"}, +} + + +ENCMAP = {k: v for k, v in zip("aabeliostzg", "@4831105729")} + + +add_map("leet", ENCMAP, ignore_case="encode", no_error=True, pattern=r"(?:leet|1337|leetspeak)$", entropy=lambda e: e) + diff --git a/codext/languages/morse.py b/src/codext/languages/morse.py old mode 100755 new mode 100644 similarity index 97% rename from codext/languages/morse.py rename to src/codext/languages/morse.py index 10f9f14..6c21a09 --- a/codext/languages/morse.py +++ b/src/codext/languages/morse.py @@ -1,40 +1,40 @@ -# -*- coding: UTF-8 -*- -"""Morse Codec - morse content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(morse|morse/-.)': {'this is a test': "- .... .. ... / .. ... / .- / - . ... -"}, - 'enc(morse-/AB)': {'this is a test': "A BBBB BB BBB / BB BBB / BA / A B BBB A"}, - 'enc(morse-01)': {'this is a test': "0 1111 11 111 - 11 111 - 10 - 0 1 111 0"}, -} -__guess__ = ["morse", "morse/_.", "morse-/01", "morse-01", "morse-/ab", "morse-ab", "morse-/AB", "morse-AB"] - - -ENCMAP = { - # letters - 'a': ".-", 'b': "-...", 'c': "-.-.", 'd': "-..", 'e': ".", 'f': "..-.", 'g': "--.", 'h': "....", 'i': "..", - 'j': ".---", 'k': "-.-", 'l': ".-..", 'm': "--", 'n': "-.", 'o': "---", 'p': ".--.", 'q': "--.-", 'r': ".-.", - 's': "...", 't': "-", 'u': "..-", 'v': "...-", 'w': ".--", 'x': "-..-", 'y': "-.--", 'z': "--..", - # digits - '1': ".----", '2': "..---", '3': "...--", '4': "....-", '5': ".....", '6': "-....", '7': "--...", '8': "---..", - '9': "----.", '0': "-----", - # punctuation - ',': "--..--", '.': ".-.-.-", ':' : "---...", '?': "..--..", '/': "-..-.", '-': "-....-", '=' : "-...-", - '(': "-.--.", ')': "-.--.-", '@' : ".--.-.", '\'': ".----.", '_': "..--.-", '!': "-.-.--", '&': ".-...", - '"': ".-..-.", ';': "-.-.-.", '$': "...-..-", - # word separator - ' ' : "/", -} - - -add_map("morse", ENCMAP, "#", " ", ignore_case="encode", pattern=r"^morse([-_]?.{3})?$", printables_rate=1., - expansion_factor=(2.8, .6)) - +# -*- coding: UTF-8 -*- +"""Morse Codec - morse content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(morse|morse/-.)': {'this is a test': "- .... .. ... / .. ... / .- / - . ... -"}, + 'enc(morse-/AB)': {'this is a test': "A BBBB BB BBB / BB BBB / BA / A B BBB A"}, + 'enc(morse-01)': {'this is a test': "0 1111 11 111 - 11 111 - 10 - 0 1 111 0"}, +} +__guess__ = ["morse", "morse/_.", "morse-/01", "morse-01", "morse-/ab", "morse-ab", "morse-/AB", "morse-AB"] + + +ENCMAP = { + # letters + 'a': ".-", 'b': "-...", 'c': "-.-.", 'd': "-..", 'e': ".", 'f': "..-.", 'g': "--.", 'h': "....", 'i': "..", + 'j': ".---", 'k': "-.-", 'l': ".-..", 'm': "--", 'n': "-.", 'o': "---", 'p': ".--.", 'q': "--.-", 'r': ".-.", + 's': "...", 't': "-", 'u': "..-", 'v': "...-", 'w': ".--", 'x': "-..-", 'y': "-.--", 'z': "--..", + # digits + '1': ".----", '2': "..---", '3': "...--", '4': "....-", '5': ".....", '6': "-....", '7': "--...", '8': "---..", + '9': "----.", '0': "-----", + # punctuation + ',': "--..--", '.': ".-.-.-", ':' : "---...", '?': "..--..", '/': "-..-.", '-': "-....-", '=' : "-...-", + '(': "-.--.", ')': "-.--.-", '@' : ".--.-.", '\'': ".----.", '_': "..--.-", '!': "-.-.--", '&': ".-...", + '"': ".-..-.", ';': "-.-.-.", '$': "...-..-", + # word separator + ' ' : "/", +} + + +add_map("morse", ENCMAP, "#", " ", ignore_case="encode", pattern=r"^morse([-_]?.{3})?$", printables_rate=1., + expansion_factor=(2.8, .6)) + diff --git a/codext/languages/navajo.py b/src/codext/languages/navajo.py old mode 100755 new mode 100644 similarity index 98% rename from codext/languages/navajo.py rename to src/codext/languages/navajo.py index b895622..a46b35c --- a/codext/languages/navajo.py +++ b/src/codext/languages/navajo.py @@ -1,35 +1,35 @@ -# -*- coding: UTF-8 -*- -"""Navajo Codec - Navajo code content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = {'enc-dec(navajo)': ["this is a test", "THIS\nIS\nA\nTEST"]} - - -# source: https://www.history.navy.mil/research/library/online-reading-room/title-list-alphabetically/n/navajo-code-talker-dictionary.html -ENCMAP = { - 'A': ["WOL-LA-CHEE", "BE-LA-SANA", "TSE-NILL"], 'B': ["NA-HASH-CHID", "SHUSH", "TOISH-JEH"], - 'C': ["MOASI", "TLA-GIN", "BA-GOSHI"], 'D': ["BE", "CHINDI", "LHA-CHA-EH"], 'E': ["AH-JAH", "DZEH", "AH-NAH"], - 'F': ["CHUO", "TSA-E-DONIN-EE", "MA-E"], 'G': ["AH-TAD", "KLIZZIE", "JEHA"], 'H': ["TSE-GAH", "CHA", "LIN"], - 'I': ["TKIN", "YEH-HES", "A-CHI"], 'J': ["TKELE-CHO-G", "AH-YA-TSINNE", "YIL-DOI"], - 'K': ["JAD-HO-LONI", "BA-AH-NE-DI-TININ", "KLIZZIE-YAZZIE"], 'L': ["DIBEH-YAZZIE", "AH-JAD", "NASH-DOIE-TSO"], - 'M': ["TSIN-TLITI", "BE-TAS-TNI", "NA-AS-TSO-SI"], 'N': ["TSAH", "A-CHIN"], - 'O': ["A-KHA", "TLO-CHIN", "NE-AHS-JAH"], 'P': ["CLA-GI-AIH", "BI-SO-DIH", "NE-ZHONI"], 'Q': "CA-YEILTH", - 'R': ["GAH", "DAH-NES-TSA", "AH-LOSZ"], 'S': ["DIBEH", "KLESH"], 'T': ["D-AH", "A-WOH", "THAN-ZIE"], - 'U': ["SHI-DA", "NO-DA-IH"], 'V': "A-KEH-DI-GLINI", 'W': "GLOE-IH", 'X': "AL-NA-AS-DZOH", 'Y': "TSAH-AS-ZIH", - 'Z': "BESH-DO-TLIZ", - ' ': "-", '\n': "\n", - '0': "0", '1': "1", '2': "2", '3': "3", '4': "4", '5': "5", '6': "6", '7': "7", '8': "8", '9': "9", -} - - -add_map("navajo", ENCMAP, ignore_case="both", sep=" ", pattern=r"^navajo$", printables_rate=1., - expansion_factor=(6.2, .8)) - +# -*- coding: UTF-8 -*- +"""Navajo Codec - Navajo code content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = {'enc-dec(navajo)': ["this is a test", "THIS\nIS\nA\nTEST"]} + + +# source: https://www.history.navy.mil/research/library/online-reading-room/title-list-alphabetically/n/navajo-code-talker-dictionary.html +ENCMAP = { + 'A': ["WOL-LA-CHEE", "BE-LA-SANA", "TSE-NILL"], 'B': ["NA-HASH-CHID", "SHUSH", "TOISH-JEH"], + 'C': ["MOASI", "TLA-GIN", "BA-GOSHI"], 'D': ["BE", "CHINDI", "LHA-CHA-EH"], 'E': ["AH-JAH", "DZEH", "AH-NAH"], + 'F': ["CHUO", "TSA-E-DONIN-EE", "MA-E"], 'G': ["AH-TAD", "KLIZZIE", "JEHA"], 'H': ["TSE-GAH", "CHA", "LIN"], + 'I': ["TKIN", "YEH-HES", "A-CHI"], 'J': ["TKELE-CHO-G", "AH-YA-TSINNE", "YIL-DOI"], + 'K': ["JAD-HO-LONI", "BA-AH-NE-DI-TININ", "KLIZZIE-YAZZIE"], 'L': ["DIBEH-YAZZIE", "AH-JAD", "NASH-DOIE-TSO"], + 'M': ["TSIN-TLITI", "BE-TAS-TNI", "NA-AS-TSO-SI"], 'N': ["TSAH", "A-CHIN"], + 'O': ["A-KHA", "TLO-CHIN", "NE-AHS-JAH"], 'P': ["CLA-GI-AIH", "BI-SO-DIH", "NE-ZHONI"], 'Q': "CA-YEILTH", + 'R': ["GAH", "DAH-NES-TSA", "AH-LOSZ"], 'S': ["DIBEH", "KLESH"], 'T': ["D-AH", "A-WOH", "THAN-ZIE"], + 'U': ["SHI-DA", "NO-DA-IH"], 'V': "A-KEH-DI-GLINI", 'W': "GLOE-IH", 'X': "AL-NA-AS-DZOH", 'Y': "TSAH-AS-ZIH", + 'Z': "BESH-DO-TLIZ", + ' ': "-", '\n': "\n", + '0': "0", '1': "1", '2': "2", '3': "3", '4': "4", '5': "5", '6': "6", '7': "7", '8': "8", '9': "9", +} + + +add_map("navajo", ENCMAP, ignore_case="both", sep=" ", pattern=r"^navajo$", printables_rate=1., + expansion_factor=(6.2, .8)) + diff --git a/codext/languages/radio.py b/src/codext/languages/radio.py old mode 100755 new mode 100644 similarity index 97% rename from codext/languages/radio.py rename to src/codext/languages/radio.py index 304e44a..03e420e --- a/codext/languages/radio.py +++ b/src/codext/languages/radio.py @@ -1,29 +1,29 @@ -# -*- coding: UTF-8 -*- -"""Radio Codec - NATO/Military phonetic alphabet content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(radio|military-alphabet)': {'test': "Tango Echo Sierra Tango"}, - 'enc(nato-alphabet|radio-phonetic)': {'string': "Sierra Tango Romeo India November Golf"}, -} - - -ENCMAP = { - 'A': "Alpha", 'B': "Bravo", 'C': "Charlie", 'D': "Delta", 'E': "Echo", 'F': "Foxtrot", 'G': "Golf", 'H': "Hotel", - 'I': "India", 'J': "Juliett", 'K': "Kilo", 'L': "Lima", 'M': "Mike", 'N': "November", 'O': "Oscar", 'P': "Papa", - 'Q': "Quebec", 'R': "Romeo", 'S': "Sierra", 'T': "Tango", 'U': "Uniform", 'V': "Victor", 'W': "Whiskey", - 'X': "X-ray", 'Y': "Yankee", 'Z': "Zulu", ' ': "/", -} - - -add_map("radio", ENCMAP, sep=" ", ignore_case="both", printables_rate=1., expansion_factor=(5.5, .3), - pattern=r"^(?:military|nato|radio)(?:(?:[-_]phonetic)?(?:[-_]alphabet)?)?$") - +# -*- coding: UTF-8 -*- +"""Radio Codec - NATO/Military phonetic alphabet content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(radio|military-alphabet)': {'test': "Tango Echo Sierra Tango"}, + 'enc(nato-alphabet|radio-phonetic)': {'string': "Sierra Tango Romeo India November Golf"}, +} + + +ENCMAP = { + 'A': "Alpha", 'B': "Bravo", 'C': "Charlie", 'D': "Delta", 'E': "Echo", 'F': "Foxtrot", 'G': "Golf", 'H': "Hotel", + 'I': "India", 'J': "Juliett", 'K': "Kilo", 'L': "Lima", 'M': "Mike", 'N': "November", 'O': "Oscar", 'P': "Papa", + 'Q': "Quebec", 'R': "Romeo", 'S': "Sierra", 'T': "Tango", 'U': "Uniform", 'V': "Victor", 'W': "Whiskey", + 'X': "X-ray", 'Y': "Yankee", 'Z': "Zulu", ' ': "/", +} + + +add_map("radio", ENCMAP, sep=" ", ignore_case="both", printables_rate=1., expansion_factor=(5.5, .3), + pattern=r"^(?:military|nato|radio)(?:(?:[-_]phonetic)?(?:[-_]alphabet)?)?$") + diff --git a/codext/languages/southpark.py b/src/codext/languages/southpark.py old mode 100755 new mode 100644 similarity index 98% rename from codext/languages/southpark.py rename to src/codext/languages/southpark.py index 8abc18b..6fdbd93 --- a/codext/languages/southpark.py +++ b/src/codext/languages/southpark.py @@ -1,44 +1,44 @@ -# -*- coding: UTF-8 -*- -"""Southpark Codec - Kenny's language content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples1__ = { - 'enc_dec(kenny|southpark)': ["This is a Test"], - 'enc_dec(kenny_123456|southpark-ABCDEF)': ["This is a Test"], -} -__guess1__ = ["southpark", "southpark-123456", "southpark-abcdef", "southpark-ABCDEF"] -__examples2__ = { - 'enc(southpark-icase|kenny_icase)': {'this is a test': "FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP"}, - 'enc(southpark_icase-123)': {'this is a test': "123213211122111211122111222111123233122123"}, -} -__guess2__ = ["southpark-icase", "southpark-icase-123", "southpark-icase-abc", "southpark-icase-ABC"] - - -ENCMAP1 = { - 'a': "mmm", 'b': "mmp", 'c': "mmf", 'd': "mpm", 'e': "mpp", 'f': "mpf", 'g': "mfm", 'h': "mfp", 'i': "mff", - 'j': "pmm", 'k': "pmp", 'l': "pmf", 'm': "ppm", 'n': "ppp", 'o': "ppf", 'p': "pfm", 'q': "pfp", 'r': "pff", - 's': "fmm", 't': "fmp", 'u': "fmf", 'v': "fpm", 'w': "fpp", 'x': "fpf", 'y': "ffm", 'z': "ffp", - 'A': "Mmm", 'B': "Mmp", 'C': "Mmf", 'D': "Mpm", 'E': "Mpp", 'F': "Mpf", 'G': "Mfm", 'H': "Mfp", 'I': "Mff", - 'J': "Pmm", 'K': "Pmp", 'L': "Pmf", 'M': "Ppm", 'N': "Ppp", 'O': "Ppf", 'P': "Pfm", 'Q': "Pfp", 'R': "Pff", - 'S': "Fmm", 'T': "Fmp", 'U': "Fmf", 'V': "Fpm", 'W': "Fpp", 'X': "Fpf", 'Y': "Ffm", 'Z': "Ffp", - ' ': ["fff", "Fff"], -} -ENCMAP2 = { - 'a': "MMM", 'b': "MMP", 'c': "MMF", 'd': "MPM", 'e': "MPP", 'f': "MPF", 'g': "MFM", 'h': "MFP", 'i': "MFF", - 'j': "PMM", 'k': "PMP", 'l': "PMF", 'm': "PPM", 'n': "PPP", 'o': "PPF", 'p': "PFM", 'q': "PFP", 'r': "PFF", - 's': "FMM", 't': "FMP", 'u': "FMF", 'v': "FPM", 'w': "FPP", 'x': "FPF", 'y': "FFM", 'z': "FFP", ' ': "FFF", -} - - -add_map("southpark", ENCMAP1, pattern=r"^(?:kenny|southpark)([-_].{6})?$", examples=__examples1__, guess=__guess1__) -add_map("southpark-icase", ENCMAP2, ignore_case="both", pattern=r"^(?:kenny|southpark)[-_]icase([-_].{3})?$", - examples=__examples2__, guess=__guess2__, printables_rate=1., expansion_factor=3.) - +# -*- coding: UTF-8 -*- +"""Southpark Codec - Kenny's language content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples1__ = { + 'enc_dec(kenny|southpark)': ["This is a Test"], + 'enc_dec(kenny_123456|southpark-ABCDEF)': ["This is a Test"], +} +__guess1__ = ["southpark", "southpark-123456", "southpark-abcdef", "southpark-ABCDEF"] +__examples2__ = { + 'enc(southpark-icase|kenny_icase)': {'this is a test': "FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP"}, + 'enc(southpark_icase-123)': {'this is a test': "123213211122111211122111222111123233122123"}, +} +__guess2__ = ["southpark-icase", "southpark-icase-123", "southpark-icase-abc", "southpark-icase-ABC"] + + +ENCMAP1 = { + 'a': "mmm", 'b': "mmp", 'c': "mmf", 'd': "mpm", 'e': "mpp", 'f': "mpf", 'g': "mfm", 'h': "mfp", 'i': "mff", + 'j': "pmm", 'k': "pmp", 'l': "pmf", 'm': "ppm", 'n': "ppp", 'o': "ppf", 'p': "pfm", 'q': "pfp", 'r': "pff", + 's': "fmm", 't': "fmp", 'u': "fmf", 'v': "fpm", 'w': "fpp", 'x': "fpf", 'y': "ffm", 'z': "ffp", + 'A': "Mmm", 'B': "Mmp", 'C': "Mmf", 'D': "Mpm", 'E': "Mpp", 'F': "Mpf", 'G': "Mfm", 'H': "Mfp", 'I': "Mff", + 'J': "Pmm", 'K': "Pmp", 'L': "Pmf", 'M': "Ppm", 'N': "Ppp", 'O': "Ppf", 'P': "Pfm", 'Q': "Pfp", 'R': "Pff", + 'S': "Fmm", 'T': "Fmp", 'U': "Fmf", 'V': "Fpm", 'W': "Fpp", 'X': "Fpf", 'Y': "Ffm", 'Z': "Ffp", + ' ': ["fff", "Fff"], +} +ENCMAP2 = { + 'a': "MMM", 'b': "MMP", 'c': "MMF", 'd': "MPM", 'e': "MPP", 'f': "MPF", 'g': "MFM", 'h': "MFP", 'i': "MFF", + 'j': "PMM", 'k': "PMP", 'l': "PMF", 'm': "PPM", 'n': "PPP", 'o': "PPF", 'p': "PFM", 'q': "PFP", 'r': "PFF", + 's': "FMM", 't': "FMP", 'u': "FMF", 'v': "FPM", 'w': "FPP", 'x': "FPF", 'y': "FFM", 'z': "FFP", ' ': "FFF", +} + + +add_map("southpark", ENCMAP1, pattern=r"^(?:kenny|southpark)([-_].{6})?$", examples=__examples1__, guess=__guess1__) +add_map("southpark-icase", ENCMAP2, ignore_case="both", pattern=r"^(?:kenny|southpark)[-_]icase([-_].{3})?$", + examples=__examples2__, guess=__guess2__, printables_rate=1., expansion_factor=3.) + diff --git a/codext/languages/tap.py b/src/codext/languages/tap.py similarity index 86% rename from codext/languages/tap.py rename to src/codext/languages/tap.py index efd551d..ec7c15b 100644 --- a/codext/languages/tap.py +++ b/src/codext/languages/tap.py @@ -1,39 +1,38 @@ -# -*- coding: UTF-8 -*- -"""Tap code - Tap/knock code encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(tap|knock-code|tap_code)': {'this is a test' : ".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. ." - "⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ...."}, -} -__guess__ = ["tap", "tap-inv"] - - -def __build_encmap(a): - d, i = {}, 0 - for x in range(1,6): - for y in range(1,6): - d[a[i]] = x * "." + " " + y * "." - i += 1 - d['k'], d[' '] = d['c'], " " - return d - - - -ENCMAP = { - '': __build_encmap("abcdefghijlmnopqrstuvwxyz"), - 'inv': __build_encmap("abcdefghijlmnopqrstuvwxyz"[::-1]), -} - - -if PY3: - add_map("tap", ENCMAP, ignore_case="both", sep="⠀", pattern=r"^(?:tap|knock)(?:[-_]code)?(|inv)$") - +# -*- coding: UTF-8 -*- +"""Tap code - Tap/knock code encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(tap|knock-code|tap_code)': {'this is a test' : ".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. ." + "⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ...."}, +} +__guess__ = ["tap", "tap-inv"] + + +def __build_encmap(a): + d, i = {}, 0 + for x in range(1,6): + for y in range(1,6): + d[a[i]] = x * "." + " " + y * "." + i += 1 + d['k'], d[' '] = d['c'], " " + return d + + + +ENCMAP = { + '': __build_encmap("abcdefghijlmnopqrstuvwxyz"), + 'inv': __build_encmap("abcdefghijlmnopqrstuvwxyz"[::-1]), +} + + +add_map("tap", ENCMAP, ignore_case="both", sep="⠀", pattern=r"^(?:tap|knock)(?:[-_]code)?(|inv)$") + diff --git a/codext/languages/tomtom.py b/src/codext/languages/tomtom.py old mode 100755 new mode 100644 similarity index 97% rename from codext/languages/tomtom.py rename to src/codext/languages/tomtom.py index 34a3b46..403eda4 --- a/codext/languages/tomtom.py +++ b/src/codext/languages/tomtom.py @@ -1,35 +1,35 @@ -# -*- coding: UTF-8 -*- -"""Tom-Tom Codec - tom-tom content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc': { - 'this is a test': "\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\" - } -} -__guess__ = ["tom-tom", "tom-tom/_.", "tom-tom-/01", "tom-tom-01", "tom-tom-/ab", "tom-tom-ab", "tom-tom-/AB", - "tom-tom-AB"] - - -ENCMAP = { - # letters - 'A': "/", 'B': "//", 'C': "///", 'D': "////", 'E': "/\\", 'F': "//\\", 'G': "///\\", 'H': "/\\\\", 'I': "/\\\\\\", - 'J': "\\/", 'K': "\\\\/", 'L': "\\\\\\/", 'M': "\\//", 'N': "\\///", 'O': "/\\/", 'P': "//\\/", 'Q': "/\\\\/", - 'R': "/\\//", 'S': "\\/\\", 'T': "\\\\/\\", 'U': "\\//\\", 'V': "\\/\\\\", 'W': "//\\\\", 'X': "\\\\//", - 'Y': "\\/\\/", 'Z': "/\\/\\", - # word separator - ' ' : "|", -} - - -add_map("tom-tom", ENCMAP, ".", " ", ignore_case="both", pattern=r"^tom-?tom([-_]?.{3})?$", printables_rate=1., - expansion_factor=(3.8, .2)) - +# -*- coding: UTF-8 -*- +"""Tom-Tom Codec - tom-tom content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc': { + 'this is a test': "\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\" + } +} +__guess__ = ["tom-tom", "tom-tom/_.", "tom-tom-/01", "tom-tom-01", "tom-tom-/ab", "tom-tom-ab", "tom-tom-/AB", + "tom-tom-AB"] + + +ENCMAP = { + # letters + 'A': "/", 'B': "//", 'C': "///", 'D': "////", 'E': "/\\", 'F': "//\\", 'G': "///\\", 'H': "/\\\\", 'I': "/\\\\\\", + 'J': "\\/", 'K': "\\\\/", 'L': "\\\\\\/", 'M': "\\//", 'N': "\\///", 'O': "/\\/", 'P': "//\\/", 'Q': "/\\\\/", + 'R': "/\\//", 'S': "\\/\\", 'T': "\\\\/\\", 'U': "\\//\\", 'V': "\\/\\\\", 'W': "//\\\\", 'X': "\\\\//", + 'Y': "\\/\\/", 'Z': "/\\/\\", + # word separator + ' ' : "|", +} + + +add_map("tom-tom", ENCMAP, ".", " ", ignore_case="both", pattern=r"^tom-?tom([-_]?.{3})?$", printables_rate=1., + expansion_factor=(3.8, .2)) + diff --git a/codext/macros.json b/src/codext/macros.json similarity index 100% rename from codext/macros.json rename to src/codext/macros.json diff --git a/codext/others/__init__.py b/src/codext/others/__init__.py old mode 100755 new mode 100644 similarity index 79% rename from codext/others/__init__.py rename to src/codext/others/__init__.py index aa7ffa2..7342b8a --- a/codext/others/__init__.py +++ b/src/codext/others/__init__.py @@ -1,6 +1,7 @@ -# -*- coding: UTF-8 -*- -from .dna import * -from .letters import * -from .markdown import * -from .uuencode import * - +# -*- coding: UTF-8 -*- +from .dna import * +from .kbshift import * +from .letters import * +from .markdown import * +from .uuencode import * + diff --git a/codext/others/dna.py b/src/codext/others/dna.py old mode 100755 new mode 100644 similarity index 97% rename from codext/others/dna.py rename to src/codext/others/dna.py index 428edee..2757a6a --- a/codext/others/dna.py +++ b/src/codext/others/dna.py @@ -1,42 +1,42 @@ -# -*- coding: UTF-8 -*- -"""DNA Codec - dna content encoding. - -This implements the 8 methods of ATGC nucleotides following the rule of complementary pairing, according the literature4 - about coding and computing of DNA sequences. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(dna0|dna9)': None, - 'enc(dna1)': {'this is a test': "GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA"}, - 'enc(dna-2)': {'this is a test': "CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA"}, - 'enc(dna_3)': {'this is a test': "ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG"}, - 'enc(dna4)': {'this is a test': "AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC"}, - 'enc(dna-5)': {'this is a test': "TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG"}, - 'enc(dna_6)': {'this is a test': "TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC"}, - 'enc(dna7)': {'this is a test': "GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT"}, - 'enc(dna-8)': {'this is a test': "CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT"}, -} -__guess__ = ["dna%d" % i for i in range(1, 9)] - - -SEQUENCES = { - '00': "AAGCGCTT", - '11': "TTCGCGAA", - '01': "GCAATTGC", - '10': "CGTTAACG", -} -ENCMAP = [] -for i in range(8): - ENCMAP.append({k: v[i] for k, v in SEQUENCES.items()}) - - -add_map("dna", ENCMAP, intype="bin", pattern=r"dna[-_]?([1-8])$", entropy=2., printables_rate=1., expansion_factor=4.) - +# -*- coding: UTF-8 -*- +"""DNA Codec - dna content encoding. + +This implements the 8 methods of ATGC nucleotides following the rule of complementary pairing, according the literature4 + about coding and computing of DNA sequences. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(dna0|dna9)': None, + 'enc(dna1)': {'this is a test': "GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA"}, + 'enc(dna-2)': {'this is a test': "CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA"}, + 'enc(dna_3)': {'this is a test': "ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG"}, + 'enc(dna4)': {'this is a test': "AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC"}, + 'enc(dna-5)': {'this is a test': "TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG"}, + 'enc(dna_6)': {'this is a test': "TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC"}, + 'enc(dna7)': {'this is a test': "GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT"}, + 'enc(dna-8)': {'this is a test': "CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT"}, +} +__guess__ = ["dna%d" % i for i in range(1, 9)] + + +SEQUENCES = { + '00': "AAGCGCTT", + '11': "TTCGCGAA", + '01': "GCAATTGC", + '10': "CGTTAACG", +} +ENCMAP = [] +for i in range(8): + ENCMAP.append({k: v[i] for k, v in SEQUENCES.items()}) + + +add_map("dna", ENCMAP, intype="bin", pattern=r"dna[-_]?([1-8])$", entropy=2., printables_rate=1., expansion_factor=4.) + diff --git a/src/codext/others/kbshift.py b/src/codext/others/kbshift.py new file mode 100644 index 0000000..60b3bf0 --- /dev/null +++ b/src/codext/others/kbshift.py @@ -0,0 +1,66 @@ +# -*- coding: UTF-8 -*- +"""Keyboard-Shift Codec - keyboard line shifting content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +LAYOUTS = { + 'ansi': "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;'\nzxcvbnm<>\n,./", + 'azerty': "azertyuiop\nqsdfghjklm\nwxcvbn", + 'azerty-be': "³1234567890°_\n²&é\"'(§è!çà)-\n|@#^{}\nazertyuiop$\n€[]\n¨*\nqsdfghjklm%£\nùµ\n´`\n>wxcvbn?./+\n<,;:=\n\\~", + 'azerty-fr': "1234567890°+\n²&é\"'(-è_çà)=\n~#{[|`\\^@]}\nazertyuiop¨£\nqsdfghjklm%µ\nù*\n>wxcvbn?./§\n<,;:!", + 'dvorak': "~!@#$%^&*(){}\n`1234567890[]\n\"<>pyfgcrl?+|\n',./=\\\naoeuidhtns_\n-\n:qjkxbmwvz\n;", + 'qwerty': "qwertyuiop\nasdfghjkl\nzxcvbnm", + 'qwerty-us': "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;,\nzxcvbnm<>?\n./", +} +__per_len = {} +for k, s in LAYOUTS.items(): + i = max(map(len, s.split("\n"))) + __per_len.setdefault(i, []) + __per_len[i].append(k) + + +__examples__ = {"enc-dec(kbshift_%s_%d)" % (kb, n): ["@irandom{256,512}"] for n in range(10) for kb in LAYOUTS.keys()} +__guess__ = [] +for mlen, kbs in __per_len.items(): + for k in kbs: + __guess__.extend(["kbshift-%s-%d" % (k, i+1) for i in range(mlen)]) + + +def _kbshift(text, keyboard="azerty", n=1, decode=False): + r = "" + for c in text: + nc = None + for l in LAYOUTS[keyboard].splitlines(): + if c.lower() in l: + nc = l[(l.index(c.lower()) + [-1, 1][decode] * n) % len(l)] + break + r += c if nc is None else nc + return r + + +def kbshift_encode(scheme): + kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups() + def encode(text, errors="strict"): + r = _kbshift(ensure_str(text), kb, int(shift)) + return r, len(r) + return encode + + +def kbshift_decode(scheme): + kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups() + def decode(text, errors="strict"): + r = _kbshift(ensure_str(text), kb, int(shift), True) + return r, len(r) + return decode + + +add("kbshift", kbshift_encode, kbshift_decode, entropy=lambda e: e,printables_rate=lambda pr: pr, transitive=True, + pattern=r"^kbshift(?:|[-_]((?:az|qw)erty[-_]?[1-9]|(?:ansi|azerty-(?:be|fr)|dvorak|qwerty-us)[-_]?(?:[1-9]|1[0-2])))$") + diff --git a/codext/others/letters.py b/src/codext/others/letters.py old mode 100755 new mode 100644 similarity index 97% rename from codext/others/letters.py rename to src/codext/others/letters.py index e27ae96..57fa26a --- a/codext/others/letters.py +++ b/src/codext/others/letters.py @@ -1,91 +1,91 @@ -# -*- coding: UTF-8 -*- -"""Letters Codec - letter indices-related content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from string import ascii_uppercase - -from ..__common__ import * - - -__examples__ = { - 'enc(consonant-index|consonants_indices)': { - 'This is a test': "166I15I15A16E1516", - '\x00': None, - '\xff': None, - }, - 'dec(consonant-index|consonants_indices)': { - '166I15I15A16E1516': "THISISATEST", - '\x00': None, - '\xff': None, - }, - 'enc(vowel-index|vowels_indices)': {'This is a test': "TH3S3S1T2ST"}, - 'dec(vowel-index|vowels_indices)': {'TH3S3S1T2ST': "THISISATEST"}, - 'enc(consonant-vowel_indices)': {'This is a test': "C16C6V3C15V3C15V1C16V2C15C16"}, - 'dec(consonants_vowels-index)': {'C16C6V3C15V3C15V1C16V2C15C16': "THISISATEST"}, -} -__guess__ = ["consonant-index", "vowel-index", "consonants_vowels-index"] - - -VOWELS = "AEIOUY" - - -def __get_encmap(letters): - if re.match(r"^consonants?$", letters): - encmap = {c: str(i+1) for i, c in enumerate(sorted(set(ascii_uppercase) - set(VOWELS)))} - for c in VOWELS: - encmap[c] = c - elif re.match(r"^vowels?$", letters): - encmap = {c: c for c in ascii_uppercase} - for i, c in enumerate(VOWELS): - encmap[c] = str(i+1) - elif re.match(r"^consonants?[-_]vowels?$", letters): - encmap = {c: "C" + str(i+1) for i, c in enumerate(sorted(set(ascii_uppercase) - set(VOWELS)))} - for i, c in enumerate(VOWELS): - encmap[c] = "V" + str(i+1) - for c in " ": - encmap[c] = "" - return encmap - - -def letters_encode(letters): - encmap = __get_encmap(letters) - def encode(text, errors="strict"): - s = "" - for i, c in enumerate(text.upper()): - try: - s += encmap[c] - except KeyError: - s += handle_error("letter-indices", errors)(c, i) - return "".join(encmap.get(c.upper(), c) for c in text), len(text) - return encode - - -def letters_decode(letters): - decmap = {v: k for k, v in __get_encmap(letters).items()} - maxlen = max(len(x) for x in decmap.keys()) - def decode(text, errors="strict"): - s, i = "", 0 - while i < len(text): - err = True - for j in range(maxlen, 0, -1): - try: - s += decmap[text[i:i+j]] - i += j - err = False - break - except (IndexError, KeyError): - pass - if err: - s += handle_error("letter-indices", errors, decode=True)(text[i], i) - return s, len(text) - return decode - - -add("letter-indices", letters_encode, letters_decode, printables_rate=1., expansion_factor=None, - pattern=r"^(consonants?|vowels?|consonants?[-_]vowels?)[-_]ind(?:ex|ices)$") - +# -*- coding: UTF-8 -*- +"""Letters Codec - letter indices-related content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from string import ascii_uppercase + +from ..__common__ import * + + +__examples__ = { + 'enc(consonant-index|consonants_indices)': { + 'This is a test': "166I15I15A16E1516", + '\x00': None, + '\xff': None, + }, + 'dec(consonant-index|consonants_indices)': { + '166I15I15A16E1516': "THISISATEST", + '\x00': None, + '\xff': None, + }, + 'enc(vowel-index|vowels_indices)': {'This is a test': "TH3S3S1T2ST"}, + 'dec(vowel-index|vowels_indices)': {'TH3S3S1T2ST': "THISISATEST"}, + 'enc(consonant-vowel_indices)': {'This is a test': "C16C6V3C15V3C15V1C16V2C15C16"}, + 'dec(consonants_vowels-index)': {'C16C6V3C15V3C15V1C16V2C15C16': "THISISATEST"}, +} +__guess__ = ["consonant-index", "vowel-index", "consonants_vowels-index"] + + +VOWELS = "AEIOUY" + + +def __get_encmap(letters): + if re.match(r"^consonants?$", letters): + encmap = {c: str(i+1) for i, c in enumerate(sorted(set(ascii_uppercase) - set(VOWELS)))} + for c in VOWELS: + encmap[c] = c + elif re.match(r"^vowels?$", letters): + encmap = {c: c for c in ascii_uppercase} + for i, c in enumerate(VOWELS): + encmap[c] = str(i+1) + elif re.match(r"^consonants?[-_]vowels?$", letters): + encmap = {c: "C" + str(i+1) for i, c in enumerate(sorted(set(ascii_uppercase) - set(VOWELS)))} + for i, c in enumerate(VOWELS): + encmap[c] = "V" + str(i+1) + for c in " ": + encmap[c] = "" + return encmap + + +def letters_encode(letters): + encmap = __get_encmap(letters) + def encode(text, errors="strict"): + s = "" + for i, c in enumerate(text.upper()): + try: + s += encmap[c] + except KeyError: + s += handle_error("letter-indices", errors)(c, i) + return "".join(encmap.get(c.upper(), c) for c in text), len(text) + return encode + + +def letters_decode(letters): + decmap = {v: k for k, v in __get_encmap(letters).items()} + maxlen = max(len(x) for x in decmap.keys()) + def decode(text, errors="strict"): + s, i = "", 0 + while i < len(text): + err = True + for j in range(maxlen, 0, -1): + try: + s += decmap[text[i:i+j]] + i += j + err = False + break + except (IndexError, KeyError): + pass + if err: + s += handle_error("letter-indices", errors, decode=True)(text[i], i) + return s, len(text) + return decode + + +add("letter-indices", letters_encode, letters_decode, printables_rate=1., expansion_factor=None, + pattern=r"^(consonants?|vowels?|consonants?[-_]vowels?)[-_]ind(?:ex|ices)$") + diff --git a/codext/others/markdown.py b/src/codext/others/markdown.py old mode 100755 new mode 100644 similarity index 96% rename from codext/others/markdown.py rename to src/codext/others/markdown.py index b3d300f..548ee2b --- a/codext/others/markdown.py +++ b/src/codext/others/markdown.py @@ -1,22 +1,22 @@ -# -*- coding: UTF-8 -*- -"""Markdown Codec - markdown content conversion to HTML. - -This codec: -- encodes strings from str to str -- encodes strings from bytes to bytes -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__guess__ = [] - - -try: - from markdown2 import markdown as md2html - # note: the group is NOT captured so that the pattern is only used to match the name of the codec and not to - # dynamically bind to a parametrizable encode function - add("markdown", lambda md, error="strict": (md2html(md), len(md)), pattern=r"^(?:markdown|Markdown|md)$") -except ImportError: - pass - +# -*- coding: UTF-8 -*- +"""Markdown Codec - markdown content conversion to HTML. + +This codec: +- encodes strings from str to str +- encodes strings from bytes to bytes +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__guess__ = [] + + +try: + from markdown2 import markdown as md2html + # note: the group is NOT captured so that the pattern is only used to match the name of the codec and not to + # dynamically bind to a parametrizable encode function + add("markdown", lambda md, error="strict": (md2html(md), len(md)), pattern=r"^(?:markdown|Markdown|md)$") +except ImportError: + pass + diff --git a/codext/others/uuencode.py b/src/codext/others/uuencode.py similarity index 95% rename from codext/others/uuencode.py rename to src/codext/others/uuencode.py index a2f2fb6..f1ecfc3 100644 --- a/codext/others/uuencode.py +++ b/src/codext/others/uuencode.py @@ -17,7 +17,7 @@ 'dec(uu-encode)': {'.=&AI 1: - s += handle_error("whitespace_after_before", errors, decode=True, item="line")(l, i) - after = len(l) - len(l.rstrip(" ")) - before = len(l) - len(l.lstrip(" ")) - c = l[before] - s += chr(eval(eq)) - return s, len(text) - return decode - - -op = r"[+-](?:\d+(?:\.\d+)?[*/])?" -add("whitespace_after_before", wsba_encode, wsba_decode, guess=__guess2__, entropy=1., printables_rate=1., penalty=.1, - expansion_factor=(22., 3.), pattern=r"whitespace("+op+r"before"+op+r"after|"+op+r"after"+op+r"before)$") - +# -*- coding: UTF-8 -*- +"""Whitespace Codec - whitespace/tabs content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import random +import re +from string import printable + +from ..__common__ import * + + +__examples1__ = { + 'enc(whitespace|whitespaces)': {'test': "\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t"}, + 'enc(whitespace-inv|whitespace_inverted)': {'test': " \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t "}, +} +__guess1__ = ["whitespace", "whitespace-inv"] +__guess2__ = ["whitespace+after-before", "whitespace-after+before"] + + +ENCMAP = {r'': {'0': "\t", '1': " "}, r'[-_]inv(erted)?': {'0': " ", '1': "\t"}} +add_map("whitespace", ENCMAP, intype="bin", pattern=r"^whitespaces?([-_]inv(?:erted)?)?$", examples=__examples1__, + guess=__guess1__, entropy=1., printables_rate=1., expansion_factor=8.) + + +def wsba_encode(p): + eq = "ord(c)" + p + def encode(text, errors="strict"): + r = [] + for i, c in enumerate(text): + if ord(c) < min(ord(c) for c in printable[:-6]): + r.append(handle_error("whitespace" + p, errors, repl_char="\x00")(c, i)) + continue + enc = "\x00" + offset = random.randint(-10,10) + while enc not in printable[:-6]: + after = random.randint(0, 20) + before = random.randint(0, 20) + enc = chr(eval(eq) % 256) + r.append(" " * before + enc + " " * after) + s = "\n".join(r) + return s, len(s) + return encode + + +def wsba_decode(p): + eq = "ord(c)" + "".join({'-':"+",'+':"-"}.get(c, c) for c in p) + def decode(text, errors="strict"): + s = "" + for i, l in enumerate(text.split("\n")): + ll = len(l.strip()) + if ll == 0: + continue + if ll > 1: + s += handle_error("whitespace_after_before", errors, decode=True, item="line")(l, i) + after = len(l) - len(l.rstrip(" ")) + before = len(l) - len(l.lstrip(" ")) + c = l[before] + s += chr(eval(eq)) + return s, len(text) + return decode + + +op = r"[+-](?:\d+(?:\.\d+)?[*/])?" +add("whitespace_after_before", wsba_encode, wsba_decode, guess=__guess2__, entropy=1., printables_rate=1., penalty=.1, + expansion_factor=(22., 3.), pattern=r"whitespace("+op+r"before"+op+r"after|"+op+r"after"+op+r"before)$") + diff --git a/codext/web/__init__.py b/src/codext/web/__init__.py old mode 100755 new mode 100644 similarity index 94% rename from codext/web/__init__.py rename to src/codext/web/__init__.py index b29367a..566b441 --- a/codext/web/__init__.py +++ b/src/codext/web/__init__.py @@ -1,4 +1,4 @@ -# -*- coding: UTF-8 -*- -from .html import * -from .url import * - +# -*- coding: UTF-8 -*- +from .html import * +from .url import * + diff --git a/codext/web/html.py b/src/codext/web/html.py old mode 100755 new mode 100644 similarity index 98% rename from codext/web/html.py rename to src/codext/web/html.py index 0a128b3..6354946 --- a/codext/web/html.py +++ b/src/codext/web/html.py @@ -1,292 +1,288 @@ -# -*- coding: UTF-8 -*- -"""HTML entity Codec - html entity content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import re -from six import unichr - -from ..__common__ import * - - -__examples__ = { - 'enc(html_entities|html-entity)': {'': "<This is a test>"}, - 'dec(html|html_entity)': {'&DoesNotExist;': None}, - 'dec(html_entities|html-entity)': { - '<This is a test>': "", - '<This is a test>': "", - }, -} -if PY3: - __examples__['enc(html)'] = {'\u1234': "&1234;"} - - -# source: https://dev.w3.org/html5/html-author/charref -ENCMAP = { - '\t': " ", '\n': " ", '!': "!", '"': """, '#': "#", '$': "$", '%': "%", - '&': "&", '\'': "'", '(': "(", ')': ")", '*': "*", '+': "+", ',': ",", - '.': ".", '/': "/", ':': ":", ';': ";", '<': "<", '=': "=", '>': ">", - '?': "?", '@': "@", '[': "[", '\\': "\", ']': "]", '^': "^", '_': "_", - '`': "`", '{': "{", '|': "|", '}': "}", '¡': "¡", '¢': "¢", - '£': "£", '¤': "¤", '¥': "¥", '¦': "¦", '§': "§", '¨': "¨", '©': "©", - 'ª': "ª", '«': "«", '¬': "¬", '­': "­", '®': "®", '¯': "¯", '°': "°", - '±': "±", '²': "²", '³': "³", '´': "´", 'µ': "µ", '¶': "¶", '·': "·", - '¸': "¸", '¹': "¹", 'º': "º", '»': "»", '¼': "¼", '½': "½", '¾': "¾", - '¿': "¿", 'À': "À", 'Á': "Á", 'Â': "Â", 'Ã': "Ã", 'Ä': "Ä", 'Å': "Å", - 'Æ': "Æ", 'Ç': "Ç", 'È': "È", 'É': "É", 'Ê': "Ê", 'Ë': "Ë", 'Ì': "Ì", - 'Í': "Í", 'Î': "Î", 'Ï': "Ï", 'Ð': "Ð", 'Ñ': "Ñ", 'Ò': "Ò", 'Ó': "Ó", - 'Ô': "Ô", 'Õ': "Õ", 'Ö': "Ö", '×': "×", 'Ø': "Ø", 'Ù': "Ù", 'Ú': "Ú", - 'Û': "Û", 'Ü': "Ü", 'Ý': "Ý", 'Þ': "Þ", 'ß': "ß", 'à': "à", 'á': "á", - 'â': "â", 'ã': "ã", 'ä': "ä", 'å': "å", 'æ': "æ", 'ç': "ç", 'è': "è", - 'é': "é", 'ê': "ê", 'ë': "ë", 'ì': "ì", 'í': "í", 'î': "î", 'ï': "ï", - 'ð': "ð", 'ñ': "ñ", 'ò': "ò", 'ó': "ó", 'ô': "ô", 'õ': "õ", 'ö': "ö", - '÷': "÷", 'ø': "ø", 'ù': "ù", 'ú': "ú", 'û': "û", 'ü': "ü", 'ý': "ý", - 'þ': "þ", 'ÿ': "ÿ", 'Ā': "Ā", 'ā': "ā", 'Ă': "Ă", 'ă': "ă", 'Ą': "Ą", - 'ą': "ą", 'Ć': "Ć", 'ć': "ć", 'Ĉ': "Ĉ", 'ĉ': "ĉ", 'Ċ': "Ċ", 'ċ': "ċ", - 'Č': "Č", 'č': "č", 'Ď': "Ď", 'ď': "ď", 'Đ': "Đ", 'đ': "đ", - 'Ē': "Ē", 'ē': "ē", 'Ė': "Ė", 'ė': "ė", 'Ę': "Ę", 'ę': "ę", 'Ě': "Ě", - 'ě': "ě", 'Ĝ': "Ĝ", 'ĝ': "ĝ", 'Ğ': "Ğ", 'ğ': "ğ", 'Ġ': "Ġ", 'ġ': "ġ", - 'Ģ': "Ģ", 'Ĥ': "Ĥ", 'ĥ': "ĥ", 'Ħ': "Ħ", 'ħ': "ħ", 'Ĩ': "Ĩ", - 'ĩ': "ĩ", 'Ī': "Ī", 'ī': "ī", 'Į': "Į", 'į': "į", 'İ': "İ", 'ı': "ı", - 'IJ': "IJ", 'ij': "ij", 'Ĵ': "Ĵ", 'ĵ': "ĵ", 'Ķ': "Ķ", 'ķ': "ķ", 'ĸ': "ĸ", - 'Ĺ': "Ĺ", 'ĺ': "ĺ", 'Ļ': "Ļ", 'ļ': "ļ", 'Ľ': "Ľ", 'ľ': "ľ", - 'Ŀ': "Ŀ", 'ŀ': "ŀ", 'Ł': "Ł", 'ł': "ł", 'Ń': "Ń", 'ń': "ń", - 'Ņ': "Ņ", 'ņ': "ņ", 'Ň': "Ň", 'ň': "ň", 'ʼn': "ʼn", 'Ŋ': "Ŋ", 'ŋ': "ŋ", - 'Ō': "Ō", 'ō': "ō", 'Ő': "Ő", 'ő': "ő", 'Œ': "Œ", 'œ': "œ", 'Ŕ': "Ŕ", - 'ŕ': "ŕ", 'Ŗ': "Ŗ", 'ŗ': "ŗ", 'Ř': "Ř", 'ř': "ř", 'Ś': "Ś", - 'ś': "ś", 'Ŝ': "Ŝ", 'ŝ': "ŝ", 'Ş': "Ş", 'ş': "ş", 'Š': "Š", - 'š': "š", 'Ţ': "Ţ", 'ţ': "ţ", 'Ť': "Ť", 'ť': "ť", 'Ŧ': "Ŧ", - 'ŧ': "ŧ", 'Ũ': "Ũ", 'ũ': "ũ", 'Ū': "Ū", 'ū': "ū", 'Ŭ': "Ŭ", - 'ŭ': "ŭ", 'Ů': "Ů", 'ů': "ů", 'Ű': "Ű", 'ű': "ű", 'Ų': "Ų", 'ų': "ų", - 'Ŵ': "Ŵ", 'ŵ': "ŵ", 'Ŷ': "Ŷ", 'ŷ': "ŷ", 'Ÿ': "Ÿ", 'Ź': "Ź", 'ź': "ź", - 'Ż': "Ż", 'ż': "ż", 'Ž': "Ž", 'ž': "ž", 'ƒ': "ƒ", 'Ƶ': "Ƶ", 'ǵ': "ǵ", - 'ȷ': "ȷ", 'ˆ': "ˆ", 'ˇ': "ˇ", '˘': "˘", '˙': "˙", '˚': "˚", '˛': "˛", - '˜': "˜", '˝': "˝", '̑': "̑", '̲': "_", 'Α': "Α", 'Β': "Β", - 'Γ': "Γ", 'Δ': "Δ", 'Ε': "Ε", 'Ζ': "Ζ", 'Η': "Η", 'Θ': "Θ", 'Ι': "Ι", - 'Κ': "Κ", 'Λ': "Λ", 'Μ': "Μ", 'Ν': "Ν", 'Ξ': "Ξ", 'Ο': "Ο", 'Π': "Π", - 'Ρ': "Ρ", 'Σ': "Σ", 'Τ': "Τ", 'Υ': "Υ", 'Φ': "Φ", 'Χ': "Χ", 'Ψ': "Ψ", - 'Ω': "Ω", 'α': "α", 'β': "β", 'γ': "γ", 'δ': "δ", 'ε': "ϵ", 'ζ': "ζ", - 'η': "η", 'θ': "θ", 'ι': "ι", 'κ': "κ", 'λ': "λ", 'μ': "μ", 'ν': "ν", - 'ξ': "ξ", 'ο': "ο", 'π': "π", 'ρ': "ρ", 'ς': "ς", 'σ': "σ", 'τ': "τ", - 'υ': "υ", 'φ': "φ", 'χ': "χ", 'ψ': "ψ", 'ω': "ω", 'ϑ': "ϑ", 'ϒ': "ϒ", - 'ϕ': "ϕ", 'ϖ': "ϖ", 'Ϝ': "Ϝ", 'ϝ': "ϝ", 'ϰ': "ϰ", 'ϱ': "ϱ", - 'ϵ': "ε", '϶': "϶", 'Ё': "Ё", 'Ђ': "Ђ", 'Ѓ': "Ѓ", 'Є': "Є", 'Ѕ': "Ѕ", - 'І': "І", 'Ї': "Ї", 'Ј': "Ј", 'Љ': "Љ", 'Њ': "Њ", 'Ћ': "Ћ", 'Ќ': "Ќ", - 'Ў': "Ў", 'Џ': "Џ", 'А': "А", 'Б': "Б", 'В': "В", 'Г': "Г", 'Д': "Д", 'Е': "Е", - 'Ж': "Ж", 'З': "З", 'И': "И", 'Й': "Й", 'К': "К", 'Л': "Л", 'М': "М", 'Н': "Н", - 'О': "О", 'П': "П", 'Р': "Р", 'С': "С", 'Т': "Т", 'У': "У", 'Ф': "Ф", 'Х': "Х", - 'Ц': "Ц", 'Ч': "Ч", 'Ш': "Ш", 'Щ': "Щ", 'Ъ': "Ъ", 'Ы': "Ы", 'Ь': "Ь", - 'Э': "Э", 'Ю': "Ю", 'Я': "Я", 'а': "а", 'б': "б", 'в': "в", 'г': "г", 'д': "д", - 'е': "е", 'ж': "ж", 'з': "з", 'и': "и", 'й': "й", 'к': "к", 'л': "л", 'м': "м", - 'н': "н", 'о': "о", 'п': "п", 'р': "р", 'с': "с", 'т': "т", 'у': "у", 'ф': "ф", - 'х': "х", 'ц': "ц", 'ч': "ч", 'ш': "ш", 'щ': "щ", 'ъ': "ъ", 'ы': "ы", - 'ь': "ь", 'э': "э", 'ю': "ю", 'я': "я", 'ё': "ё", 'ђ': "ђ", 'ѓ': "ѓ", - 'є': "є", 'ѕ': "ѕ", 'і': "і", 'ї': "ї", 'ј': "ј", 'љ': "љ", 'њ': "њ", - 'ћ': "ћ", 'ќ': "ќ", 'ў': "ў", 'џ': "џ", '\u2002': " ", '\u2003': " ", - '\u2004': " ", '\u2005': " ", '\u2007': " ", '\u2008': " ", '\u2009': " ", - '\u200a': " ", '​\u200b': "​", '\u200c': "‌", '\u200d': "‍", '\u200e': "‎", - '\u200f': "‏", '‐': "‐", '–': "–", '—': "—", - '―': "―", '‖': "‖", '‘': "‘", '’': "’", '‚': "‚", '“': "“", '”': "”", - '„': "„", '†': "†", '‡': "‡", '•': "•", '‥': "‥", '…': "…", '‰': "‰", - '‱': "‱", '′': "′", '″': "″", '‴': "‴", '‵': "‵", '‹': "‹", - '›': "›", '‾': "‾", '⁁': "⁁", '⁃': "⁃", '⁄': "⁄", '⁏': "⁏", '⁗': "⁗", - '\u205f': " ", '⁠': "⁠", '⁡': "⁡", '⁢': "⁢", '⁣': "⁣", - '€': "€", '⃛': "⃛", '⃜': "⃜", 'ℂ': "ℂ", '℅': "℅", 'ℊ': "ℊ", 'ℋ': "ℋ", - 'ℌ': "ℌ", 'ℍ': "ℍ", 'ℎ': "ℎ", 'ℏ': "ℏ", 'ℐ': "ℐ", 'ℑ': "ℑ", - 'ℒ': "ℒ", 'ℓ': "ℓ", 'ℕ': "ℕ", '№': "№", '℗': "℗", '℘': "℘", 'ℙ': "ℙ", - 'ℚ': "ℚ", 'ℛ': "ℛ", 'ℜ': "ℜ", 'ℝ': "ℝ", '℞': "℞", '™': "™", 'ℤ': "ℤ", - 'Ω': "Ω", '℧': "℧", 'ℨ': "ℨ", '℩': "℩", 'Å': "Å", 'ℬ': "ℬ", 'ℭ': "ℭ", - 'ℯ': "ℯ", 'ℰ': "ℰ", 'ℱ': "ℱ", 'ℳ': "ℳ", 'ℴ': "ℴ", 'ℵ': "ℵ", 'ℶ': "ℶ", - 'ℷ': "ℷ", 'ℸ': "ℸ", 'ⅅ': "ⅅ", 'ⅆ': "ⅆ", 'ⅇ': "ⅇ", - 'ⅈ': "ⅈ", '⅓': "⅓", '⅔': "⅔", '⅕': "⅕", '⅖': "⅖", '⅗': "⅗", - '⅘': "⅘", '⅙': "⅙", '⅚': "⅚", '⅛': "⅛", '⅜': "⅜", '⅝': "⅝", - '⅞': "⅞", '←': "←", '↑': "↑", '→': "→", '↓': "↓", '↔': "↔", '↕': "↕", - '↖': "↖", '↗': "↗", '↘': "↘", '↙': "↙", '↚': "↚", '↛': "↛", '↝': "↝", - '↞': "↞", '↟': "↟", '↠': "↠", '↡': "↡", '↢': "↢", '↣': "↣", - '↤': "↤", '↥': "↥", '↦': "↦", '↧': "↧", '↩': "↩", '↪': "↪", - '↫': "↫", '↬': "↬", '↭': "↭", '↮': "↮", '↰': "↰", '↱': "↱", '↲': "↲", - '↳': "↳", '↵': "↵", '↶': "↶", '↷': "↷", '↺': "↺", '↻': "↻", '↼': "↼", - '↽': "↽", '↾': "↾", '↿': "↿", '⇀': "⇀", '⇁': "⇁", '⇂': "⇂", '⇃': "⇃", - '⇄': "⇄", '⇅': "⇅", '⇆': "⇆", '⇇': "⇇", '⇈': "⇈", '⇉': "⇉", '⇊': "⇊", - '⇋': "⇋", '⇌': "⇌", '⇍': "⇍", '⇎': "⇎", '⇏': "⇏", '⇐': "⇐", '⇑': "⇑", - '⇒': "⇒", '⇓': "⇓", '⇔': "⇔", '⇕': "⇕", '⇖': "⇖", '⇗': "⇗", '⇘': "⇘", - '⇙': "⇙", '⇚': "⇚", '⇛': "⇛", '⇝': "⇝", '⇤': "⇤", '⇥': "⇥", '⇵': "⇵", - '⇽': "⇽", '⇾': "⇾", '⇿': "⇿", '∀': "∀", '∁': "∁", '∂': "∂", '∃': "∃", - '∄': "∄", '∅': "∅", '∇': "∇", '∈': "∈", '∉': "∉", '∋': "∋", '∌': "∌", - '∏': "∏", '∐': "∐", '∑': "∑", '−': "−", '∓': "∓", '∔': "∔", '∖': "∖", - '∗': "∗", '∘': "∘", '√': "√", '∝': "∝", '∞': "∞", '∟': "∟", '∠': "∠", - '∡': "∡", '∢': "∢", '∣': "∣", '∤': "∤", '∥': "∥", '∦': "∦", '∧': "∧", - '∨': "∨", '∩': "∩", '∪': "∪", '∫': "∫", '∬': "∬", '∭': "∭", '∮': "∮", - '∯': "∯", '∰': "∰", '∱': "∱", '∲': "∲", '∳': "∳", '∴': "∴", - '∵': "∵", '∶': "∶", '∷': "∷", '∸': "∸", '∺': "∺", '∻': "∻", '∼': "∼", - '∽': "∽", '∾': "∾", '∿': "∿", '≀': "≀", '≁': "≁", '≂': "≂", '≃': "≃", - '≄': "≄", '≅': "≅", '≆': "≆", '≇': "≇", '≈': "≈", '≉': "≉", '≊': "≊", - '≋': "≋", '≌': "≌", '≍': "≍", '≎': "≎", '≏': "≏", '≐': "≐", '≑': "≑", - '≒': "≒", '≓': "≓", '≔': "≔", '≕': "≕", '≖': "≖", '≗': "≗", '≙': "≙", - '≚': "≚", '≜': "≜", '≟': "≟", '≠': "≠", '≡': "≡", '≢': "≢", '≤': "≤", - '≥': "≥", '≦': "≦", '≧': "≧", '≨': "≨", '≩': "≩", '≪': "≪", '≫': "≫", '≬': "≬", - '≭': "≭", '≮': "≮", '≯': "≯", '≰': "≰", '≱': "≱", '≲': "≲", '≳': "≳", - '≴': "≴", '≵': "≵", '≶': "≶", '≷': "≷", '≸': "≸", '≹': "≹", '≺': "≺", '≻': "≻", - '≼': "≼", '≽': "≽", '≾': "≾", '≿': "≿", '⊀': "⊀", '⊁': "⊁", '⊂': "⊂", - '⊃': "⊃", '⊄': "⊄", '⊅': "⊅", '⊆': "⊆", '⊇': "⊇", '⊈': "⊈", '⊉': "⊉", - '⊊': "⊊", '⊋': "⊋", '⊍': "⊍", '⊎': "⊎", '⊏': "⊏", '⊐': "⊐", '⊑': "⊑", - '⊒': "⊒", '⊓': "⊓", '⊔': "⊔", '⊕': "⊕", '⊖': "⊖", '⊗': "⊗", '⊘': "⊘", - '⊙': "⊙", '⊚': "⊚", '⊛': "⊛", '⊝': "⊝", '⊞': "⊞", '⊟': "⊟", '⊠': "⊠", - '⊡': "⊡", '⊢': "⊢", '⊣': "⊣", '⊤': "⊤", '⊥': "⊥", '⊧': "⊧", '⊨': "⊨", - '⊩': "⊩", '⊪': "⊪", '⊫': "⊫", '⊬': "⊬", '⊭': "⊭", '⊮': "⊮", - '⊯': "⊯", '⊰': "⊰", '⊲': "⊲", '⊳': "⊳", '⊴': "⊴", '⊵': "⊵", '⊶': "⊶", - '⊷': "⊷", '⊸': "⊸", '⊹': "⊹", '⊺': "⊺", '⊻': "⊻", '⊽': "⊽", - '⊾': "⊾", '⊿': "⊿", '⋀': "⋀", '⋁': "⋁", '⋂': "⋂", '⋃': "⋃", '⋄': "⋄", - '⋅': "⋅", '⋆': "⋆", '⋇': "⋇", '⋈': "⋈", '⋉': "⋉", '⋊': "⋊", - '⋋': "⋋", '⋌': "⋌", '⋍': "⋍", '⋎': "⋎", '⋏': "⋏", '⋐': "⋐", '⋑': "⋑", - '⋒': "⋒", '⋓': "⋓", '⋔': "⋔", '⋕': "⋕", '⋖': "⋖", '⋗': "⋗", '⋘': "⋘", '⋙': "⋙", - '⋚': "⋚", '⋛': "⋛", '⋞': "⋞", '⋟': "⋟", '⋠': "⋠", '⋡': "⋡", '⋢': "⋢", - '⋣': "⋣", '⋦': "⋦", '⋧': "⋧", '⋨': "⋨", '⋩': "⋩", '⋪': "⋪", '⋫': "⋫", - '⋬': "⋬", '⋭': "⋭", '⋮': "⋮", '⋯': "⋯", '⋰': "⋰", '⋱': "⋱", '⋲': "⋲", - '⋳': "⋳", '⋴': "⋴", '⋵': "⋵", '⋶': "⋶", '⋷': "⋷", '⋹': "⋹", - '⋺': "⋺", '⋻': "⋻", '⋼': "⋼", '⋽': "⋽", '⋾': "⋾", '⌅': "⌅", '⌆': "⌆", - '⌈': "⌈", '⌉': "⌉", '⌊': "⌊", '⌋': "⌋", '⌌': "⌌", '⌍': "⌍", - '⌎': "⌎", '⌏': "⌏", '⌐': "⌐", '⌒': "⌒", '⌓': "⌓", '⌕': "⌕", - '⌖': "⌖", '⌜': "⌜", '⌝': "⌝", '⌞': "⌞", '⌟': "⌟", '⌢': "⌢", - '⌣': "⌣", '⌭': "⌭", '⌮': "⌮", '⌶': "⌶", '⌽': "⌽", '⌿': "⌿", - '⍼': "⍼", '⎰': "⎰", '⎱': "⎱", '⎴': "⎴", '⎵': "⎵", '⎶': "⎶", - '⏜': "⏜", '⏝': "⏝", '⏞': "⏞", '⏟': "⏟", '⏢': "⏢", - '⏧': "⏧", '␣': "␣", 'Ⓢ': "Ⓢ", '─': "─", '│': "│", '┌': "┌", '┐': "┐", - '└': "└", '┘': "┘", '├': "├", '┤': "┤", '┬': "┬", '┴': "┴", '┼': "┼", - '═': "═", '║': "║", '╒': "╒", '╓': "╓", '╔': "╔", '╕': "╕", '╖': "╖", - '╗': "╗", '╘': "╘", '╙': "╙", '╚': "╚", '╛': "╛", '╜': "╜", '╝': "╝", - '╞': "╞", '╟': "╟", '╠': "╠", '╡': "╡", '╢': "╢", '╣': "╣", '╤': "╤", - '╥': "╥", '╦': "╦", '╧': "╧", '╨': "╨", '╩': "╩", '╪': "╪", '╫': "╫", - '╬': "╬", '▀': "▀", '▄': "▄", '█': "█", '░': "░", '▒': "▒", '▓': "▓", - '□': "□", '▪': "▪", '▫': "▫", '▭': "▭", '▮': "▮", '▱': "▱", - '△': "△", '▴': "▴", '▵': "▵", '▸': "▸", '▹': "▹", '▽': "▽", '▾': "▾", - '▿': "▿", '◂': "◂", '◃': "◃", '◊': "◊", '○': "○", '◬': "◬", '◯': "◯", - '◸': "◸", '◹': "◹", '◺': "◺", '◻': "◻", '◼': "◼", - '★': "★", '☆': "☆", '☎': "☎", '♀': "♀", '♂': "♂", '♠': "♠", '♣': "♣", - '♥': "♥", '♦': "♦", '♪': "♪", '♭': "♭", '♮': "♮", '♯': "♯", '✓': "✓", - '✗': "✗", '✠': "✠", '✶': "✶", '❘': "❘", '❲': "❲", '❳': "❳", - '⟦': "⟦", '⟧': "⟧", '⟨': "⟨", '⟩': "⟩", '⟪': "⟪", '⟫': "⟫", '⟬': "⟬", - '⟭': "⟭", '⟵': "⟵", '⟶': "⟶", '⟷': "⟷", '⟸': "⟸", '⟹': "⟹", '⟺': "⟺", - '⟼': "⟼", '⟿': "⟿", '⤂': "⤂", '⤃': "⤃", '⤄': "⤄", '⤅': "⤅", '⤌': "⤌", - '⤍': "⤍", '⤎': "⤎", '⤏': "⤏", '⤐': "⤐", '⤑': "⤑", '⤒': "⤒", - '⤓': "⤓", '⤖': "⤖", '⤙': "⤙", '⤚': "⤚", '⤛': "⤛", '⤜': "⤜", - '⤝': "⤝", '⤞': "⤞", '⤟': "⤟", '⤠': "⤠", '⤣': "⤣", '⤤': "⤤", - '⤥': "⤥", '⤦': "⤦", '⤧': "⤧", '⤨': "⤨", '⤩': "⤩", '⤪': "⤪", - '⤳': "⤳", '⤵': "⤵", '⤶': "⤶", '⤷': "⤷", '⤸': "⤸", '⤹': "⤹", - '⤼': "⤼", '⤽': "⤽", '⥅': "⥅", '⥈': "⥈", '⥉': "⥉", '⥊': "⥊", - '⥋': "⥋", '⥎': "⥎", '⥏': "⥏", '⥐': "⥐", - '⥑': "⥑", '⥒': "⥒", '⥓': "⥓", '⥔': "⥔", - '⥕': "⥕", '⥖': "⥖", '⥗': "⥗", '⥘': "⥘", - '⥙': "⥙", '⥚': "⥚", '⥛': "⥛", '⥜': "⥜", - '⥝': "⥝", '⥞': "⥞", '⥟': "⥟", '⥠': "⥠", - '⥡': "⥡", '⥢': "⥢", '⥣': "⥣", '⥤': "⥤", '⥥': "⥥", '⥦': "⥦", - '⥧': "⥧", '⥨': "⥨", '⥩': "⥩", '⥪': "⥪", '⥫': "⥫", '⥬': "⥬", - '⥭': "⥭", '⥮': "⥮", '⥯': "⥯", '⥰': "⥰", '⥱': "⥱", '⥲': "⥲", - '⥳': "⥳", '⥴': "⥴", '⥵': "⥵", '⥶': "⥶", '⥸': "⥸", '⥹': "⥹", - '⥻': "⥻", '⥼': "⥼", '⥽': "⥽", '⥾': "⥾", '⥿': "⥿", '⦅': "⦅", - '⦆': "⦆", '⦋': "⦋", '⦌': "⦌", '⦍': "⦍", '⦎': "⦎", '⦏': "⦏", - '⦐': "⦐", '⦑': "⦑", '⦒': "⦒", '⦓': "⦓", '⦔': "⦔", '⦕': "⦕", - '⦖': "⦖", '⦚': "⦚", '⦜': "⦜", '⦝': "⦝", '⦤': "⦤", '⦥': "⦥", - '⦦': "⦦", '⦧': "⦧", '⦨': "⦨", '⦩': "⦩", '⦪': "⦪", '⦫': "⦫", - '⦬': "⦬", '⦭': "⦭", '⦮': "⦮", '⦯': "⦯", '⦰': "⦰", '⦱': "⦱", - '⦲': "⦲", '⦳': "⦳", '⦴': "⦴", '⦵': "⦵", '⦶': "⦶", '⦷': "⦷", - '⦹': "⦹", '⦻': "⦻", '⦼': "⦼", '⦾': "⦾", '⦿': "⦿", '⧀': "⧀", '⧁': "⧁", - '⧂': "⧂", '⧃': "⧃", '⧄': "⧄", '⧅': "⧅", '⧉': "⧉", '⧍': "⧍", '⧎': "⧎", - '⧏': "⧏", '⧐': "⧐", '⧚': "∽̱", '⧜': "⧜", '⧝': "⧝", - '⧞': "⧞", '⧣': "⧣", '⧤': "⧤", '⧥': "⧥", '⧫': "⧫", '⧴': "⧴", - '⧶': "⧶", '⨀': "⨀", '⨁': "⨁", '⨂': "⨂", '⨄': "⨄", '⨆': "⨆", '⨌': "⨌", - '⨍': "⨍", '⨐': "⨐", '⨑': "⨑", '⨒': "⨒", '⨓': "⨓", '⨔': "⨔", - '⨕': "⨕", '⨖': "⨖", '⨗': "⨗", '⨢': "⨢", '⨣': "⨣", '⨤': "⨤", - '⨥': "⨥", '⨦': "⨦", '⨧': "⨧", '⨩': "⨩", '⨪': "⨪", '⨭': "⨭", - '⨮': "⨮", '⨯': "⨯", '⨰': "⨰", '⨱': "⨱", '⨳': "⨳", '⨴': "⨴", - '⨵': "⨵", '⨶': "⨶", '⨷': "⨷", '⨸': "⨸", '⨹': "⨹", '⨺': "⨺", - '⨻': "⨻", '⨼': "⨼", '⨿': "⨿", '⩀': "⩀", '⩂': "⩂", '⩃': "⩃", '⩄': "⩄", - '⩅': "⩅", '⩆': "⩆", '⩇': "⩇", '⩈': "⩈", '⩉': "⩉", '⩊': "⩊", - '⩋': "⩋", '⩌': "⩌", '⩍': "⩍", '⩐': "⩐", '⩓': "⩓", '⩔': "⩔", '⩕': "⩕", - '⩖': "⩖", '⩗': "⩗", '⩘': "⩘", '⩚': "⩚", '⩛': "⩛", '⩜': "⩜", '⩝': "⩝", - '⩟': "⩟", '⩦': "⩦", '⩪': "⩪", '⩭': "⩭", '⩮': "⩮", '⩯': "⩯", '⩰': "⩰", - '⩱': "⩱", '⩲': "⩲", '⩳': "⩳", '⩴': "⩴", '⩵': "⩵", '⩷': "⩷", '⩸': "⩸", - '⩹': "⩹", '⩺': "⩺", '⩻': "⩻", '⩼': "⩼", '⩽': "⩽", '⩾': "⩾", '⩿': "⩿", - '⪀': "⪀", '⪁': "⪁", '⪂': "⪂", '⪃': "⪃", '⪄': "⪄", '⪅': "⪅", - '⪆': "⪆", '⪇': "⪇", '⪈': "⪈", '⪉': "⪉", '⪊': "⪊", '⪋': "⪋", '⪌': "⪌", '⪍': "⪍", - '⪎': "⪎", '⪏': "⪏", '⪐': "⪐", '⪑': "⪑", '⪒': "⪒", '⪓': "⪓", '⪔': "⪔", - '⪕': "⪕", '⪖': "⪖", '⪗': "⪗", '⪘': "⪘", '⪙': "⪙", '⪚': "⪚", '⪝': "⪝", - '⪞': "⪞", '⪟': "⪟", '⪠': "⪠", '⪡': "⪡", '⪢': "⪢", '⪤': "⪤", - '⪥': "⪥", '⪦': "⪦", '⪧': "⪧", '⪨': "⪨", '⪩': "⪩", '⪪': "⪪", '⪫': "⪫", - '⪬': "⪬", '⪭': "⪭", '⪮': "⪮", '⪯': "⪯", '⪰': "⪰", '⪳': "⪳", '⪴': "⪴", - '⪵': "⪵", '⪶': "⪶", '⪷': "⪷", '⪸': "⪸", '⪹': "⪹", '⪺': "⪺", '⪻': "⪻", - '⪼': "⪼", '⪽': "⪽", '⪾': "⪾", '⪿': "⪿", '⫀': "⫀", '⫁': "⫁", - '⫂': "⫂", '⫃': "⫃", '⫄': "⫄", '⫅': "⫅", '⫆': "⫆", '⫇': "⫇", - '⫈': "⫈", '⫋': "⫋", '⫌': "⫌", '⫏': "⫏", '⫐': "⫐", '⫑': "⫑", '⫒': "⫒", - '⫓': "⫓", '⫔': "⫔", '⫕': "⫕", '⫖': "⫖", '⫗': "⫗", '⫘': "⫘", - '⫙': "⫙", '⫚': "⫚", '⫛': "⫛", '⫤': "⫤", '⫦': "⫦", '⫧': "⫧", '⫨': "⫨", - '⫩': "⫩", '⫫': "⫫", '⫬': "⫬", '⫭': "⫭", '⫮': "⫮", '⫯': "⫯", '⫰': "⫰", - '⫱': "⫱", '⫲': "⫲", '⫳': "⫳", '⫽': "⫽", 'ff': "ff", 'fi': "fi", 'fl': "fl", - 'ffi': "ffi", 'ffl': "ffl", '𝒜': "𝒜", '𝒞': "𝒞", '𝒟': "𝒟", '𝒢': "𝒢", '𝒥': "𝒥", - '𝒦': "𝒦", '𝒩': "𝒩", '𝒪': "𝒪", '𝒫': "𝒫", '𝒬': "𝒬", '𝒮': "𝒮", '𝒯': "𝒯", - '𝒰': "𝒰", '𝒱': "𝒱", '𝒲': "𝒲", '𝒳': "𝒳", '𝒴': "𝒴", '𝒵': "𝒵", '𝒶': "𝒶", - '𝒷': "𝒷", '𝒸': "𝒸", '𝒹': "𝒹", '𝒻': "𝒻", '𝒽': "𝒽", '𝒾': "𝒾", '𝒿': "𝒿", - '𝓀': "𝓀", '𝓁': "𝓁", '𝓂': "𝓂", '𝓃': "𝓃", '𝓅': "𝓅", '𝓆': "𝓆", '𝓇': "𝓇", - '𝓈': "𝓈", '𝓉': "𝓉", '𝓊': "𝓊", '𝓋': "𝓋", '𝓌': "𝓌", '𝓍': "𝓍", '𝓎': "𝓎", - '𝓏': "𝓏", '𝔄': "𝔄", '𝔅': "𝔅", '𝔇': "𝔇", '𝔈': "𝔈", '𝔉': "𝔉", '𝔊': "𝔊", '𝔍': "𝔍", - '𝔎': "𝔎", '𝔏': "𝔏", '𝔐': "𝔐", '𝔑': "𝔑", '𝔒': "𝔒", '𝔓': "𝔓", '𝔔': "𝔔", '𝔖': "𝔖", - '𝔗': "𝔗", '𝔘': "𝔘", '𝔙': "𝔙", '𝔚': "𝔚", '𝔛': "𝔛", '𝔜': "𝔜", '𝔞': "𝔞", '𝔟': "𝔟", - '𝔠': "𝔠", '𝔡': "𝔡", '𝔢': "𝔢", '𝔣': "𝔣", '𝔤': "𝔤", '𝔥': "𝔥", '𝔦': "𝔦", '𝔧': "𝔧", - '𝔨': "𝔨", '𝔩': "𝔩", '𝔪': "𝔪", '𝔫': "𝔫", '𝔬': "𝔬", '𝔭': "𝔭", '𝔮': "𝔮", '𝔯': "𝔯", - '𝔰': "𝔰", '𝔱': "𝔱", '𝔲': "𝔲", '𝔳': "𝔳", '𝔴': "𝔴", '𝔵': "𝔵", '𝔶': "𝔶", '𝔷': "𝔷", - '𝔸': "𝔸", '𝔹': "𝔹", '𝔻': "𝔻", '𝔼': "𝔼", '𝔽': "𝔽", '𝔾': "𝔾", '𝕀': "𝕀", - '𝕁': "𝕁", '𝕂': "𝕂", '𝕃': "𝕃", '𝕄': "𝕄", '𝕆': "𝕆", '𝕊': "𝕊", '𝕋': "𝕋", - '𝕌': "𝕌", '𝕍': "𝕍", '𝕎': "𝕎", '𝕏': "𝕏", '𝕐': "𝕐", '𝕒': "𝕒", '𝕓': "𝕓", - '𝕔': "𝕔", '𝕕': "𝕕", '𝕖': "𝕖", '𝕗': "𝕗", '𝕘': "𝕘", '𝕙': "𝕙", '𝕚': "𝕚", - '𝕛': "𝕛", '𝕜': "𝕜", '𝕝': "𝕝", '𝕞': "𝕞", '𝕟': "𝕟", '𝕠': "𝕠", '𝕡': "𝕡", - '𝕢': "𝕢", '𝕣': "𝕣", '𝕤': "𝕤", '𝕥': "𝕥", '𝕦': "𝕦", '𝕧': "𝕧", '𝕨': "𝕨", - '𝕩': "𝕩", '𝕪': "𝕪", '𝕫': "𝕫", -} -DECMAP = {v: k for k, v in ENCMAP.items()} - - -class HtmlEntityDecodeError(ValueError): - pass - - -def htmlentity_encode(text, errors="strict"): - s = "" - for c in text: - try: - s += ENCMAP[c] - except KeyError: - i = ord(c) - s += "&" + hex(i)[2:].zfill(0) + ";" if i > 0xff else c - return s, len(text) - - -def htmlentity_decode(text, errors="strict"): - s = "" - i = 0 - while i < len(text): - m = re.match(r"&(?:(?:[A-Za-z][A-Za-z0-9]{1,6}){1,4}|[0-9]{4});", text[i:i+30]) - if m: - entity = m.group() - c = unichr(int(entity[1:5], 16)) if entity[1:5].isdigit() and len(entity) == 6 else \ - " " if entity == " " else None - if c: - s += c - else: - try: - s += DECMAP[entity] - except KeyError: - s += handle_error("html-entity", errors, HtmlEntityDecodeError, decode=True)(text[i], i) - i += len(entity) - else: - s += text[i] - i += 1 - return s, len(text) - - -add("html", htmlentity_encode, htmlentity_decode, r"^html(?:[-_]?entit(?:y|ies))?$", - extra_exceptions=["HtmlEntityDecodeError"]) - +# -*- coding: UTF-8 -*- +"""HTML entity Codec - html entity content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(html_entities|html-entity)': {'': "<This is a test>"}, + 'enc(html)': {'\u1234': "&1234;"}, + 'dec(html|html_entity)': {'&DoesNotExist;': None}, + 'dec(html_entities|html-entity)': { + '<This is a test>': "", + '<This is a test>': "", + }, +} + + +# source: https://dev.w3.org/html5/html-author/charref +ENCMAP = { + '\t': " ", '\n': " ", '!': "!", '"': """, '#': "#", '$': "$", '%': "%", + '&': "&", '\'': "'", '(': "(", ')': ")", '*': "*", '+': "+", ',': ",", + '.': ".", '/': "/", ':': ":", ';': ";", '<': "<", '=': "=", '>': ">", + '?': "?", '@': "@", '[': "[", '\\': "\", ']': "]", '^': "^", '_': "_", + '`': "`", '{': "{", '|': "|", '}': "}", '¡': "¡", '¢': "¢", + '£': "£", '¤': "¤", '¥': "¥", '¦': "¦", '§': "§", '¨': "¨", '©': "©", + 'ª': "ª", '«': "«", '¬': "¬", '­': "­", '®': "®", '¯': "¯", '°': "°", + '±': "±", '²': "²", '³': "³", '´': "´", 'µ': "µ", '¶': "¶", '·': "·", + '¸': "¸", '¹': "¹", 'º': "º", '»': "»", '¼': "¼", '½': "½", '¾': "¾", + '¿': "¿", 'À': "À", 'Á': "Á", 'Â': "Â", 'Ã': "Ã", 'Ä': "Ä", 'Å': "Å", + 'Æ': "Æ", 'Ç': "Ç", 'È': "È", 'É': "É", 'Ê': "Ê", 'Ë': "Ë", 'Ì': "Ì", + 'Í': "Í", 'Î': "Î", 'Ï': "Ï", 'Ð': "Ð", 'Ñ': "Ñ", 'Ò': "Ò", 'Ó': "Ó", + 'Ô': "Ô", 'Õ': "Õ", 'Ö': "Ö", '×': "×", 'Ø': "Ø", 'Ù': "Ù", 'Ú': "Ú", + 'Û': "Û", 'Ü': "Ü", 'Ý': "Ý", 'Þ': "Þ", 'ß': "ß", 'à': "à", 'á': "á", + 'â': "â", 'ã': "ã", 'ä': "ä", 'å': "å", 'æ': "æ", 'ç': "ç", 'è': "è", + 'é': "é", 'ê': "ê", 'ë': "ë", 'ì': "ì", 'í': "í", 'î': "î", 'ï': "ï", + 'ð': "ð", 'ñ': "ñ", 'ò': "ò", 'ó': "ó", 'ô': "ô", 'õ': "õ", 'ö': "ö", + '÷': "÷", 'ø': "ø", 'ù': "ù", 'ú': "ú", 'û': "û", 'ü': "ü", 'ý': "ý", + 'þ': "þ", 'ÿ': "ÿ", 'Ā': "Ā", 'ā': "ā", 'Ă': "Ă", 'ă': "ă", 'Ą': "Ą", + 'ą': "ą", 'Ć': "Ć", 'ć': "ć", 'Ĉ': "Ĉ", 'ĉ': "ĉ", 'Ċ': "Ċ", 'ċ': "ċ", + 'Č': "Č", 'č': "č", 'Ď': "Ď", 'ď': "ď", 'Đ': "Đ", 'đ': "đ", + 'Ē': "Ē", 'ē': "ē", 'Ė': "Ė", 'ė': "ė", 'Ę': "Ę", 'ę': "ę", 'Ě': "Ě", + 'ě': "ě", 'Ĝ': "Ĝ", 'ĝ': "ĝ", 'Ğ': "Ğ", 'ğ': "ğ", 'Ġ': "Ġ", 'ġ': "ġ", + 'Ģ': "Ģ", 'Ĥ': "Ĥ", 'ĥ': "ĥ", 'Ħ': "Ħ", 'ħ': "ħ", 'Ĩ': "Ĩ", + 'ĩ': "ĩ", 'Ī': "Ī", 'ī': "ī", 'Į': "Į", 'į': "į", 'İ': "İ", 'ı': "ı", + 'IJ': "IJ", 'ij': "ij", 'Ĵ': "Ĵ", 'ĵ': "ĵ", 'Ķ': "Ķ", 'ķ': "ķ", 'ĸ': "ĸ", + 'Ĺ': "Ĺ", 'ĺ': "ĺ", 'Ļ': "Ļ", 'ļ': "ļ", 'Ľ': "Ľ", 'ľ': "ľ", + 'Ŀ': "Ŀ", 'ŀ': "ŀ", 'Ł': "Ł", 'ł': "ł", 'Ń': "Ń", 'ń': "ń", + 'Ņ': "Ņ", 'ņ': "ņ", 'Ň': "Ň", 'ň': "ň", 'ʼn': "ʼn", 'Ŋ': "Ŋ", 'ŋ': "ŋ", + 'Ō': "Ō", 'ō': "ō", 'Ő': "Ő", 'ő': "ő", 'Œ': "Œ", 'œ': "œ", 'Ŕ': "Ŕ", + 'ŕ': "ŕ", 'Ŗ': "Ŗ", 'ŗ': "ŗ", 'Ř': "Ř", 'ř': "ř", 'Ś': "Ś", + 'ś': "ś", 'Ŝ': "Ŝ", 'ŝ': "ŝ", 'Ş': "Ş", 'ş': "ş", 'Š': "Š", + 'š': "š", 'Ţ': "Ţ", 'ţ': "ţ", 'Ť': "Ť", 'ť': "ť", 'Ŧ': "Ŧ", + 'ŧ': "ŧ", 'Ũ': "Ũ", 'ũ': "ũ", 'Ū': "Ū", 'ū': "ū", 'Ŭ': "Ŭ", + 'ŭ': "ŭ", 'Ů': "Ů", 'ů': "ů", 'Ű': "Ű", 'ű': "ű", 'Ų': "Ų", 'ų': "ų", + 'Ŵ': "Ŵ", 'ŵ': "ŵ", 'Ŷ': "Ŷ", 'ŷ': "ŷ", 'Ÿ': "Ÿ", 'Ź': "Ź", 'ź': "ź", + 'Ż': "Ż", 'ż': "ż", 'Ž': "Ž", 'ž': "ž", 'ƒ': "ƒ", 'Ƶ': "Ƶ", 'ǵ': "ǵ", + 'ȷ': "ȷ", 'ˆ': "ˆ", 'ˇ': "ˇ", '˘': "˘", '˙': "˙", '˚': "˚", '˛': "˛", + '˜': "˜", '˝': "˝", '̑': "̑", '̲': "_", 'Α': "Α", 'Β': "Β", + 'Γ': "Γ", 'Δ': "Δ", 'Ε': "Ε", 'Ζ': "Ζ", 'Η': "Η", 'Θ': "Θ", 'Ι': "Ι", + 'Κ': "Κ", 'Λ': "Λ", 'Μ': "Μ", 'Ν': "Ν", 'Ξ': "Ξ", 'Ο': "Ο", 'Π': "Π", + 'Ρ': "Ρ", 'Σ': "Σ", 'Τ': "Τ", 'Υ': "Υ", 'Φ': "Φ", 'Χ': "Χ", 'Ψ': "Ψ", + 'Ω': "Ω", 'α': "α", 'β': "β", 'γ': "γ", 'δ': "δ", 'ε': "ϵ", 'ζ': "ζ", + 'η': "η", 'θ': "θ", 'ι': "ι", 'κ': "κ", 'λ': "λ", 'μ': "μ", 'ν': "ν", + 'ξ': "ξ", 'ο': "ο", 'π': "π", 'ρ': "ρ", 'ς': "ς", 'σ': "σ", 'τ': "τ", + 'υ': "υ", 'φ': "φ", 'χ': "χ", 'ψ': "ψ", 'ω': "ω", 'ϑ': "ϑ", 'ϒ': "ϒ", + 'ϕ': "ϕ", 'ϖ': "ϖ", 'Ϝ': "Ϝ", 'ϝ': "ϝ", 'ϰ': "ϰ", 'ϱ': "ϱ", + 'ϵ': "ε", '϶': "϶", 'Ё': "Ё", 'Ђ': "Ђ", 'Ѓ': "Ѓ", 'Є': "Є", 'Ѕ': "Ѕ", + 'І': "І", 'Ї': "Ї", 'Ј': "Ј", 'Љ': "Љ", 'Њ': "Њ", 'Ћ': "Ћ", 'Ќ': "Ќ", + 'Ў': "Ў", 'Џ': "Џ", 'А': "А", 'Б': "Б", 'В': "В", 'Г': "Г", 'Д': "Д", 'Е': "Е", + 'Ж': "Ж", 'З': "З", 'И': "И", 'Й': "Й", 'К': "К", 'Л': "Л", 'М': "М", 'Н': "Н", + 'О': "О", 'П': "П", 'Р': "Р", 'С': "С", 'Т': "Т", 'У': "У", 'Ф': "Ф", 'Х': "Х", + 'Ц': "Ц", 'Ч': "Ч", 'Ш': "Ш", 'Щ': "Щ", 'Ъ': "Ъ", 'Ы': "Ы", 'Ь': "Ь", + 'Э': "Э", 'Ю': "Ю", 'Я': "Я", 'а': "а", 'б': "б", 'в': "в", 'г': "г", 'д': "д", + 'е': "е", 'ж': "ж", 'з': "з", 'и': "и", 'й': "й", 'к': "к", 'л': "л", 'м': "м", + 'н': "н", 'о': "о", 'п': "п", 'р': "р", 'с': "с", 'т': "т", 'у': "у", 'ф': "ф", + 'х': "х", 'ц': "ц", 'ч': "ч", 'ш': "ш", 'щ': "щ", 'ъ': "ъ", 'ы': "ы", + 'ь': "ь", 'э': "э", 'ю': "ю", 'я': "я", 'ё': "ё", 'ђ': "ђ", 'ѓ': "ѓ", + 'є': "є", 'ѕ': "ѕ", 'і': "і", 'ї': "ї", 'ј': "ј", 'љ': "љ", 'њ': "њ", + 'ћ': "ћ", 'ќ': "ќ", 'ў': "ў", 'џ': "џ", '\u2002': " ", '\u2003': " ", + '\u2004': " ", '\u2005': " ", '\u2007': " ", '\u2008': " ", '\u2009': " ", + '\u200a': " ", '​\u200b': "​", '\u200c': "‌", '\u200d': "‍", '\u200e': "‎", + '\u200f': "‏", '‐': "‐", '–': "–", '—': "—", + '―': "―", '‖': "‖", '‘': "‘", '’': "’", '‚': "‚", '“': "“", '”': "”", + '„': "„", '†': "†", '‡': "‡", '•': "•", '‥': "‥", '…': "…", '‰': "‰", + '‱': "‱", '′': "′", '″': "″", '‴': "‴", '‵': "‵", '‹': "‹", + '›': "›", '‾': "‾", '⁁': "⁁", '⁃': "⁃", '⁄': "⁄", '⁏': "⁏", '⁗': "⁗", + '\u205f': " ", '⁠': "⁠", '⁡': "⁡", '⁢': "⁢", '⁣': "⁣", + '€': "€", '⃛': "⃛", '⃜': "⃜", 'ℂ': "ℂ", '℅': "℅", 'ℊ': "ℊ", 'ℋ': "ℋ", + 'ℌ': "ℌ", 'ℍ': "ℍ", 'ℎ': "ℎ", 'ℏ': "ℏ", 'ℐ': "ℐ", 'ℑ': "ℑ", + 'ℒ': "ℒ", 'ℓ': "ℓ", 'ℕ': "ℕ", '№': "№", '℗': "℗", '℘': "℘", 'ℙ': "ℙ", + 'ℚ': "ℚ", 'ℛ': "ℛ", 'ℜ': "ℜ", 'ℝ': "ℝ", '℞': "℞", '™': "™", 'ℤ': "ℤ", + 'Ω': "Ω", '℧': "℧", 'ℨ': "ℨ", '℩': "℩", 'Å': "Å", 'ℬ': "ℬ", 'ℭ': "ℭ", + 'ℯ': "ℯ", 'ℰ': "ℰ", 'ℱ': "ℱ", 'ℳ': "ℳ", 'ℴ': "ℴ", 'ℵ': "ℵ", 'ℶ': "ℶ", + 'ℷ': "ℷ", 'ℸ': "ℸ", 'ⅅ': "ⅅ", 'ⅆ': "ⅆ", 'ⅇ': "ⅇ", + 'ⅈ': "ⅈ", '⅓': "⅓", '⅔': "⅔", '⅕': "⅕", '⅖': "⅖", '⅗': "⅗", + '⅘': "⅘", '⅙': "⅙", '⅚': "⅚", '⅛': "⅛", '⅜': "⅜", '⅝': "⅝", + '⅞': "⅞", '←': "←", '↑': "↑", '→': "→", '↓': "↓", '↔': "↔", '↕': "↕", + '↖': "↖", '↗': "↗", '↘': "↘", '↙': "↙", '↚': "↚", '↛': "↛", '↝': "↝", + '↞': "↞", '↟': "↟", '↠': "↠", '↡': "↡", '↢': "↢", '↣': "↣", + '↤': "↤", '↥': "↥", '↦': "↦", '↧': "↧", '↩': "↩", '↪': "↪", + '↫': "↫", '↬': "↬", '↭': "↭", '↮': "↮", '↰': "↰", '↱': "↱", '↲': "↲", + '↳': "↳", '↵': "↵", '↶': "↶", '↷': "↷", '↺': "↺", '↻': "↻", '↼': "↼", + '↽': "↽", '↾': "↾", '↿': "↿", '⇀': "⇀", '⇁': "⇁", '⇂': "⇂", '⇃': "⇃", + '⇄': "⇄", '⇅': "⇅", '⇆': "⇆", '⇇': "⇇", '⇈': "⇈", '⇉': "⇉", '⇊': "⇊", + '⇋': "⇋", '⇌': "⇌", '⇍': "⇍", '⇎': "⇎", '⇏': "⇏", '⇐': "⇐", '⇑': "⇑", + '⇒': "⇒", '⇓': "⇓", '⇔': "⇔", '⇕': "⇕", '⇖': "⇖", '⇗': "⇗", '⇘': "⇘", + '⇙': "⇙", '⇚': "⇚", '⇛': "⇛", '⇝': "⇝", '⇤': "⇤", '⇥': "⇥", '⇵': "⇵", + '⇽': "⇽", '⇾': "⇾", '⇿': "⇿", '∀': "∀", '∁': "∁", '∂': "∂", '∃': "∃", + '∄': "∄", '∅': "∅", '∇': "∇", '∈': "∈", '∉': "∉", '∋': "∋", '∌': "∌", + '∏': "∏", '∐': "∐", '∑': "∑", '−': "−", '∓': "∓", '∔': "∔", '∖': "∖", + '∗': "∗", '∘': "∘", '√': "√", '∝': "∝", '∞': "∞", '∟': "∟", '∠': "∠", + '∡': "∡", '∢': "∢", '∣': "∣", '∤': "∤", '∥': "∥", '∦': "∦", '∧': "∧", + '∨': "∨", '∩': "∩", '∪': "∪", '∫': "∫", '∬': "∬", '∭': "∭", '∮': "∮", + '∯': "∯", '∰': "∰", '∱': "∱", '∲': "∲", '∳': "∳", '∴': "∴", + '∵': "∵", '∶': "∶", '∷': "∷", '∸': "∸", '∺': "∺", '∻': "∻", '∼': "∼", + '∽': "∽", '∾': "∾", '∿': "∿", '≀': "≀", '≁': "≁", '≂': "≂", '≃': "≃", + '≄': "≄", '≅': "≅", '≆': "≆", '≇': "≇", '≈': "≈", '≉': "≉", '≊': "≊", + '≋': "≋", '≌': "≌", '≍': "≍", '≎': "≎", '≏': "≏", '≐': "≐", '≑': "≑", + '≒': "≒", '≓': "≓", '≔': "≔", '≕': "≕", '≖': "≖", '≗': "≗", '≙': "≙", + '≚': "≚", '≜': "≜", '≟': "≟", '≠': "≠", '≡': "≡", '≢': "≢", '≤': "≤", + '≥': "≥", '≦': "≦", '≧': "≧", '≨': "≨", '≩': "≩", '≪': "≪", '≫': "≫", '≬': "≬", + '≭': "≭", '≮': "≮", '≯': "≯", '≰': "≰", '≱': "≱", '≲': "≲", '≳': "≳", + '≴': "≴", '≵': "≵", '≶': "≶", '≷': "≷", '≸': "≸", '≹': "≹", '≺': "≺", '≻': "≻", + '≼': "≼", '≽': "≽", '≾': "≾", '≿': "≿", '⊀': "⊀", '⊁': "⊁", '⊂': "⊂", + '⊃': "⊃", '⊄': "⊄", '⊅': "⊅", '⊆': "⊆", '⊇': "⊇", '⊈': "⊈", '⊉': "⊉", + '⊊': "⊊", '⊋': "⊋", '⊍': "⊍", '⊎': "⊎", '⊏': "⊏", '⊐': "⊐", '⊑': "⊑", + '⊒': "⊒", '⊓': "⊓", '⊔': "⊔", '⊕': "⊕", '⊖': "⊖", '⊗': "⊗", '⊘': "⊘", + '⊙': "⊙", '⊚': "⊚", '⊛': "⊛", '⊝': "⊝", '⊞': "⊞", '⊟': "⊟", '⊠': "⊠", + '⊡': "⊡", '⊢': "⊢", '⊣': "⊣", '⊤': "⊤", '⊥': "⊥", '⊧': "⊧", '⊨': "⊨", + '⊩': "⊩", '⊪': "⊪", '⊫': "⊫", '⊬': "⊬", '⊭': "⊭", '⊮': "⊮", + '⊯': "⊯", '⊰': "⊰", '⊲': "⊲", '⊳': "⊳", '⊴': "⊴", '⊵': "⊵", '⊶': "⊶", + '⊷': "⊷", '⊸': "⊸", '⊹': "⊹", '⊺': "⊺", '⊻': "⊻", '⊽': "⊽", + '⊾': "⊾", '⊿': "⊿", '⋀': "⋀", '⋁': "⋁", '⋂': "⋂", '⋃': "⋃", '⋄': "⋄", + '⋅': "⋅", '⋆': "⋆", '⋇': "⋇", '⋈': "⋈", '⋉': "⋉", '⋊': "⋊", + '⋋': "⋋", '⋌': "⋌", '⋍': "⋍", '⋎': "⋎", '⋏': "⋏", '⋐': "⋐", '⋑': "⋑", + '⋒': "⋒", '⋓': "⋓", '⋔': "⋔", '⋕': "⋕", '⋖': "⋖", '⋗': "⋗", '⋘': "⋘", '⋙': "⋙", + '⋚': "⋚", '⋛': "⋛", '⋞': "⋞", '⋟': "⋟", '⋠': "⋠", '⋡': "⋡", '⋢': "⋢", + '⋣': "⋣", '⋦': "⋦", '⋧': "⋧", '⋨': "⋨", '⋩': "⋩", '⋪': "⋪", '⋫': "⋫", + '⋬': "⋬", '⋭': "⋭", '⋮': "⋮", '⋯': "⋯", '⋰': "⋰", '⋱': "⋱", '⋲': "⋲", + '⋳': "⋳", '⋴': "⋴", '⋵': "⋵", '⋶': "⋶", '⋷': "⋷", '⋹': "⋹", + '⋺': "⋺", '⋻': "⋻", '⋼': "⋼", '⋽': "⋽", '⋾': "⋾", '⌅': "⌅", '⌆': "⌆", + '⌈': "⌈", '⌉': "⌉", '⌊': "⌊", '⌋': "⌋", '⌌': "⌌", '⌍': "⌍", + '⌎': "⌎", '⌏': "⌏", '⌐': "⌐", '⌒': "⌒", '⌓': "⌓", '⌕': "⌕", + '⌖': "⌖", '⌜': "⌜", '⌝': "⌝", '⌞': "⌞", '⌟': "⌟", '⌢': "⌢", + '⌣': "⌣", '⌭': "⌭", '⌮': "⌮", '⌶': "⌶", '⌽': "⌽", '⌿': "⌿", + '⍼': "⍼", '⎰': "⎰", '⎱': "⎱", '⎴': "⎴", '⎵': "⎵", '⎶': "⎶", + '⏜': "⏜", '⏝': "⏝", '⏞': "⏞", '⏟': "⏟", '⏢': "⏢", + '⏧': "⏧", '␣': "␣", 'Ⓢ': "Ⓢ", '─': "─", '│': "│", '┌': "┌", '┐': "┐", + '└': "└", '┘': "┘", '├': "├", '┤': "┤", '┬': "┬", '┴': "┴", '┼': "┼", + '═': "═", '║': "║", '╒': "╒", '╓': "╓", '╔': "╔", '╕': "╕", '╖': "╖", + '╗': "╗", '╘': "╘", '╙': "╙", '╚': "╚", '╛': "╛", '╜': "╜", '╝': "╝", + '╞': "╞", '╟': "╟", '╠': "╠", '╡': "╡", '╢': "╢", '╣': "╣", '╤': "╤", + '╥': "╥", '╦': "╦", '╧': "╧", '╨': "╨", '╩': "╩", '╪': "╪", '╫': "╫", + '╬': "╬", '▀': "▀", '▄': "▄", '█': "█", '░': "░", '▒': "▒", '▓': "▓", + '□': "□", '▪': "▪", '▫': "▫", '▭': "▭", '▮': "▮", '▱': "▱", + '△': "△", '▴': "▴", '▵': "▵", '▸': "▸", '▹': "▹", '▽': "▽", '▾': "▾", + '▿': "▿", '◂': "◂", '◃': "◃", '◊': "◊", '○': "○", '◬': "◬", '◯': "◯", + '◸': "◸", '◹': "◹", '◺': "◺", '◻': "◻", '◼': "◼", + '★': "★", '☆': "☆", '☎': "☎", '♀': "♀", '♂': "♂", '♠': "♠", '♣': "♣", + '♥': "♥", '♦': "♦", '♪': "♪", '♭': "♭", '♮': "♮", '♯': "♯", '✓': "✓", + '✗': "✗", '✠': "✠", '✶': "✶", '❘': "❘", '❲': "❲", '❳': "❳", + '⟦': "⟦", '⟧': "⟧", '⟨': "⟨", '⟩': "⟩", '⟪': "⟪", '⟫': "⟫", '⟬': "⟬", + '⟭': "⟭", '⟵': "⟵", '⟶': "⟶", '⟷': "⟷", '⟸': "⟸", '⟹': "⟹", '⟺': "⟺", + '⟼': "⟼", '⟿': "⟿", '⤂': "⤂", '⤃': "⤃", '⤄': "⤄", '⤅': "⤅", '⤌': "⤌", + '⤍': "⤍", '⤎': "⤎", '⤏': "⤏", '⤐': "⤐", '⤑': "⤑", '⤒': "⤒", + '⤓': "⤓", '⤖': "⤖", '⤙': "⤙", '⤚': "⤚", '⤛': "⤛", '⤜': "⤜", + '⤝': "⤝", '⤞': "⤞", '⤟': "⤟", '⤠': "⤠", '⤣': "⤣", '⤤': "⤤", + '⤥': "⤥", '⤦': "⤦", '⤧': "⤧", '⤨': "⤨", '⤩': "⤩", '⤪': "⤪", + '⤳': "⤳", '⤵': "⤵", '⤶': "⤶", '⤷': "⤷", '⤸': "⤸", '⤹': "⤹", + '⤼': "⤼", '⤽': "⤽", '⥅': "⥅", '⥈': "⥈", '⥉': "⥉", '⥊': "⥊", + '⥋': "⥋", '⥎': "⥎", '⥏': "⥏", '⥐': "⥐", + '⥑': "⥑", '⥒': "⥒", '⥓': "⥓", '⥔': "⥔", + '⥕': "⥕", '⥖': "⥖", '⥗': "⥗", '⥘': "⥘", + '⥙': "⥙", '⥚': "⥚", '⥛': "⥛", '⥜': "⥜", + '⥝': "⥝", '⥞': "⥞", '⥟': "⥟", '⥠': "⥠", + '⥡': "⥡", '⥢': "⥢", '⥣': "⥣", '⥤': "⥤", '⥥': "⥥", '⥦': "⥦", + '⥧': "⥧", '⥨': "⥨", '⥩': "⥩", '⥪': "⥪", '⥫': "⥫", '⥬': "⥬", + '⥭': "⥭", '⥮': "⥮", '⥯': "⥯", '⥰': "⥰", '⥱': "⥱", '⥲': "⥲", + '⥳': "⥳", '⥴': "⥴", '⥵': "⥵", '⥶': "⥶", '⥸': "⥸", '⥹': "⥹", + '⥻': "⥻", '⥼': "⥼", '⥽': "⥽", '⥾': "⥾", '⥿': "⥿", '⦅': "⦅", + '⦆': "⦆", '⦋': "⦋", '⦌': "⦌", '⦍': "⦍", '⦎': "⦎", '⦏': "⦏", + '⦐': "⦐", '⦑': "⦑", '⦒': "⦒", '⦓': "⦓", '⦔': "⦔", '⦕': "⦕", + '⦖': "⦖", '⦚': "⦚", '⦜': "⦜", '⦝': "⦝", '⦤': "⦤", '⦥': "⦥", + '⦦': "⦦", '⦧': "⦧", '⦨': "⦨", '⦩': "⦩", '⦪': "⦪", '⦫': "⦫", + '⦬': "⦬", '⦭': "⦭", '⦮': "⦮", '⦯': "⦯", '⦰': "⦰", '⦱': "⦱", + '⦲': "⦲", '⦳': "⦳", '⦴': "⦴", '⦵': "⦵", '⦶': "⦶", '⦷': "⦷", + '⦹': "⦹", '⦻': "⦻", '⦼': "⦼", '⦾': "⦾", '⦿': "⦿", '⧀': "⧀", '⧁': "⧁", + '⧂': "⧂", '⧃': "⧃", '⧄': "⧄", '⧅': "⧅", '⧉': "⧉", '⧍': "⧍", '⧎': "⧎", + '⧏': "⧏", '⧐': "⧐", '⧚': "∽̱", '⧜': "⧜", '⧝': "⧝", + '⧞': "⧞", '⧣': "⧣", '⧤': "⧤", '⧥': "⧥", '⧫': "⧫", '⧴': "⧴", + '⧶': "⧶", '⨀': "⨀", '⨁': "⨁", '⨂': "⨂", '⨄': "⨄", '⨆': "⨆", '⨌': "⨌", + '⨍': "⨍", '⨐': "⨐", '⨑': "⨑", '⨒': "⨒", '⨓': "⨓", '⨔': "⨔", + '⨕': "⨕", '⨖': "⨖", '⨗': "⨗", '⨢': "⨢", '⨣': "⨣", '⨤': "⨤", + '⨥': "⨥", '⨦': "⨦", '⨧': "⨧", '⨩': "⨩", '⨪': "⨪", '⨭': "⨭", + '⨮': "⨮", '⨯': "⨯", '⨰': "⨰", '⨱': "⨱", '⨳': "⨳", '⨴': "⨴", + '⨵': "⨵", '⨶': "⨶", '⨷': "⨷", '⨸': "⨸", '⨹': "⨹", '⨺': "⨺", + '⨻': "⨻", '⨼': "⨼", '⨿': "⨿", '⩀': "⩀", '⩂': "⩂", '⩃': "⩃", '⩄': "⩄", + '⩅': "⩅", '⩆': "⩆", '⩇': "⩇", '⩈': "⩈", '⩉': "⩉", '⩊': "⩊", + '⩋': "⩋", '⩌': "⩌", '⩍': "⩍", '⩐': "⩐", '⩓': "⩓", '⩔': "⩔", '⩕': "⩕", + '⩖': "⩖", '⩗': "⩗", '⩘': "⩘", '⩚': "⩚", '⩛': "⩛", '⩜': "⩜", '⩝': "⩝", + '⩟': "⩟", '⩦': "⩦", '⩪': "⩪", '⩭': "⩭", '⩮': "⩮", '⩯': "⩯", '⩰': "⩰", + '⩱': "⩱", '⩲': "⩲", '⩳': "⩳", '⩴': "⩴", '⩵': "⩵", '⩷': "⩷", '⩸': "⩸", + '⩹': "⩹", '⩺': "⩺", '⩻': "⩻", '⩼': "⩼", '⩽': "⩽", '⩾': "⩾", '⩿': "⩿", + '⪀': "⪀", '⪁': "⪁", '⪂': "⪂", '⪃': "⪃", '⪄': "⪄", '⪅': "⪅", + '⪆': "⪆", '⪇': "⪇", '⪈': "⪈", '⪉': "⪉", '⪊': "⪊", '⪋': "⪋", '⪌': "⪌", '⪍': "⪍", + '⪎': "⪎", '⪏': "⪏", '⪐': "⪐", '⪑': "⪑", '⪒': "⪒", '⪓': "⪓", '⪔': "⪔", + '⪕': "⪕", '⪖': "⪖", '⪗': "⪗", '⪘': "⪘", '⪙': "⪙", '⪚': "⪚", '⪝': "⪝", + '⪞': "⪞", '⪟': "⪟", '⪠': "⪠", '⪡': "⪡", '⪢': "⪢", '⪤': "⪤", + '⪥': "⪥", '⪦': "⪦", '⪧': "⪧", '⪨': "⪨", '⪩': "⪩", '⪪': "⪪", '⪫': "⪫", + '⪬': "⪬", '⪭': "⪭", '⪮': "⪮", '⪯': "⪯", '⪰': "⪰", '⪳': "⪳", '⪴': "⪴", + '⪵': "⪵", '⪶': "⪶", '⪷': "⪷", '⪸': "⪸", '⪹': "⪹", '⪺': "⪺", '⪻': "⪻", + '⪼': "⪼", '⪽': "⪽", '⪾': "⪾", '⪿': "⪿", '⫀': "⫀", '⫁': "⫁", + '⫂': "⫂", '⫃': "⫃", '⫄': "⫄", '⫅': "⫅", '⫆': "⫆", '⫇': "⫇", + '⫈': "⫈", '⫋': "⫋", '⫌': "⫌", '⫏': "⫏", '⫐': "⫐", '⫑': "⫑", '⫒': "⫒", + '⫓': "⫓", '⫔': "⫔", '⫕': "⫕", '⫖': "⫖", '⫗': "⫗", '⫘': "⫘", + '⫙': "⫙", '⫚': "⫚", '⫛': "⫛", '⫤': "⫤", '⫦': "⫦", '⫧': "⫧", '⫨': "⫨", + '⫩': "⫩", '⫫': "⫫", '⫬': "⫬", '⫭': "⫭", '⫮': "⫮", '⫯': "⫯", '⫰': "⫰", + '⫱': "⫱", '⫲': "⫲", '⫳': "⫳", '⫽': "⫽", 'ff': "ff", 'fi': "fi", 'fl': "fl", + 'ffi': "ffi", 'ffl': "ffl", '𝒜': "𝒜", '𝒞': "𝒞", '𝒟': "𝒟", '𝒢': "𝒢", '𝒥': "𝒥", + '𝒦': "𝒦", '𝒩': "𝒩", '𝒪': "𝒪", '𝒫': "𝒫", '𝒬': "𝒬", '𝒮': "𝒮", '𝒯': "𝒯", + '𝒰': "𝒰", '𝒱': "𝒱", '𝒲': "𝒲", '𝒳': "𝒳", '𝒴': "𝒴", '𝒵': "𝒵", '𝒶': "𝒶", + '𝒷': "𝒷", '𝒸': "𝒸", '𝒹': "𝒹", '𝒻': "𝒻", '𝒽': "𝒽", '𝒾': "𝒾", '𝒿': "𝒿", + '𝓀': "𝓀", '𝓁': "𝓁", '𝓂': "𝓂", '𝓃': "𝓃", '𝓅': "𝓅", '𝓆': "𝓆", '𝓇': "𝓇", + '𝓈': "𝓈", '𝓉': "𝓉", '𝓊': "𝓊", '𝓋': "𝓋", '𝓌': "𝓌", '𝓍': "𝓍", '𝓎': "𝓎", + '𝓏': "𝓏", '𝔄': "𝔄", '𝔅': "𝔅", '𝔇': "𝔇", '𝔈': "𝔈", '𝔉': "𝔉", '𝔊': "𝔊", '𝔍': "𝔍", + '𝔎': "𝔎", '𝔏': "𝔏", '𝔐': "𝔐", '𝔑': "𝔑", '𝔒': "𝔒", '𝔓': "𝔓", '𝔔': "𝔔", '𝔖': "𝔖", + '𝔗': "𝔗", '𝔘': "𝔘", '𝔙': "𝔙", '𝔚': "𝔚", '𝔛': "𝔛", '𝔜': "𝔜", '𝔞': "𝔞", '𝔟': "𝔟", + '𝔠': "𝔠", '𝔡': "𝔡", '𝔢': "𝔢", '𝔣': "𝔣", '𝔤': "𝔤", '𝔥': "𝔥", '𝔦': "𝔦", '𝔧': "𝔧", + '𝔨': "𝔨", '𝔩': "𝔩", '𝔪': "𝔪", '𝔫': "𝔫", '𝔬': "𝔬", '𝔭': "𝔭", '𝔮': "𝔮", '𝔯': "𝔯", + '𝔰': "𝔰", '𝔱': "𝔱", '𝔲': "𝔲", '𝔳': "𝔳", '𝔴': "𝔴", '𝔵': "𝔵", '𝔶': "𝔶", '𝔷': "𝔷", + '𝔸': "𝔸", '𝔹': "𝔹", '𝔻': "𝔻", '𝔼': "𝔼", '𝔽': "𝔽", '𝔾': "𝔾", '𝕀': "𝕀", + '𝕁': "𝕁", '𝕂': "𝕂", '𝕃': "𝕃", '𝕄': "𝕄", '𝕆': "𝕆", '𝕊': "𝕊", '𝕋': "𝕋", + '𝕌': "𝕌", '𝕍': "𝕍", '𝕎': "𝕎", '𝕏': "𝕏", '𝕐': "𝕐", '𝕒': "𝕒", '𝕓': "𝕓", + '𝕔': "𝕔", '𝕕': "𝕕", '𝕖': "𝕖", '𝕗': "𝕗", '𝕘': "𝕘", '𝕙': "𝕙", '𝕚': "𝕚", + '𝕛': "𝕛", '𝕜': "𝕜", '𝕝': "𝕝", '𝕞': "𝕞", '𝕟': "𝕟", '𝕠': "𝕠", '𝕡': "𝕡", + '𝕢': "𝕢", '𝕣': "𝕣", '𝕤': "𝕤", '𝕥': "𝕥", '𝕦': "𝕦", '𝕧': "𝕧", '𝕨': "𝕨", + '𝕩': "𝕩", '𝕪': "𝕪", '𝕫': "𝕫", +} +DECMAP = {v: k for k, v in ENCMAP.items()} + + +class HtmlEntityDecodeError(ValueError): + pass + + +def htmlentity_encode(text, errors="strict"): + s = "" + for c in text: + try: + s += ENCMAP[c] + except KeyError: + i = ord(c) + s += "&" + hex(i)[2:].zfill(0) + ";" if i > 0xff else c + return s, len(text) + + +def htmlentity_decode(text, errors="strict"): + s = "" + i = 0 + while i < len(text): + m = re.match(r"&(?:(?:[A-Za-z][A-Za-z0-9]{1,6}){1,4}|[0-9]{4});", text[i:i+30]) + if m: + entity = m.group() + c = chr(int(entity[1:5], 16)) if entity[1:5].isdigit() and len(entity) == 6 else \ + " " if entity == " " else None + if c: + s += c + else: + try: + s += DECMAP[entity] + except KeyError: + s += handle_error("html-entity", errors, HtmlEntityDecodeError, decode=True)(text[i], i) + i += len(entity) + else: + s += text[i] + i += 1 + return s, len(text) + + +add("html", htmlentity_encode, htmlentity_decode, r"^html(?:[-_]?entit(?:y|ies))?$", + extra_exceptions=["HtmlEntityDecodeError"]) + diff --git a/codext/web/url.py b/src/codext/web/url.py old mode 100755 new mode 100644 similarity index 96% rename from codext/web/url.py rename to src/codext/web/url.py index 24035a2..3abff09 --- a/codext/web/url.py +++ b/src/codext/web/url.py @@ -1,29 +1,29 @@ -# -*- coding: UTF-8 -*- -"""URL Codec - urlencode content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(url|urlencode)': {'?=this/is-a_test/../': "%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F"}, - 'dec(url|urlencode)': {'test/test%2etxt': "test/test.txt", 'test%2ftest.txt': "test/test.txt"} -} - - -SAFE = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-" -ENCMAP = {} -for i in range(256): - c = chr(i) - if c not in SAFE: - ENCMAP[c] = "%{:02X}".format(i) - - -add_map("url", ENCMAP, ignore_case="decode", no_error=True, pattern=r"^url(?:encode)?$", printables_rate=1., - expansion_factor=(1.2, .2)) - +# -*- coding: UTF-8 -*- +"""URL Codec - urlencode content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(url|urlencode)': {'?=this/is-a_test/../': "%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F"}, + 'dec(url|urlencode)': {'test/test%2etxt': "test/test.txt", 'test%2ftest.txt': "test/test.txt"} +} + + +SAFE = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-" +ENCMAP = {} +for i in range(256): + c = chr(i) + if c not in SAFE: + ENCMAP[c] = "%{:02X}".format(i) + + +add_map("url", ENCMAP, ignore_case="decode", no_error=True, pattern=r"^url(?:encode)?$", printables_rate=1., + expansion_factor=(1.2, .2)) + diff --git a/tests/test_base.py b/tests/test_base.py index 7b3dae0..a37d1a6 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,236 +1,235 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Base codecs tests. - -""" -import os -import sys -from unittest import TestCase - -from codext.__common__ import * -from codext.base._base import _generate_charset -from codext.base.baseN import base, main2, main32, main64url - - -class TestCodecsBase(TestCase): - def setUp(self): - global STR - STR = "this is a test" - - def test_new_base_codec(self): - for i in [0, 1, 256]: - self.assertRaises(ValueError, _generate_charset, i) - b10 = lambda *a: "0123456789" - base(b10, "base10") - B10 = "2361031878030638688519054699098996" - self.assertEqual(codecs.encode(STR, "base10"), B10) - self.assertEqual(codecs.encode(b(STR), "base10"), b(B10)) - self.assertEqual(codecs.decode(B10, "base10"), STR) - self.assertEqual(codecs.decode(b(B10), "base10"), b(STR)) - self.assertRaises(ValueError, base, 1, "test") - b11 = "0123456789a" - base(b11, "base11") - B11 = "113342054335735319526632a26972419" - self.assertEqual(codecs.encode(STR, "base11"), B11) - self.assertEqual(codecs.decode(B11, "base11"), STR) - self.assertRaises(ValueError, base, object(), "test") - self.assertIsNone(base({'': "01234"}, r"^base5(test)?$")) - self.assertIsNotNone(codecs.encode(STR, "base5test")) - self.assertRaises(ValueError, base, {'': "01234"}, "base5-test", pow2=True) - self.assertEqual("", codecs.decode("", "base5test")) - - def test_codec_base1(self): - C = "A" - for i in range(3): - self.assertIsNotNone(codecs.encode(i * C, "base1")) - self.assertRaises(ValueError, codecs.encode, 4 * C, "unary") - self.assertEqual(codecs.decode("AAAAA", "base1"), "\x05") - - def test_codec_base2(self): - STR = "test" - B2 = "01110100011001010111001101110100" - self.assertEqual(codecs.encode(STR, "base2"), B2) - self.assertEqual(codecs.encode(b(STR), "base2"), b(B2)) - self.assertEqual(codecs.decode(B2, "base2"), STR) - self.assertEqual(codecs.decode(b(B2), "base2"), b(STR)) - B2 = "10001011100110101000110010001011" - self.assertEqual(codecs.encode(STR, "base2-inv"), B2) - self.assertEqual(codecs.decode(B2, "base2-inv"), STR) - B2 = "abbbabaaabbaabababbbaabbabbbabaa" - self.assertEqual(codecs.encode(STR, "base2-ab"), B2) - self.assertEqual(codecs.decode(B2, "base2-ab"), STR) - B2 = "CDDDCDCCCDDCCDCDCDDDCCDDCDDDCDCC" - self.assertEqual(codecs.encode(STR, "base2-CD"), B2) - self.assertEqual(codecs.decode(B2, "base2-CD"), STR) - B2 = "34443433344334343444334434443433" - self.assertEqual(codecs.encode(STR, "base2-34"), B2) - self.assertEqual(codecs.decode(B2, "base2-34"), STR) - - def test_codec_base3(self): - STR = "test" - B3 = "23112113223321323322" - self.assertEqual(codecs.encode(STR, "base3"), B3) - self.assertEqual(codecs.encode(b(STR), "base3"), b(B3)) - self.assertEqual(codecs.decode(B3, "base3"), STR) - self.assertEqual(codecs.decode(b(B3), "base3"), b(STR)) - B3 = "21332331221123121122" - self.assertEqual(codecs.encode(STR, "base3-inv"), B3) - self.assertEqual(codecs.decode(B3, "base3-inv"), STR) - B3 = "bcaabaacbbccbacbccbb" - self.assertEqual(codecs.encode(STR, "base3-abc"), B3) - self.assertEqual(codecs.decode(B3, "base3-abc"), STR) - self.assertRaises(LookupError, codecs.encode, "test", "base3-ab") - self.assertRaises(LookupError, codecs.encode, "test", "base3-abcd") - - def test_codec_base4(self): - STR = "test" - B4 = "2421232224142421" - self.assertEqual(codecs.encode(STR, "base4"), B4) - self.assertEqual(codecs.encode(b(STR), "base4"), b(B4)) - self.assertEqual(codecs.decode(B4, "base4"), STR) - self.assertEqual(codecs.decode(b(B4), "base4"), b(STR)) - B4 = "3134323331413134" - self.assertEqual(codecs.encode(STR, "base4-inv"), B4) - self.assertEqual(codecs.decode(B4, "base4-inv"), STR) - B4 = "bdbabcbbbdadbdba" - self.assertEqual(codecs.encode(STR, "base4-abcd"), B4) - self.assertEqual(codecs.decode(B4, "base4-abcd"), STR) - self.assertRaises(LookupError, codecs.encode, "test", "base4-abc") - self.assertRaises(LookupError, codecs.encode, "test", "base4-abcde") - - def test_codec_base8(self): - STR = "test" - B8 = "dfagcfgddfa=====" - self.assertEqual(codecs.encode(STR, "base8"), B8) - self.assertEqual(codecs.encode(b(STR), "base8"), b(B8)) - self.assertEqual(codecs.decode(B8, "base8"), STR) - self.assertEqual(codecs.decode(b(B8), "base8"), b(STR)) - B8 = "echbfcbeech=====" - self.assertEqual(codecs.encode(STR, "base8-inv"), B8) - self.assertEqual(codecs.decode(B8, "base8-inv"), STR) - B8 = "35062563350=====" - self.assertEqual(codecs.encode(STR, "base8-01234567"), B8) - self.assertEqual(codecs.decode(B8, "base8-01234567"), STR) - self.assertRaises(LookupError, codecs.encode, "test", "base8-0123456") - self.assertRaises(LookupError, codecs.encode, "test", "base8-012345678") - - def test_codec_base16(self): - B16 = "7468697320697320612074657374" - self.assertEqual(codecs.encode(STR, "base16"), B16) - self.assertEqual(codecs.encode(b(STR), "base16"), b(B16)) - self.assertEqual(codecs.decode(B16, "base16"), STR) - self.assertEqual(codecs.decode(b(B16), "base16"), b(STR)) - B16 += "?" - self.assertRaises(ValueError, codecs.decode, B16, "base16") - self.assertEqual(codecs.decode(B16, "base16", "ignore"), STR) - self.assertEqual(codecs.decode(B16, "base16", "replace"), STR + "\x00") - self.assertRaises(ValueError, codecs.decode, B16, "base16", "BAD") - STR2 = "=:;" - B16_1 = "3d3a3b" - B16_2 = "3D3A3B" - B16_3 = "3D3a3B" # mixed case: should fail - self.assertEqual(codecs.encode(STR2, "hex"), B16_2) - self.assertEqual(codecs.decode(B16_1, "hex"), STR2) - self.assertEqual(codecs.decode(B16_2, "hex"), STR2) - self.assertRaises(ValueError, codecs.decode, B16_3, "hex") - - def test_codec_base32(self): - for b32, enc in zip(["ORUGS4ZANFZSAYJAORSXG5A=", "qtwg1h3ypf31yajyqt1zg7y=", "EHK6ISP0D5PI0O90EHIN6T0=", - "fjn6kwt0e5tk0s90fjkr6x0=", "EHM6JWS0D5SJ0R90EHJQ6X0="], - ["base32", "zbase32", "base32-hex", "geohash", "crockford"]): - self.assertEqual(codecs.encode(STR, enc), b32) - self.assertEqual(codecs.encode(b(STR), enc), b(b32)) - self.assertEqual(codecs.decode(b32, enc), STR) - self.assertEqual(codecs.decode(b(b32), enc), b(STR)) - self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc) - self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc, "BAD") - - def test_codec_base36(self): - B36 = "4WMHTK6UZL044O91NKCEB8" - self.assertEqual(codecs.encode(STR, "base36"), B36) - self.assertEqual(codecs.encode(b(STR), "base36"), b(B36)) - self.assertEqual(codecs.decode(B36, "base36"), STR) - self.assertEqual(codecs.decode(b(B36), "base36"), b(STR)) - B36 = "E6WR3UG49VAEEYJBXUMOLI" - self.assertEqual(codecs.encode(STR, "base36-inv"), B36) - self.assertEqual(codecs.decode(B36, "base36-inv"), STR) - self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36-inv") - self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36", "BAD") - self.assertEqual(codecs.decode(B36 + "?", "base36-inv", "ignore"), STR) - - def test_codec_base58(self): - B58 = "jo91waLQA1NNeBmZKUF" - self.assertEqual(codecs.encode(STR, "base58"), B58) - self.assertEqual(codecs.encode(b(STR), "base58"), b(B58)) - self.assertEqual(codecs.decode(B58, "base58"), STR) - self.assertEqual(codecs.decode(b(B58), "base58"), b(STR)) - B58 = "jo9rA2LQwr44eBmZK7E" - self.assertEqual(codecs.encode(STR, "base58-ripple"), B58) - self.assertEqual(codecs.decode(B58, "base58-rp"), STR) - B58 = "JN91Wzkpa1nnDbLyjtf" - self.assertEqual(codecs.encode(STR, "base58-flickr"), B58) - self.assertEqual(codecs.encode(STR, "base58-shorturl"), B58) - self.assertEqual(codecs.decode(B58, "base58-fl"), STR) - self.assertEqual(codecs.encode(STR, "base58-short-url"), B58) - self.assertEqual(codecs.encode(STR, "base58-url"), B58) - - def test_codec_base62(self): - for b62, enc in zip(["CsoB4HQ5gmgMyCenF7E", "M2yLERaFqwqW8MoxPHO"], ["base62", "base62-inv"]): - self.assertEqual(codecs.encode(STR, enc), b62) - self.assertEqual(codecs.encode(b(STR), enc), b(b62)) - self.assertEqual(codecs.decode(b62, enc), STR) - self.assertEqual(codecs.decode(b(b62), enc), b(STR)) - - def test_codec_base64(self): - for b64, enc in zip(["dGhpcyBpcyBhIHRlc3Q=", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): - self.assertEqual(codecs.encode(STR, enc), b64) - self.assertEqual(codecs.encode(b(STR), enc), b(b64)) - self.assertEqual(codecs.decode(b64, enc), STR) - self.assertEqual(codecs.decode(b(b64), enc), b(STR)) - - def test_codec_base91(self): - for b91, enc in zip([",X,<:WRT%yxth90oZB", ",N,<:MHJ%onjXzqeP1", "Jx&[jv4S3Wg>,71@Jk", "yJy^\\IDFsdc?Tof:L#"], - ["base91", "base91-inv", "base91-alt", "base91-alt-inv"]): - self.assertEqual(codecs.encode(STR, enc), b91) - self.assertEqual(codecs.encode(b(STR), enc), b(b91)) - self.assertEqual(codecs.decode(b91, enc), STR) - self.assertEqual(codecs.decode(b(b91), enc), b(STR)) - self.assertIsNotNone(codecs.encode("\x00\x00", "base91")) - self.assertIsNotNone(codecs.decode("abc", "base91")) - self.assertIsNotNone(codecs.decode("AD", "base91")) - self.assertRaises(ValueError, codecs.decode, "\xff", "base91") - self.assertRaises(ValueError, codecs.decode, "a\xff", "base91") - self.assertIsNotNone(codecs.encode("\x00\x00", "base91-alt")) - - def test_codec_base100(self): - if PY3: - B100 = "\U0001f46b\U0001f45f\U0001f460\U0001f46a\U0001f417\U0001f460\U0001f46a\U0001f417\U0001f458" \ - "\U0001f417\U0001f46b\U0001f45c\U0001f46a\U0001f46b" - self.assertEqual(codecs.encode(STR, "base100"), B100) - self.assertEqual(codecs.encode(b(STR), "base100"), b(B100)) - self.assertEqual(codecs.decode(B100, "base100"), STR) - self.assertEqual(codecs.decode(b(B100), "base100"), b(STR)) - self.assertRaises(ValueError, codecs.decode, b(B100)[1:], "base100") - - def test_codec_base_generic(self): - for n in range(2, 255): - bn = "base{}_generic".format(n) - self.assertEqual(codecs.decode(codecs.encode(STR, bn), bn), STR) - self.assertRaises(LookupError, codecs.decode, "test", "base0-generic") - self.assertRaises(LookupError, codecs.decode, "test", "base1-generic") - self.assertRaises(LookupError, codecs.decode, "test", "base256-generic") - - def test_base_main(self): - tmp = sys.argv[:] - tfile = "test-base-main.txt" - with open(tfile, 'w') as f: - f.write("This is a long test string for the sake of causing line wrapping based on default parameters.") - for swap_arg in [[], ["-s"]]: - sys.argv = [tmp[0], tfile] + swap_arg - for m in main32, main64url: - self.assertEqual(m(), 0) - sys.argv = [tmp[0], tfile, "-d"] + swap_arg - self.assertEqual(main2(), 1) - os.remove(tfile) - sys.argv[:] = tmp - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Base codecs tests. + +""" +import sys +from unittest import TestCase + +from codext.__common__ import * +from codext.base._base import _generate_charset +from codext.base.baseN import base, main2, main32, main64url + + +class TestCodecsBase(TestCase): + def setUp(self): + global STR + STR = "this is a test" + + def test_new_base_codec(self): + for i in [0, 1, 256]: + self.assertRaises(ValueError, _generate_charset, i) + b10 = lambda *a: "0123456789" + base(b10, "base10") + B10 = "2361031878030638688519054699098996" + self.assertEqual(codecs.encode(STR, "base10"), B10) + self.assertEqual(codecs.encode(b(STR), "base10"), b(B10)) + self.assertEqual(codecs.decode(B10, "base10"), STR) + self.assertEqual(codecs.decode(b(B10), "base10"), b(STR)) + self.assertRaises(ValueError, base, 1, "test") + b11 = "0123456789a" + base(b11, "base11") + B11 = "113342054335735319526632a26972419" + self.assertEqual(codecs.encode(STR, "base11"), B11) + self.assertEqual(codecs.decode(B11, "base11"), STR) + self.assertRaises(ValueError, base, object(), "test") + self.assertIsNone(base({'': "01234"}, r"^base5(test)?$")) + self.assertIsNotNone(codecs.encode(STR, "base5test")) + self.assertRaises(ValueError, base, {'': "01234"}, "base5-test", pow2=True) + self.assertEqual("", codecs.decode("", "base5test")) + + def test_codec_base1(self): + C = "A" + for i in range(3): + self.assertIsNotNone(codecs.encode(i * C, "base1")) + self.assertRaises(ValueError, codecs.encode, 4 * C, "unary") + self.assertEqual(codecs.decode("AAAAA", "base1"), "\x05") + + def test_codec_base2(self): + STR = "test" + B2 = "01110100011001010111001101110100" + self.assertEqual(codecs.encode(STR, "base2"), B2) + self.assertEqual(codecs.encode(b(STR), "base2"), b(B2)) + self.assertEqual(codecs.decode(B2, "base2"), STR) + self.assertEqual(codecs.decode(b(B2), "base2"), b(STR)) + B2 = "10001011100110101000110010001011" + self.assertEqual(codecs.encode(STR, "base2-inv"), B2) + self.assertEqual(codecs.decode(B2, "base2-inv"), STR) + B2 = "abbbabaaabbaabababbbaabbabbbabaa" + self.assertEqual(codecs.encode(STR, "base2-ab"), B2) + self.assertEqual(codecs.decode(B2, "base2-ab"), STR) + B2 = "CDDDCDCCCDDCCDCDCDDDCCDDCDDDCDCC" + self.assertEqual(codecs.encode(STR, "base2-CD"), B2) + self.assertEqual(codecs.decode(B2, "base2-CD"), STR) + B2 = "34443433344334343444334434443433" + self.assertEqual(codecs.encode(STR, "base2-34"), B2) + self.assertEqual(codecs.decode(B2, "base2-34"), STR) + + def test_codec_base3(self): + STR = "test" + B3 = "23112113223321323322" + self.assertEqual(codecs.encode(STR, "base3"), B3) + self.assertEqual(codecs.encode(b(STR), "base3"), b(B3)) + self.assertEqual(codecs.decode(B3, "base3"), STR) + self.assertEqual(codecs.decode(b(B3), "base3"), b(STR)) + B3 = "21332331221123121122" + self.assertEqual(codecs.encode(STR, "base3-inv"), B3) + self.assertEqual(codecs.decode(B3, "base3-inv"), STR) + B3 = "bcaabaacbbccbacbccbb" + self.assertEqual(codecs.encode(STR, "base3-abc"), B3) + self.assertEqual(codecs.decode(B3, "base3-abc"), STR) + self.assertRaises(LookupError, codecs.encode, "test", "base3-ab") + self.assertRaises(LookupError, codecs.encode, "test", "base3-abcd") + + def test_codec_base4(self): + STR = "test" + B4 = "2421232224142421" + self.assertEqual(codecs.encode(STR, "base4"), B4) + self.assertEqual(codecs.encode(b(STR), "base4"), b(B4)) + self.assertEqual(codecs.decode(B4, "base4"), STR) + self.assertEqual(codecs.decode(b(B4), "base4"), b(STR)) + B4 = "3134323331413134" + self.assertEqual(codecs.encode(STR, "base4-inv"), B4) + self.assertEqual(codecs.decode(B4, "base4-inv"), STR) + B4 = "bdbabcbbbdadbdba" + self.assertEqual(codecs.encode(STR, "base4-abcd"), B4) + self.assertEqual(codecs.decode(B4, "base4-abcd"), STR) + self.assertRaises(LookupError, codecs.encode, "test", "base4-abc") + self.assertRaises(LookupError, codecs.encode, "test", "base4-abcde") + + def test_codec_base8(self): + STR = "test" + B8 = "dfagcfgddfa=====" + self.assertEqual(codecs.encode(STR, "base8"), B8) + self.assertEqual(codecs.encode(b(STR), "base8"), b(B8)) + self.assertEqual(codecs.decode(B8, "base8"), STR) + self.assertEqual(codecs.decode(b(B8), "base8"), b(STR)) + B8 = "echbfcbeech=====" + self.assertEqual(codecs.encode(STR, "base8-inv"), B8) + self.assertEqual(codecs.decode(B8, "base8-inv"), STR) + B8 = "35062563350=====" + self.assertEqual(codecs.encode(STR, "base8-01234567"), B8) + self.assertEqual(codecs.decode(B8, "base8-01234567"), STR) + self.assertRaises(LookupError, codecs.encode, "test", "base8-0123456") + self.assertRaises(LookupError, codecs.encode, "test", "base8-012345678") + + def test_codec_base16(self): + B16 = "7468697320697320612074657374" + self.assertEqual(codecs.encode(STR, "base16"), B16) + self.assertEqual(codecs.encode(b(STR), "base16"), b(B16)) + self.assertEqual(codecs.decode(B16, "base16"), STR) + self.assertEqual(codecs.decode(b(B16), "base16"), b(STR)) + B16 += "?" + self.assertRaises(ValueError, codecs.decode, B16, "base16") + self.assertEqual(codecs.decode(B16, "base16", "ignore"), STR) + self.assertEqual(codecs.decode(B16, "base16", "replace"), STR + "\x00") + self.assertRaises(ValueError, codecs.decode, B16, "base16", "BAD") + STR2 = "=:;" + B16_1 = "3d3a3b" + B16_2 = "3D3A3B" + B16_3 = "3D3a3B" # mixed case: should fail + self.assertEqual(codecs.encode(STR2, "hex"), B16_2) + self.assertEqual(codecs.decode(B16_1, "hex"), STR2) + self.assertEqual(codecs.decode(B16_2, "hex"), STR2) + self.assertRaises(ValueError, codecs.decode, B16_3, "hex") + + def test_codec_base32(self): + for b32, enc in zip(["ORUGS4ZANFZSAYJAORSXG5A=", "qtwg1h3ypf31yajyqt1zg7y=", "EHK6ISP0D5PI0O90EHIN6T0=", + "fjn6kwt0e5tk0s90fjkr6x0=", "EHM6JWS0D5SJ0R90EHJQ6X0="], + ["base32", "zbase32", "base32-hex", "geohash", "crockford"]): + self.assertEqual(codecs.encode(STR, enc), b32) + self.assertEqual(codecs.encode(b(STR), enc), b(b32)) + self.assertEqual(codecs.decode(b32, enc), STR) + self.assertEqual(codecs.decode(b(b32), enc), b(STR)) + self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc) + self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc, "BAD") + + def test_codec_base36(self): + B36 = "4WMHTK6UZL044O91NKCEB8" + self.assertEqual(codecs.encode(STR, "base36"), B36) + self.assertEqual(codecs.encode(b(STR), "base36"), b(B36)) + self.assertEqual(codecs.decode(B36, "base36"), STR) + self.assertEqual(codecs.decode(b(B36), "base36"), b(STR)) + B36 = "E6WR3UG49VAEEYJBXUMOLI" + self.assertEqual(codecs.encode(STR, "base36-inv"), B36) + self.assertEqual(codecs.decode(B36, "base36-inv"), STR) + self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36-inv") + self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36", "BAD") + self.assertEqual(codecs.decode(B36 + "?", "base36-inv", "ignore"), STR) + + def test_codec_base58(self): + B58 = "jo91waLQA1NNeBmZKUF" + self.assertEqual(codecs.encode(STR, "base58"), B58) + self.assertEqual(codecs.encode(b(STR), "base58"), b(B58)) + self.assertEqual(codecs.decode(B58, "base58"), STR) + self.assertEqual(codecs.decode(b(B58), "base58"), b(STR)) + B58 = "jo9rA2LQwr44eBmZK7E" + self.assertEqual(codecs.encode(STR, "base58-ripple"), B58) + self.assertEqual(codecs.decode(B58, "base58-rp"), STR) + B58 = "JN91Wzkpa1nnDbLyjtf" + self.assertEqual(codecs.encode(STR, "base58-flickr"), B58) + self.assertEqual(codecs.encode(STR, "base58-shorturl"), B58) + self.assertEqual(codecs.decode(B58, "base58-fl"), STR) + self.assertEqual(codecs.encode(STR, "base58-short-url"), B58) + self.assertEqual(codecs.encode(STR, "base58-url"), B58) + + def test_codec_base62(self): + for b62, enc in zip(["CsoB4HQ5gmgMyCenF7E", "M2yLERaFqwqW8MoxPHO"], ["base62", "base62-inv"]): + self.assertEqual(codecs.encode(STR, enc), b62) + self.assertEqual(codecs.encode(b(STR), enc), b(b62)) + self.assertEqual(codecs.decode(b62, enc), STR) + self.assertEqual(codecs.decode(b(b62), enc), b(STR)) + + def test_codec_base64(self): + for b64, enc in zip(["dGhpcyBpcyBhIHRlc3Q=", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): + self.assertEqual(codecs.encode(STR, enc), b64) + self.assertEqual(codecs.encode(b(STR), enc), b(b64)) + self.assertEqual(codecs.decode(b64, enc), STR) + self.assertEqual(codecs.decode(b(b64), enc), b(STR)) + + def test_codec_base91(self): + for b91, enc in zip([",X,<:WRT%yxth90oZB", ",N,<:MHJ%onjXzqeP1", "Jx&[jv4S3Wg>,71@Jk", "yJy^\\IDFsdc?Tof:L#"], + ["base91", "base91-inv", "base91-alt", "base91-alt-inv"]): + self.assertEqual(codecs.encode(STR, enc), b91) + self.assertEqual(codecs.encode(b(STR), enc), b(b91)) + self.assertEqual(codecs.decode(b91, enc), STR) + self.assertEqual(codecs.decode(b(b91), enc), b(STR)) + self.assertIsNotNone(codecs.encode("\x00\x00", "base91")) + self.assertIsNotNone(codecs.decode("abc", "base91")) + self.assertIsNotNone(codecs.decode("AD", "base91")) + self.assertRaises(ValueError, codecs.decode, "\xff", "base91") + self.assertRaises(ValueError, codecs.decode, "a\xff", "base91") + self.assertIsNotNone(codecs.encode("\x00\x00", "base91-alt")) + + def test_codec_base100(self): + B100 = "\U0001f46b\U0001f45f\U0001f460\U0001f46a\U0001f417\U0001f460\U0001f46a\U0001f417\U0001f458\U0001f417" \ + "\U0001f46b\U0001f45c\U0001f46a\U0001f46b" + self.assertEqual(codecs.encode(STR, "base100"), B100) + self.assertEqual(codecs.encode(b(STR), "base100"), b(B100)) + self.assertEqual(codecs.decode(B100, "base100"), STR) + self.assertEqual(codecs.decode(b(B100), "base100"), b(STR)) + self.assertRaises(ValueError, codecs.decode, b(B100)[1:], "base100") + self.assertIsNotNone(codecs.decode(b(B100) + b"\n", "base100", "ignore")) + + def test_codec_base_generic(self): + for n in range(2, 255): + bn = "base{}_generic".format(n) + self.assertEqual(codecs.decode(codecs.encode(STR, bn), bn), STR) + self.assertRaises(LookupError, codecs.decode, "test", "base0-generic") + self.assertRaises(LookupError, codecs.decode, "test", "base1-generic") + self.assertRaises(LookupError, codecs.decode, "test", "base256-generic") + + def test_base_main(self): + tmp = sys.argv[:] + tfile = "test-base-main.txt" + with open(tfile, 'w') as f: + f.write("This is a long test string for the sake of causing line wrapping based on default parameters.") + for swap_arg in [[], ["-s"]]: + sys.argv = [tmp[0], tfile] + swap_arg + for m in main32, main64url: + self.assertEqual(m(), 0) + sys.argv = [tmp[0], tfile, "-d"] + swap_arg + self.assertEqual(main2(), 1) + os.remove(tfile) + sys.argv[:] = tmp + diff --git a/tests/test_common.py b/tests/test_common.py index 8bbf410..407997c 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -1,256 +1,237 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Codecs added assets' tests. - -""" -import codecs -import codext -import json -import random -import sys -from codext.__common__ import CODECS_OVERWRITTEN, PERS_MACROS, PERS_MACROS_FILE -from six import b, binary_type, text_type -from unittest import TestCase - - -PY3 = sys.version[0] == "3" - - -def dummy_encode(input, errors="strict"): - return input, len(input) - - -def dummy_decode(input, errors="strict"): - return input, len(input) - - -def dummy_errored_decode(useless): - raise AttributeError - def decode(input, errors="strict"): - return input, len(input) - return decode - - -def ensure_str(s, encoding='utf-8', errors='strict'): - """ Similar to six.ensure_str. Adapted here to avoid messing up with six version errors. """ - if not PY3 and isinstance(s, text_type): - return s.encode(encoding, errors) - elif PY3 and isinstance(s, binary_type): - try: - return s.decode(encoding, errors) - except: - return s.decode("latin-1") - return s - - -def getregentry(encoding): - if encoding == "dummy3": - return codecs.CodecInfo(name="dummy3", encode=dummy_encode, decode=dummy_decode) - - -class TestCommon(TestCase): - def setUp(self): - codext.reset() - - def test_add_codec(self): - self.assertRaises(ValueError, codext.add, "test") - self.assertRaises(ValueError, codext.add, "test", "BAD") - self.assertRaises(ValueError, codext.add, "test", lambda: None, "BAD") - self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) - self.assertEqual(codext.encode("test", "dummy"), "test") - ci = codext.lookup("dummy") - for k in ["add_to_codecs", "category", "examples", "name", "pattern", "text"]: - self.assertIn(k, ci.parameters.keys()) - self.assertIsNotNone(codext.add("dummy_errored", None, dummy_errored_decode, r"dummy_errored(\d+)$")) - self.assertRaises(AttributeError, codext.lookup, "dummy_errored1") - - def test_add_map_codec(self): - ENCMAP = [{'a': "A", 'b': "B", 'c': "C"}, {'d': "D", 'e': "E", 'f': "F"}, {'g': "G", 'h': "H", 'i': "I"}] - self.assertIsNotNone(codext.add_map("dummy2", ENCMAP, pattern=r"^dummy2(?:[-_]?(\d))?$")) - self.assertRaises(ValueError, codext.add_map, "dummy2", "BAD_ENCMAP") - self.assertEqual(codext.encode("abc", "dummy2"), "ABC") - self.assertEqual(codext.encode("abc", "dummy2-1"), "ABC") - self.assertEqual(codext.encode("def", "dummy2-2"), "DEF") - self.assertEqual(codext.encode("ghi", "dummy2-3"), "GHI") - self.assertRaises(LookupError, codext.encode, "test", "dummy2-4") - ENCMAP = {'': {'a': "A", 'b': "B"}, r'bad': {'a': "B", 'b': "A"}} - self.assertIsNotNone(codext.add_map("dummy3", ENCMAP, pattern=r"^dummy3([-_]inverted)?$")) - self.assertRaises(LookupError, codext.encode, "test", "dummy3_inverted") - self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, ignore_case="BAD") - self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, intype="BAD") - self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, outype="BAD") - ci = codext.lookup("dummy2") - for k in ["category", "encmap", "ignore_case", "intype", "no_error", "outype", "repl_char", "sep", "text"]: - self.assertIn(k, ci.parameters.keys()) - - def test_list_codecs(self): - self.assertTrue(len(codext.list()) > 0) - self.assertTrue(len(codext.list("other")) > 0) - self.assertTrue(len(codext.list("native")) > 0) - self.assertTrue(len(codext.list("non-native")) > 0) - self.assertTrue(len(codext.list("native", "non-native", "crypto", "base")) > 0) - self.assertTrue(len(codext.list("native", "language", "crypto")) > 0) - self.assertTrue(len(codext.list("~crypto")) > 0) - self.assertEqual(set(codext.list("~native")), set(codext.list("non-native"))) - self.assertEqual(set(codext.list()), set(codext.list("native") + codext.list("non-native"))) - self.assertRaises(ValueError, codext.list, "BAD_CATEGORY") - self.assertTrue(codext.is_native("base64_codec")) - self.assertFalse(codext.is_native("base64")) - - def test_remove_codec(self): - self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) - self.assertEqual(codext.encode("test", "dummy"), "test") - self.assertIsNone(codext.remove("dummy")) - self.assertRaises(LookupError, codext.encode, "test", "dummy") - # special case, when adding a new codec also to the native codecs registry, then it won't be possible to remove - # it afterwards - self.assertIsNotNone(codecs.add("dummy2", dummy_encode, dummy_decode)) - self.assertEqual(codecs.encode("test", "dummy2"), "test") - self.assertIsNone(codecs.remove("dummy2")) - self.assertEqual(codecs.encode("test", "dummy2"), "test") - self.assertIsNone(codecs.register(getregentry)) - self.assertEqual(codecs.encode("test", "dummy3"), "test") - self.assertIsNone(codecs.remove("dummy3")) - self.assertEqual(codecs.encode("test", "dummy3"), "test") - - def test_clear_codecs(self): - self.assertIsNotNone(codecs.encode("test", "morse")) - self.assertIsNone(codecs.clear()) - self.assertRaises(LookupError, codecs.encode, "test", "morse") - - def test_reset_codecs(self): - self.assertIsNone(codext.reset()) - self.assertIsNotNone(codext.encode("test", "morse")) - self.assertRaises(LookupError, codext.encode, "test", "dummy") - self.assertTrue(len(CODECS_OVERWRITTEN) > 0) - self.assertIsNotNone(str(CODECS_OVERWRITTEN[0])) - - def test_search_codecs(self): - self.assertIsNotNone(codext.search("morse")) - self.assertIsNotNone(codext.search("geohash")) - self.assertIsNotNone(codext.examples("morse")) - self.assertIsNotNone(codext.examples("cp")) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[ab]{1,3}"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=ab)cd"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=-)\w+"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"([^\s])\1"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^\\]"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^a]"))) - - def test_encode_multiple_rounds(self): - if PY3: - self.assertRaises(TypeError, codext.encode, b"test", "utf-8[2]") - s = "test" - for i in range(3): - s = codext.encode(s, "morse") - self.assertEqual(s, codext.encode("test", "morse[3]")) - self.assertIsNotNone(codext.encode("test", "base64[10]")) - - def test_guess_decode(self): - self.assertIsNone(codext.stopfunc._reload_lang()) - self.assertIsNotNone(codext.stopfunc._validate("flag")) - _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None - codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "^test(?:_codec)?$", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) - self.assertIn("test-codec", codext.list_encodings("test")) - self.assertEqual(codext.decode("TEST=", "test"), "TEST") - self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2, - scoring_heuristic=False).items())[0][1], "TEST") - self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include=["test", "base"], - max_depth=2).items())[0][1], "TEST") - STR = "This is a test" - self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", max_depth=1))) - self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "a test", found=["base62"]))) - self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, "base", scoring_heuristic=True, - exclude=["base100"]))) - self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, ["base", "crypto"]))) - self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=None)), 0) - self.assertIn("F1@9", _l(codext.guess("VGVzdCBGMUA5ICE=", codext.stopfunc.flag, max_depth=1, stop=False, - show=True))) - self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", - exclude=("base64", "base64-url"))), 0) - self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", - scoring_heuristic=True, exclude=("base64", "base64-url", "atbash"))), 0) - self.assertRaises(ValueError, codext.guess, STR, max_depth=0) - self.assertRaises(ValueError, codext.guess, STR, exclude=42) - for c in ["base", "language", "native", "stegano"]: - e = codext.list(c) - random.shuffle(e) - for ename in e[:10]: - for encoding in codext.lookup(ename).parameters.get('guess', [ename])[:10]: - try: - enc = codext.encode(STR, encoding) - except (NotImplementedError, ValueError): - continue - except TypeError: - enc = codext.encode(b(STR), encoding) - if codext.decode(enc, encoding) == STR: - continue - for found_encodings, found_dec in codext.guess(enc, "a test", 0, 1, [c], - scoring_heuristic=True, debug=True).items(): - self.assertEqual(ensure_str(STR).lower(), ensure_str(found_dec).lower()) - if c != "base": - # do not check for base as the guessed encoding name can be different, e.g.: - # actual: base2 - # guessed: base2-generic - if "-icase" in encoding: - self.assertEqual(encoding.lower(), found_encodings[0].lower()) - else: - self.assertEqual(encoding, found_encodings[0]) - txt = "".join(chr(i) for i in range(256)) - b64 = codext.encode(txt, "base64") - self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True, include="base"))) - self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST") - - def test_rank_input(self): - codext.remove("test_codec") - self.assertRaises(LookupError, codext.encode, "TEST", "test") - codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "^test(?:_codec)?$", padding_char="=", no_error=True, penalty=1.) - STR = "This is a test string !" - ENC = codext.encode(STR, "base64") - self.assertTrue(len(codext.rank(ENC)) > 20) - self.assertEqual(len(codext.rank(ENC, limit=20)), 20) - self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url", "base64-inv"]) - self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) - self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) - self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) - self.assertIsNotNone(codext.rank("TEST=", include=["test", "base"])[0][0][1], "TEST") - - def test_handle_macros(self): - MACRO = "test-macro-f2ca1bb6c7e907d06dafe4687e579fce76b37e4e93b7605022da52e6ccc26fd2" - STR = "this is a test" - ENC = "H4sIAMrbkmEC/0txzyhIrnQC4QxPj6CcZONAWwAMIDOIFAAAAA==" - codext.remove(MACRO) - l = codext.list_macros() - self.assertTrue(len(l) > 0) - cm = codext.lookup("example-macro") - self.assertIsNotNone(cm) - self.assertRaises(LookupError, codext.lookup, "example-macro", False) - self.assertRaises(ValueError, codext.add_macro, "example-macro", "base64") - self.assertRaises(ValueError, codext.add_macro, "base64", "base91") - self.assertIsNotNone(repr(cm)) - self.assertTrue(hasattr(cm, "parameters")) - self.assertRaises(LookupError, codext.lookup, MACRO) - self.assertIsNone(codext.add_macro(MACRO, "base64", "gzip", "base64")) - self.assertIn(MACRO, codext.list_macros()) - self.assertIsNotNone(codext.encode(STR, MACRO)) - self.assertEqual(codext.decode(ENC, MACRO), STR) - # insert a bad entry for the list of encodings in the JSON file - PERS_MACROS[MACRO] = "not a list or tuple..." - with open(PERS_MACROS_FILE, 'w') as f: - json.dump(PERS_MACROS, f) - codext.reset() - self.assertRaises(ValueError, codext.lookup, MACRO) - self.assertIsNone(codext.remove(MACRO)) - self.assertRaises(LookupError, codext.lookup, MACRO) - self.assertNotIn(MACRO, codext.list_macros()) - self.assertIsNone(codext.remove("THIS-MACRO-DOES-NOT-EXIST")) - self.assertIsNone(codext.remove("VALID-MACRO")) - self.assertIsNone(codext.add_macro("VALID-MACRO", "gzip", "base64")) - self.assertIsNone(codext.remove("VALID-MACRO")) - if PY3: - self.assertIsNone(codext.add_macro("VALID-MACRO", "lzma", "base64")) - self.assertIsNone(codext.remove("VALID-MACRO")) - self.assertRaises(ValueError, codext.add_macro, "SHALL-FAIL", "base26", "sms", "letter-indices") - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Codecs added assets' tests. + +""" +import codext +import json +import random +import sys +from codext.__common__ import * +from codext.__common__ import CODECS_OVERWRITTEN, PERS_MACROS, PERS_MACROS_FILE +from unittest import TestCase + + +def dummy_encode(input, errors="strict"): + return input, len(input) + + +def dummy_decode(input, errors="strict"): + return input, len(input) + + +def dummy_errored_decode(useless): + raise AttributeError + def decode(input, errors="strict"): + return input, len(input) + return decode + + +def getregentry(encoding): + if encoding == "dummy3": + return codecs.CodecInfo(name="dummy3", encode=dummy_encode, decode=dummy_decode) + + +class TestCommon(TestCase): + def setUp(self): + codext.reset() + + def test_add_codec(self): + self.assertRaises(ValueError, codext.add, "test") + self.assertRaises(ValueError, codext.add, "test", "BAD") + self.assertRaises(ValueError, codext.add, "test", lambda: None, "BAD") + self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) + self.assertEqual(codext.encode("test", "dummy"), "test") + ci = codext.lookup("dummy") + for k in ["add_to_codecs", "category", "examples", "name", "pattern", "text"]: + self.assertIn(k, ci.parameters.keys()) + self.assertIsNotNone(codext.add("dummy_errored", None, dummy_errored_decode, r"dummy_errored(\d+)$")) + self.assertRaises(AttributeError, codext.lookup, "dummy_errored1") + + def test_add_map_codec(self): + ENCMAP = [{'a': "A", 'b': "B", 'c': "C"}, {'d': "D", 'e': "E", 'f': "F"}, {'g': "G", 'h': "H", 'i': "I"}] + self.assertIsNotNone(codext.add_map("dummy2", ENCMAP, pattern=r"^dummy2(?:[-_]?(\d))?$")) + self.assertRaises(ValueError, codext.add_map, "dummy2", "BAD_ENCMAP") + self.assertEqual(codext.encode("abc", "dummy2"), "ABC") + self.assertEqual(codext.encode("abc", "dummy2-1"), "ABC") + self.assertEqual(codext.encode("def", "dummy2-2"), "DEF") + self.assertEqual(codext.encode("ghi", "dummy2-3"), "GHI") + self.assertRaises(LookupError, codext.encode, "test", "dummy2-4") + ENCMAP = {'': {'a': "A", 'b': "B"}, r'bad': {'a': "B", 'b': "A"}} + self.assertIsNotNone(codext.add_map("dummy3", ENCMAP, pattern=r"^dummy3([-_]inverted)?$")) + self.assertRaises(LookupError, codext.encode, "test", "dummy3_inverted") + self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, ignore_case="BAD") + self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, intype="BAD") + self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, outype="BAD") + ci = codext.lookup("dummy2") + for k in ["category", "encmap", "ignore_case", "intype", "no_error", "outype", "repl_char", "sep", "text"]: + self.assertIn(k, ci.parameters.keys()) + + def test_list_codecs(self): + self.assertTrue(len(codext.list()) > 0) + self.assertTrue(len(codext.list("other")) > 0) + self.assertTrue(len(codext.list("native")) > 0) + self.assertTrue(len(codext.list("non-native")) > 0) + self.assertTrue(len(codext.list("native", "non-native", "crypto", "base")) > 0) + self.assertTrue(len(codext.list("native", "language", "crypto")) > 0) + self.assertTrue(len(codext.list("~crypto")) > 0) + self.assertEqual(set(codext.list("~native")), set(codext.list("non-native"))) + self.assertEqual(set(codext.list()), set(codext.list("native") + codext.list("non-native"))) + self.assertRaises(ValueError, codext.list, "BAD_CATEGORY") + self.assertTrue(codext.is_native("base64_codec")) + self.assertFalse(codext.is_native("base64")) + + def test_remove_codec(self): + self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) + self.assertEqual(codext.encode("test", "dummy"), "test") + self.assertIsNone(codext.remove("dummy")) + self.assertRaises(LookupError, codext.encode, "test", "dummy") + # special case, when adding a new codec also to the native codecs registry, then it won't be possible to remove + # it afterwards + self.assertIsNotNone(codecs.add("dummy2", dummy_encode, dummy_decode)) + self.assertEqual(codecs.encode("test", "dummy2"), "test") + self.assertIsNone(codecs.remove("dummy2")) + self.assertEqual(codecs.encode("test", "dummy2"), "test") + self.assertIsNone(codecs.register(getregentry)) + self.assertEqual(codecs.encode("test", "dummy3"), "test") + self.assertIsNone(codecs.remove("dummy3")) + self.assertEqual(codecs.encode("test", "dummy3"), "test") + + def test_clear_codecs(self): + self.assertIsNotNone(codecs.encode("test", "morse")) + self.assertIsNone(codecs.clear()) + self.assertRaises(LookupError, codecs.encode, "test", "morse") + + def test_reset_codecs(self): + self.assertIsNone(codext.reset()) + self.assertIsNotNone(codext.encode("test", "morse")) + self.assertRaises(LookupError, codext.encode, "test", "dummy") + self.assertTrue(len(CODECS_OVERWRITTEN) > 0) + self.assertIsNotNone(str(CODECS_OVERWRITTEN[0])) + + def test_search_codecs(self): + self.assertIsNotNone(codext.search("morse")) + self.assertIsNotNone(codext.search("geohash")) + self.assertIsNotNone(codext.examples("morse")) + self.assertIsNotNone(codext.examples("cp")) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[ab]{1,3}"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=ab)cd"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=-)\w+"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"([^\s])\1"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^\\]"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^a]"))) + + def test_encode_multiple_rounds(self): + s = "test" + for i in range(3): + s = codext.encode(s, "morse") + self.assertEqual(s, codext.encode("test", "morse[3]")) + self.assertIsNotNone(codext.encode("test", "base64[10]")) + + def test_guess_decode(self): + self.assertIsNone(codext.stopfunc._reload_lang()) + self.assertIsNotNone(codext.stopfunc._validate("flag")) + _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None + codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), + "^test(?:_codec)?$", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) + self.assertIn("test-codec", codext.list_encodings("test")) + self.assertEqual(codext.decode("TEST=", "test"), "TEST") + self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2, + scoring_heuristic=False).items())[0][1], "TEST") + self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include=["test", "base"], + max_depth=2).items())[0][1], "TEST") + STR = "This is a test" + self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", max_depth=1))) + self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "a test", found=["base62"]))) + self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, "base", scoring_heuristic=True, + exclude=["base100"]))) + self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, ["base", "crypto"]))) + self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=None)), 0) + self.assertIn("F1@9", _l(codext.guess("VGVzdCBGMUA5ICE=", codext.stopfunc.flag, max_depth=1, stop=False, + show=True))) + self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", + exclude=("base64", "base64-url"))), 0) + self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", + scoring_heuristic=True, exclude=("base64", "base64-url", "atbash"))), 0) + self.assertRaises(ValueError, codext.guess, STR, max_depth=0) + self.assertRaises(ValueError, codext.guess, STR, exclude=42) + for c in ["base", "language", "native", "stegano"]: + e = codext.list(c) + random.shuffle(e) + for ename in e[:10]: + for encoding in codext.lookup(ename).parameters.get('guess', [ename])[:10]: + try: + enc = codext.encode(STR, encoding) + except (NotImplementedError, ValueError): + continue + except TypeError: + enc = codext.encode(b(STR), encoding) + if codext.decode(enc, encoding) == STR: + continue + for found_encodings, found_dec in codext.guess(enc, "a test", 0, 1, [c], + scoring_heuristic=True, debug=True).items(): + self.assertEqual(ensure_str(STR).lower(), ensure_str(found_dec).lower()) + if c != "base": + # do not check for base as the guessed encoding name can be different, e.g.: + # actual: base2 + # guessed: base2-generic + if "-icase" in encoding: + self.assertEqual(encoding.lower(), found_encodings[0].lower()) + else: + self.assertEqual(encoding, found_encodings[0]) + txt = "".join(chr(i) for i in range(256)) + b64 = codext.encode(txt, "base64") + self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True, include="base"))) + self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST") + + def test_rank_input(self): + codext.remove("test_codec") + self.assertRaises(LookupError, codext.encode, "TEST", "test") + codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), + "^test(?:_codec)?$", padding_char="=", no_error=True, penalty=1.) + STR = "This is a test string !" + ENC = codext.encode(STR, "base64") + self.assertTrue(len(codext.rank(ENC)) > 20) + self.assertEqual(len(codext.rank(ENC, limit=20)), 20) + self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url", "base64-inv"]) + self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) + self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) + self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) + self.assertIsNotNone(codext.rank("TEST=", include=["test", "base"])[0][0][1], "TEST") + + def test_handle_macros(self): + MACRO = "test-macro-f2ca1bb6c7e907d06dafe4687e579fce76b37e4e93b7605022da52e6ccc26fd2" + STR = "this is a test" + ENC = "H4sIAMrbkmEC/0txzyhIrnQC4QxPj6CcZONAWwAMIDOIFAAAAA==" + codext.remove(MACRO) + l = codext.list_macros() + self.assertTrue(len(l) > 0) + cm = codext.lookup("example-macro") + self.assertIsNotNone(cm) + self.assertRaises(LookupError, codext.lookup, "example-macro", False) + self.assertRaises(ValueError, codext.add_macro, "example-macro", "base64") + self.assertRaises(ValueError, codext.add_macro, "base64", "base91") + self.assertIsNotNone(repr(cm)) + self.assertTrue(hasattr(cm, "parameters")) + self.assertRaises(LookupError, codext.lookup, MACRO) + self.assertIsNone(codext.add_macro(MACRO, "base64", "gzip", "base64")) + self.assertIn(MACRO, codext.list_macros()) + self.assertIsNotNone(codext.encode(STR, MACRO)) + self.assertEqual(codext.decode(ENC, MACRO), STR) + # insert a bad entry for the list of encodings in the JSON file + PERS_MACROS[MACRO] = "not a list or tuple..." + with open(PERS_MACROS_FILE, 'w') as f: + json.dump(PERS_MACROS, f) + codext.reset() + self.assertRaises(ValueError, codext.lookup, MACRO) + self.assertIsNone(codext.remove(MACRO)) + self.assertRaises(LookupError, codext.lookup, MACRO) + self.assertNotIn(MACRO, codext.list_macros()) + self.assertIsNone(codext.remove("THIS-MACRO-DOES-NOT-EXIST")) + self.assertIsNone(codext.remove("VALID-MACRO")) + self.assertIsNone(codext.add_macro("VALID-MACRO", "gzip", "base64")) + self.assertIsNone(codext.remove("VALID-MACRO")) + self.assertIsNone(codext.add_macro("VALID-MACRO", "lzma", "base64")) + self.assertIsNone(codext.remove("VALID-MACRO")) + self.assertRaises(ValueError, codext.add_macro, "SHALL-FAIL", "base26", "sms", "letter-indices") + diff --git a/tests/test_generated.py b/tests/test_generated.py index 6b89129..57b7b4e 100644 --- a/tests/test_generated.py +++ b/tests/test_generated.py @@ -1,133 +1,157 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Automatically generated codec tests. - -""" -import os -import re -from itertools import chain -from random import randint -from string import printable -from unittest import TestCase - -from codext.__common__ import * - - -def make_test(**params): - """ Test factory function for auto-creating tests for encodings having __examples__ defined. """ - def _template(self): - tfile = "test-codec-%s.txt" % params['name'] - icase = params.get('ignore_case') - icdec = lambda s: s.lower() if icase in ["decode", "both"] else s - icenc = lambda s: s.lower() if icase in ["encode", "both"] else s - # first, define if only encode is used ; if so, decoding must occur right after encode tests, otherwise just - # execute the defined decode tests - dec = True - for k in params['examples'].keys(): - if k.startswith("dec"): - dec = False - # now execute tests relying on the given examples - for k, examples in params['examples'].items(): - # multiple encoding names can be given, e.g. 'enc(morse|morse-AB|...)' - m = re.match(r"(?:dec|enc|enc-dec)\((.*?)(?:\|(.*?))*\)", k) - if m: - f1 = getattr(codecs, ["decode", "encode"][k.startswith("enc")]) - f2 = getattr(codecs, ["encode", "decode"][k.startswith("enc")]) - for ename in m.groups(): - if ename is None: - continue - # erroneous encoding name test - if examples is None: - self.assertRaises(LookupError, f1, "test", ename) - continue - # unhandled character error tests - encmap = params.get('encmap') - if encmap and params['intype'] not in ["bin", "ord"] and not params['no_error']: - if not isinstance(encmap, list): - encmap = [encmap] - for em in encmap: - if k.startswith("dec"): - em = {v: k for k, v in em.items()} - # find one handled character and one unhandled - c1, c2 = None, None - p = list(map(ord, printable)) - for i in chain(p, set(range(256)) - set(p)): - if chr(i) in em.keys(): - c1 = chr(i) - break - for i in chain(set(range(256)) - set(p), p): - if chr(i) not in em.keys(): - c2 = chr(i) - break - # now check that it raises the right error or not given the selected errors handling - if c1 and c2: - sep = params['sep'][0] if len(params['sep']) > 0 else "" - self.assertRaises(ValueError, f1, c2, ename) - self.assertRaises(ValueError, f1, c2, ename, "BAD_ERRORS") - if not k.startswith("enc-dec"): - self.assertEqual(f1(c1 + c2, ename, "ignore"), f1(c1, ename)) - self.assertEqual(f1(c1 + c2, ename, "leave"), f1(c1, ename) + sep + c2) - self.assertEqual(f1(c1 + c2, ename, "replace"), f1(c1, ename) + sep + \ - params.get('repl_minlen', 1) * params['repl_char']) - # examples validation tests - if k.startswith("enc-dec") and isinstance(examples, list): - for e in examples[:]: - rd = re.match(r"\@random(?:\{(\d+(?:,(\d+))*?)\})?$", e) - if rd: - examples.remove(e) - for n in (rd.group(1) or "512").split(","): - examples.append("".join(chr(randint(0, 255)) for i in range(int(n)))) - for s in [""] + examples: - self.assertEqual(icdec(f2(icenc(f1(s, ename)), ename)), icdec(s)) - self.assertEqual(icdec(f2(icenc(f1(b(s), ename)), ename)), b(icdec(s))) - # file tests - with codecs.open(tfile, 'wb', encoding=ename) as f: - f.write(b(s)) - with codecs.open(tfile, 'rb', encoding=ename) as f: - s2 = f.read() if PY3 else f.read().rstrip("\x00") - self.assertEqual(b(icdec(s2)), b(icdec(s))) - os.remove(tfile) - else: - for s1, s2 in examples.items(): - # willingly erroneous tests - if s2 is None: - self.assertRaises((ValueError, NotImplementedError), f1, s1, ename) - continue - # raw text tests - self.assertEqual(icenc(f1(s1, ename)), icenc(s2)) - self.assertEqual(b(icenc(f1(s1, ename))), b(icenc(s2))) - self.assertIsNotNone(f1(s1, ename, "replace")) - self.assertIsNotNone(f1(s1, ename, "ignore")) - if dec: - self.assertEqual(icdec(f2(s2, ename)), icdec(s1)) - self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s1))) - self.assertIsNotNone(f2(s2, ename, "replace")) - self.assertIsNotNone(f2(s2, ename, "ignore")) - if k.startswith("enc"): - # file tests - with codecs.open(tfile, 'wb', encoding=ename) as f: - f.write(b(s1)) - with codecs.open(tfile, 'rb', encoding=ename) as f: - s = f.read() - if not PY3 and re.search("[^\x00]\x00$", s): - s = s[:-1] - self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s))) - os.remove(tfile) - return _template - - -class GeneratedTestCase(TestCase): - pass - - -for encoding in list_encodings(): - try: - ci = lookup(encoding) - except LookupError: - continue - # only consider codecs with __examples__ defined in their globals for dynamic tests generation - if ci.parameters.get('examples') is not None: - f = make_test(**ci.parameters) - f.__name__ = n = "test_" + encoding.replace("-", "_") - setattr(GeneratedTestCase, n, f) - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Automatically generated codec tests. + +""" +from itertools import chain +from random import randint +from string import printable +from unittest import TestCase + +from codext.__common__ import * + + +def make_test(**params): + """ Test factory function for auto-creating tests for encodings having __examples__ defined. """ + def _template(self): + tfile = "test-codec-%s.txt" % params['name'] + icase = params.get('ignore_case') + icdec = lambda s: s.lower() if icase in ["decode", "both"] else s + icenc = lambda s: s.lower() if icase in ["encode", "both"] else s + # first, define if only encode is used ; if so, decoding must occur right after encode tests, otherwise just + # execute the defined decode tests + dec = True + for k in params['examples'].keys(): + if k.startswith("dec"): + dec = False + # now execute tests relying on the given examples + for k, examples in params['examples'].items(): + # multiple encoding names can be given, e.g. 'enc(morse|morse-AB|...)' + m = re.match(r"(?:dec|enc|enc-dec)\((.*?)(?:\|(.*?))*\)(\*)?", k) + if m: + f1 = getattr(codecs, ["decode", "encode"][k.startswith("enc")]) + f2 = getattr(codecs, ["encode", "decode"][k.startswith("enc")]) + for ename in m.groups(): + if ename == "*": + # ignore mode only + continue + if ename is None: + continue + # buggy generated encoding names + try: + lookup(ename) + except LookupError: + continue + # erroneous encoding name test + if examples is None: + self.assertRaises(LookupError, f1, "test", ename) + continue + # unhandled character error tests + encmap = params.get('encmap') + if encmap and params['intype'] not in ["bin", "ord"] and not params['no_error']: + if not isinstance(encmap, list): + encmap = [encmap] + for em in encmap: + if k.startswith("dec"): + em = {v: k for k, v in em.items()} + # find one handled character and one unhandled + c1, c2 = None, None + p = list(map(ord, printable)) + for i in chain(p, set(range(256)) - set(p)): + if chr(i) in em.keys(): + c1 = chr(i) + break + for i in chain(set(range(256)) - set(p), p): + if chr(i) not in em.keys(): + c2 = chr(i) + break + # now check that it raises the right error or not given the selected errors handling + if c1 and c2: + sep = params['sep'][0] if len(params['sep']) > 0 else "" + self.assertRaises(ValueError, f1, c2, ename) + self.assertRaises(ValueError, f1, c2, ename, "BAD_ERRORS") + if not k.startswith("enc-dec"): + self.assertEqual(f1(c1 + c2, ename, "ignore"), f1(c1, ename)) + self.assertEqual(f1(c1 + c2, ename, "leave"), f1(c1, ename) + sep + c2) + self.assertEqual(f1(c1 + c2, ename, "replace"), f1(c1, ename) + sep + \ + params.get('repl_minlen', 1) * params['repl_char']) + # examples validation tests + incr_f1 = codecs.getincrementalencoder(ename)().encode + incr_f2 = codecs.getincrementaldecoder(ename)().decode + # - "enc-dec" tests (uses a list of values that shall remain the same after encoding and decoding, + # no matter what the encoded value is + if k.startswith("enc-dec") and isinstance(examples, list): + for e in examples[:]: + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + if rd: + examples.remove(e) + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + examples.append(s.lower() if rd.group(1) else s) + for s in [""] + examples: + self.assertEqual(icdec(f2(icenc(f1(s, ename)), ename)), icdec(s)) + self.assertEqual(icdec(f2(icenc(f1(b(s), ename)), ename)), b(icdec(s))) + # important note: with respect to the original design, + # IncrementalEncoder(...).encode(...) gives bytes + # IncrementalDecoder(...).encode(...) gives str + self.assertEqual(icdec(incr_f2(icenc(incr_f1(s, ename)), ename)), icdec(s)) + self.assertEqual(icdec(incr_f2(icenc(incr_f1(b(s), ename)), ename)), icdec(s)) + # file tests + with codecs.open(tfile, 'wb', encoding=ename) as f: + f.write(b(s)) + with codecs.open(tfile, 'rb', encoding=ename) as f: + s2 = f.read() + self.assertEqual(b(icdec(s2)), b(icdec(s))) + os.remove(tfile) + # - "enc" and "dec" tests (uses a dictionary with the value to be encoded and the expected encoded + # value) + else: + for s1, s2 in examples.items(): + # willingly erroneous tests + if s2 is None: + self.assertRaises((ValueError, NotImplementedError), f1, s1, ename) + continue + # raw text tests + self.assertEqual(icenc(f1(s1, ename)), icenc(s2)) + self.assertEqual(b(icenc(f1(s1, ename))), b(icenc(s2))) + # important note: with respect to the original design, + # IncrementalEncoder(...).encode(...) gives bytes + #self.assertEqual(icenc(incr_f1(s1, ename)), b(icenc(s2))) + #self.assertEqual(icenc(incr_f1(b(s1), ename)), b(icenc(s2))) + self.assertIsNotNone(f1(s1, ename, "replace")) + self.assertIsNotNone(f1(s1, ename, "ignore")) + if dec: + self.assertEqual(icdec(f2(s2, ename)), icdec(s1)) + self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s1))) + # important note: with respect to the original design, + # IncrementalDecoder(...).encode(...) gives str + #self.assertEqual(icdec(incr_f2(s2, ename)), icdec(s1)) + #self.assertEqual(icdec(incr_f2(b(s2), ename)), icdec(s1)) + self.assertIsNotNone(f2(s2, ename, "replace")) + self.assertIsNotNone(f2(s2, ename, "ignore")) + if k.startswith("enc"): + # file tests + with codecs.open(tfile, 'wb', encoding=ename) as f: + f.write(b(s1)) + with codecs.open(tfile, 'rb', encoding=ename) as f: + s = f.read() + self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s))) + os.remove(tfile) + return _template + + +class GeneratedTestCase(TestCase): + pass + + +for encoding in list_encodings(): + try: + ci = lookup(encoding) + except LookupError: + continue + # only consider codecs with __examples__ defined in their globals for dynamic tests generation + if ci.parameters.get('examples') is not None: + f = make_test(**ci.parameters) + f.__name__ = n = "test_" + encoding.replace("-", "_") + setattr(GeneratedTestCase, n, f) + diff --git a/tests/test_manual.py b/tests/test_manual.py index 4211df7..e443f75 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -1,169 +1,185 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Manual codec tests. - -""" -import os -import random -from six import binary_type, string_types -from unittest import TestCase - -from codext.__common__ import * -from codext.binary.baudot import _check_alphabet -from codext.hashing.checksums import CRC - - -class ComplementaryTestCase(TestCase): - def test_codec_baudot(self): - self.assertRaises(ValueError, _check_alphabet, ["BAD_ALPHABET"]) - - def test_codec_dna(self): - self.assertEqual(codecs.decode("ABC", "dna-1", errors="ignore"), "\x02") - self.assertEqual(codecs.decode("ABC", "dna-2", errors="replace"), "[00??01]") - - def test_codec_morse(self): - self.assertRaises(LookupError, codecs.encode, "test", "morse-AAB") - - def test_codec_sms(self): - self.assertEqual(codecs.decode("A-B-222-3-4-5", "sms", "leave"), "ABcdgj") - - -class ManualTestCase(TestCase): - def test_codec_affine(self): - STR = "this is a test" - AFF1 = "vjkubkubcbvguv" - self.assertRaises(LookupError, codecs.encode, STR, "affine-BAD") - self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u-BAD") - # uses by default an alphabet with lowercase, uppercase, whitespace and parameters a=1 and b=2 - self.assertEqual(codecs.encode(STR, "affine"), codecs.encode(STR, "affine-?l?u?s-1,2")) - self.assertEqual(codecs.encode(STR, "affine"), AFF1) - self.assertEqual(codecs.encode(b(STR), "affine"), b(AFF1)) - self.assertEqual(codecs.decode(AFF1, "affine"), STR) - self.assertEqual(codecs.decode(b(AFF1), "affine"), b(STR)) - AFF2 = "ORWJdWJdidOCJO" - self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-5,8"), AFF2) - self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-5,8"), b(AFF2)) - self.assertEqual(codecs.decode(AFF2, "affine-?l?u?d?s-5,8"), STR) - self.assertEqual(codecs.decode(b(AFF2), "affine-?l?u?d?s-5,8"), b(STR)) - AFF3 = "QsuOcuOcecQmOQ" - self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-2,4"), AFF3) - self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-2,4"), b(AFF3)) - self.assertEqual(codecs.decode(AFF3, "affine-?l?u?d?s-2,4"), STR) - self.assertEqual(codecs.decode(b(AFF3), "affine-?l?u?d?s-2,4"), b(STR)) - self.assertRaises(ValueError, codecs.decode, ".BAD.", "affine-?l?u?d?s-2,4") - self.assertIsNotNone(codecs.encode("TEST", "affine_?u-1,2")) - # example of parameters that cause mapping collisions - self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u?d?s-6,8") - - def test_codec_atbash(self): - STR = "This is a test" - ATB1 = "Gsrh rh z gvhg" - self.assertIsNotNone(codecs.encode("test", "atbash-whatevers")) - # uses by default an alphabet with lowercase and uppercase - self.assertEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-?l?u")) - self.assertNotEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-[?l?u]")) - self.assertEqual(codecs.encode(STR, "atbash_cipher"), ATB1) - self.assertEqual(codecs.encode(b(STR), "atbash-cipher"), b(ATB1)) - self.assertEqual(codecs.decode(ATB1, "atbash"), STR) - self.assertEqual(codecs.decode(b(ATB1), "atbash"), b(STR)) - ATB2 = "N^]/a]/a a.{/." - self.assertEqual(codecs.encode(STR, "atbash-[?l?u?p?s]"), ATB2) - self.assertEqual(codecs.encode(b(STR), "atbash_cipher-[?l?u?p?s]"), b(ATB2)) - self.assertEqual(codecs.decode(ATB2, "atbash-[?l?u?p?s]"), STR) - self.assertEqual(codecs.decode(b(ATB2), "atbash_cipher-[?l?u?p?s]"), b(STR)) - - def test_codec_case_related_manips(self): - STR = "This is a test" - self.assertEqual(codecs.encode(STR, "lower"), "this is a test") - self.assertEqual(codecs.encode(b(STR), "uppercase"), b("THIS IS A TEST")) - self.assertEqual(codecs.encode(STR, "capitalize"), "This is a test") - self.assertEqual(codecs.decode(b(STR), "capitalize"), b("this is a test")) - self.assertEqual(codecs.encode(STR, "title"), "This Is A Test") - self.assertEqual(codecs.decode(b(STR), "title"), b("this is a test")) - self.assertEqual(codecs.encode(b(STR), "swapcase"), b("tHIS IS A TEST")) - self.assertEqual(codecs.encode(b(STR), "camelcase"), b("thisIsATest")) - self.assertEqual(codecs.encode(b(STR), "kebabcase"), b("this-is-a-test")) - self.assertEqual(codecs.encode(b(STR), "pascalcase"), b("ThisIsATest")) - self.assertEqual(codecs.encode(b(STR), "slugify"), b("this-is-a-test")) - self.assertEqual(codecs.encode(b(STR), "snakecase"), b("this_is_a_test")) - self.assertRaises(NotImplementedError, codecs.decode, STR, "camel") - self.assertRaises(NotImplementedError, codecs.decode, STR, "pascal") - self.assertRaises(NotImplementedError, codecs.decode, STR, "slug") - self.assertRaises(NotImplementedError, codecs.decode, STR, "snake") - - def test_codec_dummy_str_manips(self): - STR = "this is a test" - self.assertEqual(codecs.decode(STR, "reverse"), "tset a si siht") - self.assertEqual(codecs.decode(STR, "reverse_words"), "siht si a tset") - self.assertEqual(codecs.decode(STR.split()[0], "reverse"), codecs.decode(STR.split()[0], "reverse-words")) - self.assertEqual(codecs.encode(STR, "replace-i1"), STR.replace("i", "1")) - self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR) - self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that")) - self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR) - - def test_codec_hash_functions(self): - STR = b"This is a test string!" - for h in ["adler32", "md2", "md4", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: - self.assertIsNotNone(codecs.encode(STR, h)) - self.assertRaises(NotImplementedError, codecs.decode, STR, h) - if PY3: - self.assertEqual(len(codecs.encode(STR, "blake2b_64")), 128) - self.assertRaises(LookupError, codecs.encode, STR, "blake2b_0") - self.assertRaises(LookupError, codecs.encode, STR, "blake2b-65") - self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2b") - self.assertEqual(len(codecs.encode(STR, "blake2s_32")), 64) - self.assertRaises(LookupError, codecs.encode, STR, "blake2s_0") - self.assertRaises(LookupError, codecs.encode, STR, "blake2s-33") - self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2s") - self.assertIsNotNone(codecs.encode(STR, "shake128")) - self.assertRaises(LookupError, codecs.encode, STR, "shake128_0") - self.assertRaises(NotImplementedError, codecs.decode, STR, "shake128") - self.assertIsNotNone(codecs.encode(STR, "shake256")) - self.assertRaises(LookupError, codecs.encode, STR, "shake256-0") - self.assertRaises(NotImplementedError, codecs.decode, STR, "shake256") - for h in ["sha3_224", "sha3_256", "sha3_384", "sha3_512"]: - self.assertIsNotNone(codecs.encode(STR, h)) - self.assertRaises(NotImplementedError, codecs.decode, STR, h) - if UNIX: - import crypt - METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] - for m in METHODS: - h = "crypt-" + m - self.assertIsNotNone(codecs.encode(STR, h)) - self.assertRaises(NotImplementedError, codecs.decode, STR, h) - # CRC checks - STR = "123456789" - for n, variants in CRC.items(): - for name, params in variants.items(): - enc = ("crc%d-%s" % (n, name) if isinstance(n, int) else "crc-%s" % name).rstrip("-") - print(enc) - self.assertEqual(codecs.encode(STR, enc), "%0{}x".format(round((n or 16)/4+.5)) % params[5]) - - def test_codec_markdown(self): - HTM = "

Test title

\n\n

Test paragraph

\n" - MD = "# Test title\n\nTest paragraph" - TFILE = "test-codec-markdown.html" - self.assertTrue(isinstance(codecs.encode(MD, "markdown"), string_types)) - self.assertTrue(not PY3 or isinstance(codecs.encode(b(MD), "markdown"), binary_type)) - self.assertEqual(codecs.encode(MD, "markdown"), HTM) - self.assertRaises(NotImplementedError, codecs.decode, MD, "markdown") - with codecs.open(TFILE, 'w', encoding="markdown") as f: - f.write(b(MD)) - with codecs.open(TFILE) as f: - s = f.read() - self.assertEqual(HTM, ensure_str(s)) - os.remove(TFILE) - - def test_codec_whitespace_after_before(self): - STR = "test" - for i in range(100): - c = "whitespace{}{}*after{}{}*before".format("-+"[random.randint(0, 1)], random.randint(1, 3), - "-+"[random.randint(0, 1)], random.randint(1, 3)) - self.assertEqual(codecs.decode("\n" + codecs.encode(STR, c) + "\n", c), STR) - # in this special case, the whitespaces between words cannot be encoded because: - # - ord(" ") == 32 - # - the next minimal value in the printable characters excluding the latest 6 is ord("!") == 33 - # and therefore ord(" ")-random(0,20)-random(0,20) will never fall into the valid ordinals ! - self.assertRaises(ValueError, codecs.encode, "this is a test", "whitespace-after-before") - self.assertIn("\x00", codecs.encode("this is a test", "whitespace-after-before", "replace")) - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Manual codec tests. + +""" +import os +import random +from unittest import TestCase + +from codext.__common__ import * + + +class ComplementaryTestCase(TestCase): + def test_codec_baudot(self): + from codext.binary.baudot import _check_alphabet + self.assertRaises(ValueError, _check_alphabet, ["BAD_ALPHABET"]) + + def test_codec_dna(self): + self.assertEqual(codecs.decode("ABC", "dna-1", errors="ignore"), "\x02") + self.assertEqual(codecs.decode("ABC", "dna-2", errors="replace"), "[00??01]") + + def test_codec_morse(self): + self.assertRaises(LookupError, codecs.encode, "test", "morse-AAB") + + def test_codec_polybius(self): + from codext.crypto.polybius import polybius_encode, polybius_decode + self.assertRaises(LookupError, polybius_encode, "ABC") + self.assertRaises(ValueError, polybius_decode(), "BAD_") + self.assertRaises(ValueError, polybius_decode(), "441543441") + self.assertEqual(codecs.decode("441543445", "polybius", "ignore"), "TEST") + + def test_codec_sms(self): + self.assertEqual(codecs.decode("A-B-222-3-4-5", "sms", "leave"), "ABcdgj") + + +class ManualTestCase(TestCase): + def test_codec_affine(self): + STR = "this is a test" + AFF1 = "vjkubkubcbvguv" + self.assertRaises(LookupError, codecs.encode, STR, "affine-BAD") + self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u-BAD") + # uses by default an alphabet with lowercase, uppercase, whitespace and parameters a=1 and b=2 + self.assertEqual(codecs.encode(STR, "affine"), codecs.encode(STR, "affine-?l?u?s-1,2")) + self.assertEqual(codecs.encode(STR, "affine"), AFF1) + self.assertEqual(codecs.encode(b(STR), "affine"), b(AFF1)) + self.assertEqual(codecs.decode(AFF1, "affine"), STR) + self.assertEqual(codecs.decode(b(AFF1), "affine"), b(STR)) + AFF2 = "ORWJdWJdidOCJO" + self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-5,8"), AFF2) + self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-5,8"), b(AFF2)) + self.assertEqual(codecs.decode(AFF2, "affine-?l?u?d?s-5,8"), STR) + self.assertEqual(codecs.decode(b(AFF2), "affine-?l?u?d?s-5,8"), b(STR)) + AFF3 = "QsuOcuOcecQmOQ" + self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-2,4"), AFF3) + self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-2,4"), b(AFF3)) + self.assertEqual(codecs.decode(AFF3, "affine-?l?u?d?s-2,4"), STR) + self.assertEqual(codecs.decode(b(AFF3), "affine-?l?u?d?s-2,4"), b(STR)) + self.assertRaises(ValueError, codecs.decode, ".BAD.", "affine-?l?u?d?s-2,4") + self.assertIsNotNone(codecs.encode("TEST", "affine_?u-1,2")) + # example of parameters that cause mapping collisions + self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u?d?s-6,8") + + def test_codec_atbash(self): + STR = "This is a test" + ATB1 = "Gsrh rh z gvhg" + self.assertIsNotNone(codecs.encode("test", "atbash-whatevers")) + # uses by default an alphabet with lowercase and uppercase + self.assertEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-?l?u")) + self.assertNotEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-[?l?u]")) + self.assertEqual(codecs.encode(STR, "atbash_cipher"), ATB1) + self.assertEqual(codecs.encode(b(STR), "atbash-cipher"), b(ATB1)) + self.assertEqual(codecs.decode(ATB1, "atbash"), STR) + self.assertEqual(codecs.decode(b(ATB1), "atbash"), b(STR)) + ATB2 = "N^]/a]/a a.{/." + self.assertEqual(codecs.encode(STR, "atbash-[?l?u?p?s]"), ATB2) + self.assertEqual(codecs.encode(b(STR), "atbash_cipher-[?l?u?p?s]"), b(ATB2)) + self.assertEqual(codecs.decode(ATB2, "atbash-[?l?u?p?s]"), STR) + self.assertEqual(codecs.decode(b(ATB2), "atbash_cipher-[?l?u?p?s]"), b(STR)) + + def test_codec_case_related_manips(self): + STR = "This is a test" + self.assertEqual(codecs.encode(STR, "lower"), "this is a test") + self.assertEqual(codecs.encode(b(STR), "uppercase"), b("THIS IS A TEST")) + self.assertEqual(codecs.encode(STR, "capitalize"), "This is a test") + self.assertEqual(codecs.decode(b(STR), "capitalize"), b("this is a test")) + self.assertEqual(codecs.encode(STR, "title"), "This Is A Test") + self.assertEqual(codecs.decode(b(STR), "title"), b("this is a test")) + self.assertEqual(codecs.encode(b(STR), "swapcase"), b("tHIS IS A TEST")) + self.assertEqual(codecs.encode(b(STR), "camelcase"), b("thisIsATest")) + self.assertEqual(codecs.encode(b(STR), "kebabcase"), b("this-is-a-test")) + self.assertEqual(codecs.encode(b(STR), "pascalcase"), b("ThisIsATest")) + self.assertEqual(codecs.encode(b(STR), "slugify"), b("this-is-a-test")) + self.assertEqual(codecs.encode(b(STR), "snakecase"), b("this_is_a_test")) + self.assertRaises(NotImplementedError, codecs.decode, STR, "camel") + self.assertRaises(NotImplementedError, codecs.decode, STR, "pascal") + self.assertRaises(NotImplementedError, codecs.decode, STR, "slug") + self.assertRaises(NotImplementedError, codecs.decode, STR, "snake") + + def test_codec_checksum_functions(self): + from codext.checksums.crc import CRC + for n, variants in CRC.items(): + for name, params in variants.items(): + enc = ("crc%d-%s" % (n, name) if isinstance(n, int) else "crc-%s" % name).rstrip("-") + self.assertEqual(codecs.encode("123456789", enc), "%0{}x".format(round((n or 16)/4+.5)) % params[5]) + from codext.checksums.luhn import luhn + for s, r in [("", ""), ("0", "0"), ("1", "8"), ("7992739871", "3")]: + self.assertEqual(codecs.encode(s, "luhn"), r) + self.assertEqual(codecs.encode("-", "luhn", errors="ignore"), "") + + def test_codec_dummy_str_manips(self): + STR = "this is a test" + self.assertEqual(codecs.decode(STR, "reverse"), "tset a si siht") + self.assertEqual(codecs.decode(STR, "reverse_words"), "siht si a tset") + self.assertEqual(codecs.decode(STR.split()[0], "reverse"), codecs.decode(STR.split()[0], "reverse-words")) + self.assertEqual(codecs.encode(STR, "replace-i1"), STR.replace("i", "1")) + self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR) + self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that")) + self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR) + self.assertEqual(codecs.encode(STR, "tokenize-2"), "th is i s a te st") + self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200") + + def test_codec_hash_functions(self): + STR = b"This is a test string!" + for h in ["adler32", "md2", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: + self.assertIsNotNone(codecs.encode(STR, h)) + self.assertRaises(NotImplementedError, codecs.decode, STR, h) + self.assertEqual(len(codecs.encode(STR, "blake2b_64")), 128) + self.assertRaises(LookupError, codecs.encode, STR, "blake2b_0") + self.assertRaises(LookupError, codecs.encode, STR, "blake2b-65") + self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2b") + self.assertEqual(len(codecs.encode(STR, "blake2s_32")), 64) + self.assertRaises(LookupError, codecs.encode, STR, "blake2s_0") + self.assertRaises(LookupError, codecs.encode, STR, "blake2s-33") + self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2s") + self.assertIsNotNone(codecs.encode(STR, "shake128")) + self.assertRaises(LookupError, codecs.encode, STR, "shake128_0") + self.assertRaises(NotImplementedError, codecs.decode, STR, "shake128") + self.assertIsNotNone(codecs.encode(STR, "shake256")) + self.assertRaises(LookupError, codecs.encode, STR, "shake256-0") + self.assertRaises(NotImplementedError, codecs.decode, STR, "shake256") + for h in ["sha3_224", "sha3_256", "sha3_384", "sha3_512"]: + self.assertIsNotNone(codecs.encode(STR, h)) + self.assertRaises(NotImplementedError, codecs.decode, STR, h) + if UNIX: + try: + import crypt + except ImportError: + try: + import legacycrypt as crypt + except ImportError: + crypt = None + METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] \ + if crypt is not None else [] + for m in METHODS: + h = "crypt-" + m + self.assertIsNotNone(codecs.encode(STR, h)) + self.assertRaises(NotImplementedError, codecs.decode, STR, h) + + def test_codec_markdown(self): + HTM = "

Test title

\n\n

Test paragraph

\n" + MD = "# Test title\n\nTest paragraph" + TFILE = "test-codec-markdown.html" + self.assertTrue(isinstance(codecs.encode(MD, "markdown"), str)) + self.assertEqual(codecs.encode(MD, "markdown"), HTM) + self.assertRaises(NotImplementedError, codecs.decode, MD, "markdown") + with codecs.open(TFILE, 'w', encoding="markdown") as f: + f.write(b(MD)) + with codecs.open(TFILE) as f: + s = f.read() + self.assertEqual(HTM, ensure_str(s)) + os.remove(TFILE) + + def test_codec_whitespace_after_before(self): + STR = "test" + for i in range(100): + c = "whitespace{}{}*after{}{}*before".format("-+"[random.randint(0, 1)], random.randint(1, 3), + "-+"[random.randint(0, 1)], random.randint(1, 3)) + self.assertEqual(codecs.decode("\n" + codecs.encode(STR, c) + "\n", c), STR) + # in this special case, the whitespaces between words cannot be encoded because: + # - ord(" ") == 32 + # - the next minimal value in the printable characters excluding the latest 6 is ord("!") == 33 + # and therefore ord(" ")-random(0,20)-random(0,20) will never fall into the valid ordinals ! + self.assertRaises(ValueError, codecs.encode, "this is a test", "whitespace-after-before") + self.assertIn("\x00", codecs.encode("this is a test", "whitespace-after-before", "replace")) +