diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..37e11a4 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,50 @@ +name: CI + +# Run on push only for ci/staging +# Otherwise it may trigger concurrently `push & pull_request` on PRs. +on: + push: + branches: + - ci + - staging + +jobs: + build: + name: Python ${{ matrix.python }} + runs-on: ubuntu-latest + strategy: + matrix: + python: + - "3.10" + - "3.11" + - "3.12" + - "3.13" + - "3.14" + - "pypy3.11" + + steps: + - uses: actions/checkout@v4 + - name: setup python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e . + pip install coveralls --upgrade + - name: Run flake8 + run: | + pip install flake8 --upgrade + flake8 --exclude=build --ignore=E501,F403,F401,E241,E225,E128 . + - name: Run pycodestyle + run: | + pip install pycodestyle --upgrade + pycodestyle --ignore=E128,E261,E225,E501,W605 slugify test.py setup.py + - name: Run test + run: | + coverage run --source=slugify test.py + - name: Coveralls + run: coveralls --service=github + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml new file mode 100644 index 0000000..55119d6 --- /dev/null +++ b/.github/workflows/dev.yml @@ -0,0 +1,50 @@ +name: DEV + +# Run on push only for dev/sandbox +# Otherwise it may trigger concurrently `push & pull_request` on PRs. +on: + push: + branches: + - sandbox + - dev + +jobs: + build: + name: Python ${{ matrix.python }} + runs-on: ubuntu-latest + strategy: + matrix: + python: + - "3.10" + - "3.11" + - "3.12" + - "3.13" + - "3.14" + - "pypy3.11" + + steps: + - uses: actions/checkout@v4 + - name: setup python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e . + pip install coveralls --upgrade + - name: Run flake8 + run: | + pip install flake8 --upgrade + flake8 --exclude=build --ignore=E501,F403,F401,E241,E225,E128 . + - name: Run pycodestyle + run: | + pip install pycodestyle --upgrade + pycodestyle --ignore=E128,E261,E225,E501,W605 slugify test.py setup.py + - name: Run test + run: | + coverage run --source=slugify test.py + - name: Coveralls + run: coveralls --service=github + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..ba50793 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,34 @@ +name: Main + +on: + pull_request: null + push: + branches: + - master + +jobs: + build: + name: Linux + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + - name: setup python + uses: actions/setup-python@v6 + with: + python-version: | + pypy3.11 + 3.10 + 3.11 + 3.12 + 3.13 + 3.14 + - name: Install dependencies + run: | + python -m pip install coveralls tox tox-uv + - name: Run test + run: | + tox + - name: Coveralls + run: coveralls --service=github + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 685386f..0000000 --- a/.travis.yml +++ /dev/null @@ -1,25 +0,0 @@ -sudo: false -language: python - -python: - - "2.7" - - "3.4" - - "3.5" - - "3.6" - - pypy - -install: - - pip install pip -U - - pip install -e . - - pip install pycodestyle - - pip install coveralls - - pip install https://github.com/un33k/pyflakes/tarball/master - -before_script: - - "bash pycodestyle.sh" - - if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then pyflakes -x W slugify; fi - -script: coverage run --source=slugify test.py - -after_success: - coveralls diff --git a/.vscode/settings.json b/.vscode/settings.json index 32531ea..ecfbb80 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,5 @@ { "python.linting.pylintEnabled": false, - "restructuredtext.confPath": "", - "python.pythonPath": "/usr/local/opt/python/bin/python3.6" -} \ No newline at end of file + "python.pythonPath": "/usr/bin/python3", + "cSpell.words": ["Neekman", "shch", "xlate"] +} diff --git a/CHANGELOG.md b/CHANGELOG.md index e2eed12..537460e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,155 +1,259 @@ +## Unreleased + +- Support Python 3.14. +- Drop support for Python 3.9 and lower. +- Use tox for local test runs and in CI. +- Test the project against both `unidecode` and `text_unidecode`. +- Fix type annotation issues identified by mypy. +- Run CI against pull requests. +- Fix package build warnings. + +## 8.0.4 + +- Properly handle uppercase special characters (@mib1185 - thx) + +## 8.0.3 + +- Drop compatibility for unsupported Python Version (@Viicos - thx) +- Fix pattern types. + +## 8.0.2 + +- Normalize text before converting to unicode. (@chuckyblack - thx) + +## 8.0.1 + +- Added license notice to readme (@C-nit - thx) + +## 8.0.0 + +- By default, prefer unidecode if installed (@enkidulan - thx) + +## 7.0.0 + +- Drop python 3.6, add python 3.11 (@hugovk - thx) + +## 6.1.2 + +- Reintroduce the cli options + +## 6.1.1 + +- Remove type hinting (temporarily) + +## 6.1.0 + +- Add `allow_unicode` flag to allow unicode characters in the slug + +## 6.0.1 + +- Rework regex_pattern to mean the opposite (disallowed chars instead of allowed) +- Thanks to @yyyyyyyan for the initial PR followed by the final PR by @mrezzamoradi + +## 6.0.0 + +- Enable github action +- Remove tox, as we run the test on github action, the end users can refer to those test + +## 5.0.2 + +- Enable twine publish + +## 5.0.1 + +- Drop support for python 2.7, 3.5 & tox, clean up + +## 5.0.0 + +- Add support for Py 3.9 - added tox (@jon-betts - Thx) +- Drop support for python 2.7, 3.5 & friends + +## 4.0.1 + +- Add support for Py 3.8 +- Last version with `official` python 2.7 and <= 3.5 support + +## 4.0.0 + +- Drop support from 2.6, & < 3.4.5 + +## 3.0.6 + +- Fixed encoding in special.py + +## 3.0.5 + +- Add test for pre-translation (e.g German Umlaut) +- Add special char supports (optional Use) + +## 3.0.4 + +- Now supporting text-unidecode>=1.3 +- Now supporting Unidecode>=1.1.1 + +## 3.0.3 + +- Remove unicode chars from file + +## 3.0.2 + +- Add official support of Py 3.7 + +## 3.0.1 + +- Add test.py to manifest + +## 3.0.0 + +- Upgrade Unidecode +- Promote text-unidecode as the primary decoding package +- Add Unidecode as an optional extra. "pip install python-slugify[unidecode]" + ## 2.0.1 - - Add replacements option e.g. [['|', 'or'], ['%', 'percent'], ['-', '_']] (@andriyor) + +- Add replacements option e.g. [['|', 'or'], ['%', 'percent'], ['-', '_']] (@andriyor) ## 2.0.0 - - Fix alternative dependency installation + +- Fix alternative dependency installation ## 1.2.6 - - Add support for case sensitive slugs (@s-m-e) + +- Add support for case sensitive slugs (@s-m-e) ## 1.2.5 - - Add support for using text-unidecode (@bolkedebruin) - - Switch to pycodestyle instead of pep8 + +- Add support for using text-unidecode (@bolkedebruin) +- Switch to pycodestyle instead of pep8 ## 1.2.4 - - Remove build artifacts during packaging - - Simplify the setup.py file (@reece) + +- Remove build artifacts during packaging +- Simplify the setup.py file (@reece) ## 1.2.3 - - Republish - possible corrupt 1.2.2 build + +- Republish - possible corrupt 1.2.2 build ## 1.2.2 - - Add `regex_pattern` option. (@vrbaskiz) - - Add Python 3.6 support + +- Add `regex_pattern` option. (@vrbaskiz) +- Add Python 3.6 support ## 1.2.1 - - Including certain files (e.g. license.md) in sdists via MANIFEST.in (@proinsias) - - Relax licensing by moving from BSD to MIT - - Add Python 3.5 support - - Add more tests + +- Including certain files (e.g. license.md) in sdists via MANIFEST.in (@proinsias) +- Relax licensing by moving from BSD to MIT +- Add Python 3.5 support +- Add more tests ## 1.2.0 Backward incompatible change: (@fabiocaccamo) - - In version < 1.2.0 all single quotes ( ' ) were removed, and - moving forward, >= 1.2.0, they will be replaced with ( - ). - Example: - < 1.2.0 -- ('C\'est déjà l\'été.' -> "cest-deja-lete") - >= 1.2.0 -- ('C\'est déjà l\'été.' -> "c-est-deja-l-ete") +- In version < 1.2.0 all single quotes ( ' ) were removed, and + moving forward, >= 1.2.0, they will be replaced with ( - ). + Example: + < 1.2.0 -- ('C\'est déjà l\'été.' -> "cest-deja-lete") + > = 1.2.0 -- ('C\'est déjà l\'été.' -> "c-est-deja-l-ete") ## 1.1.4 Bugfix: - - Add more test cases, dropped `official` support for python 3.2 - +- Add more test cases, dropped `official` support for python 3.2 ## 1.1.3 Bugfix: - - Handle unichar in python 3.x - +- Handle unichar in python 3.x ## 1.1.2 Enhancement: - - Ability to remove `stopwords` from string - +- Ability to remove `stopwords` from string ## 1.0.2 Enhancement: - - A new PyPI release - +- A new PyPI release ## 1.0.1 Enhancement: - - Promoting to production grade - +- Promoting to production grade ## 0.1.1 Enhancement: - - Added option to save word order - - Removed 2to3 dependency - - Added more tests - +- Added option to save word order +- Removed 2to3 dependency +- Added more tests ## 0.1.0 Enhancement: - - Added more test - - Added test for python 3.4 - +- Added more test +- Added test for python 3.4 ## 0.0.9 Enhancement: - - Enable console_scripts - +- Enable console_scripts ## 0.0.8 Enhancement: - - Move logic out of __init__.py - - Added console_scripts (@ekamil) - - Updated pep8.sh - - Added pypy support - +- Move logic out of **init**.py +- Added console_scripts (@ekamil) +- Updated pep8.sh +- Added pypy support ## 0.0.7 Enhancement: - - Handle encoding in setup file - - Update ReadME, ChangeLog, License files - +- Handle encoding in setup file +- Update ReadME, ChangeLog, License files ## 0.0.6 Enhancement: - - Update for smart_truncate - +- Update for smart_truncate ## 0.0.5 Features: - - Added Python 3.2 and 3.3 support (work by: arthurdarcet@github) - +- Added Python 3.2 and 3.3 support (work by: arthurdarcet@github) ## 0.0.4 Features: - - Added option to choose non-dash separators (request by: danilodimoia@github) - +- Added option to choose non-dash separators (request by: danilodimoia@github) ## 0.0.3 Features: - - Added the ability to truncate slugs (request by: juanriaza@github) - +- Added the ability to truncate slugs (request by: juanriaza@github) ## 0.0.2 Enhancement: - - Incremental update - +- Incremental update ## 0.0.1 - - Initial version +- Initial version diff --git a/MANIFEST.in b/MANIFEST.in index 067e13a..373701c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,3 @@ -include CHANGELOG.md include LICENSE include README.md +include CHANGELOG.md diff --git a/README.md b/README.md index bc4d9e8..e5123f1 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ -Python Slugify -==================== +# Python Slugify **A Python slugify application that handles unicode**. @@ -7,142 +6,220 @@ Python Slugify [![version-image]][version-link] [![coverage-image]][coverage-link] -Overview -==================== +# Overview **Best attempt** to create slugs from unicode strings while keeping it **DRY**. -Notice -==================== +# Notice + +This module, by default installs and uses [text-unidecode](https://github.com/kmike/text-unidecode) _(GPL & Perl Artistic)_ for its decoding needs. + +However, there is an alternative decoding package called [Unidecode](https://github.com/avian2/unidecode) _(GPL)_. It can be installed as `python-slugify[unidecode]` for those who prefer it. `Unidecode` is believed to be more [advanced](https://github.com/un33k/python-slugify/wiki/Python-Slugify-Wiki#notes-on-unidecode). + +### `Official` Support Matrix + +| Python | Slugify | +| -------------- | ------------------ | +| `>= 2.7 < 3.6` | `< 5.0.0` | +| `>= 3.6 < 3.7` | `>= 5.0.0 < 7.0.0` | +| `>= 3.7` | `>= 7.0.0` | + +# How to install + + pip install python-slugify + + # OR + + pip install python-slugify[unidecode] + +# Options + +```python +def slugify( + text: str, + entities: bool = True, + decimal: bool = True, + hexadecimal: bool = True, + max_length: int = 0, + word_boundary: bool = False, + separator: str = DEFAULT_SEPARATOR, + save_order: bool = False, + stopwords: Iterable[str] = (), + regex_pattern: str | None = None, + lowercase: bool = True, + replacements: Iterable[Iterable[str]] = (), + allow_unicode: bool = False, +) -> str: + """ + Make a slug from the given text. + :param text (str): initial text + :param entities (bool): converts html entities to unicode (foo & bar -> foo-bar) + :param decimal (bool): converts html decimal to unicode (Ž -> Ž -> z) + :param hexadecimal (bool): converts html hexadecimal to unicode (Ž -> Ž -> z) + :param max_length (int): output string length + :param word_boundary (bool): truncates to end of full words (length may be shorter than max_length) + :param save_order (bool): when set, does not include shorter subsequent words even if they fit + :param separator (str): separator between words + :param stopwords (iterable): words to discount + :param regex_pattern (str): regex pattern for disallowed characters + :param lowercase (bool): activate case sensitivity by setting it to False + :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] + :param allow_unicode (bool): allow unicode characters + :return (str): slugify text + """ +``` + +# How to use + +```python +from slugify import slugify + +txt = "This is a test ---" +r = slugify(txt) +self.assertEqual(r, "this-is-a-test") + +txt = '影師嗎' +r = slugify(txt) +self.assertEqual(r, "ying-shi-ma") + +txt = '影師嗎' +r = slugify(txt, allow_unicode=True) +self.assertEqual(r, "影師嗎") + +txt = 'C\'est déjà l\'été.' +r = slugify(txt) +self.assertEqual(r, "c-est-deja-l-ete") + +txt = 'Nín hǎo. Wǒ shì zhōng guó rén' +r = slugify(txt) +self.assertEqual(r, "nin-hao-wo-shi-zhong-guo-ren") + +txt = 'Компьютер' +r = slugify(txt) +self.assertEqual(r, "kompiuter") + +txt = 'jaja---lol-méméméoo--a' +r = slugify(txt, max_length=9) +self.assertEqual(r, "jaja-lol") + +txt = 'jaja---lol-méméméoo--a' +r = slugify(txt, max_length=15, word_boundary=True) +self.assertEqual(r, "jaja-lol-a") + +txt = 'jaja---lol-méméméoo--a' +r = slugify(txt, max_length=20, word_boundary=True, separator=".") +self.assertEqual(r, "jaja.lol.mememeoo.a") + +txt = 'one two three four' +r = slugify(txt, max_length=12, word_boundary=True, save_order=False) +self.assertEqual(r, "one-two-four") + +txt = 'one two three four' +r = slugify(txt, max_length=12, word_boundary=True, save_order=True) +self.assertEqual(r, "one-two") + +txt = 'the quick brown fox jumps over the lazy dog' +r = slugify(txt, stopwords=['the']) +self.assertEqual(r, 'quick-brown-fox-jumps-over-lazy-dog') + +txt = 'the quick brown fox jumps over the lazy dog in a hurry' +r = slugify(txt, stopwords=['the', 'in', 'a', 'hurry']) +self.assertEqual(r, 'quick-brown-fox-jumps-over-lazy-dog') + +txt = 'thIs Has a stopword Stopword' +r = slugify(txt, stopwords=['Stopword'], lowercase=False) +self.assertEqual(r, 'thIs-Has-a-stopword') + +txt = "___This is a test___" +regex_pattern = r'[^-a-z0-9_]+' +r = slugify(txt, regex_pattern=regex_pattern) +self.assertEqual(r, "___this-is-a-test___") + +txt = "___This is a test___" +regex_pattern = r'[^-a-z0-9_]+' +r = slugify(txt, separator='_', regex_pattern=regex_pattern) +self.assertNotEqual(r, "_this_is_a_test_") + +txt = '10 | 20 %' +r = slugify(txt, replacements=[['|', 'or'], ['%', 'percent']]) +self.assertEqual(r, "10-or-20-percent") + +txt = 'ÜBER Über German Umlaut' +r = slugify(txt, replacements=[['Ü', 'UE'], ['ü', 'ue']]) +self.assertEqual(r, "ueber-ueber-german-umlaut") + +txt = 'i love 🦄' +r = slugify(txt, allow_unicode=True) +self.assertEqual(r, "i-love") + +txt = 'i love 🦄' +r = slugify(txt, allow_unicode=True, regex_pattern=r'[^🦄]+') +self.assertEqual(r, "🦄") + +``` -By default, this modules installs and uses [Unidecode](https://github.com/avian2/unidecode) *(GPL)* for its decoding needs. However if you wish to use [text-unidecode](https://github.com/kmike/text-unidecode) *(GPL & Perl Artistic)* instead, please ensure it is installed prior to `python-slugify` installation. - -In cases where both `Unidecode` and `text-unidecode` are installed, `Unidecode` is used as the default decoding module. - - -How to install -==================== - - 1. easy_install python-slugify - 2. pip install python-slugify - 3. git clone http://github.com/un33k/python-slugify - a. cd python-slugify - b. python setup.py install - 4. wget https://github.com/un33k/python-slugify/zipball/master - a. unzip the downloaded file - b. cd python-slugify-* - c. python setup.py install - - -How to use -==================== - - ```python - from slugify import slugify - - txt = "This is a test ---" - r = slugify(txt) - self.assertEqual(r, "this-is-a-test") - - txt = '影師嗎' - r = slugify(txt) - self.assertEqual(r, "ying-shi-ma") - - txt = 'C\'est déjà l\'été.' - r = slugify(txt) - self.assertEqual(r, "c-est-deja-l-ete") - - txt = 'Nín hǎo. Wǒ shì zhōng guó rén' - r = slugify(txt) - self.assertEqual(r, "nin-hao-wo-shi-zhong-guo-ren") - - txt = 'Компьютер' - r = slugify(txt) - self.assertEqual(r, "kompiuter") - - txt = 'jaja---lol-méméméoo--a' - r = slugify(txt, max_length=9) - self.assertEqual(r, "jaja-lol") - - txt = 'jaja---lol-méméméoo--a' - r = slugify(txt, max_length=15, word_boundary=True) - self.assertEqual(r, "jaja-lol-a") - - txt = 'jaja---lol-méméméoo--a' - r = slugify(txt, max_length=20, word_boundary=True, separator=".") - self.assertEqual(r, "jaja.lol.mememeoo.a") - - txt = 'one two three four five' - r = slugify(txt, max_length=13, word_boundary=True, save_order=True) - self.assertEqual(r, "one-two-three") +For more examples, have a look at the [test.py](test.py) file. - txt = 'the quick brown fox jumps over the lazy dog' - r = slugify(txt, stopwords=['the']) - self.assertEqual(r, 'quick-brown-fox-jumps-over-lazy-dog') +# Command Line Options - txt = 'the quick brown fox jumps over the lazy dog in a hurry' - r = slugify(txt, stopwords=['the', 'in', 'a', 'hurry']) - self.assertEqual(r, 'quick-brown-fox-jumps-over-lazy-dog') +With the package, a command line tool called `slugify` is also installed. - txt = 'thIs Has a stopword Stopword' - r = slugify(txt, stopwords=['Stopword'], lowercase=False) - self.assertEqual(r, 'thIs-Has-a-stopword') +It allows convenient command line access to all the features the `slugify` function supports. Call it with `-h` for help. - txt = "___This is a test___" - regex_pattern = r'[^-a-z0-9_]+' - r = slugify(txt, regex_pattern=regex_pattern) - self.assertEqual(r, "___this-is-a-test___") +The command can take its input directly on the command line or from STDIN (when the `--stdin` flag is passed): - txt = "___This is a test___" - regex_pattern = r'[^-a-z0-9_]+' - r = slugify(txt, separator='_', regex_pattern=regex_pattern) - self.assertNotEqual(r, "_this_is_a_test_") +``` +$ echo "Taking input from STDIN" | slugify --stdin +taking-input-from-stdin +``` - txt = '10 | 20 %' - r = slugify(txt, replacements=[['|', 'or'], ['%', 'percent']]) - self.assertEqual(r, "10-or-20-percent") +``` +$ slugify taking input from the command line +taking-input-from-the-command-line +``` - ``` - -For more examples, have a look at the [test.py](test.py) file. +Please note that when a multi-valued option such as `--stopwords` or `--replacements` is passed, you need to use `--` as separator before you start with the input: +``` +$ slugify --stopwords the in a hurry -- the quick brown fox jumps over the lazy dog in a hurry +quick-brown-fox-jumps-over-lazy-dog +``` -Running the tests -==================== +# Running the tests To run the tests against the current environment: python test.py +# Contribution -License -==================== +Please read the ([wiki](https://github.com/un33k/python-slugify/wiki/Python-Slugify-Wiki)) page prior to raising any PRs. + +# License Released under a ([MIT](LICENSE)) license. +### Notes on GPL dependencies +Though the dependencies may be GPL licensed, `python-slugify` itself is not considered a derivative work and will remain under the MIT license. +If you wish to avoid installation of any GPL licensed packages, please note that the default dependency `text-unidecode` explicitly lets you choose to use the [Artistic License](https://opensource.org/license/artistic-perl-1-0-2/) instead. Use without concern. + +# Version -Version -==================== X.Y.Z Version `MAJOR` version -- when you make incompatible API changes, `MINOR` version -- when you add functionality in a backwards-compatible manner, and `PATCH` version -- when you make backwards-compatible bug fixes. -[status-image]: https://secure.travis-ci.org/un33k/python-slugify.png?branch=master -[status-link]: http://travis-ci.org/un33k/python-slugify?branch=master - +[status-image]: https://github.com/un33k/python-slugify/actions/workflows/main.yml/badge.svg +[status-link]: https://github.com/un33k/python-slugify/actions/workflows/ci.yml [version-image]: https://img.shields.io/pypi/v/python-slugify.svg [version-link]: https://pypi.python.org/pypi/python-slugify - [coverage-image]: https://coveralls.io/repos/un33k/python-slugify/badge.svg [coverage-link]: https://coveralls.io/r/un33k/python-slugify - [download-image]: https://img.shields.io/pypi/dm/python-slugify.svg [download-link]: https://pypi.python.org/pypi/python-slugify +# Sponsors -Sponsors -==================== - -[![Surge](https://www.surgeforward.com/wp-content/themes/understrap-master/images/logo.png)](https://github.com/surgeforward) +[Neekware Inc.](http://neekware.com) diff --git a/dev.requirements.txt b/dev.requirements.txt new file mode 100644 index 0000000..5f94d7b --- /dev/null +++ b/dev.requirements.txt @@ -0,0 +1,3 @@ +pycodestyle==2.8.0 +twine==3.4.1 +flake8==4.0.1 \ No newline at end of file diff --git a/pycodestyle.sh b/format.sh similarity index 100% rename from pycodestyle.sh rename to format.sh diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1c02bfe --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,46 @@ +[build-system] +requires = ["setuptools>=61.2"] +build-backend = "setuptools.build_meta" + + +# coverage +# -------- + +[tool.coverage.run] +relative_files = true +parallel = true +branch = true +source = [ + "slugify", + "test", +] + +[tool.coverage.paths] +source = [ + "src", + "*/site-packages", +] + +[tool.coverage.report] +skip_covered = true +fail_under = 97 + + +# mypy +# ---- + +[tool.mypy] +packages = "slugify" +strict = true +sqlite_cache = true + + +# pytest +# ------ + +[tool.pytest.ini_options] +testpaths = ["test.py"] +addopts = "--color=yes" +filterwarnings = [ + "error", +] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 656daab..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -Unidecode>=0.04.16 diff --git a/requirements_alt.txt b/requirements_alt.txt deleted file mode 100644 index 980e50a..0000000 --- a/requirements_alt.txt +++ /dev/null @@ -1 +0,0 @@ -text-unidecode>=1.2 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 3c6e79c..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[bdist_wheel] -universal=1 diff --git a/setup.py b/setup.py index ab59194..32f44dd 100755 --- a/setup.py +++ b/setup.py @@ -1,75 +1,84 @@ #!/usr/bin/env python - -# -*- coding: utf-8 -*- -from setuptools import setup, find_packages -import re +# Learn more: https://github.com/un33k/setup.py import os import sys -import codecs -install_requires = [] -try: - import text_unidecode -except ImportError: - install_requires.append('Unidecode>=0.04.16') +from shutil import rmtree +from setuptools import setup + -name = 'python-slugify' package = 'slugify' -description = 'A Python Slugify application that handles Unicode' -url = 'https://github.com/un33k/python-slugify' -author = 'Val Neekman' -author_email = 'info@neekware.com' -license = 'MIT' +python_requires = ">=3.10" +here = os.path.abspath(os.path.dirname(__file__)) -classifiers = [ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Developers', - 'Topic :: Software Development :: Build Tools', - 'License :: OSI Approved :: MIT License', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.6', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.2', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', -] +install_requires = ['text-unidecode>=1.3'] +extras_requires = {'unidecode': ['Unidecode>=1.1.1']} +about = {} +with open(os.path.join(here, package, '__version__.py'), 'r', encoding='utf-8') as f: + exec(f.read(), about) -def get_version(package): - """ - Return package version as listed in `__version__` in `init.py`. - """ - init_py = codecs.open(os.path.join(package, '__init__.py'), encoding='utf-8').read() - return re.search("^__version__ = ['\"]([^'\"]+)['\"]", init_py, re.MULTILINE).group(1) +with open('README.md', 'r', encoding='utf-8') as f: + readme = f.read() -if sys.argv[-1] == 'build': - os.system("python setup.py sdist bdist_wheel") +def status(s): + print('\033[1m{0}\033[0m'.format(s)) + +# 'setup.py publish' shortcut. if sys.argv[-1] == 'publish': - os.system("python setup.py sdist upload") - args = {'version': get_version(package)} - print("You probably want to also tag the version now:") - print(" git tag -a %(version)s -m 'version %(version)s' && git push --tags" % args) - sys.exit() + try: + status('Removing previous builds…') + rmtree(os.path.join(here, 'dist')) + except OSError: + pass -EXCLUDE_FROM_PACKAGES = [] + status('Building Source and Wheel (universal) distribution…') + os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable)) + + status('Uploading the package to PyPI via Twine…') + os.system('twine upload dist/*') + + status('Pushing git tags…') + os.system('git tag v{0}'.format(about['__version__'])) + os.system('git push --tags') + sys.exit() setup( - name=name, - version=get_version(package), - url=url, - license=license, - description=description, - long_description=description, - author=author, - author_email=author_email, - packages=find_packages(exclude=EXCLUDE_FROM_PACKAGES), + name=about['__title__'], + version=about['__version__'], + description=about['__description__'], + long_description=readme, + long_description_content_type='text/markdown', + author=about['__author__'], + author_email=about['__author_email__'], + url=about['__url__'], + license=about['__license__'], + packages=[package], + package_data={ + '': ['LICENSE'], + 'slugify': ['py.typed'], + }, + package_dir={'slugify': 'slugify'}, + include_package_data=True, + python_requires=python_requires, install_requires=install_requires, - classifiers=classifiers, - entry_points={'console_scripts': ['slugify=slugify.slugify:main']}, + extras_require=extras_requires, + zip_safe=False, + cmdclass={}, + project_urls={}, + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'Natural Language :: English', + 'Programming Language :: Python', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', + 'Programming Language :: Python :: 3.14', + ], + entry_points={'console_scripts': ['slugify=slugify.__main__:main']}, ) diff --git a/slugify/__init__.py b/slugify/__init__.py index 7358b99..6d3279f 100644 --- a/slugify/__init__.py +++ b/slugify/__init__.py @@ -1,6 +1,10 @@ +from .special import * from .slugify import * - - -__author__ = 'Val Neekman @ Neekware Inc. [@vneekman]' -__description__ = 'A Python slugify application that also handles Unicode' -__version__ = '2.0.1' +from .__version__ import __title__ +from .__version__ import __author__ +from .__version__ import __author_email__ +from .__version__ import __description__ +from .__version__ import __url__ +from .__version__ import __license__ +from .__version__ import __copyright__ +from .__version__ import __version__ diff --git a/slugify/__main__.py b/slugify/__main__.py new file mode 100644 index 0000000..4e6b3d9 --- /dev/null +++ b/slugify/__main__.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import argparse +import sys +from typing import Any + +from .slugify import slugify, DEFAULT_SEPARATOR + + +def parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Slug string") + + input_group = parser.add_argument_group(description="Input") + input_group.add_argument("input_string", nargs='*', + help='Text to slugify') + input_group.add_argument("--stdin", action='store_true', + help="Take the text from STDIN") + + parser.add_argument("--no-entities", action='store_false', dest='entities', default=True, + help="Do not convert HTML entities to unicode") + parser.add_argument("--no-decimal", action='store_false', dest='decimal', default=True, + help="Do not convert HTML decimal to unicode") + parser.add_argument("--no-hexadecimal", action='store_false', dest='hexadecimal', default=True, + help="Do not convert HTML hexadecimal to unicode") + parser.add_argument("--max-length", type=int, default=0, + help="Output string length, 0 for no limit") + parser.add_argument("--word-boundary", action='store_true', default=False, + help="Truncate to complete word even if length ends up shorter than --max_length") + parser.add_argument("--save-order", action='store_true', default=False, + help="When set and --max_length > 0 return whole words in the initial order") + parser.add_argument("--separator", type=str, default=DEFAULT_SEPARATOR, + help="Separator between words. By default " + DEFAULT_SEPARATOR) + parser.add_argument("--stopwords", nargs='+', + help="Words to discount") + parser.add_argument("--regex-pattern", + help="Python regex pattern for disallowed characters") + parser.add_argument("--no-lowercase", action='store_false', dest='lowercase', default=True, + help="Activate case sensitivity") + parser.add_argument("--replacements", nargs='+', + help="""Additional replacement rules e.g. "|->or", "%%->percent".""") + parser.add_argument("--allow-unicode", action='store_true', default=False, + help="Allow unicode characters") + + args = parser.parse_args(argv[1:]) + + if args.input_string and args.stdin: + parser.error("Input strings and --stdin cannot work together") + + if args.replacements: + def split_check(repl: str) -> list[str]: + SEP = '->' + if SEP not in repl: + parser.error("Replacements must be of the form: ORIGINAL{SEP}REPLACED".format(SEP=SEP)) + return repl.split(SEP, 1) + args.replacements = [split_check(repl) for repl in args.replacements] + + if args.input_string: + args.input_string = " ".join(args.input_string) + elif args.stdin: + args.input_string = sys.stdin.read() + + if not args.input_string: + args.input_string = '' + + return args + + +def slugify_params(args: argparse.Namespace) -> dict[str, Any]: + return dict( + text=args.input_string, + entities=args.entities, + decimal=args.decimal, + hexadecimal=args.hexadecimal, + max_length=args.max_length, + word_boundary=args.word_boundary, + save_order=args.save_order, + separator=args.separator, + stopwords=args.stopwords, + lowercase=args.lowercase, + replacements=args.replacements, + allow_unicode=args.allow_unicode + ) + + +def main(argv: list[str] | None = None) -> None: + """ Run this program """ + if argv is None: + argv = sys.argv + args = parse_args(argv) + params = slugify_params(args) + try: + print(slugify(**params)) + except KeyboardInterrupt: + sys.exit(-1) + + +if __name__ == '__main__': + main() diff --git a/slugify/__version__.py b/slugify/__version__.py new file mode 100644 index 0000000..a9cd778 --- /dev/null +++ b/slugify/__version__.py @@ -0,0 +1,8 @@ +__title__ = 'python-slugify' +__author__ = 'Val Neekman' +__author_email__ = 'info@neekware.com' +__description__ = 'A Python slugify application that also handles Unicode' +__url__ = 'https://github.com/un33k/python-slugify' +__license__ = 'SPDX-License-Identifier: MIT' +__copyright__ = 'Copyright 2022 Val Neekman @ Neekware Inc.' +__version__ = '8.0.4' diff --git a/slugify/py.typed b/slugify/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/slugify/slugify.py b/slugify/slugify.py index 59e9672..9b5f27f 100644 --- a/slugify/slugify.py +++ b/slugify/slugify.py @@ -1,22 +1,14 @@ +from __future__ import annotations + import re import unicodedata -import types -import sys - -try: - from htmlentitydefs import name2codepoint - _unicode = unicode - _unicode_type = types.UnicodeType -except ImportError: - from html.entities import name2codepoint - _unicode = str - _unicode_type = str - unichr = chr +from collections.abc import Iterable +from html.entities import name2codepoint try: import unidecode except ImportError: - import text_unidecode as unidecode + import text_unidecode as unidecode # type: ignore[import-untyped, no-redef] __all__ = ['slugify', 'smart_truncate'] @@ -25,14 +17,20 @@ DECIMAL_PATTERN = re.compile(r'&#(\d+);') HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);') QUOTE_PATTERN = re.compile(r'[\']+') -ALLOWED_CHARS_PATTERN = re.compile(r'[^-a-z0-9]+') -ALLOWED_CHARS_PATTERN_WITH_UPPERCASE = re.compile(r'[^-a-zA-Z0-9]+') +DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+') +DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+') DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}') NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)') DEFAULT_SEPARATOR = '-' -def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', save_order=False): +def smart_truncate( + string: str, + max_length: int = 0, + word_boundary: bool = False, + separator: str = " ", + save_order: bool = False, +) -> str: """ Truncate a string. :param string (str): string for modification @@ -62,35 +60,48 @@ def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', sav if word: next_len = len(truncated) + len(word) if next_len < max_length: - truncated += '{0}{1}'.format(word, separator) + truncated += '{}{}'.format(word, separator) elif next_len == max_length: - truncated += '{0}'.format(word) + truncated += '{}'.format(word) break else: if save_order: break - if not truncated: # pragma: no cover + if not truncated: truncated = string[:max_length] return truncated.strip(separator) -def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False, - separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True, - replacements=()): +def slugify( + text: str, + entities: bool = True, + decimal: bool = True, + hexadecimal: bool = True, + max_length: int = 0, + word_boundary: bool = False, + separator: str = DEFAULT_SEPARATOR, + save_order: bool = False, + stopwords: Iterable[str] = (), + regex_pattern: re.Pattern[str] | str | None = None, + lowercase: bool = True, + replacements: Iterable[Iterable[str]] = (), + allow_unicode: bool = False, +) -> str: """ Make a slug from the given text. :param text (str): initial text - :param entities (bool): - :param decimal (bool): - :param hexadecimal (bool): + :param entities (bool): converts html entities to unicode + :param decimal (bool): converts html decimal to unicode + :param hexadecimal (bool): converts html hexadecimal to unicode :param max_length (int): output string length - :param word_boundary (bool): - :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order + :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length + :param save_order (bool): when set, does not include shorter subsequent words even if they fit :param separator (str): separator between words :param stopwords (iterable): words to discount - :param regex_pattern (str): regex pattern for allowed characters + :param regex_pattern (str): regex pattern for disallowed characters :param lowercase (bool): activate case sensitivity by setting it to False :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] + :param allow_unicode (bool): allow unicode characters :return (str): """ @@ -100,41 +111,46 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w text = text.replace(old, new) # ensure text is unicode - if not isinstance(text, _unicode_type): - text = _unicode(text, 'utf-8', 'ignore') + if not isinstance(text, str): + text = str(text, 'utf-8', 'ignore') # replace quotes with dashes - pre-process text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) - # decode unicode - text = unidecode.unidecode(text) + # normalize text, convert to unicode if required + if allow_unicode: + text = unicodedata.normalize('NFKC', text) + else: + text = unicodedata.normalize('NFKD', text) + text = unidecode.unidecode(text) # ensure text is still in unicode - if not isinstance(text, _unicode_type): - text = _unicode(text, 'utf-8', 'ignore') + if not isinstance(text, str): + text = str(text, 'utf-8', 'ignore') # character entity reference if entities: - text = CHAR_ENTITY_PATTERN.sub(lambda m: unichr(name2codepoint[m.group(1)]), text) + text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text) # decimal character reference if decimal: try: - text = DECIMAL_PATTERN.sub(lambda m: unichr(int(m.group(1))), text) + text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text) except Exception: pass # hexadecimal character reference if hexadecimal: try: - text = HEX_PATTERN.sub(lambda m: unichr(int(m.group(1), 16)), text) + text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text) except Exception: pass - # translate - text = unicodedata.normalize('NFKD', text) - if sys.version_info < (3,): - text = text.encode('ascii', 'ignore') + # re normalize text + if allow_unicode: + text = unicodedata.normalize('NFKC', text) + else: + text = unicodedata.normalize('NFKD', text) # make the text lowercase (optional) if lowercase: @@ -147,10 +163,11 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w text = NUMBERS_PATTERN.sub('', text) # replace all other unwanted characters - if lowercase: - pattern = regex_pattern or ALLOWED_CHARS_PATTERN + if allow_unicode: + pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN else: - pattern = regex_pattern or ALLOWED_CHARS_PATTERN_WITH_UPPERCASE + pattern = regex_pattern or DISALLOWED_CHARS_PATTERN + text = re.sub(pattern, DEFAULT_SEPARATOR, text) # remove redundant @@ -178,11 +195,3 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w text = text.replace(DEFAULT_SEPARATOR, separator) return text - - -def main(): # pragma: no cover - if len(sys.argv) < 2: - print("Usage %s TEXT TO SLUGIFY" % sys.argv[0]) - else: - text = ' '.join(sys.argv[1:]) - print(slugify(text)) diff --git a/slugify/special.py b/slugify/special.py new file mode 100644 index 0000000..918cb2a --- /dev/null +++ b/slugify/special.py @@ -0,0 +1,47 @@ +from __future__ import annotations + + +def add_uppercase_char(char_list: list[tuple[str, str]]) -> list[tuple[str, str]]: + """ Given a replacement char list, this adds uppercase chars to the list """ + + for item in char_list: + char, xlate = item + upper_dict = char.upper(), xlate.capitalize() + if upper_dict not in char_list and char != upper_dict[0]: + char_list.insert(0, upper_dict) + return char_list + + +# Language specific pre translations +# Source awesome-slugify + +_CYRILLIC = [ # package defaults: + (u'ё', u'e'), # io / yo + (u'я', u'ya'), # ia + (u'х', u'h'), # kh + (u'у', u'y'), # u + (u'щ', u'sch'), # sch + (u'ю', u'u'), # iu / yu +] +CYRILLIC = add_uppercase_char(_CYRILLIC) + +_GERMAN = [ # package defaults: + (u'ä', u'ae'), # a + (u'ö', u'oe'), # o + (u'ü', u'ue'), # u +] +GERMAN = add_uppercase_char(_GERMAN) + +_GREEK = [ # package defaults: + (u'χ', u'ch'), # kh + (u'Ξ', u'X'), # Ks + (u'ϒ', u'Y'), # U + (u'υ', u'y'), # u + (u'ύ', u'y'), + (u'ϋ', u'y'), + (u'ΰ', u'y'), +] +GREEK = add_uppercase_char(_GREEK) + +# Pre translations +PRE_TRANSLATIONS = CYRILLIC + GERMAN + GREEK diff --git a/tea.yaml b/tea.yaml new file mode 100644 index 0000000..dda3df9 --- /dev/null +++ b/tea.yaml @@ -0,0 +1,7 @@ +# https://tea.xyz/what-is-this-file +--- +version: 1.0.0 +codeOwners: + - '0xaC8Bb28685BD43FD784DC902E132829c6C6DafA2' +quorum: 1 + diff --git a/test.py b/test.py index 78b1956..fcec4b6 100644 --- a/test.py +++ b/test.py @@ -1,11 +1,16 @@ # -*- coding: utf-8 -*- - +import io +import sys import unittest +from contextlib import contextmanager + +from slugify import PRE_TRANSLATIONS from slugify import slugify from slugify import smart_truncate +from slugify.__main__ import slugify_params, parse_args -class TestSlugification(unittest.TestCase): +class TestSlugify(unittest.TestCase): def test_extraneous_seperators(self): @@ -32,6 +37,10 @@ def test_phonetic_conversion_of_eastern_scripts(self): self.assertEqual(r, "ying-shi-ma") def test_accented_text(self): + txt = '𝐚́́𝕒́àáâäãąā' + r = slugify(txt) + self.assertEqual(r, "aaaaaaaaa") + txt = 'C\'est déjà l\'été.' r = slugify(txt) self.assertEqual(r, "c-est-deja-l-ete") @@ -142,11 +151,36 @@ def test_stopwords_with_different_separator(self): r = slugify(txt, stopwords=['the'], separator=' ') self.assertEqual(r, 'quick brown fox jumps over lazy dog') - def test_html_entities(self): + def test_html_entities_on(self): txt = 'foo & bar' r = slugify(txt) self.assertEqual(r, 'foo-bar') + def test_html_entities_off(self): + txt = 'foo & bar' + r = slugify(txt, entities=False) + self.assertEqual(r, 'foo-amp-bar') + + def test_html_decimal_on(self): + txt = 'Ž' + r = slugify(txt, decimal=True) + self.assertEqual(r, 'z') + + def test_html_decimal_off(self): + txt = 'Ž' + r = slugify(txt, entities=False, decimal=False) + self.assertEqual(r, '381') + + def test_html_hexadecimal_on(self): + txt = 'Ž' + r = slugify(txt, hexadecimal=True) + self.assertEqual(r, 'z') + + def test_html_hexadecimal_off(self): + txt = 'Ž' + r = slugify(txt, hexadecimal=False) + self.assertEqual(r, 'x17d') + def test_starts_with_number(self): txt = '10 amazing secrets' r = slugify(txt) @@ -198,18 +232,300 @@ def test_replacements(self): r = slugify(txt, replacements=[['♥', 'amour'], ['🦄', 'licorne']]) self.assertEqual(r, "i-amour-licorne") + def test_replacements_german_umlaut_custom(self): + txt = 'ÜBER Über German Umlaut' + r = slugify(txt, replacements=[['Ü', 'UE'], ['ü', 'ue']]) + self.assertEqual(r, "ueber-ueber-german-umlaut") -class TestUtils(unittest.TestCase): + def test_pre_translation(self): + self.assertEqual(PRE_TRANSLATIONS, [('Ю', 'U'), ('Щ', 'Sch'), ('У', 'Y'), ('Х', 'H'), ('Я', 'Ya'), ('Ё', 'E'), ('ё', 'e'), ('я', 'ya'), ('х', 'h'), ('у', 'y'), ('щ', 'sch'), ('ю', 'u'), ('Ü', 'Ue'), ('Ö', 'Oe'), ('Ä', 'Ae'), ('ä', 'ae'), ('ö', 'oe'), ('ü', 'ue'), ('Ϋ́', 'Y'), ('Ϋ', 'Y'), ('Ύ', 'Y'), ('Υ', 'Y'), ('Χ', 'Ch'), ('χ', 'ch'), ('Ξ', 'X'), ('ϒ', 'Y'), ('υ', 'y'), ('ύ', 'y'), ('ϋ', 'y'), ('ΰ', 'y')]) - def test_smart_truncate_no_max_length(self): - txt = '1,000 reasons you are #1' - r = smart_truncate(txt) + +class TestSlugifyUnicode(unittest.TestCase): + def test_extraneous_seperators(self): + + txt = "This is a test ---" + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "this-is-a-test") + + txt = "___This is a test ---" + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "this-is-a-test") + + txt = "___This is a test___" + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "this-is-a-test") + + def test_non_word_characters(self): + txt = "This -- is a ## test ---" + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "this-is-a-test") + + def test_phonetic_conversion_of_eastern_scripts(self): + txt = '影師嗎' + r = slugify(txt, allow_unicode=True) self.assertEqual(r, txt) - def test_smart_truncate_no_seperator(self): + def test_accented_text(self): + txt = 'C\'est déjà l\'été.' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "c-est-déjà-l-été") + + txt = 'Nín hǎo. Wǒ shì zhōng guó rén' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "nín-hǎo-wǒ-shì-zhōng-guó-rén") + + def test_accented_text_with_non_word_characters(self): + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "jaja-lol-méméméoo-a") + + def test_cyrillic_text(self): + txt = 'Компьютер' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "компьютер") + + def test_max_length(self): + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=9) + self.assertEqual(r, "jaja-lol") + + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=15) + self.assertEqual(r, "jaja-lol-mémémé") + + def test_max_length_cutoff_not_required(self): + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=50) + self.assertEqual(r, "jaja-lol-méméméoo-a") + + def test_word_boundary(self): + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=15, word_boundary=True) + self.assertEqual(r, "jaja-lol-a") + + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=17, word_boundary=True) + self.assertEqual(r, "jaja-lol-méméméoo") + + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=18, word_boundary=True) + self.assertEqual(r, "jaja-lol-méméméoo") + + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=19, word_boundary=True) + self.assertEqual(r, "jaja-lol-méméméoo-a") + + def test_custom_separator(self): + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=20, word_boundary=True, separator=".") + self.assertEqual(r, "jaja.lol.méméméoo.a") + + def test_multi_character_separator(self): + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=20, word_boundary=True, separator="ZZZZZZ") + self.assertEqual(r, "jajaZZZZZZlolZZZZZZméméméooZZZZZZa") + + def test_save_order(self): + txt = 'one two three four five' + r = slugify(txt, allow_unicode=True, max_length=13, word_boundary=True, save_order=True) + self.assertEqual(r, "one-two-three") + + txt = 'one two three four five' + r = slugify(txt, allow_unicode=True, max_length=13, word_boundary=True, save_order=False) + self.assertEqual(r, "one-two-three") + + txt = 'one two three four five' + r = slugify(txt, allow_unicode=True, max_length=12, word_boundary=True, save_order=False) + self.assertEqual(r, "one-two-four") + + txt = 'one two three four five' + r = slugify(txt, allow_unicode=True, max_length=12, word_boundary=True, save_order=True) + self.assertEqual(r, "one-two") + + def test_save_order_rtl(self): + """For right-to-left unicode languages""" + txt = 'دو سه چهار پنج' + r = slugify(txt, allow_unicode=True, max_length=10, word_boundary=True, save_order=True) + self.assertEqual(r, "دو-سه-چهار") + + txt = 'دو سه چهار پنج' + r = slugify(txt, allow_unicode=True, max_length=10, word_boundary=True, save_order=False) + self.assertEqual(r, "دو-سه-چهار") + + txt = 'دو سه چهار پنج' + r = slugify(txt, allow_unicode=True, max_length=9, word_boundary=True, save_order=False) + self.assertEqual(r, "دو-سه-پنج") + + txt = 'دو سه چهار پنج' + r = slugify(txt, allow_unicode=True, max_length=9, word_boundary=True, save_order=True) + self.assertEqual(r, "دو-سه") + + def test_stopword_removal(self): + txt = 'this has a stopword' + r = slugify(txt, allow_unicode=True, stopwords=['stopword']) + self.assertEqual(r, 'this-has-a') + + txt = 'this has a Öländ' + r = slugify(txt, allow_unicode=True, stopwords=['Öländ']) + self.assertEqual(r, 'this-has-a') + + def test_stopword_removal_casesensitive(self): + txt = 'thIs Has a stopword Stopword' + r = slugify(txt, allow_unicode=True, stopwords=['Stopword'], lowercase=False) + self.assertEqual(r, 'thIs-Has-a-stopword') + + txt = 'thIs Has a öländ Öländ' + r = slugify(txt, allow_unicode=True, stopwords=['Öländ'], lowercase=False) + self.assertEqual(r, 'thIs-Has-a-öländ') + + def test_multiple_stopword_occurances(self): + txt = 'the quick brown fox jumps over the lazy dog' + r = slugify(txt, allow_unicode=True, stopwords=['the']) + self.assertEqual(r, 'quick-brown-fox-jumps-over-lazy-dog') + + def test_differently_cased_stopword_match(self): + txt = 'Foo A FOO B foo C' + r = slugify(txt, allow_unicode=True, stopwords=['foo']) + self.assertEqual(r, 'a-b-c') + + txt = 'Foo A FOO B foo C' + r = slugify(txt, allow_unicode=True, stopwords=['FOO']) + self.assertEqual(r, 'a-b-c') + + def test_multiple_stopwords(self): + txt = 'the quick brown fox jumps over the lazy dog in a hurry' + r = slugify(txt, allow_unicode=True, stopwords=['the', 'in', 'a', 'hurry']) + self.assertEqual(r, 'quick-brown-fox-jumps-over-lazy-dog') + + def test_stopwords_with_different_separator(self): + txt = 'the quick brown fox jumps over the lazy dog' + r = slugify(txt, allow_unicode=True, stopwords=['the'], separator=' ') + self.assertEqual(r, 'quick brown fox jumps over lazy dog') + + def test_html_entities_on(self): + txt = 'foo & bar' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, 'foo-bar') + + def test_html_entities_off(self): + txt = 'foo & bår' + r = slugify(txt, allow_unicode=True, entities=False) + self.assertEqual(r, 'foo-amp-bår') + + def test_html_decimal_on(self): + txt = 'Ž' + r = slugify(txt, allow_unicode=True, decimal=True) + self.assertEqual(r, 'ž') + + def test_html_decimal_off(self): + txt = 'Ž' + r = slugify(txt, allow_unicode=True, entities=False, decimal=False) + self.assertEqual(r, '381') + + def test_html_hexadecimal_on(self): + txt = 'Ž' + r = slugify(txt, allow_unicode=True, hexadecimal=True) + self.assertEqual(r, 'ž') + + def test_html_hexadecimal_off(self): + txt = 'Ž' + r = slugify(txt, allow_unicode=True, hexadecimal=False) + self.assertEqual(r, 'x17d') + + def test_starts_with_number(self): + txt = '10 amazing secrets' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, '10-amazing-secrets') + + def test_contains_numbers(self): + txt = 'buildings with 1000 windows' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, 'buildings-with-1000-windows') + + def test_ends_with_number(self): + txt = 'recipe number 3' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, 'recipe-number-3') + + def test_numbers_only(self): + txt = '404' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, '404') + + def test_numbers_and_symbols(self): txt = '1,000 reasons you are #1' - r = smart_truncate(txt, max_length=100, separator='_') - self.assertEqual(r, txt) + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, '1000-reasons-you-are-1') + + txt = '۱,۰۰۰ reasons you are #۱' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, '۱۰۰۰-reasons-you-are-۱') + + def test_regex_pattern_keep_underscore(self): + """allowing unicode should not overrule the passed regex_pattern""" + txt = "___This is a test___" + regex_pattern = r'[^-a-z0-9_]+' + r = slugify(txt, allow_unicode=True, regex_pattern=regex_pattern) + self.assertEqual(r, "___this-is-a-test___") + + def test_regex_pattern_keep_underscore_with_underscore_as_separator(self): + """ + The regex_pattern turns the power to the caller. + Hence, the caller must ensure that a custom separator doesn't clash + with the regex_pattern. + """ + txt = "___This is a test___" + regex_pattern = r'[^-a-z0-9_]+' + r = slugify(txt, allow_unicode=True, separator='_', regex_pattern=regex_pattern) + self.assertNotEqual(r, "_this_is_a_test_") + + def test_replacements(self): + txt = '10 | 20 %' + r = slugify(txt, allow_unicode=True, replacements=[['|', 'or'], ['%', 'percent']]) + self.assertEqual(r, "10-or-20-percent") + + txt = 'I ♥ 🦄' + r = slugify(txt, allow_unicode=True, replacements=[['♥', 'amour'], ['🦄', 'licorne']]) + self.assertEqual(r, "i-amour-licorne") + + txt = 'I ♥ 🦄' + r = slugify(txt, allow_unicode=True, replacements=[['♥', 'სიყვარული'], ['🦄', 'licorne']]) + self.assertEqual(r, "i-სიყვარული-licorne") + + def test_replacements_german_umlaut_custom(self): + txt = 'ÜBER Über German Umlaut' + r = slugify(txt, allow_unicode=True, replacements=[['Ü', 'UE'], ['ü', 'ue']]) + self.assertEqual(r, "ueber-ueber-german-umlaut") + + def test_emojis(self): + """ + allowing unicode shouldn't allow emojis, even in replacements. + the only exception is when it is allowed by the regex_pattern. regex_pattern overrules all + """ + txt = 'i love 🦄' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "i-love") + + txt = 'i love 🦄' + r = slugify(txt, allow_unicode=True, decimal=True) + self.assertEqual(r, "i-love") + + txt = 'i love 🦄' + r = slugify(txt, allow_unicode=True, hexadecimal=True) + self.assertEqual(r, "i-love") + + txt = 'i love 🦄' + r = slugify(txt, allow_unicode=True, entities=True) + self.assertEqual(r, "i-love") + + txt = 'i love you' + r = slugify(txt, allow_unicode=True, replacements=[['you', '🦄']]) + self.assertEqual(r, "i-love") + + txt = 'i love 🦄' + r = slugify(txt, allow_unicode=True, regex_pattern=r'[^🦄]+') + self.assertEqual(r, "🦄") class TestUtils(unittest.TestCase): @@ -225,5 +541,117 @@ def test_smart_truncate_no_seperator(self): self.assertEqual(r, txt) -if __name__ == '__main__': +PY3 = sys.version_info.major == 3 + + +@contextmanager +def captured_stderr(): + backup = sys.stderr + sys.stderr = io.StringIO() if PY3 else io.BytesIO() + try: + yield sys.stderr + finally: + sys.stderr = backup + + +@contextmanager +def loaded_stdin(contents): + backup = sys.stdin + sys.stdin = io.StringIO(contents) if PY3 else io.BytesIO(contents) + try: + yield sys.stdin + finally: + sys.stdin = backup + + +class TestCommandParams(unittest.TestCase): + DEFAULTS = { + 'entities': True, + 'decimal': True, + 'hexadecimal': True, + 'max_length': 0, + 'word_boundary': False, + 'save_order': False, + 'separator': '-', + 'stopwords': None, + 'lowercase': True, + 'replacements': None + } + + def get_params_from_cli(self, *argv): + args = parse_args([None] + list(argv)) + return slugify_params(args) + + def make_params(self, **values): + return dict(self.DEFAULTS, **values) + + def assertParamsMatch(self, expected, checked): + reduced_checked = {} + for key in expected.keys(): + reduced_checked[key] = checked[key] + self.assertEqual(expected, reduced_checked) + + def test_defaults(self): + params = self.get_params_from_cli() + self.assertParamsMatch(self.DEFAULTS, params) + + def test_negative_flags(self): + params = self.get_params_from_cli('--no-entities', '--no-decimal', '--no-hexadecimal', '--no-lowercase') + expected = self.make_params(entities=False, decimal=False, hexadecimal=False, lowercase=False) + self.assertFalse(expected['lowercase']) + self.assertFalse(expected['word_boundary']) + self.assertParamsMatch(expected, params) + + def test_affirmative_flags(self): + params = self.get_params_from_cli('--word-boundary', '--save-order') + expected = self.make_params(word_boundary=True, save_order=True) + self.assertParamsMatch(expected, params) + + def test_valued_arguments(self): + params = self.get_params_from_cli('--stopwords', 'abba', 'beatles', '--max-length', '98', '--separator', '+') + expected = self.make_params(stopwords=['abba', 'beatles'], max_length=98, separator='+') + self.assertParamsMatch(expected, params) + + def test_replacements_right(self): + params = self.get_params_from_cli('--replacements', 'A->B', 'C->D') + expected = self.make_params(replacements=[['A', 'B'], ['C', 'D']]) + self.assertParamsMatch(expected, params) + + def test_replacements_wrong(self): + with self.assertRaises(SystemExit) as err, captured_stderr() as cse: + self.get_params_from_cli('--replacements', 'A--B') + self.assertEqual(err.exception.code, 2) + self.assertIn("Replacements must be of the form: ORIGINAL->REPLACED", cse.getvalue()) + + def test_text_in_cli(self): + params = self.get_params_from_cli('Cool Text') + expected = self.make_params(text='Cool Text') + self.assertParamsMatch(expected, params) + + def test_text_in_cli_multi(self): + params = self.get_params_from_cli('Cool', 'Text') + expected = self.make_params(text='Cool Text') + self.assertParamsMatch(expected, params) + + def test_text_in_stdin(self): + with loaded_stdin("Cool Stdin"): + params = self.get_params_from_cli('--stdin') + expected = self.make_params(text='Cool Stdin') + self.assertParamsMatch(expected, params) + + def test_two_text_sources_fails(self): + with self.assertRaises(SystemExit) as err, captured_stderr() as cse: + self.get_params_from_cli('--stdin', 'Text') + self.assertEqual(err.exception.code, 2) + self.assertIn("Input strings and --stdin cannot work together", cse.getvalue()) + + def test_multivalued_options_with_text(self): + text = "the quick brown fox jumps over the lazy dog in a hurry" + cli_args = "--stopwords the in a hurry -- {}".format(text).split() + params = self.get_params_from_cli(*cli_args) + self.assertEqual(params['text'], text) + self.assertEqual(params['stopwords'], ['the', 'in', 'a', 'hurry']) + + +if __name__ == '__main__': # pragma: nocover unittest.main() diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..0c16f5e --- /dev/null +++ b/tox.ini @@ -0,0 +1,69 @@ +[tox] +env_list = + coverage-erase + py{3.10, 3.11, 3.12, 3.13, 3.14}-{unidecode, text_unidecode} + pypy{3.11}-{unidecode, text_unidecode} + coverage-report + coverage-html + mypy + pycodestyle + +[testenv] +depends = + py{3.10, 3.11, 3.12, 3.13, 3.14}-{unidecode, text_unidecode}: coverage-erase + pypy{3.11}-{unidecode, text_unidecode}: coverage-erase +deps = + coverage[toml] + pytest + unidecode: pip + unidecode: unidecode +commands_pre: + # If testing unidecode, ensure text_unidecode is unavailable. + unidecode: pip uninstall --yes text_unidecode +commands = + coverage run -m pytest test.py + +[testenv:coverage_base] +deps = + coverage[toml] + +[testenv:coverage-erase] +base = coverage_base +commands = + coverage erase + +[testenv:coverage-report] +base = coverage_base +depends = + py{3.10, 3.11, 3.12, 3.13, 3.14}-{unidecode, text_unidecode} + pypy{3.11}-{unidecode, text_unidecode} +commands_pre = + - coverage combine +commands = + coverage report + +[testenv:coverage-html] +base = coverage_base +depends = + coverage-report +commands = + coverage html --fail-under=0 + +[testenv:mypy] +deps = + mypy + unidecode +commands = + mypy + +[testenv:pycodestyle] +deps = + pycodestyle +commands = + pycodestyle --ignore=E128,E261,E225,E501,W605 slugify test.py setup.py + +[testenv:flake8] +deps = + flake8 +commands = + flake8 --ignore=E501,F403,F401,E241,E225,E128 slugify/ setup.py test.py