From c643ee3925d8c7842a58058e0f8492c7aae02b4a Mon Sep 17 00:00:00 2001 From: "Vincent.Y.K.JIANG" Date: Wed, 1 Apr 2026 15:09:39 +0800 Subject: [PATCH] Add slugify_with_map function and corresponding tests This commit introduces the `slugify_with_map` function to enhance slug generation with detailed transformation steps. It includes support for replacements, stopwords, and unicode handling. Additionally, comprehensive unit tests have been added to ensure compatibility and correctness of the new functionality, comparing results with the existing `slugify` function. --- slugify/slugify.py | 165 ++++++++++++++++++++++++++++++++++++++++++++- test.py | 102 +++++++++++++++++++++++++++- 2 files changed, 265 insertions(+), 2 deletions(-) diff --git a/slugify/slugify.py b/slugify/slugify.py index 9b5f27f..98d2743 100644 --- a/slugify/slugify.py +++ b/slugify/slugify.py @@ -4,13 +4,19 @@ import unicodedata from collections.abc import Iterable from html.entities import name2codepoint +from typing import NamedTuple try: import unidecode except ImportError: import text_unidecode as unidecode # type: ignore[import-untyped, no-redef] -__all__ = ['slugify', 'smart_truncate'] +__all__ = ['slugify', 'slugify_with_map', 'smart_truncate'] + + +class SlugifyResult(NamedTuple): + slug: str + steps: list[dict[str, str]] CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint)) @@ -195,3 +201,160 @@ def slugify( text = text.replace(DEFAULT_SEPARATOR, separator) return text + + +def slugify_with_map( + text: str, + entities: bool = True, + decimal: bool = True, + hexadecimal: bool = True, + max_length: int = 0, + word_boundary: bool = False, + separator: str = DEFAULT_SEPARATOR, + save_order: bool = False, + stopwords: Iterable[str] = (), + regex_pattern: re.Pattern[str] | str | None = None, + lowercase: bool = True, + replacements: Iterable[Iterable[str]] = (), + allow_unicode: bool = False, +) -> SlugifyResult: + """ + Make a slug from the given text and return transformation steps. + :param text (str): initial text + :param entities (bool): converts html entities to unicode + :param decimal (bool): converts html decimal to unicode + :param hexadecimal (bool): converts html hexadecimal to unicode + :param max_length (int): output string length + :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length + :param save_order (bool): when set, does not include shorter subsequent words even if they fit + :param separator (str): separator between words + :param stopwords (iterable): words to discount + :param regex_pattern (str): regex pattern for disallowed characters + :param lowercase (bool): activate case sensitivity by setting it to False + :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] + :param allow_unicode (bool): allow unicode characters + :return (SlugifyResult): slug and transformation steps + """ + steps = [] + original = text + + def _record(step_name: str, action: str, before: str, after: str) -> None: + if before != after: + steps.append({ + 'step': step_name, + 'action': action, + 'before': before, + 'after': after + }) + + current = original + + if replacements: + before = current + for old, new in replacements: + current = current.replace(old, new) + _record('replacements', 'replace', before, current) + + if not isinstance(current, str): + before = current + current = str(current, 'utf-8', 'ignore') + _record('ensure_unicode', 'convert', before, current) + + before = current + current = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, current) + _record('quotes', 'replace', before, current) + + before = current + if allow_unicode: + current = unicodedata.normalize('NFKC', current) + else: + current = unicodedata.normalize('NFKD', current) + current = unidecode.unidecode(current) + _record('normalize', 'transliterate', before, current) + + if not isinstance(current, str): + before = current + current = str(current, 'utf-8', 'ignore') + _record('ensure_unicode', 'convert', before, current) + + if entities: + before = current + current = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), current) + _record('entities', 'decode', before, current) + + if decimal: + try: + before = current + current = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), current) + _record('decimal', 'decode', before, current) + except Exception: + pass + + if hexadecimal: + try: + before = current + current = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), current) + _record('hexadecimal', 'decode', before, current) + except Exception: + pass + + before = current + if allow_unicode: + current = unicodedata.normalize('NFKC', current) + else: + current = unicodedata.normalize('NFKD', current) + _record('renormalize', 'normalize', before, current) + + if lowercase: + before = current + current = current.lower() + _record('lowercase', 'transform', before, current) + + before = current + current = QUOTE_PATTERN.sub('', current) + _record('remove_quotes', 'delete', before, current) + + before = current + current = NUMBERS_PATTERN.sub('', current) + _record('clean_numbers', 'clean', before, current) + + if allow_unicode: + pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN + else: + pattern = regex_pattern or DISALLOWED_CHARS_PATTERN + + before = current + current = re.sub(pattern, DEFAULT_SEPARATOR, current) + _record('clean_chars', 'clean', before, current) + + before = current + current = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, current).strip(DEFAULT_SEPARATOR) + _record('clean_duplicates', 'clean', before, current) + + if stopwords: + before = current + if lowercase: + stopwords_lower = [s.lower() for s in stopwords] + words = [w for w in current.split(DEFAULT_SEPARATOR) if w not in stopwords_lower] + else: + words = [w for w in current.split(DEFAULT_SEPARATOR) if w not in stopwords] + current = DEFAULT_SEPARATOR.join(words) + _record('stopwords', 'delete', before, current) + + if replacements: + before = current + for old, new in replacements: + current = current.replace(old, new) + _record('final_replacements', 'replace', before, current) + + if max_length > 0: + before = current + current = smart_truncate(current, max_length, word_boundary, DEFAULT_SEPARATOR, save_order) + _record('truncate', 'truncate', before, current) + + if separator != DEFAULT_SEPARATOR: + before = current + current = current.replace(DEFAULT_SEPARATOR, separator) + _record('separator', 'replace', before, current) + + return SlugifyResult(slug=current, steps=steps) diff --git a/test.py b/test.py index fcec4b6..f223083 100644 --- a/test.py +++ b/test.py @@ -5,7 +5,7 @@ from contextlib import contextmanager from slugify import PRE_TRANSLATIONS -from slugify import slugify +from slugify import slugify, slugify_with_map from slugify import smart_truncate from slugify.__main__ import slugify_params, parse_args @@ -653,5 +653,105 @@ def test_multivalued_options_with_text(self): self.assertEqual(params['stopwords'], ['the', 'in', 'a', 'hurry']) +class TestSlugifyWithMap(unittest.TestCase): + + def test_slug_compatibility(self): + test_cases = [ + ("This is a test ---", {}), + ("影師嗎", {}), + ("C'est déjà l'été.", {}), + ("10 | 20 %", {'replacements': [['|', 'or'], ['%', 'percent']]}), + ("this has a stopword", {'stopwords': ['stopword']}), + ("foo & bar", {}), + ("Foo A FOO B foo C", {'stopwords': ['foo']}), + ("jaja---lol-méméméoo--a", {'max_length': 15, 'word_boundary': True}), + ] + for text, kwargs in test_cases: + with self.subTest(text=text, kwargs=kwargs): + self.assertEqual( + slugify_with_map(text, **kwargs).slug, + slugify(text, **kwargs) + ) + + def test_slug_compatibility_unicode(self): + test_cases = [ + ("影師嗎", {'allow_unicode': True}), + ("C'est déjà l'été.", {'allow_unicode': True}), + ("Компьютер", {'allow_unicode': True}), + ("this has a Öländ", {'allow_unicode': True, 'stopwords': ['Öländ']}), + ("foo & bår", {'allow_unicode': True, 'entities': False}), + ] + for text, kwargs in test_cases: + with self.subTest(text=text, kwargs=kwargs): + self.assertEqual( + slugify_with_map(text, **kwargs).slug, + slugify(text, **kwargs) + ) + + def test_replacements_steps(self): + result = slugify_with_map('10 | 20 %', replacements=[['|', 'or'], ['%', 'percent']]) + steps = [s['step'] for s in result.steps] + self.assertIn('replacements', steps) + replacement_step = next(s for s in result.steps if s['step'] == 'replacements') + self.assertEqual(replacement_step['action'], 'replace') + self.assertIn('|', replacement_step['before']) + self.assertIn('%', replacement_step['before']) + self.assertIn('or', replacement_step['after']) + self.assertIn('percent', replacement_step['after']) + + def test_stopwords_steps(self): + result = slugify_with_map('the quick brown fox', stopwords=['the']) + steps = [s['step'] for s in result.steps] + self.assertIn('stopwords', steps) + stopword_step = next(s for s in result.steps if s['step'] == 'stopwords') + self.assertEqual(stopword_step['action'], 'delete') + self.assertIn('the', stopword_step['before']) + self.assertNotIn('the', stopword_step['after']) + + def test_html_entities_steps(self): + result = slugify_with_map('foo & bar') + steps = [s['step'] for s in result.steps] + self.assertIn('entities', steps) + entity_step = next(s for s in result.steps if s['step'] == 'entities') + self.assertEqual(entity_step['action'], 'decode') + self.assertIn('&', entity_step['before']) + self.assertIn('&', entity_step['after']) + + def test_allow_unicode_false(self): + result = slugify_with_map('影師嗎', allow_unicode=False) + steps = [s['step'] for s in result.steps] + self.assertIn('normalize', steps) + normalize_step = next(s for s in result.steps if s['step'] == 'normalize') + self.assertEqual(normalize_step['action'], 'transliterate') + self.assertIn('影師嗎', normalize_step['before']) + self.assertNotEqual(normalize_step['after'], normalize_step['before']) + self.assertEqual(result.slug, slugify('影師嗎', allow_unicode=False)) + + def test_allow_unicode_true(self): + result = slugify_with_map('Hello 影師嗎 WORLD', allow_unicode=True) + steps = [s['step'] for s in result.steps] + self.assertIn('lowercase', steps) + lowercase_step = next(s for s in result.steps if s['step'] == 'lowercase') + self.assertIn('WORLD', lowercase_step['before']) + self.assertIn('world', lowercase_step['after']) + + def test_lowercase_step(self): + result = slugify_with_map('HELLO WORLD', lowercase=True) + steps = [s['step'] for s in result.steps] + self.assertIn('lowercase', steps) + lowercase_step = next(s for s in result.steps if s['step'] == 'lowercase') + self.assertEqual(lowercase_step['action'], 'transform') + self.assertIn('HELLO', lowercase_step['before']) + self.assertIn('hello', lowercase_step['after']) + + def test_clean_chars_step(self): + result = slugify_with_map('This -- is a ## test ---') + steps = [s['step'] for s in result.steps] + self.assertIn('clean_chars', steps) + clean_step = next(s for s in result.steps if s['step'] == 'clean_chars') + self.assertEqual(clean_step['action'], 'clean') + self.assertIn('##', clean_step['before']) + + if __name__ == '__main__': # pragma: nocover unittest.main()