diff --git a/slugify/slugify.py b/slugify/slugify.py index 9b5f27f..98d2743 100644 --- a/slugify/slugify.py +++ b/slugify/slugify.py @@ -4,13 +4,19 @@ import unicodedata from collections.abc import Iterable from html.entities import name2codepoint +from typing import NamedTuple try: import unidecode except ImportError: import text_unidecode as unidecode # type: ignore[import-untyped, no-redef] -__all__ = ['slugify', 'smart_truncate'] +__all__ = ['slugify', 'slugify_with_map', 'smart_truncate'] + + +class SlugifyResult(NamedTuple): + slug: str + steps: list[dict[str, str]] CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint)) @@ -195,3 +201,160 @@ def slugify( text = text.replace(DEFAULT_SEPARATOR, separator) return text + + +def slugify_with_map( + text: str, + entities: bool = True, + decimal: bool = True, + hexadecimal: bool = True, + max_length: int = 0, + word_boundary: bool = False, + separator: str = DEFAULT_SEPARATOR, + save_order: bool = False, + stopwords: Iterable[str] = (), + regex_pattern: re.Pattern[str] | str | None = None, + lowercase: bool = True, + replacements: Iterable[Iterable[str]] = (), + allow_unicode: bool = False, +) -> SlugifyResult: + """ + Make a slug from the given text and return transformation steps. + :param text (str): initial text + :param entities (bool): converts html entities to unicode + :param decimal (bool): converts html decimal to unicode + :param hexadecimal (bool): converts html hexadecimal to unicode + :param max_length (int): output string length + :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length + :param save_order (bool): when set, does not include shorter subsequent words even if they fit + :param separator (str): separator between words + :param stopwords (iterable): words to discount + :param regex_pattern (str): regex pattern for disallowed characters + :param lowercase (bool): activate case sensitivity by setting it to False + :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] + :param allow_unicode (bool): allow unicode characters + :return (SlugifyResult): slug and transformation steps + """ + steps = [] + original = text + + def _record(step_name: str, action: str, before: str, after: str) -> None: + if before != after: + steps.append({ + 'step': step_name, + 'action': action, + 'before': before, + 'after': after + }) + + current = original + + if replacements: + before = current + for old, new in replacements: + current = current.replace(old, new) + _record('replacements', 'replace', before, current) + + if not isinstance(current, str): + before = current + current = str(current, 'utf-8', 'ignore') + _record('ensure_unicode', 'convert', before, current) + + before = current + current = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, current) + _record('quotes', 'replace', before, current) + + before = current + if allow_unicode: + current = unicodedata.normalize('NFKC', current) + else: + current = unicodedata.normalize('NFKD', current) + current = unidecode.unidecode(current) + _record('normalize', 'transliterate', before, current) + + if not isinstance(current, str): + before = current + current = str(current, 'utf-8', 'ignore') + _record('ensure_unicode', 'convert', before, current) + + if entities: + before = current + current = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), current) + _record('entities', 'decode', before, current) + + if decimal: + try: + before = current + current = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), current) + _record('decimal', 'decode', before, current) + except Exception: + pass + + if hexadecimal: + try: + before = current + current = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), current) + _record('hexadecimal', 'decode', before, current) + except Exception: + pass + + before = current + if allow_unicode: + current = unicodedata.normalize('NFKC', current) + else: + current = unicodedata.normalize('NFKD', current) + _record('renormalize', 'normalize', before, current) + + if lowercase: + before = current + current = current.lower() + _record('lowercase', 'transform', before, current) + + before = current + current = QUOTE_PATTERN.sub('', current) + _record('remove_quotes', 'delete', before, current) + + before = current + current = NUMBERS_PATTERN.sub('', current) + _record('clean_numbers', 'clean', before, current) + + if allow_unicode: + pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN + else: + pattern = regex_pattern or DISALLOWED_CHARS_PATTERN + + before = current + current = re.sub(pattern, DEFAULT_SEPARATOR, current) + _record('clean_chars', 'clean', before, current) + + before = current + current = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, current).strip(DEFAULT_SEPARATOR) + _record('clean_duplicates', 'clean', before, current) + + if stopwords: + before = current + if lowercase: + stopwords_lower = [s.lower() for s in stopwords] + words = [w for w in current.split(DEFAULT_SEPARATOR) if w not in stopwords_lower] + else: + words = [w for w in current.split(DEFAULT_SEPARATOR) if w not in stopwords] + current = DEFAULT_SEPARATOR.join(words) + _record('stopwords', 'delete', before, current) + + if replacements: + before = current + for old, new in replacements: + current = current.replace(old, new) + _record('final_replacements', 'replace', before, current) + + if max_length > 0: + before = current + current = smart_truncate(current, max_length, word_boundary, DEFAULT_SEPARATOR, save_order) + _record('truncate', 'truncate', before, current) + + if separator != DEFAULT_SEPARATOR: + before = current + current = current.replace(DEFAULT_SEPARATOR, separator) + _record('separator', 'replace', before, current) + + return SlugifyResult(slug=current, steps=steps) diff --git a/test.py b/test.py index fcec4b6..f223083 100644 --- a/test.py +++ b/test.py @@ -5,7 +5,7 @@ from contextlib import contextmanager from slugify import PRE_TRANSLATIONS -from slugify import slugify +from slugify import slugify, slugify_with_map from slugify import smart_truncate from slugify.__main__ import slugify_params, parse_args @@ -653,5 +653,105 @@ def test_multivalued_options_with_text(self): self.assertEqual(params['stopwords'], ['the', 'in', 'a', 'hurry']) +class TestSlugifyWithMap(unittest.TestCase): + + def test_slug_compatibility(self): + test_cases = [ + ("This is a test ---", {}), + ("影師嗎", {}), + ("C'est déjà l'été.", {}), + ("10 | 20 %", {'replacements': [['|', 'or'], ['%', 'percent']]}), + ("this has a stopword", {'stopwords': ['stopword']}), + ("foo & bar", {}), + ("Foo A FOO B foo C", {'stopwords': ['foo']}), + ("jaja---lol-méméméoo--a", {'max_length': 15, 'word_boundary': True}), + ] + for text, kwargs in test_cases: + with self.subTest(text=text, kwargs=kwargs): + self.assertEqual( + slugify_with_map(text, **kwargs).slug, + slugify(text, **kwargs) + ) + + def test_slug_compatibility_unicode(self): + test_cases = [ + ("影師嗎", {'allow_unicode': True}), + ("C'est déjà l'été.", {'allow_unicode': True}), + ("Компьютер", {'allow_unicode': True}), + ("this has a Öländ", {'allow_unicode': True, 'stopwords': ['Öländ']}), + ("foo & bår", {'allow_unicode': True, 'entities': False}), + ] + for text, kwargs in test_cases: + with self.subTest(text=text, kwargs=kwargs): + self.assertEqual( + slugify_with_map(text, **kwargs).slug, + slugify(text, **kwargs) + ) + + def test_replacements_steps(self): + result = slugify_with_map('10 | 20 %', replacements=[['|', 'or'], ['%', 'percent']]) + steps = [s['step'] for s in result.steps] + self.assertIn('replacements', steps) + replacement_step = next(s for s in result.steps if s['step'] == 'replacements') + self.assertEqual(replacement_step['action'], 'replace') + self.assertIn('|', replacement_step['before']) + self.assertIn('%', replacement_step['before']) + self.assertIn('or', replacement_step['after']) + self.assertIn('percent', replacement_step['after']) + + def test_stopwords_steps(self): + result = slugify_with_map('the quick brown fox', stopwords=['the']) + steps = [s['step'] for s in result.steps] + self.assertIn('stopwords', steps) + stopword_step = next(s for s in result.steps if s['step'] == 'stopwords') + self.assertEqual(stopword_step['action'], 'delete') + self.assertIn('the', stopword_step['before']) + self.assertNotIn('the', stopword_step['after']) + + def test_html_entities_steps(self): + result = slugify_with_map('foo & bar') + steps = [s['step'] for s in result.steps] + self.assertIn('entities', steps) + entity_step = next(s for s in result.steps if s['step'] == 'entities') + self.assertEqual(entity_step['action'], 'decode') + self.assertIn('&', entity_step['before']) + self.assertIn('&', entity_step['after']) + + def test_allow_unicode_false(self): + result = slugify_with_map('影師嗎', allow_unicode=False) + steps = [s['step'] for s in result.steps] + self.assertIn('normalize', steps) + normalize_step = next(s for s in result.steps if s['step'] == 'normalize') + self.assertEqual(normalize_step['action'], 'transliterate') + self.assertIn('影師嗎', normalize_step['before']) + self.assertNotEqual(normalize_step['after'], normalize_step['before']) + self.assertEqual(result.slug, slugify('影師嗎', allow_unicode=False)) + + def test_allow_unicode_true(self): + result = slugify_with_map('Hello 影師嗎 WORLD', allow_unicode=True) + steps = [s['step'] for s in result.steps] + self.assertIn('lowercase', steps) + lowercase_step = next(s for s in result.steps if s['step'] == 'lowercase') + self.assertIn('WORLD', lowercase_step['before']) + self.assertIn('world', lowercase_step['after']) + + def test_lowercase_step(self): + result = slugify_with_map('HELLO WORLD', lowercase=True) + steps = [s['step'] for s in result.steps] + self.assertIn('lowercase', steps) + lowercase_step = next(s for s in result.steps if s['step'] == 'lowercase') + self.assertEqual(lowercase_step['action'], 'transform') + self.assertIn('HELLO', lowercase_step['before']) + self.assertIn('hello', lowercase_step['after']) + + def test_clean_chars_step(self): + result = slugify_with_map('This -- is a ## test ---') + steps = [s['step'] for s in result.steps] + self.assertIn('clean_chars', steps) + clean_step = next(s for s in result.steps if s['step'] == 'clean_chars') + self.assertEqual(clean_step['action'], 'clean') + self.assertIn('##', clean_step['before']) + + if __name__ == '__main__': # pragma: nocover unittest.main()