Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 164 additions & 1 deletion slugify/slugify.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,19 @@
import unicodedata
from collections.abc import Iterable
from html.entities import name2codepoint
from typing import NamedTuple

try:
import unidecode
except ImportError:
import text_unidecode as unidecode # type: ignore[import-untyped, no-redef]

__all__ = ['slugify', 'smart_truncate']
__all__ = ['slugify', 'slugify_with_map', 'smart_truncate']


class SlugifyResult(NamedTuple):
slug: str
steps: list[dict[str, str]]


CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
Expand Down Expand Up @@ -195,3 +201,160 @@ def slugify(
text = text.replace(DEFAULT_SEPARATOR, separator)

return text


def slugify_with_map(
text: str,
entities: bool = True,
decimal: bool = True,
hexadecimal: bool = True,
max_length: int = 0,
word_boundary: bool = False,
separator: str = DEFAULT_SEPARATOR,
save_order: bool = False,
stopwords: Iterable[str] = (),
regex_pattern: re.Pattern[str] | str | None = None,
lowercase: bool = True,
replacements: Iterable[Iterable[str]] = (),
allow_unicode: bool = False,
) -> SlugifyResult:
"""
Make a slug from the given text and return transformation steps.
:param text (str): initial text
:param entities (bool): converts html entities to unicode
:param decimal (bool): converts html decimal to unicode
:param hexadecimal (bool): converts html hexadecimal to unicode
:param max_length (int): output string length
:param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
:param save_order (bool): when set, does not include shorter subsequent words even if they fit
:param separator (str): separator between words
:param stopwords (iterable): words to discount
:param regex_pattern (str): regex pattern for disallowed characters
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
:param allow_unicode (bool): allow unicode characters
:return (SlugifyResult): slug and transformation steps
"""
steps = []
original = text

def _record(step_name: str, action: str, before: str, after: str) -> None:
if before != after:
steps.append({
'step': step_name,
'action': action,
'before': before,
'after': after
})

current = original

if replacements:
before = current
for old, new in replacements:
current = current.replace(old, new)
_record('replacements', 'replace', before, current)

if not isinstance(current, str):
before = current
current = str(current, 'utf-8', 'ignore')
_record('ensure_unicode', 'convert', before, current)

before = current
current = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, current)
_record('quotes', 'replace', before, current)

before = current
if allow_unicode:
current = unicodedata.normalize('NFKC', current)
else:
current = unicodedata.normalize('NFKD', current)
current = unidecode.unidecode(current)
_record('normalize', 'transliterate', before, current)

if not isinstance(current, str):
before = current
current = str(current, 'utf-8', 'ignore')
_record('ensure_unicode', 'convert', before, current)

if entities:
before = current
current = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), current)
_record('entities', 'decode', before, current)

if decimal:
try:
before = current
current = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), current)
_record('decimal', 'decode', before, current)
except Exception:
pass

if hexadecimal:
try:
before = current
current = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), current)
_record('hexadecimal', 'decode', before, current)
except Exception:
pass

before = current
if allow_unicode:
current = unicodedata.normalize('NFKC', current)
else:
current = unicodedata.normalize('NFKD', current)
_record('renormalize', 'normalize', before, current)

if lowercase:
before = current
current = current.lower()
_record('lowercase', 'transform', before, current)

before = current
current = QUOTE_PATTERN.sub('', current)
_record('remove_quotes', 'delete', before, current)

before = current
current = NUMBERS_PATTERN.sub('', current)
_record('clean_numbers', 'clean', before, current)

if allow_unicode:
pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
else:
pattern = regex_pattern or DISALLOWED_CHARS_PATTERN

before = current
current = re.sub(pattern, DEFAULT_SEPARATOR, current)
_record('clean_chars', 'clean', before, current)

before = current
current = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, current).strip(DEFAULT_SEPARATOR)
_record('clean_duplicates', 'clean', before, current)

if stopwords:
before = current
if lowercase:
stopwords_lower = [s.lower() for s in stopwords]
words = [w for w in current.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
else:
words = [w for w in current.split(DEFAULT_SEPARATOR) if w not in stopwords]
current = DEFAULT_SEPARATOR.join(words)
_record('stopwords', 'delete', before, current)

if replacements:
before = current
for old, new in replacements:
current = current.replace(old, new)
_record('final_replacements', 'replace', before, current)

if max_length > 0:
before = current
current = smart_truncate(current, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
_record('truncate', 'truncate', before, current)

if separator != DEFAULT_SEPARATOR:
before = current
current = current.replace(DEFAULT_SEPARATOR, separator)
_record('separator', 'replace', before, current)

return SlugifyResult(slug=current, steps=steps)
102 changes: 101 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from contextlib import contextmanager

from slugify import PRE_TRANSLATIONS
from slugify import slugify
from slugify import slugify, slugify_with_map
from slugify import smart_truncate
from slugify.__main__ import slugify_params, parse_args

Expand Down Expand Up @@ -653,5 +653,105 @@ def test_multivalued_options_with_text(self):
self.assertEqual(params['stopwords'], ['the', 'in', 'a', 'hurry'])


class TestSlugifyWithMap(unittest.TestCase):

def test_slug_compatibility(self):
test_cases = [
("This is a test ---", {}),
("影師嗎", {}),
("C'est déjà l'été.", {}),
("10 | 20 %", {'replacements': [['|', 'or'], ['%', 'percent']]}),
("this has a stopword", {'stopwords': ['stopword']}),
("foo & bar", {}),
("Foo A FOO B foo C", {'stopwords': ['foo']}),
("jaja---lol-méméméoo--a", {'max_length': 15, 'word_boundary': True}),
]
for text, kwargs in test_cases:
with self.subTest(text=text, kwargs=kwargs):
self.assertEqual(
slugify_with_map(text, **kwargs).slug,
slugify(text, **kwargs)
)

def test_slug_compatibility_unicode(self):
test_cases = [
("影師嗎", {'allow_unicode': True}),
("C'est déjà l'été.", {'allow_unicode': True}),
("Компьютер", {'allow_unicode': True}),
("this has a Öländ", {'allow_unicode': True, 'stopwords': ['Öländ']}),
("foo & bår", {'allow_unicode': True, 'entities': False}),
]
for text, kwargs in test_cases:
with self.subTest(text=text, kwargs=kwargs):
self.assertEqual(
slugify_with_map(text, **kwargs).slug,
slugify(text, **kwargs)
)

def test_replacements_steps(self):
result = slugify_with_map('10 | 20 %', replacements=[['|', 'or'], ['%', 'percent']])
steps = [s['step'] for s in result.steps]
self.assertIn('replacements', steps)
replacement_step = next(s for s in result.steps if s['step'] == 'replacements')
self.assertEqual(replacement_step['action'], 'replace')
self.assertIn('|', replacement_step['before'])
self.assertIn('%', replacement_step['before'])
self.assertIn('or', replacement_step['after'])
self.assertIn('percent', replacement_step['after'])

def test_stopwords_steps(self):
result = slugify_with_map('the quick brown fox', stopwords=['the'])
steps = [s['step'] for s in result.steps]
self.assertIn('stopwords', steps)
stopword_step = next(s for s in result.steps if s['step'] == 'stopwords')
self.assertEqual(stopword_step['action'], 'delete')
self.assertIn('the', stopword_step['before'])
self.assertNotIn('the', stopword_step['after'])

def test_html_entities_steps(self):
result = slugify_with_map('foo & bar')
steps = [s['step'] for s in result.steps]
self.assertIn('entities', steps)
entity_step = next(s for s in result.steps if s['step'] == 'entities')
self.assertEqual(entity_step['action'], 'decode')
self.assertIn('&', entity_step['before'])
self.assertIn('&', entity_step['after'])

def test_allow_unicode_false(self):
result = slugify_with_map('影師嗎', allow_unicode=False)
steps = [s['step'] for s in result.steps]
self.assertIn('normalize', steps)
normalize_step = next(s for s in result.steps if s['step'] == 'normalize')
self.assertEqual(normalize_step['action'], 'transliterate')
self.assertIn('影師嗎', normalize_step['before'])
self.assertNotEqual(normalize_step['after'], normalize_step['before'])
self.assertEqual(result.slug, slugify('影師嗎', allow_unicode=False))

def test_allow_unicode_true(self):
result = slugify_with_map('Hello 影師嗎 WORLD', allow_unicode=True)
steps = [s['step'] for s in result.steps]
self.assertIn('lowercase', steps)
lowercase_step = next(s for s in result.steps if s['step'] == 'lowercase')
self.assertIn('WORLD', lowercase_step['before'])
self.assertIn('world', lowercase_step['after'])

def test_lowercase_step(self):
result = slugify_with_map('HELLO WORLD', lowercase=True)
steps = [s['step'] for s in result.steps]
self.assertIn('lowercase', steps)
lowercase_step = next(s for s in result.steps if s['step'] == 'lowercase')
self.assertEqual(lowercase_step['action'], 'transform')
self.assertIn('HELLO', lowercase_step['before'])
self.assertIn('hello', lowercase_step['after'])

def test_clean_chars_step(self):
result = slugify_with_map('This -- is a ## test ---')
steps = [s['step'] for s in result.steps]
self.assertIn('clean_chars', steps)
clean_step = next(s for s in result.steps if s['step'] == 'clean_chars')
self.assertEqual(clean_step['action'], 'clean')
self.assertIn('##', clean_step['before'])


if __name__ == '__main__': # pragma: nocover
unittest.main()