Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 175 additions & 22 deletions slugify/slugify.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,22 @@
import re
import unicodedata
from collections.abc import Iterable
from dataclasses import dataclass
from html.entities import name2codepoint
from typing import Any

try:
import unidecode
except ImportError:
import text_unidecode as unidecode # type: ignore[import-untyped, no-redef]

__all__ = ['slugify', 'smart_truncate']
__all__ = ['slugify', 'slugify_with_map', 'smart_truncate', 'SlugWithMap']


@dataclass
class SlugWithMap:
slug: str
steps: list[dict[str, Any]]


CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
Expand Down Expand Up @@ -72,7 +80,7 @@ def smart_truncate(
return truncated.strip(separator)


def slugify(
def _slugify_core(
text: str,
entities: bool = True,
decimal: bool = True,
Expand All @@ -86,112 +94,257 @@ def slugify(
lowercase: bool = True,
replacements: Iterable[Iterable[str]] = (),
allow_unicode: bool = False,
) -> str:
"""
Make a slug from the given text.
:param text (str): initial text
:param entities (bool): converts html entities to unicode
:param decimal (bool): converts html decimal to unicode
:param hexadecimal (bool): converts html hexadecimal to unicode
:param max_length (int): output string length
:param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
:param save_order (bool): when set, does not include shorter subsequent words even if they fit
:param separator (str): separator between words
:param stopwords (iterable): words to discount
:param regex_pattern (str): regex pattern for disallowed characters
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
:param allow_unicode (bool): allow unicode characters
:return (str):
"""
track_steps: bool = False,
) -> tuple[str, list[dict[str, Any]]]:
steps: list[dict[str, Any]] = []

# user-specific replacements
def _record(step_type: str, **kwargs: Any) -> None:
if track_steps:
steps.append({'type': step_type, **kwargs})

# user-specific replacements (pre)
if replacements:
for old, new in replacements:
prev_text = text
text = text.replace(old, new)
if track_steps and text != prev_text:
_record('replace', old=old, new=new, before=prev_text, after=text)

# ensure text is unicode
if not isinstance(text, str):
prev_text = text
text = str(text, 'utf-8', 'ignore')
_record('normalize', before=prev_text, after=text)

# replace quotes with dashes - pre-process
prev_text = text
text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
if track_steps and text != prev_text:
_record('replace', pattern='quotes', before=prev_text, after=text)

# normalize text, convert to unicode if required
prev_text = text
if allow_unicode:
text = unicodedata.normalize('NFKC', text)
else:
text = unicodedata.normalize('NFKD', text)
text = unidecode.unidecode(text)
if track_steps and text != prev_text:
_record('normalize', mode='unicode', before=prev_text, after=text)

# ensure text is still in unicode
if not isinstance(text, str):
prev_text = text
text = str(text, 'utf-8', 'ignore')
_record('normalize', before=prev_text, after=text)

# character entity reference
if entities:
prev_text = text
text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)
if track_steps and text != prev_text:
_record('entity', mode='named', before=prev_text, after=text)

# decimal character reference
if decimal:
try:
prev_text = text
text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)
if track_steps and text != prev_text:
_record('entity', mode='decimal', before=prev_text, after=text)
except Exception:
pass

# hexadecimal character reference
if hexadecimal:
try:
prev_text = text
text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)
if track_steps and text != prev_text:
_record('entity', mode='hexadecimal', before=prev_text, after=text)
except Exception:
pass

# re normalize text
prev_text = text
if allow_unicode:
text = unicodedata.normalize('NFKC', text)
else:
text = unicodedata.normalize('NFKD', text)
if track_steps and text != prev_text:
_record('normalize', mode='final', before=prev_text, after=text)

# make the text lowercase (optional)
if lowercase:
prev_text = text
text = text.lower()
if track_steps and text != prev_text:
_record('lowercase', before=prev_text, after=text)

# remove generated quotes -- post-process
prev_text = text
text = QUOTE_PATTERN.sub('', text)
if track_steps and text != prev_text:
_record('delete', pattern='quotes', before=prev_text, after=text)

# cleanup numbers
prev_text = text
text = NUMBERS_PATTERN.sub('', text)
if track_steps and text != prev_text:
_record('cleanup', pattern='numbers', before=prev_text, after=text)

# replace all other unwanted characters
if allow_unicode:
pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
else:
pattern = regex_pattern or DISALLOWED_CHARS_PATTERN

prev_text = text
text = re.sub(pattern, DEFAULT_SEPARATOR, text)
if track_steps and text != prev_text:
_record('regex_cleanup', before=prev_text, after=text)

# remove redundant
prev_text = text
text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)
if track_steps and text != prev_text:
_record('deduplicate', pattern='dashes', before=prev_text, after=text)

# remove stopwords
if stopwords:
prev_text = text
if lowercase:
stopwords_lower = [s.lower() for s in stopwords]
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
removed = [w for w in text.split(DEFAULT_SEPARATOR) if w in stopwords_lower]
else:
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
removed = [w for w in text.split(DEFAULT_SEPARATOR) if w in stopwords]
text = DEFAULT_SEPARATOR.join(words)
if track_steps and removed:
_record('stopwords', removed=removed, before=prev_text, after=text)

# finalize user-specific replacements
if replacements:
for old, new in replacements:
prev_text = text
text = text.replace(old, new)
if track_steps and text != prev_text:
_record('replace', old=old, new=new, phase='final', before=prev_text, after=text)

# smart truncate if requested
if max_length > 0:
prev_text = text
text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
if track_steps and text != prev_text:
_record('truncate', before=prev_text, after=text)

if separator != DEFAULT_SEPARATOR:
prev_text = text
text = text.replace(DEFAULT_SEPARATOR, separator)
if track_steps and text != prev_text:
_record('separator', old=DEFAULT_SEPARATOR, new=separator, before=prev_text, after=text)

return text, steps


return text
def slugify(
text: str,
entities: bool = True,
decimal: bool = True,
hexadecimal: bool = True,
max_length: int = 0,
word_boundary: bool = False,
separator: str = DEFAULT_SEPARATOR,
save_order: bool = False,
stopwords: Iterable[str] = (),
regex_pattern: re.Pattern[str] | str | None = None,
lowercase: bool = True,
replacements: Iterable[Iterable[str]] = (),
allow_unicode: bool = False,
) -> str:
"""
Make a slug from the given text.
:param text (str): initial text
:param entities (bool): converts html entities to unicode
:param decimal (bool): converts html decimal to unicode
:param hexadecimal (bool): converts html hexadecimal to unicode
:param max_length (int): output string length
:param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
:param save_order (bool): when set, does not include shorter subsequent words even if they fit
:param separator (str): separator between words
:param stopwords (iterable): words to discount
:param regex_pattern (str): regex pattern for disallowed characters
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
:param allow_unicode (bool): allow unicode characters
:return (str):
"""
result, _ = _slugify_core(
text=text,
entities=entities,
decimal=decimal,
hexadecimal=hexadecimal,
max_length=max_length,
word_boundary=word_boundary,
separator=separator,
save_order=save_order,
stopwords=stopwords,
regex_pattern=regex_pattern,
lowercase=lowercase,
replacements=replacements,
allow_unicode=allow_unicode,
track_steps=False,
)
return result


def slugify_with_map(
text: str,
entities: bool = True,
decimal: bool = True,
hexadecimal: bool = True,
max_length: int = 0,
word_boundary: bool = False,
separator: str = DEFAULT_SEPARATOR,
save_order: bool = False,
stopwords: Iterable[str] = (),
regex_pattern: re.Pattern[str] | str | None = None,
lowercase: bool = True,
replacements: Iterable[Iterable[str]] = (),
allow_unicode: bool = False,
) -> SlugWithMap:
"""
Make a slug from the given text with transformation steps mapping.
:param text (str): initial text
:param entities (bool): converts html entities to unicode
:param decimal (bool): converts html decimal to unicode
:param hexadecimal (bool): converts html hexadecimal to unicode
:param max_length (int): output string length
:param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
:param save_order (bool): when set, does not include shorter subsequent words even if they fit
:param separator (str): separator between words
:param stopwords (iterable): words to discount
:param regex_pattern (str): regex pattern for disallowed characters
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
:param allow_unicode (bool): allow unicode characters
:return (SlugWithMap): dataclass with slug (str) and steps (list)
"""
result, steps = _slugify_core(
text=text,
entities=entities,
decimal=decimal,
hexadecimal=hexadecimal,
max_length=max_length,
word_boundary=word_boundary,
separator=separator,
save_order=save_order,
stopwords=stopwords,
regex_pattern=regex_pattern,
lowercase=lowercase,
replacements=replacements,
allow_unicode=allow_unicode,
track_steps=True,
)
return SlugWithMap(slug=result, steps=steps)
Loading