Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion slugify/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def parse_args(argv: list[str]) -> argparse.Namespace:
help="""Additional replacement rules e.g. "|->or", "%%->percent".""")
parser.add_argument("--allow-unicode", action='store_true', default=False,
help="Allow unicode characters")
parser.add_argument("--protected-words", nargs='+',
help="Words to always preserve in final slug")

args = parser.parse_args(argv[1:])

Expand Down Expand Up @@ -78,7 +80,8 @@ def slugify_params(args: argparse.Namespace) -> dict[str, Any]:
stopwords=args.stopwords,
lowercase=args.lowercase,
replacements=args.replacements,
allow_unicode=args.allow_unicode
allow_unicode=args.allow_unicode,
protected_words=args.protected_words,
)


Expand Down
84 changes: 79 additions & 5 deletions slugify/slugify.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def slugify(
lowercase: bool = True,
replacements: Iterable[Iterable[str]] = (),
allow_unicode: bool = False,
protected_words: Iterable[str] | None = None,
) -> str:
"""
Make a slug from the given text.
Expand All @@ -102,6 +103,7 @@ def slugify(
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
:param allow_unicode (bool): allow unicode characters
:param protected_words (iterable): words to always preserve in final slug
:return (str):
"""

Expand Down Expand Up @@ -173,23 +175,95 @@ def slugify(
# remove redundant
text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)

# helper to slugify protected words consistently with main flow
def _slugify_protected(w: str) -> str:
return slugify(
w,
entities=entities,
decimal=decimal,
hexadecimal=hexadecimal,
max_length=0,
separator=DEFAULT_SEPARATOR,
stopwords=(),
regex_pattern=regex_pattern,
lowercase=lowercase,
replacements=replacements,
allow_unicode=allow_unicode,
protected_words=None,
)

protected_words_set: set[str] = set()
protected_slugs: list[str] = []
if protected_words:
seen = set()
for w in protected_words:
slug = _slugify_protected(w)
if slug and slug not in seen:
seen.add(slug)
protected_slugs.append(slug)
protected_words_set = set(protected_slugs)

# remove stopwords
if stopwords:
words = text.split(DEFAULT_SEPARATOR)
if lowercase:
stopwords_lower = [s.lower() for s in stopwords]
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
stopwords_lower = {s.lower() for s in stopwords}
words = [w for w in words if w not in stopwords_lower]
else:
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
stopwords_set = set(stopwords)
words = [w for w in words if w not in stopwords_set]
if not words:
words = text.split(DEFAULT_SEPARATOR)[:1]
text = DEFAULT_SEPARATOR.join(words)

# ensure protected words are included (multi-token slugs match as substring)
if protected_slugs:
words = text.split(DEFAULT_SEPARATOR)
for protected in protected_slugs:
if DEFAULT_SEPARATOR in protected:
if protected not in text:
words.extend(protected.split(DEFAULT_SEPARATOR))
else:
if protected not in words:
words.append(protected)
text = DEFAULT_SEPARATOR.join(w for w in words if w)

# finalize user-specific replacements
if replacements:
for old, new in replacements:
text = text.replace(old, new)

# smart truncate if requested
# smart truncate if requested, with protected words priority
if max_length > 0:
text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
if protected_slugs:
protected_full_str = DEFAULT_SEPARATOR.join(protected_slugs)
if len(protected_full_str) >= max_length:
text = protected_full_str
else:
remaining = max_length - len(protected_full_str) - (1 if protected_slugs else 0)
if remaining > 0:
text_words = text.split(DEFAULT_SEPARATOR)
protected_words_set = set()
for slug in protected_slugs:
for w in slug.split(DEFAULT_SEPARATOR):
protected_words_set.add(w)
other_words = [w for w in text_words if w not in protected_words_set]
other_truncated = smart_truncate(
DEFAULT_SEPARATOR.join(other_words),
remaining,
word_boundary,
DEFAULT_SEPARATOR,
save_order
)
if other_truncated:
text = DEFAULT_SEPARATOR.join([other_truncated, protected_full_str]).strip(DEFAULT_SEPARATOR)
else:
text = protected_full_str
else:
text = protected_full_str
else:
text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
text = text.strip(DEFAULT_SEPARATOR)

if separator != DEFAULT_SEPARATOR:
text = text.replace(DEFAULT_SEPARATOR, separator)
Expand Down
76 changes: 75 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,79 @@ def test_emojis(self):
r = slugify(txt, allow_unicode=True, regex_pattern=r'[^🦄]+')
self.assertEqual(r, "🦄")

def test_max_length_basic_truncate(self):
txt = 'hello world this is a long sentence'
r = slugify(txt, max_length=15)
self.assertEqual(r, "hello-world-thi")
self.assertLessEqual(len(r), 15)

def test_max_length_word_boundary_no_trailing_separator(self):
txt = 'hello world foo bar baz'
r = slugify(txt, max_length=14, word_boundary=True)
self.assertEqual(r, "hello-world")
self.assertFalse(r.endswith('-'))
self.assertLessEqual(len(r), 14)

def test_stopwords_normal_removal(self):
txt = 'the quick brown fox jumps over a lazy dog'
r = slugify(txt, stopwords=["the", "a"])
self.assertEqual(r, "quick-brown-fox-jumps-over-lazy-dog")
self.assertNotIn("the-", r)
self.assertNotIn("-a-", r)

def test_stopwords_avoid_empty_result(self):
txt = 'the a an'
r = slugify(txt, stopwords=["the", "a", "an"])
self.assertEqual(r, "the")
self.assertNotEqual(r, "")

def test_protected_words_forced_presence(self):
txt = 'hello world example text'
r = slugify(txt, protected_words=["python"])
self.assertIn("python", r)

def test_protected_words_even_when_too_long(self):
txt = 'short text'
r = slugify(txt, max_length=5, protected_words=["python"])
self.assertIn("python", r)
self.assertEqual(r, "python")

def test_protected_words_beats_stopwords(self):
txt = 'the python code is great'
r = slugify(txt, stopwords=["the", "python", "is"], protected_words=["python"])
self.assertIn("python", r)
self.assertNotIn("the-", r)
self.assertNotIn("-is-", r)

def test_backward_compatibility_no_new_params(self):
txt = "This is a test ---"
r = slugify(txt)
self.assertEqual(r, "this-is-a-test")

txt = 'jaja---lol-méméméoo--a'
r = slugify(txt, max_length=9)
self.assertEqual(r, "jaja-lol")

def test_combination_all_params(self):
txt = 'the python guide for beginners in 2025 is awesome'
r = slugify(
txt,
max_length=30,
stopwords=["a", "the", "for", "in", "is"],
protected_words=["python"],
)
self.assertIn("python", r)
self.assertLessEqual(len(r), 30)
self.assertNotIn("the-", r)
self.assertNotIn("-for-", r)

def test_protected_words_multi_token_chinese(self):
txt = '影師嗎 教程'
r = slugify(txt, protected_words=['影師嗎'], allow_unicode=False)
self.assertEqual(r, "ying-shi-ma-jiao-cheng")
self.assertNotIn("YingShiMa", r)
self.assertIn("ying-shi-ma", r)


class TestUtils(unittest.TestCase):

Expand Down Expand Up @@ -575,7 +648,8 @@ class TestCommandParams(unittest.TestCase):
'separator': '-',
'stopwords': None,
'lowercase': True,
'replacements': None
'replacements': None,
'protected_words': None,
}

def get_params_from_cli(self, *argv):
Expand Down