python · gaborbernat · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026
@@ -141,6 +141,14 @@ module_name
 
 * TODO
 
+html
+----
+
+* :func:`html.escape` and :func:`html.unescape` are now implemented in C, with
+  the pure-Python versions kept as a fallback. This also speeds up
+  :class:`html.parser.HTMLParser`, which relies on :func:`html.unescape`.
+  (Contributed by Bernát Gábor in :gh:`151024`.)
+
 
 
 Removed

diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
@@ -735,6 +735,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(qualname)
         STRUCT_FOR_ID(query)
         STRUCT_FOR_ID(queuetype)
+        STRUCT_FOR_ID(quote)
         STRUCT_FOR_ID(quotetabs)
         STRUCT_FOR_ID(raw)
         STRUCT_FOR_ID(read)

diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
@@ -130,3 +130,11 @@ def unescape(s):
     if '&' not in s:
         return s
     return _charref.sub(_replace_charref, s)
+
+
+try:
+    # Prefer the C accelerator; fall back to the pure-Python versions above on
+    # implementations that do not ship it (see PEP 399).
+    from _html import escape, unescape
+except ImportError:
+    pass
@@ -2,29 +2,81 @@
 Tests for the html module functions.
 """
 
-import html
 import unittest
+from test.support import import_helper
 
+py_html = import_helper.import_fresh_module('html', blocked=['_html'])
+c_html = import_helper.import_fresh_module('html', fresh=['_html'])
+
+
+class HtmlTestsMixin:
+    # Subclasses set ``html`` to the pure-Python or C-accelerated module.
+    html = None
 
-class HtmlTests(unittest.TestCase):
     def test_escape(self):
+        escape = self.html.escape
         self.assertEqual(
-            html.escape('\'<script>"&foo;"</script>\''),
+            escape('\'<script>"&foo;"</script>\''),
             '&#x27;&lt;script&gt;&quot;&amp;foo;&quot;&lt;/script&gt;&#x27;')
         self.assertEqual(
-            html.escape('\'<script>"&foo;"</script>\'', False),
+            escape('\'<script>"&foo;"</script>\'', False),
             '\'&lt;script&gt;"&amp;foo;"&lt;/script&gt;\'')
 
+    def test_escape_quote_flag(self):
+        escape = self.html.escape
+        self.assertEqual(escape('"\'', quote=True), '&quot;&#x27;')
+        self.assertEqual(escape('"\''), '&quot;&#x27;')
+        self.assertEqual(escape('"\'', quote=False), '"\'')
+        self.assertEqual(escape('"\'', False), '"\'')
+
+    def test_escape_no_specials_returned_unchanged(self):
+        for s in ['', 'a', 'plain text', 'x' * 100, 'caf\xe9 r\xe9sum\xe9',
+                  '☃ snowman', '\U0001F600 emoji']:
+            self.assertEqual(self.html.escape(s), s)
+
+    def test_escape_specials_at_every_offset(self):
+        # Exercise the word-at-a-time (SWAR) scan boundaries and tail loop by
+        # placing each special at every offset of a run crossing 8-byte words.
+        escape = self.html.escape
+        specials = {'&': '&amp;', '<': '&lt;', '>': '&gt;',
+                    '"': '&quot;', "'": '&#x27;'}
+        for ch, rep in specials.items():
+            for pad in range(0, 20):
+                s = 'a' * pad + ch + 'b' * pad
+                self.assertEqual(escape(s), 'a' * pad + rep + 'b' * pad)
+
+    def test_escape_adjacent_specials(self):
+        self.assertEqual(self.html.escape('&<>"\'' * 5),
+                         '&amp;&lt;&gt;&quot;&#x27;' * 5)
+
+    def test_escape_multiple_kinds(self):
+        escape = self.html.escape
+        # 2-byte (UCS-2) and 4-byte (UCS-4) strings still escape ASCII specials.
+        self.assertEqual(escape('☃ <b> & </b>'),
+                         '☃ &lt;b&gt; &amp; &lt;/b&gt;')
+        self.assertEqual(escape('\U0001F600<&>"\''),
+                         '\U0001F600&lt;&amp;&gt;&quot;&#x27;')
+        # Latin-1 high bytes must not be matched by the byte-wise scan.
+        self.assertEqual(escape('\xe9\xff & \xe9'), '\xe9\xff &amp; \xe9')
+
+    def test_escape_str_subclass_returns_true_str(self):
+        class S(str):
+            pass
+        for s in ['no specials', 'a & b']:
+            result = self.html.escape(S(s))
+            self.assertEqual(result, self.html.escape(s))
+            self.assertIs(type(result), str)
+
     def test_unescape(self):
         numeric_formats = ['&#%d', '&#%d;', '&#x%x', '&#x%x;']
         errmsg = 'unescape(%r) should have returned %r'
         def check(text, expected):
-            self.assertEqual(html.unescape(text), expected,
+            self.assertEqual(self.html.unescape(text), expected,
                              msg=errmsg % (text, expected))
         def check_num(num, expected):
             for format in numeric_formats:
                 text = format % num
-                self.assertEqual(html.unescape(text), expected,
+                self.assertEqual(self.html.unescape(text), expected,
                                  msg=errmsg % (text, expected))
         # check text with no character references
         check('no character references', 'no character references')
@@ -42,25 +94,25 @@ def check_num(num, expected):
                    '&#x%x', '&#x%06x', '&#x%x;', '&#x%06x;',
                    '&#x%X', '&#x%06X', '&#X%x;', '&#X%06x;']
         for num, char in zip([65, 97, 34, 38, 0x2603, 0x101234],
-                             ['A', 'a', '"', '&', '\u2603', '\U00101234']):
+                             ['A', 'a', '"', '&', '☃', '\U00101234']):
             for s in formats:
                 check(s % num, char)
                 for end in [' ', 'X']:
                     check((s+end) % num, char+end)
         # check invalid code points
         for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]:
-            check_num(cp, '\uFFFD')
+            check_num(cp, '�')
         # check more invalid code points
         for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]:
             check_num(cp, '')
         # check invalid numbers
-        for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\r\u20ac\u2022\x9d'):
+        for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\r€•\x9d'):
             check_num(num, ch)
         # check small numbers
-        check_num(0, '\uFFFD')
+        check_num(0, '�')
         check_num(9, '\t')
         # check a big number
-        check_num(1000000000000000000, '\uFFFD')
+        check_num(1000000000000000000, '�')
         # check that multiple trailing semicolons are handled correctly
         for e in ['&quot;;', '&#34;;', '&#x22;;', '&#X22;;']:
             check(e, '";')
@@ -89,7 +141,7 @@ def check_num(num, expected):
         # longest valid name
         check('&CounterClockwiseContourIntegral;', '∳')
         # check a charref that maps to two unicode chars
-        check('&acE;', '\u223E\u0333')
+        check('&acE;', '∾̳')
         check('&acE', '&acE')
         # see #12888
         check('&#123; ' * 1050, '{ ' * 1050)
@@ -98,6 +150,38 @@ def check_num(num, expected):
               'ÉricÉric&alphacentauriαcentauri')
         check('&co;', '&co;')
 
+    def test_unescape_multiple_kinds(self):
+        unescape = self.html.unescape
+        # references embedded in 2-byte and 4-byte strings
+        self.assertEqual(unescape('☃ &amp; &#62; &copy; x'),
+                         '☃ & > \xa9 x')
+        self.assertEqual(unescape('\U0001F600&amp;&#x41;&notin;'),
+                         '\U0001F600&A∉')
+
+    def test_unescape_long_text_with_sparse_refs(self):
+        # exercise the bulk substring copy between references
+        unescape = self.html.unescape
+        s = 'x' * 5000 + '&amp;' + 'y' * 5000
+        self.assertEqual(unescape(s), 'x' * 5000 + '&' + 'y' * 5000)
+        self.assertEqual(unescape('a' * 5000), 'a' * 5000)
+
+    def test_unescape_str_subclass(self):
+        class S(str):
+            pass
+        self.assertEqual(self.html.unescape(S('no refs')), 'no refs')
+        self.assertEqual(self.html.unescape(S('a &amp; b')), 'a & b')
+
+
+class PyHtmlTests(HtmlTestsMixin, unittest.TestCase):
+    html = py_html
+
+
+@unittest.skipUnless(
+    c_html is not None and getattr(c_html.escape, '__module__', None) == '_html',
+    'requires the _html C accelerator')
+class CHtmlTests(HtmlTestsMixin, unittest.TestCase):
+    html = c_html
+
 
 if __name__ == '__main__':
     unittest.main()
@@ -1848,6 +1848,13 @@ regen-limited-abi: all
 regen-unicodedata:
 	$(PYTHON_FOR_REGEN) $(srcdir)/Tools/unicode/makeunicodedata.py
 
+.PHONY: regen-html
+regen-html:
+	# Regenerate Modules/html_entities.h from the html module data
+	# using Tools/build/generate_html_entities.py
+	$(PYTHON_FOR_REGEN) $(srcdir)/Tools/build/generate_html_entities.py \
+		$(srcdir)/Modules/html_entities.h
+
 
 ############################################################################
 # Regenerate all generated files
@@ -1860,7 +1867,7 @@ regen-all: regen-cases regen-slots \
 	regen-test-levenshtein regen-global-objects
 	@echo
 	@echo "Note: make regen-stdlib-module-names, make regen-limited-abi, "
-	@echo "make regen-configure, make regen-sbom, and make regen-unicodedata should be run manually"
+	@echo "make regen-configure, make regen-sbom, make regen-html, and make regen-unicodedata should be run manually"
 
 ############################################################################
 # Special rules for object files
@@ -3422,6 +3429,7 @@ MODULE_DEPS_SHARED=@MODULE_DEPS_SHARED@
 MODULE__CURSES_DEPS=$(srcdir)/Include/py_curses.h
 MODULE__CURSES_PANEL_DEPS=$(srcdir)/Include/py_curses.h
 MODULE__DATETIME_DEPS=$(srcdir)/Include/datetime.h
+MODULE__HTML_DEPS=$(srcdir)/Modules/html_entities.h
 MODULE_CMATH_DEPS=$(srcdir)/Modules/_math.h
 MODULE_MATH_DEPS=$(srcdir)/Modules/_math.h
 MODULE_PYEXPAT_DEPS=@LIBEXPAT_INTERNAL@

diff --git a/Misc/NEWS.d/next/Library/2026-06-06-15-31-29.gh-issue-151024.pzZAnr.rst b/Misc/NEWS.d/next/Library/2026-06-06-15-31-29.gh-issue-151024.pzZAnr.rst
@@ -0,0 +1,3 @@
+Add a C accelerator for :func:`html.escape` and :func:`html.unescape`, with the
+pure-Python implementations kept as a fallback. This also speeds up
+:class:`html.parser.HTMLParser`, which uses :func:`html.unescape`.
@@ -35,6 +35,7 @@
 @MODULE__BISECT_TRUE@_bisect _bisectmodule.c
 @MODULE__CSV_TRUE@_csv _csv.c
 @MODULE__HEAPQ_TRUE@_heapq _heapqmodule.c
+@MODULE__HTML_TRUE@_html _htmlmodule.c
 @MODULE__JSON_TRUE@_json _json.c
 @MODULE__LSPROF_TRUE@_lsprof _lsprof.c rotatingtree.c
 @MODULE__MATH_INTEGER_TRUE@_math_integer mathintegermodule.c