Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Doc/whatsnew/3.16.rst
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,14 @@ module_name

* TODO

html
----

* :func:`html.escape` and :func:`html.unescape` are now implemented in C, with
the pure-Python versions kept as a fallback. This also speeds up
:class:`html.parser.HTMLParser`, which relies on :func:`html.unescape`.
(Contributed by Bernát Gábor in :gh:`151024`.)



Removed
Expand Down
1 change: 1 addition & 0 deletions Include/internal/pycore_global_objects_fini_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Include/internal/pycore_global_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,7 @@ struct _Py_global_strings {
STRUCT_FOR_ID(qualname)
STRUCT_FOR_ID(query)
STRUCT_FOR_ID(queuetype)
STRUCT_FOR_ID(quote)
STRUCT_FOR_ID(quotetabs)
STRUCT_FOR_ID(raw)
STRUCT_FOR_ID(read)
Expand Down
1 change: 1 addition & 0 deletions Include/internal/pycore_runtime_init_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions Include/internal/pycore_unicodeobject_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions Lib/html/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,11 @@ def unescape(s):
if '&' not in s:
return s
return _charref.sub(_replace_charref, s)


try:
# Prefer the C accelerator; fall back to the pure-Python versions above on
# implementations that do not ship it (see PEP 399).
from _html import escape, unescape
except ImportError:
pass
108 changes: 96 additions & 12 deletions Lib/test/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,81 @@
Tests for the html module functions.
"""

import html
import unittest
from test.support import import_helper

py_html = import_helper.import_fresh_module('html', blocked=['_html'])
c_html = import_helper.import_fresh_module('html', fresh=['_html'])


class HtmlTestsMixin:
# Subclasses set ``html`` to the pure-Python or C-accelerated module.
html = None

class HtmlTests(unittest.TestCase):
def test_escape(self):
escape = self.html.escape
self.assertEqual(
html.escape('\'<script>"&foo;"</script>\''),
escape('\'<script>"&foo;"</script>\''),
'&#x27;&lt;script&gt;&quot;&amp;foo;&quot;&lt;/script&gt;&#x27;')
self.assertEqual(
html.escape('\'<script>"&foo;"</script>\'', False),
escape('\'<script>"&foo;"</script>\'', False),
'\'&lt;script&gt;"&amp;foo;"&lt;/script&gt;\'')

def test_escape_quote_flag(self):
escape = self.html.escape
self.assertEqual(escape('"\'', quote=True), '&quot;&#x27;')
self.assertEqual(escape('"\''), '&quot;&#x27;')
self.assertEqual(escape('"\'', quote=False), '"\'')
self.assertEqual(escape('"\'', False), '"\'')

def test_escape_no_specials_returned_unchanged(self):
for s in ['', 'a', 'plain text', 'x' * 100, 'caf\xe9 r\xe9sum\xe9',
'☃ snowman', '\U0001F600 emoji']:
self.assertEqual(self.html.escape(s), s)

def test_escape_specials_at_every_offset(self):
# Exercise the word-at-a-time (SWAR) scan boundaries and tail loop by
# placing each special at every offset of a run crossing 8-byte words.
escape = self.html.escape
specials = {'&': '&amp;', '<': '&lt;', '>': '&gt;',
'"': '&quot;', "'": '&#x27;'}
for ch, rep in specials.items():
for pad in range(0, 20):
s = 'a' * pad + ch + 'b' * pad
self.assertEqual(escape(s), 'a' * pad + rep + 'b' * pad)

def test_escape_adjacent_specials(self):
self.assertEqual(self.html.escape('&<>"\'' * 5),
'&amp;&lt;&gt;&quot;&#x27;' * 5)

def test_escape_multiple_kinds(self):
escape = self.html.escape
# 2-byte (UCS-2) and 4-byte (UCS-4) strings still escape ASCII specials.
self.assertEqual(escape('☃ <b> & </b>'),
'☃ &lt;b&gt; &amp; &lt;/b&gt;')
self.assertEqual(escape('\U0001F600<&>"\''),
'\U0001F600&lt;&amp;&gt;&quot;&#x27;')
# Latin-1 high bytes must not be matched by the byte-wise scan.
self.assertEqual(escape('\xe9\xff & \xe9'), '\xe9\xff &amp; \xe9')

def test_escape_str_subclass_returns_true_str(self):
class S(str):
pass
for s in ['no specials', 'a & b']:
result = self.html.escape(S(s))
self.assertEqual(result, self.html.escape(s))
self.assertIs(type(result), str)

def test_unescape(self):
numeric_formats = ['&#%d', '&#%d;', '&#x%x', '&#x%x;']
errmsg = 'unescape(%r) should have returned %r'
def check(text, expected):
self.assertEqual(html.unescape(text), expected,
self.assertEqual(self.html.unescape(text), expected,
msg=errmsg % (text, expected))
def check_num(num, expected):
for format in numeric_formats:
text = format % num
self.assertEqual(html.unescape(text), expected,
self.assertEqual(self.html.unescape(text), expected,
msg=errmsg % (text, expected))
# check text with no character references
check('no character references', 'no character references')
Expand All @@ -42,25 +94,25 @@ def check_num(num, expected):
'&#x%x', '&#x%06x', '&#x%x;', '&#x%06x;',
'&#x%X', '&#x%06X', '&#X%x;', '&#X%06x;']
for num, char in zip([65, 97, 34, 38, 0x2603, 0x101234],
['A', 'a', '"', '&', '\u2603', '\U00101234']):
['A', 'a', '"', '&', '', '\U00101234']):
for s in formats:
check(s % num, char)
for end in [' ', 'X']:
check((s+end) % num, char+end)
# check invalid code points
for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]:
check_num(cp, '\uFFFD')
check_num(cp, '')
# check more invalid code points
for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]:
check_num(cp, '')
# check invalid numbers
for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\r\u20ac\u2022\x9d'):
for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\rۥ\x9d'):
check_num(num, ch)
# check small numbers
check_num(0, '\uFFFD')
check_num(0, '')
check_num(9, '\t')
# check a big number
check_num(1000000000000000000, '\uFFFD')
check_num(1000000000000000000, '')
# check that multiple trailing semicolons are handled correctly
for e in ['&quot;;', '&#34;;', '&#x22;;', '&#X22;;']:
check(e, '";')
Expand Down Expand Up @@ -89,7 +141,7 @@ def check_num(num, expected):
# longest valid name
check('&CounterClockwiseContourIntegral;', '∳')
# check a charref that maps to two unicode chars
check('&acE;', '\u223E\u0333')
check('&acE;', '∾̳')
check('&acE', '&acE')
# see #12888
check('&#123; ' * 1050, '{ ' * 1050)
Expand All @@ -98,6 +150,38 @@ def check_num(num, expected):
'ÉricÉric&alphacentauriαcentauri')
check('&co;', '&co;')

def test_unescape_multiple_kinds(self):
unescape = self.html.unescape
# references embedded in 2-byte and 4-byte strings
self.assertEqual(unescape('☃ &amp; &#62; &copy; x'),
'☃ & > \xa9 x')
self.assertEqual(unescape('\U0001F600&amp;&#x41;&notin;'),
'\U0001F600&A∉')

def test_unescape_long_text_with_sparse_refs(self):
# exercise the bulk substring copy between references
unescape = self.html.unescape
s = 'x' * 5000 + '&amp;' + 'y' * 5000
self.assertEqual(unescape(s), 'x' * 5000 + '&' + 'y' * 5000)
self.assertEqual(unescape('a' * 5000), 'a' * 5000)

def test_unescape_str_subclass(self):
class S(str):
pass
self.assertEqual(self.html.unescape(S('no refs')), 'no refs')
self.assertEqual(self.html.unescape(S('a &amp; b')), 'a & b')


class PyHtmlTests(HtmlTestsMixin, unittest.TestCase):
html = py_html


@unittest.skipUnless(
c_html is not None and getattr(c_html.escape, '__module__', None) == '_html',
'requires the _html C accelerator')
class CHtmlTests(HtmlTestsMixin, unittest.TestCase):
html = c_html


if __name__ == '__main__':
unittest.main()
10 changes: 9 additions & 1 deletion Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -1848,6 +1848,13 @@ regen-limited-abi: all
regen-unicodedata:
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/unicode/makeunicodedata.py

.PHONY: regen-html
regen-html:
# Regenerate Modules/html_entities.h from the html module data
# using Tools/build/generate_html_entities.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/build/generate_html_entities.py \
$(srcdir)/Modules/html_entities.h


############################################################################
# Regenerate all generated files
Expand All @@ -1860,7 +1867,7 @@ regen-all: regen-cases regen-slots \
regen-test-levenshtein regen-global-objects
@echo
@echo "Note: make regen-stdlib-module-names, make regen-limited-abi, "
@echo "make regen-configure, make regen-sbom, and make regen-unicodedata should be run manually"
@echo "make regen-configure, make regen-sbom, make regen-html, and make regen-unicodedata should be run manually"

############################################################################
# Special rules for object files
Expand Down Expand Up @@ -3422,6 +3429,7 @@ MODULE_DEPS_SHARED=@MODULE_DEPS_SHARED@
MODULE__CURSES_DEPS=$(srcdir)/Include/py_curses.h
MODULE__CURSES_PANEL_DEPS=$(srcdir)/Include/py_curses.h
MODULE__DATETIME_DEPS=$(srcdir)/Include/datetime.h
MODULE__HTML_DEPS=$(srcdir)/Modules/html_entities.h
MODULE_CMATH_DEPS=$(srcdir)/Modules/_math.h
MODULE_MATH_DEPS=$(srcdir)/Modules/_math.h
MODULE_PYEXPAT_DEPS=@LIBEXPAT_INTERNAL@
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Add a C accelerator for :func:`html.escape` and :func:`html.unescape`, with the
pure-Python implementations kept as a fallback. This also speeds up
:class:`html.parser.HTMLParser`, which uses :func:`html.unescape`.
1 change: 1 addition & 0 deletions Modules/Setup.stdlib.in
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
@MODULE__BISECT_TRUE@_bisect _bisectmodule.c
@MODULE__CSV_TRUE@_csv _csv.c
@MODULE__HEAPQ_TRUE@_heapq _heapqmodule.c
@MODULE__HTML_TRUE@_html _htmlmodule.c
@MODULE__JSON_TRUE@_json _json.c
@MODULE__LSPROF_TRUE@_lsprof _lsprof.c rotatingtree.c
@MODULE__MATH_INTEGER_TRUE@_math_integer mathintegermodule.c
Expand Down
Loading
Loading