Skip to content

Commit 185f360

Browse files
authored
Update html module from Python 3.14.2 (#6855)
1 parent f8d4d99 commit 185f360

2 files changed

Lines changed: 246 additions & 134 deletions

File tree

Lib/html/parser.py

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
2626
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
27+
incomplete_charref = re.compile('&#(?:[0-9]|[xX][0-9a-fA-F])')
2728
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
2829

2930
starttagopen = re.compile('<[a-zA-Z]')
@@ -127,17 +128,25 @@ class HTMLParser(_markupbase.ParserBase):
127128
argument.
128129
"""
129130

130-
CDATA_CONTENT_ELEMENTS = ("script", "style")
131+
# See the HTML5 specs section "13.4 Parsing HTML fragments".
132+
# https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
133+
# CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
134+
CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
131135
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
132136

133-
def __init__(self, *, convert_charrefs=True):
137+
def __init__(self, *, convert_charrefs=True, scripting=False):
134138
"""Initialize and reset this instance.
135139
136-
If convert_charrefs is True (the default), all character references
140+
If convert_charrefs is true (the default), all character references
137141
are automatically converted to the corresponding Unicode characters.
142+
143+
If *scripting* is false (the default), the content of the
144+
``noscript`` element is parsed normally; if it's true,
145+
it's returned as is without being parsed.
138146
"""
139147
super().__init__()
140148
self.convert_charrefs = convert_charrefs
149+
self.scripting = scripting
141150
self.reset()
142151

143152
def reset(self):
@@ -172,7 +181,9 @@ def get_starttag_text(self):
172181
def set_cdata_mode(self, elem, *, escapable=False):
173182
self.cdata_elem = elem.lower()
174183
self._escapable = escapable
175-
if escapable and not self.convert_charrefs:
184+
if self.cdata_elem == 'plaintext':
185+
self.interesting = re.compile(r'\z')
186+
elif escapable and not self.convert_charrefs:
176187
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
177188
re.IGNORECASE|re.ASCII)
178189
else:
@@ -294,10 +305,20 @@ def goahead(self, end):
294305
k = k - 1
295306
i = self.updatepos(i, k)
296307
continue
308+
match = incomplete_charref.match(rawdata, i)
309+
if match:
310+
if end:
311+
self.handle_charref(rawdata[i+2:])
312+
i = self.updatepos(i, n)
313+
break
314+
# incomplete
315+
break
316+
elif i + 3 < n: # larger than "&#x"
317+
# not the end of the buffer, and can't be confused
318+
# with some other construct
319+
self.handle_data("&#")
320+
i = self.updatepos(i, i + 2)
297321
else:
298-
if ";" in rawdata[i:]: # bail by consuming &#
299-
self.handle_data(rawdata[i:i+2])
300-
i = self.updatepos(i, i+2)
301322
break
302323
elif startswith('&', i):
303324
match = entityref.match(rawdata, i)
@@ -311,15 +332,13 @@ def goahead(self, end):
311332
continue
312333
match = incomplete.match(rawdata, i)
313334
if match:
314-
# match.group() will contain at least 2 chars
315-
if end and match.group() == rawdata[i:]:
316-
k = match.end()
317-
if k <= i:
318-
k = n
319-
i = self.updatepos(i, i + 1)
335+
if end:
336+
self.handle_entityref(rawdata[i+1:])
337+
i = self.updatepos(i, n)
338+
break
320339
# incomplete
321340
break
322-
elif (i + 1) < n:
341+
elif i + 1 < n:
323342
# not the end of the buffer, and can't be confused
324343
# with some other construct
325344
self.handle_data("&")
@@ -444,8 +463,10 @@ def parse_starttag(self, i):
444463
self.handle_startendtag(tag, attrs)
445464
else:
446465
self.handle_starttag(tag, attrs)
447-
if tag in self.CDATA_CONTENT_ELEMENTS:
448-
self.set_cdata_mode(tag)
466+
if (tag in self.CDATA_CONTENT_ELEMENTS or
467+
(self.scripting and tag == "noscript") or
468+
tag == "plaintext"):
469+
self.set_cdata_mode(tag, escapable=False)
449470
elif tag in self.RCDATA_CONTENT_ELEMENTS:
450471
self.set_cdata_mode(tag, escapable=True)
451472
return endpos

0 commit comments

Comments
 (0)