2424
2525entityref = re .compile ('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]' )
2626charref = re .compile ('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]' )
27+ incomplete_charref = re .compile ('&#(?:[0-9]|[xX][0-9a-fA-F])' )
2728attr_charref = re .compile (r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?' )
2829
2930starttagopen = re .compile ('<[a-zA-Z]' )
@@ -127,17 +128,25 @@ class HTMLParser(_markupbase.ParserBase):
127128 argument.
128129 """
129130
130- CDATA_CONTENT_ELEMENTS = ("script" , "style" )
131+ # See the HTML5 specs section "13.4 Parsing HTML fragments".
132+ # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
133+ # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
134+ CDATA_CONTENT_ELEMENTS = ("script" , "style" , "xmp" , "iframe" , "noembed" , "noframes" )
131135 RCDATA_CONTENT_ELEMENTS = ("textarea" , "title" )
132136
133- def __init__ (self , * , convert_charrefs = True ):
137+ def __init__ (self , * , convert_charrefs = True , scripting = False ):
134138 """Initialize and reset this instance.
135139
136- If convert_charrefs is True (the default), all character references
140+ If convert_charrefs is true (the default), all character references
137141 are automatically converted to the corresponding Unicode characters.
142+
143+ If *scripting* is false (the default), the content of the
144+ ``noscript`` element is parsed normally; if it's true,
145+ it's returned as is without being parsed.
138146 """
139147 super ().__init__ ()
140148 self .convert_charrefs = convert_charrefs
149+ self .scripting = scripting
141150 self .reset ()
142151
143152 def reset (self ):
@@ -172,7 +181,9 @@ def get_starttag_text(self):
172181 def set_cdata_mode (self , elem , * , escapable = False ):
173182 self .cdata_elem = elem .lower ()
174183 self ._escapable = escapable
175- if escapable and not self .convert_charrefs :
184+ if self .cdata_elem == 'plaintext' :
185+ self .interesting = re .compile (r'\z' )
186+ elif escapable and not self .convert_charrefs :
176187 self .interesting = re .compile (r'&|</%s(?=[\t\n\r\f />])' % self .cdata_elem ,
177188 re .IGNORECASE | re .ASCII )
178189 else :
@@ -294,10 +305,20 @@ def goahead(self, end):
294305 k = k - 1
295306 i = self .updatepos (i , k )
296307 continue
308+ match = incomplete_charref .match (rawdata , i )
309+ if match :
310+ if end :
311+ self .handle_charref (rawdata [i + 2 :])
312+ i = self .updatepos (i , n )
313+ break
314+ # incomplete
315+ break
316+ elif i + 3 < n : # larger than "&#x"
317+ # not the end of the buffer, and can't be confused
318+ # with some other construct
319+ self .handle_data ("&#" )
320+ i = self .updatepos (i , i + 2 )
297321 else :
298- if ";" in rawdata [i :]: # bail by consuming &#
299- self .handle_data (rawdata [i :i + 2 ])
300- i = self .updatepos (i , i + 2 )
301322 break
302323 elif startswith ('&' , i ):
303324 match = entityref .match (rawdata , i )
@@ -311,15 +332,13 @@ def goahead(self, end):
311332 continue
312333 match = incomplete .match (rawdata , i )
313334 if match :
314- # match.group() will contain at least 2 chars
315- if end and match .group () == rawdata [i :]:
316- k = match .end ()
317- if k <= i :
318- k = n
319- i = self .updatepos (i , i + 1 )
335+ if end :
336+ self .handle_entityref (rawdata [i + 1 :])
337+ i = self .updatepos (i , n )
338+ break
320339 # incomplete
321340 break
322- elif ( i + 1 ) < n :
341+ elif i + 1 < n :
323342 # not the end of the buffer, and can't be confused
324343 # with some other construct
325344 self .handle_data ("&" )
@@ -444,8 +463,10 @@ def parse_starttag(self, i):
444463 self .handle_startendtag (tag , attrs )
445464 else :
446465 self .handle_starttag (tag , attrs )
447- if tag in self .CDATA_CONTENT_ELEMENTS :
448- self .set_cdata_mode (tag )
466+ if (tag in self .CDATA_CONTENT_ELEMENTS or
467+ (self .scripting and tag == "noscript" ) or
468+ tag == "plaintext" ):
469+ self .set_cdata_mode (tag , escapable = False )
449470 elif tag in self .RCDATA_CONTENT_ELEMENTS :
450471 self .set_cdata_mode (tag , escapable = True )
451472 return endpos
0 commit comments