RustPython
diff --git a/‎Lib/html/parser.py‎
Lines changed: 37 additions & 16 deletions b/‎Lib/html/parser.py‎
Lines changed: 37 additions & 16 deletions
@@ -24,6 +24,7 @@
 
 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+incomplete_charref = re.compile('&#(?:[0-9]|[xX][0-9a-fA-F])')
 attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
 
 starttagopen = re.compile('<[a-zA-Z]')
@@ -127,17 +128,25 @@ class HTMLParser(_markupbase.ParserBase):
     argument.
     """
 
-    CDATA_CONTENT_ELEMENTS = ("script", "style")
+    # See the HTML5 specs section "13.4 Parsing HTML fragments".
+    # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
+    # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
+    CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
     RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
 
-    def __init__(self, *, convert_charrefs=True):
+    def __init__(self, *, convert_charrefs=True, scripting=False):
         """Initialize and reset this instance.
 
-        If convert_charrefs is True (the default), all character references
+        If convert_charrefs is true (the default), all character references
         are automatically converted to the corresponding Unicode characters.
+
+        If *scripting* is false (the default), the content of the
+        ``noscript`` element is parsed normally; if it's true,
+        it's returned as is without being parsed.
         """
         super().__init__()
         self.convert_charrefs = convert_charrefs
+        self.scripting = scripting
         self.reset()
 
     def reset(self):
@@ -172,7 +181,9 @@ def get_starttag_text(self):
     def set_cdata_mode(self, elem, *, escapable=False):
         self.cdata_elem = elem.lower()
         self._escapable = escapable
-        if escapable and not self.convert_charrefs:
+        if self.cdata_elem == 'plaintext':
+            self.interesting = re.compile(r'\z')
+        elif escapable and not self.convert_charrefs:
             self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
                                           re.IGNORECASE|re.ASCII)
         else:
@@ -294,10 +305,20 @@ def goahead(self, end):
                         k = k - 1
                     i = self.updatepos(i, k)
                     continue
+                match = incomplete_charref.match(rawdata, i)
+                if match:
+                    if end:
+                        self.handle_charref(rawdata[i+2:])
+                        i = self.updatepos(i, n)
+                        break
+                    # incomplete
+                    break
+                elif i + 3 < n:  # larger than "&#x"
+                    # not the end of the buffer, and can't be confused
+                    # with some other construct
+                    self.handle_data("&#")
+                    i = self.updatepos(i, i + 2)
                 else:
-                    if ";" in rawdata[i:]:  # bail by consuming &#
-                        self.handle_data(rawdata[i:i+2])
-                        i = self.updatepos(i, i+2)
                     break
             elif startswith('&', i):
                 match = entityref.match(rawdata, i)
@@ -311,15 +332,13 @@ def goahead(self, end):
                     continue
                 match = incomplete.match(rawdata, i)
                 if match:
-                    # match.group() will contain at least 2 chars
-                    if end and match.group() == rawdata[i:]:
-                        k = match.end()
-                        if k <= i:
-                            k = n
-                        i = self.updatepos(i, i + 1)
+                    if end:
+                        self.handle_entityref(rawdata[i+1:])
+                        i = self.updatepos(i, n)
+                        break
                     # incomplete
                     break
-                elif (i + 1) < n:
+                elif i + 1 < n:
                     # not the end of the buffer, and can't be confused
                     # with some other construct
                     self.handle_data("&")
@@ -444,8 +463,10 @@ def parse_starttag(self, i):
             self.handle_startendtag(tag, attrs)
         else:
             self.handle_starttag(tag, attrs)
-            if tag in self.CDATA_CONTENT_ELEMENTS:
-                self.set_cdata_mode(tag)
+            if (tag in self.CDATA_CONTENT_ELEMENTS or
+                (self.scripting and tag == "noscript") or
+                tag == "plaintext"):
+                self.set_cdata_mode(tag, escapable=False)
             elif tag in self.RCDATA_CONTENT_ELEMENTS:
                 self.set_cdata_mode(tag, escapable=True)
         return endpos