5454import java .io .StringReader ;
5555import java .util .ArrayList ;
5656import java .util .Arrays ;
57+ import java .util .HashMap ;
58+ import java .util .HashSet ;
5759import java .util .List ;
60+ import java .util .Map ;
61+ import java .util .Set ;
5862
5963import javax .xml .parsers .SAXParserFactory ;
6064
6569import org .xml .sax .XMLReader ;
6670import org .xml .sax .ext .Attributes2 ;
6771import org .xml .sax .ext .DefaultHandler2 ;
72+ import org .xml .sax .ext .EntityResolver2 ;
6873import org .xml .sax .ext .Locator2 ;
6974import org .xml .sax .helpers .DefaultHandler ;
7075
@@ -712,12 +717,14 @@ int parseFile(VirtualFrame frame, PXMLParser self, Object file,
712717 return 1 ;
713718 }
714719
720+ @ TruffleBoundary
715721 private void doParseFile (PXMLParser self , Object file ) {
716722 if (self .isFinished ()) {
717723 throw raiseExpatError (this , ErrorMessages .PARSING_FINISHED , PXMLParser .XML_ERROR_FINISHED , 0 , 1 , 0 );
718724 }
719725 PythonFileInputStream stream = new PythonFileInputStream (file , self .getBufferSize ());
720- parseNow (self , false , new InputSource (stream ), stream ::getBytesRead );
726+ stream .prefetch ();
727+ parseNow (self , false , new InputSource (stream ), stream ::getBytesRead , detectXmlDecl (stream .getXmlDeclBytes ()));
721728 }
722729 }
723730
@@ -858,17 +865,24 @@ private static byte[] asByteArray(Object data) {
858865 @ TruffleBoundary
859866 private static void parseNow (PXMLParser parser , boolean swallowErrors ) {
860867 int byteLen = parser .getData ().length ;
861- parseNow (parser , swallowErrors , new InputSource (new ByteArrayInputStream (parser .getData ())), () -> byteLen );
868+ parseNow (parser , swallowErrors , new InputSource (new ByteArrayInputStream (parser .getData ())), () -> byteLen , detectXmlDecl ( parser . getData ()) );
862869 }
863870
864871 @ TruffleBoundary
865872 private static void parseNow (PXMLParser parser , boolean swallowErrors , InputSource source , ByteIndexSupplier byteIndexSupplier ) {
873+ parseNow (parser , swallowErrors , source , byteIndexSupplier , null );
874+ }
875+
876+ @ TruffleBoundary
877+ private static void parseNow (PXMLParser parser , boolean swallowErrors , InputSource source , ByteIndexSupplier byteIndexSupplier , XmlDeclInfo xmlDeclInfo ) {
866878 final class Handler extends DefaultHandler2 {
867879 int line = 1 ;
868880 int col ;
869881 int eventOrdinal ;
870882 Locator locator ;
871883 boolean keepCurrentPositionForNextCall ;
884+ final Map <String , ExternalEntityInfo > externalEntities = new HashMap <>();
885+ final Set <String > resolvedExternalEntities = new HashSet <>();
872886
873887 @ Override
874888 public void setDocumentLocator (Locator locator ) {
@@ -898,20 +912,28 @@ public void processingInstruction(String target, String data) {
898912
899913 @ Override
900914 public void startDocument () {
901- if (locator instanceof Locator2 locator2 ) {
915+ if (xmlDeclInfo != null ) {
916+ call (parser .getXmlDeclHandler (), toOptionalTs (xmlDeclInfo .version ), toOptionalTs (xmlDeclInfo .encoding ), xmlDeclInfo .standalone );
917+ } else if (locator instanceof Locator2 locator2 ) {
902918 call (parser .getXmlDeclHandler (), toOptionalTs (locator2 .getXMLVersion ()), toOptionalTs (locator2 .getEncoding ()), -1 );
903919 }
904920 }
905921
906922 @ Override
907923 public void startDTD (String name , String publicId , String systemId ) {
924+ if (xmlDeclInfo != null && xmlDeclInfo .standalone == 0 ) {
925+ call (parser .getNotStandaloneHandler ());
926+ }
908927 // We conservatively report an internal subset. This matches minidom builder
909928 // expectations and enables DTD callback wiring for entity/notation handling.
910- call (parser .getStartDoctypeDeclHandler (), toTs (name ), toTs (systemId ), toTs (publicId ), 1 );
929+ call (parser .getStartDoctypeDeclHandler (), toTs (name ), toTs (systemId ), toOptionalTs (publicId ), 1 );
911930 }
912931
913932 @ Override
914933 public void endDTD () {
934+ if (xmlDeclInfo != null && xmlDeclInfo .standalone == 0 ) {
935+ call (parser .getNotStandaloneHandler ());
936+ }
915937 call (parser .getEndDoctypeDeclHandler ());
916938 }
917939
@@ -925,6 +947,9 @@ public void internalEntityDecl(String name, String value) {
925947 @ Override
926948 public void externalEntityDecl (String name , String publicId , String systemId ) {
927949 boolean isParameterEntity = name != null && name .startsWith ("%" );
950+ if (!isParameterEntity ) {
951+ externalEntities .put (name , new ExternalEntityInfo (systemId , publicId ));
952+ }
928953 call (parser .getEntityDeclHandler (), toTs (name ), isParameterEntity ? 1 : 0 , PNone .NONE , parser .getBase () == null ? PNone .NONE : parser .getBase (),
929954 toOptionalTs (normalizeSystemId (systemId )),
930955 toOptionalTs (publicId ), PNone .NONE );
@@ -943,12 +968,21 @@ public void unparsedEntityDecl(String name, String publicId, String systemId, St
943968
944969 @ Override
945970 public void elementDecl (String name , String model ) {
946- call (parser .getElementDeclHandler (), toTs (name ), toTs (model ));
971+ call (parser .getElementDeclHandler (), toTs (name ), elementModel (model ));
947972 }
948973
949974 @ Override
950975 public void attributeDecl (String eName , String aName , String type , String mode , String value ) {
951- call (parser .getAttlistDeclHandler (), toTs (eName ), toTs (aName ), toTs (type ), toOptionalTs (mode ), toOptionalTs (value ));
976+ Object defaultValue = toOptionalTs (value );
977+ int required = 0 ;
978+ if ("#REQUIRED" .equals (mode )) {
979+ defaultValue = PNone .NONE ;
980+ required = 1 ;
981+ } else if ("#IMPLIED" .equals (mode )) {
982+ defaultValue = PNone .NONE ;
983+ required = 0 ;
984+ }
985+ call (parser .getAttlistDeclHandler (), toTs (eName ), toTs (aName ), toTs (type ), defaultValue , required );
952986 }
953987
954988 @ Override
@@ -1015,6 +1049,12 @@ public void endCDATA() {
10151049
10161050 @ Override
10171051 public void skippedEntity (String name ) {
1052+ ExternalEntityInfo externalEntity = externalEntities .get (name );
1053+ if (externalEntity != null && !resolvedExternalEntities .contains (name )) {
1054+ call (parser .getExternalEntityRefHandler (), PNone .NONE , parser .getBase () == null ? PNone .NONE : parser .getBase (),
1055+ toOptionalTs (normalizeSystemId (externalEntity .systemId )), toOptionalTs (externalEntity .publicId ));
1056+ return ;
1057+ }
10181058 call (parser .getSkippedEntityHandler (), toTs (name ), 0 );
10191059 if (locator != null ) {
10201060 int entityLen = name .length () + 2 ; // '&' + ';'
@@ -1105,6 +1145,14 @@ private String normalizeSystemId(String systemId) {
11051145 }
11061146 return systemId ;
11071147 }
1148+
1149+ private Object elementModel (String model ) {
1150+ if ("ANY" .equals (model )) {
1151+ return PFactory .createTuple (PythonLanguage .get (null ), new Object []{2 , 0 , PNone .NONE ,
1152+ PFactory .createTuple (PythonLanguage .get (null ), EMPTY_OBJECT_ARRAY )});
1153+ }
1154+ return toTs (model );
1155+ }
11081156 }
11091157
11101158 Handler handler = new Handler ();
@@ -1116,7 +1164,27 @@ private String normalizeSystemId(String systemId) {
11161164 reader .setFeature ("http://apache.org/xml/features/nonvalidating/load-external-dtd" , false );
11171165 } catch (Exception ignored ) {
11181166 }
1119- reader .setEntityResolver ((publicId , systemId ) -> new InputSource (new StringReader ("" )));
1167+ reader .setEntityResolver (new EntityResolver2 () {
1168+ @ Override
1169+ public InputSource getExternalSubset (String name , String baseURI ) {
1170+ return null ;
1171+ }
1172+
1173+ @ Override
1174+ public InputSource resolveEntity (String name , String publicId , String baseURI , String systemId ) {
1175+ if (name != null && !name .isEmpty ()) {
1176+ handler .resolvedExternalEntities .add (name );
1177+ }
1178+ handler .call (parser .getExternalEntityRefHandler (), PNone .NONE , parser .getBase () == null ? PNone .NONE : parser .getBase (),
1179+ handler .toOptionalTs (handler .normalizeSystemId (systemId )), handler .toOptionalTs (publicId ));
1180+ return new InputSource (new StringReader ("" ));
1181+ }
1182+
1183+ @ Override
1184+ public InputSource resolveEntity (String publicId , String systemId ) {
1185+ return new InputSource (new StringReader ("" ));
1186+ }
1187+ });
11201188 reader .setContentHandler (handler );
11211189 reader .setProperty ("http://xml.org/sax/properties/lexical-handler" , handler );
11221190 reader .setProperty ("http://xml.org/sax/properties/declaration-handler" , handler );
@@ -1173,6 +1241,8 @@ private static final class PythonFileInputStream extends InputStream {
11731241 private int offset ;
11741242 private boolean eof ;
11751243 private int bytesRead ;
1244+ private byte [] xmlDeclBytes = EMPTY_BYTE_ARRAY ;
1245+ private boolean xmlDeclComplete ;
11761246
11771247 private PythonFileInputStream (Object file , int readSize ) {
11781248 this .file = file ;
@@ -1219,6 +1289,7 @@ private void fillBuffer() throws IOException {
12191289 buffer = chunk ;
12201290 offset = 0 ;
12211291 bytesRead += chunk .length ;
1292+ captureXmlDeclBytes (chunk );
12221293 } catch (PException e ) {
12231294 throw new IOException (e );
12241295 }
@@ -1227,6 +1298,91 @@ private void fillBuffer() throws IOException {
12271298 int getBytesRead () {
12281299 return bytesRead ;
12291300 }
1301+
1302+ byte [] getXmlDeclBytes () {
1303+ return xmlDeclBytes ;
1304+ }
1305+
1306+ void prefetch () {
1307+ if (buffer .length == 0 && !eof ) {
1308+ try {
1309+ fillBuffer ();
1310+ } catch (IOException e ) {
1311+ throw new RuntimeException (e );
1312+ }
1313+ }
1314+ }
1315+
1316+ private void captureXmlDeclBytes (byte [] chunk ) {
1317+ if (xmlDeclComplete ) {
1318+ return ;
1319+ }
1320+ int newLen = Math .min (1024 , xmlDeclBytes .length + chunk .length );
1321+ byte [] merged = Arrays .copyOf (xmlDeclBytes , newLen );
1322+ int canCopy = newLen - xmlDeclBytes .length ;
1323+ System .arraycopy (chunk , 0 , merged , xmlDeclBytes .length , canCopy );
1324+ xmlDeclBytes = merged ;
1325+ for (int i = 1 ; i < xmlDeclBytes .length ; i ++) {
1326+ if (xmlDeclBytes [i - 1 ] == '?' && xmlDeclBytes [i ] == '>' ) {
1327+ xmlDeclComplete = true ;
1328+ break ;
1329+ }
1330+ }
1331+ }
1332+ }
1333+
1334+ private record ExternalEntityInfo (String systemId , String publicId ) {
1335+ }
1336+
1337+ private record XmlDeclInfo (String version , String encoding , int standalone ) {
1338+ }
1339+
1340+ @ TruffleBoundary
1341+ private static XmlDeclInfo detectXmlDecl (byte [] data ) {
1342+ if (data .length == 0 || data [0 ] != '<' ) {
1343+ return null ;
1344+ }
1345+ int end = -1 ;
1346+ for (int i = 1 ; i < data .length ; i ++) {
1347+ if (data [i - 1 ] == '?' && data [i ] == '>' ) {
1348+ end = i + 1 ;
1349+ break ;
1350+ }
1351+ }
1352+ if (end == -1 ) {
1353+ return null ;
1354+ }
1355+ String decl = new String (data , 0 , end , java .nio .charset .StandardCharsets .ISO_8859_1 );
1356+ if (!decl .startsWith ("<?xml" )) {
1357+ return null ;
1358+ }
1359+ String version = extractXmlDeclAttr (decl , "version" );
1360+ String encoding = extractXmlDeclAttr (decl , "encoding" );
1361+ String standalone = extractXmlDeclAttr (decl , "standalone" );
1362+ int standaloneInt = "yes" .equalsIgnoreCase (standalone ) ? 1 : "no" .equalsIgnoreCase (standalone ) ? 0 : -1 ;
1363+ return new XmlDeclInfo (version , encoding , standaloneInt );
1364+ }
1365+
1366+ @ TruffleBoundary
1367+ private static String extractXmlDeclAttr (String decl , String attr ) {
1368+ String key = attr + "=" ;
1369+ int idx = decl .indexOf (key );
1370+ if (idx < 0 ) {
1371+ return null ;
1372+ }
1373+ int valueStart = idx + key .length ();
1374+ if (valueStart >= decl .length ()) {
1375+ return null ;
1376+ }
1377+ char quote = decl .charAt (valueStart );
1378+ if (quote != '\'' && quote != '"' ) {
1379+ return null ;
1380+ }
1381+ int end = decl .indexOf (quote , valueStart + 1 );
1382+ if (end < 0 ) {
1383+ return null ;
1384+ }
1385+ return decl .substring (valueStart + 1 , end );
12301386 }
12311387
12321388 private static PException raiseExpatError (Node raisingNode , TruffleString msg , int code , int byteIndex , int line , int column ) {
0 commit comments