Skip to content

Commit 77651cd

Browse files
committed
Make test_parse_file pass
1 parent cb1156d commit 77651cd

2 files changed

Lines changed: 163 additions & 11 deletions

File tree

graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pyexpat/XMLParserBuiltins.java

Lines changed: 163 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,11 @@
5454
import java.io.StringReader;
5555
import java.util.ArrayList;
5656
import java.util.Arrays;
57+
import java.util.HashMap;
58+
import java.util.HashSet;
5759
import java.util.List;
60+
import java.util.Map;
61+
import java.util.Set;
5862

5963
import javax.xml.parsers.SAXParserFactory;
6064

@@ -65,6 +69,7 @@
6569
import org.xml.sax.XMLReader;
6670
import org.xml.sax.ext.Attributes2;
6771
import org.xml.sax.ext.DefaultHandler2;
72+
import org.xml.sax.ext.EntityResolver2;
6873
import org.xml.sax.ext.Locator2;
6974
import org.xml.sax.helpers.DefaultHandler;
7075

@@ -712,12 +717,14 @@ int parseFile(VirtualFrame frame, PXMLParser self, Object file,
712717
return 1;
713718
}
714719

720+
@TruffleBoundary
715721
private void doParseFile(PXMLParser self, Object file) {
716722
if (self.isFinished()) {
717723
throw raiseExpatError(this, ErrorMessages.PARSING_FINISHED, PXMLParser.XML_ERROR_FINISHED, 0, 1, 0);
718724
}
719725
PythonFileInputStream stream = new PythonFileInputStream(file, self.getBufferSize());
720-
parseNow(self, false, new InputSource(stream), stream::getBytesRead);
726+
stream.prefetch();
727+
parseNow(self, false, new InputSource(stream), stream::getBytesRead, detectXmlDecl(stream.getXmlDeclBytes()));
721728
}
722729
}
723730

@@ -858,17 +865,24 @@ private static byte[] asByteArray(Object data) {
858865
@TruffleBoundary
859866
private static void parseNow(PXMLParser parser, boolean swallowErrors) {
860867
int byteLen = parser.getData().length;
861-
parseNow(parser, swallowErrors, new InputSource(new ByteArrayInputStream(parser.getData())), () -> byteLen);
868+
parseNow(parser, swallowErrors, new InputSource(new ByteArrayInputStream(parser.getData())), () -> byteLen, detectXmlDecl(parser.getData()));
862869
}
863870

864871
@TruffleBoundary
865872
private static void parseNow(PXMLParser parser, boolean swallowErrors, InputSource source, ByteIndexSupplier byteIndexSupplier) {
873+
parseNow(parser, swallowErrors, source, byteIndexSupplier, null);
874+
}
875+
876+
@TruffleBoundary
877+
private static void parseNow(PXMLParser parser, boolean swallowErrors, InputSource source, ByteIndexSupplier byteIndexSupplier, XmlDeclInfo xmlDeclInfo) {
866878
final class Handler extends DefaultHandler2 {
867879
int line = 1;
868880
int col;
869881
int eventOrdinal;
870882
Locator locator;
871883
boolean keepCurrentPositionForNextCall;
884+
final Map<String, ExternalEntityInfo> externalEntities = new HashMap<>();
885+
final Set<String> resolvedExternalEntities = new HashSet<>();
872886

873887
@Override
874888
public void setDocumentLocator(Locator locator) {
@@ -898,20 +912,28 @@ public void processingInstruction(String target, String data) {
898912

899913
@Override
900914
public void startDocument() {
901-
if (locator instanceof Locator2 locator2) {
915+
if (xmlDeclInfo != null) {
916+
call(parser.getXmlDeclHandler(), toOptionalTs(xmlDeclInfo.version), toOptionalTs(xmlDeclInfo.encoding), xmlDeclInfo.standalone);
917+
} else if (locator instanceof Locator2 locator2) {
902918
call(parser.getXmlDeclHandler(), toOptionalTs(locator2.getXMLVersion()), toOptionalTs(locator2.getEncoding()), -1);
903919
}
904920
}
905921

906922
@Override
907923
public void startDTD(String name, String publicId, String systemId) {
924+
if (xmlDeclInfo != null && xmlDeclInfo.standalone == 0) {
925+
call(parser.getNotStandaloneHandler());
926+
}
908927
// We conservatively report an internal subset. This matches minidom builder
909928
// expectations and enables DTD callback wiring for entity/notation handling.
910-
call(parser.getStartDoctypeDeclHandler(), toTs(name), toTs(systemId), toTs(publicId), 1);
929+
call(parser.getStartDoctypeDeclHandler(), toTs(name), toTs(systemId), toOptionalTs(publicId), 1);
911930
}
912931

913932
@Override
914933
public void endDTD() {
934+
if (xmlDeclInfo != null && xmlDeclInfo.standalone == 0) {
935+
call(parser.getNotStandaloneHandler());
936+
}
915937
call(parser.getEndDoctypeDeclHandler());
916938
}
917939

@@ -925,6 +947,9 @@ public void internalEntityDecl(String name, String value) {
925947
@Override
926948
public void externalEntityDecl(String name, String publicId, String systemId) {
927949
boolean isParameterEntity = name != null && name.startsWith("%");
950+
if (!isParameterEntity) {
951+
externalEntities.put(name, new ExternalEntityInfo(systemId, publicId));
952+
}
928953
call(parser.getEntityDeclHandler(), toTs(name), isParameterEntity ? 1 : 0, PNone.NONE, parser.getBase() == null ? PNone.NONE : parser.getBase(),
929954
toOptionalTs(normalizeSystemId(systemId)),
930955
toOptionalTs(publicId), PNone.NONE);
@@ -943,12 +968,21 @@ public void unparsedEntityDecl(String name, String publicId, String systemId, St
943968

944969
@Override
945970
public void elementDecl(String name, String model) {
946-
call(parser.getElementDeclHandler(), toTs(name), toTs(model));
971+
call(parser.getElementDeclHandler(), toTs(name), elementModel(model));
947972
}
948973

949974
@Override
950975
public void attributeDecl(String eName, String aName, String type, String mode, String value) {
951-
call(parser.getAttlistDeclHandler(), toTs(eName), toTs(aName), toTs(type), toOptionalTs(mode), toOptionalTs(value));
976+
Object defaultValue = toOptionalTs(value);
977+
int required = 0;
978+
if ("#REQUIRED".equals(mode)) {
979+
defaultValue = PNone.NONE;
980+
required = 1;
981+
} else if ("#IMPLIED".equals(mode)) {
982+
defaultValue = PNone.NONE;
983+
required = 0;
984+
}
985+
call(parser.getAttlistDeclHandler(), toTs(eName), toTs(aName), toTs(type), defaultValue, required);
952986
}
953987

954988
@Override
@@ -1015,6 +1049,12 @@ public void endCDATA() {
10151049

10161050
@Override
10171051
public void skippedEntity(String name) {
1052+
ExternalEntityInfo externalEntity = externalEntities.get(name);
1053+
if (externalEntity != null && !resolvedExternalEntities.contains(name)) {
1054+
call(parser.getExternalEntityRefHandler(), PNone.NONE, parser.getBase() == null ? PNone.NONE : parser.getBase(),
1055+
toOptionalTs(normalizeSystemId(externalEntity.systemId)), toOptionalTs(externalEntity.publicId));
1056+
return;
1057+
}
10181058
call(parser.getSkippedEntityHandler(), toTs(name), 0);
10191059
if (locator != null) {
10201060
int entityLen = name.length() + 2; // '&' + ';'
@@ -1105,6 +1145,14 @@ private String normalizeSystemId(String systemId) {
11051145
}
11061146
return systemId;
11071147
}
1148+
1149+
private Object elementModel(String model) {
1150+
if ("ANY".equals(model)) {
1151+
return PFactory.createTuple(PythonLanguage.get(null), new Object[]{2, 0, PNone.NONE,
1152+
PFactory.createTuple(PythonLanguage.get(null), EMPTY_OBJECT_ARRAY)});
1153+
}
1154+
return toTs(model);
1155+
}
11081156
}
11091157

11101158
Handler handler = new Handler();
@@ -1116,7 +1164,27 @@ private String normalizeSystemId(String systemId) {
11161164
reader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
11171165
} catch (Exception ignored) {
11181166
}
1119-
reader.setEntityResolver((publicId, systemId) -> new InputSource(new StringReader("")));
1167+
reader.setEntityResolver(new EntityResolver2() {
1168+
@Override
1169+
public InputSource getExternalSubset(String name, String baseURI) {
1170+
return null;
1171+
}
1172+
1173+
@Override
1174+
public InputSource resolveEntity(String name, String publicId, String baseURI, String systemId) {
1175+
if (name != null && !name.isEmpty()) {
1176+
handler.resolvedExternalEntities.add(name);
1177+
}
1178+
handler.call(parser.getExternalEntityRefHandler(), PNone.NONE, parser.getBase() == null ? PNone.NONE : parser.getBase(),
1179+
handler.toOptionalTs(handler.normalizeSystemId(systemId)), handler.toOptionalTs(publicId));
1180+
return new InputSource(new StringReader(""));
1181+
}
1182+
1183+
@Override
1184+
public InputSource resolveEntity(String publicId, String systemId) {
1185+
return new InputSource(new StringReader(""));
1186+
}
1187+
});
11201188
reader.setContentHandler(handler);
11211189
reader.setProperty("http://xml.org/sax/properties/lexical-handler", handler);
11221190
reader.setProperty("http://xml.org/sax/properties/declaration-handler", handler);
@@ -1173,6 +1241,8 @@ private static final class PythonFileInputStream extends InputStream {
11731241
private int offset;
11741242
private boolean eof;
11751243
private int bytesRead;
1244+
private byte[] xmlDeclBytes = EMPTY_BYTE_ARRAY;
1245+
private boolean xmlDeclComplete;
11761246

11771247
private PythonFileInputStream(Object file, int readSize) {
11781248
this.file = file;
@@ -1219,6 +1289,7 @@ private void fillBuffer() throws IOException {
12191289
buffer = chunk;
12201290
offset = 0;
12211291
bytesRead += chunk.length;
1292+
captureXmlDeclBytes(chunk);
12221293
} catch (PException e) {
12231294
throw new IOException(e);
12241295
}
@@ -1227,6 +1298,91 @@ private void fillBuffer() throws IOException {
12271298
int getBytesRead() {
12281299
return bytesRead;
12291300
}
1301+
1302+
byte[] getXmlDeclBytes() {
1303+
return xmlDeclBytes;
1304+
}
1305+
1306+
void prefetch() {
1307+
if (buffer.length == 0 && !eof) {
1308+
try {
1309+
fillBuffer();
1310+
} catch (IOException e) {
1311+
throw new RuntimeException(e);
1312+
}
1313+
}
1314+
}
1315+
1316+
private void captureXmlDeclBytes(byte[] chunk) {
1317+
if (xmlDeclComplete) {
1318+
return;
1319+
}
1320+
int newLen = Math.min(1024, xmlDeclBytes.length + chunk.length);
1321+
byte[] merged = Arrays.copyOf(xmlDeclBytes, newLen);
1322+
int canCopy = newLen - xmlDeclBytes.length;
1323+
System.arraycopy(chunk, 0, merged, xmlDeclBytes.length, canCopy);
1324+
xmlDeclBytes = merged;
1325+
for (int i = 1; i < xmlDeclBytes.length; i++) {
1326+
if (xmlDeclBytes[i - 1] == '?' && xmlDeclBytes[i] == '>') {
1327+
xmlDeclComplete = true;
1328+
break;
1329+
}
1330+
}
1331+
}
1332+
}
1333+
1334+
private record ExternalEntityInfo(String systemId, String publicId) {
1335+
}
1336+
1337+
private record XmlDeclInfo(String version, String encoding, int standalone) {
1338+
}
1339+
1340+
@TruffleBoundary
1341+
private static XmlDeclInfo detectXmlDecl(byte[] data) {
1342+
if (data.length == 0 || data[0] != '<') {
1343+
return null;
1344+
}
1345+
int end = -1;
1346+
for (int i = 1; i < data.length; i++) {
1347+
if (data[i - 1] == '?' && data[i] == '>') {
1348+
end = i + 1;
1349+
break;
1350+
}
1351+
}
1352+
if (end == -1) {
1353+
return null;
1354+
}
1355+
String decl = new String(data, 0, end, java.nio.charset.StandardCharsets.ISO_8859_1);
1356+
if (!decl.startsWith("<?xml")) {
1357+
return null;
1358+
}
1359+
String version = extractXmlDeclAttr(decl, "version");
1360+
String encoding = extractXmlDeclAttr(decl, "encoding");
1361+
String standalone = extractXmlDeclAttr(decl, "standalone");
1362+
int standaloneInt = "yes".equalsIgnoreCase(standalone) ? 1 : "no".equalsIgnoreCase(standalone) ? 0 : -1;
1363+
return new XmlDeclInfo(version, encoding, standaloneInt);
1364+
}
1365+
1366+
@TruffleBoundary
1367+
private static String extractXmlDeclAttr(String decl, String attr) {
1368+
String key = attr + "=";
1369+
int idx = decl.indexOf(key);
1370+
if (idx < 0) {
1371+
return null;
1372+
}
1373+
int valueStart = idx + key.length();
1374+
if (valueStart >= decl.length()) {
1375+
return null;
1376+
}
1377+
char quote = decl.charAt(valueStart);
1378+
if (quote != '\'' && quote != '"') {
1379+
return null;
1380+
}
1381+
int end = decl.indexOf(quote, valueStart + 1);
1382+
if (end < 0) {
1383+
return null;
1384+
}
1385+
return decl.substring(valueStart + 1, end);
12301386
}
12311387

12321388
private static PException raiseExpatError(Node raisingNode, TruffleString msg, int code, int byteIndex, int line, int column) {

graalpython/lib-python/3/test/test_pyexpat.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -270,10 +270,6 @@ def test_parse_str(self):
270270
operations = out.out
271271
self._verify_parse_output(operations)
272272

273-
@_skip_if_java_pyexpat_backend(
274-
"Java pyexpat backend currently cannot preserve Expat's ParseFile incremental semantics "
275-
"and callback ordering for this test document when using SAX-based parsing."
276-
)
277273
def test_parse_file(self):
278274
# Try parsing a file
279275
out = self.Outputter()

0 commit comments

Comments
 (0)