Skip to content

Commit 2013754

Browse files
committed
Add support for NTCIR topic format
Extracts the MathSearchPatterns from a topics file. Change-Id: I2bebc1ff37ef2cc61fe39047ffab904929e1ac86
1 parent 403f0fe commit 2013754

21 files changed

Lines changed: 13028 additions & 47 deletions

File tree

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package com.formulasearchengine.mathmlquerygenerator;
2+
3+
/**
4+
* Created by Moritz on 08.11.2014.
5+
*/
6+
public class NtcirPattern {
7+
private final String num;
8+
private final String formulaID;
9+
private final String xQueryExpression;
10+
11+
public NtcirPattern (String num, String formulaID, String xQueryExpression) {
12+
this.num = num;
13+
this.formulaID = formulaID;
14+
this.xQueryExpression = xQueryExpression;
15+
}
16+
17+
public String getNum () {
18+
return num;
19+
}
20+
21+
public String getFormulaID () {
22+
return formulaID;
23+
}
24+
25+
public String getxQueryExpression () {
26+
return xQueryExpression;
27+
}
28+
}
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
package com.formulasearchengine.mathmlquerygenerator;
2+
3+
import com.formulasearchengine.xmlhelper.NonWhitespaceNodeList;
4+
import org.w3c.dom.Document;
5+
import org.w3c.dom.Node;
6+
import org.w3c.dom.NodeList;
7+
import org.xml.sax.SAXException;
8+
9+
import javax.xml.namespace.NamespaceContext;
10+
import javax.xml.parsers.DocumentBuilder;
11+
import javax.xml.parsers.DocumentBuilderFactory;
12+
import javax.xml.parsers.ParserConfigurationException;
13+
import javax.xml.xpath.*;
14+
import java.io.File;
15+
import java.io.IOException;
16+
import java.util.ArrayList;
17+
import java.util.Iterator;
18+
import java.util.List;
19+
20+
import static com.formulasearchengine.xmlhelper.NonWhitespaceNodeList.getFirstChild;
21+
22+
/**
23+
* Created by Moritz on 08.11.2014.
24+
* <p/>
25+
* Reads the topic format specified in
26+
* http://ntcir-math.nii.ac.jp/wp-content/blogs.dir/13/files/2014/05/NTCIR11-Math-topics.pdf
27+
*/
28+
public class NtcirTopicReader {
29+
public static final String NS_NII = "http://ntcir-math.nii.ac.jp/";
30+
private final Document topics;
31+
private final List<NtcirPattern> patterns = new ArrayList<>();
32+
private final XQueryGenerator queryGenerator;
33+
34+
public NtcirTopicReader (Document topics) {
35+
this.topics = topics;
36+
queryGenerator = new XQueryGenerator( topics );
37+
}
38+
39+
public NtcirTopicReader (File topicFile) throws ParserConfigurationException, IOException, SAXException {
40+
DocumentBuilder documentBuilder = getDocumentBuilderFactory().newDocumentBuilder();
41+
topics = documentBuilder.parse( topicFile );
42+
//TODO: Find out how this code duplication can be avoided in Java.
43+
queryGenerator = new XQueryGenerator( topics );
44+
}
45+
46+
private static DocumentBuilderFactory getDocumentBuilderFactory () {
47+
final DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance();
48+
documentBuilderFactory.setNamespaceAware( true );
49+
return documentBuilderFactory;
50+
}
51+
52+
private static XPath namespaceAwareXpath (final String prefix, final String nsURI) {
53+
XPathFactory xPathfactory = XPathFactory.newInstance();
54+
XPath xpath = xPathfactory.newXPath();
55+
NamespaceContext ctx = new NamespaceContext() {
56+
@Override
57+
public String getNamespaceURI (String aPrefix) {
58+
if ( aPrefix.equals( prefix ) )
59+
return nsURI;
60+
else
61+
return null;
62+
}
63+
64+
@Override
65+
public Iterator getPrefixes (String val) {
66+
throw new UnsupportedOperationException();
67+
}
68+
69+
@Override
70+
public String getPrefix (String uri) {
71+
throw new UnsupportedOperationException();
72+
}
73+
};
74+
xpath.setNamespaceContext( ctx );
75+
return xpath;
76+
}
77+
78+
public void setFooter (String footer) {
79+
queryGenerator.setFooter( footer );
80+
}
81+
82+
public void setHeader (String header) {
83+
queryGenerator.setHeader( header );
84+
}
85+
86+
public List<NtcirPattern> extractPatterns () throws XPathExpressionException {
87+
XPath xpath = namespaceAwareXpath( "t", NS_NII );
88+
XPathExpression xNum = xpath.compile( "./t:num" );
89+
XPathExpression xFormula = xpath.compile( "./t:query/t:formula" );
90+
NonWhitespaceNodeList topicList = new NonWhitespaceNodeList(
91+
topics.getElementsByTagNameNS( NS_NII, "topic" ) );
92+
for ( Node node : topicList ) {
93+
String num = xNum.evaluate( node );
94+
NonWhitespaceNodeList formulae = new NonWhitespaceNodeList( (NodeList)
95+
xFormula.evaluate( node, XPathConstants.NODESET ) );
96+
for ( Node formula : formulae ) {
97+
String id = formula.getAttributes().getNamedItem( "id" ).getTextContent();
98+
queryGenerator.setMainElement( getFirstChild( getFirstChild( formula ) ) );
99+
patterns.add( new NtcirPattern( num, id, queryGenerator.toString() ) );
100+
}
101+
}
102+
return patterns;
103+
}
104+
}

src/main/java/com/formulasearchengine/mathmlquerygenerator/XQueryGenerator.java

Lines changed: 54 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -11,46 +11,27 @@
1111
import java.util.HashMap;
1212
import java.util.Map;
1313

14+
import static com.formulasearchengine.xmlhelper.NonWhitespaceNodeList.getFirstChild;
15+
1416
/**
1517
* Created by Moritz Schubotz on 9/3/14.
1618
* Translated from http://git.wikimedia.org/blob/mediawiki%2Fextensions%2FMathSearch.git/31a80ae48d1aaa50da9103cea2e45a8dc2204b39/XQueryGenerator.php
1719
*/
1820
@SuppressWarnings ("WeakerAccess")
1921
public class XQueryGenerator {
20-
private final Map<String, ArrayList<String>> qvar = new HashMap<>();
22+
private final Document xml;
23+
private Map<String, ArrayList<String>> qvar = new HashMap<>();
2124
private String relativeXPath = "";
2225
private String lengthConstraint = "";
2326
private String header = "declare default element namespace \"http://www.w3.org/1998/Math/MathML\";\n" +
2427
"for $m in db2-fn:xmlcolumn(\"math.math_mathml\") return\n";
2528
private String footer = "data($m/*[1]/@alttext)";
26-
27-
public String getFooter () {
28-
return footer;
29-
}
30-
31-
public void setFooter (String footer) {
32-
this.footer = footer;
33-
}
34-
35-
public String getHeader () {
36-
return header;
37-
}
38-
39-
public void setHeader (String header) {
40-
this.header = header;
41-
}
42-
43-
44-
private final Document xml;
29+
private Node mainElement = null;
4530

4631
public XQueryGenerator (Document xml) {
4732
this.xml = xml;
4833
}
4934

50-
private Node getMainElement () {
51-
return getMainElement( xml );
52-
}
53-
5435
public static Node getMainElement (Document xml) {
5536
// Try to get main mws:expr first
5637
NodeList expr = xml.getElementsByTagName( "mws:expr" );
@@ -66,11 +47,11 @@ public static Node getMainElement (Document xml) {
6647
return node;
6748
}
6849
}
69-
// if that fails too interprete content of first semantic element as content MathML
70-
expr = xml.getElementsByTagNameNS("*", "semantics");
71-
if ( expr.getLength() > 0 ) {
72-
return new NonWhitespaceNodeList( expr ).item( 0 );
73-
}
50+
// if that fails too interprete content of first semantic element as content MathML
51+
expr = xml.getElementsByTagNameNS( "*", "semantics" );
52+
if ( expr.getLength() > 0 ) {
53+
return new NonWhitespaceNodeList( expr ).item( 0 );
54+
}
7455
// if that fails too interprete content of root MathML element as content MathML
7556
expr = xml.getElementsByTagName( "math" );
7657
if ( expr.getLength() > 0 ) {
@@ -80,6 +61,39 @@ public static Node getMainElement (Document xml) {
8061
return null;
8162
}
8263

64+
public String getFooter () {
65+
return footer;
66+
}
67+
68+
public void setFooter (String footer) {
69+
this.footer = footer;
70+
}
71+
72+
public String getHeader () {
73+
return header;
74+
}
75+
76+
public void setHeader (String header) {
77+
this.header = header;
78+
}
79+
80+
81+
private Node getMainElement () {
82+
if ( mainElement == null ) {
83+
return getMainElement( xml );
84+
} else {
85+
return mainElement;
86+
}
87+
88+
}
89+
90+
public void setMainElement (Node mainElement) {
91+
this.mainElement = mainElement;
92+
qvar = new HashMap<>();
93+
relativeXPath = "";
94+
lengthConstraint = "";
95+
}
96+
8397
public String toString () {
8498
Node mainElement = getMainElement();
8599
if ( mainElement == null )
@@ -120,21 +134,20 @@ public String toString () {
120134

121135
public String getString (Node mainElement, String fixedConstraints, String qvarConstraintString) {
122136
String out = getHeader();
123-
out += "for $x in $m//*:" +
124-
(new NonWhitespaceNodeList( mainElement.getChildNodes() )).item( 0 ).getLocalName() + "\n" +
137+
out += "for $x in $m//*:" + getFirstChild( mainElement ).getLocalName() + "\n" +
125138
fixedConstraints + "\n";
126-
out += getConstraings( qvarConstraintString );
139+
out += getConstraings( qvarConstraintString );
127140
out +=
128-
"return" + "\n" + getFooter();
141+
"return" + "\n" + getFooter();
129142
return out;
130143
}
131144

132145
private String getConstraings (String qvarConstraintString) {
133-
String out = lengthConstraint +
146+
String out = lengthConstraint +
134147
(((qvarConstraintString.length() > 0) && (lengthConstraint.length() > 0)) ? " and " : "") +
135-
qvarConstraintString ;
136-
if (out.trim().length()>0){
137-
return "where" + "\n" + out +"\n";
148+
qvarConstraintString;
149+
if ( out.trim().length() > 0 ) {
150+
return "where" + "\n" + out + "\n";
138151
} else {
139152
return "";
140153
}
@@ -163,10 +176,10 @@ private String generateConstraint (Node node, boolean isRoot) {
163176
}
164177
} else {
165178
if ( child.getNodeType() == Node.ELEMENT_NODE ) {
166-
if(child.getLocalName().matches("annotation(-xml)?")){
167-
continue;
168-
}
169-
i++;
179+
if ( child.getLocalName().matches( "annotation(-xml)?" ) ) {
180+
continue;
181+
}
182+
i++;
170183
if ( hasText ) {
171184
out += " and ";
172185
}

src/main/java/com/formulasearchengine/xmlhelper/NonWhitespaceNodeList.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ public NonWhitespaceNodeList (NodeList list) {
2323
}
2424
}
2525

26+
public static Node getFirstChild(Node node){
27+
NonWhitespaceNodeList children = new NonWhitespaceNodeList( node.getChildNodes() );
28+
return children.item( 0 );
29+
}
2630
@Override
2731
public Node item(int index) {
2832
return nodes.get(index);
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
package com.formulasearchengine.mathmlquerygenerator;
2+
3+
import org.junit.Test;
4+
import org.xml.sax.SAXException;
5+
6+
import javax.xml.parsers.ParserConfigurationException;
7+
import javax.xml.xpath.XPathExpressionException;
8+
import java.io.File;
9+
import java.io.IOException;
10+
import java.net.URISyntaxException;
11+
import java.net.URL;
12+
import java.util.List;
13+
14+
import static com.formulasearchengine.mathmlquerygenerator.XQueryGeneratorTest.getFileContents;
15+
import static org.junit.Assert.assertEquals;
16+
17+
public class NtcirTopicReaderTest{
18+
public static final String BASEX_HEADER = "declare default element namespace \"http://www.w3.org/1998/Math/MathML\";\n" +
19+
"for $m in //*:expr return \n";
20+
public static final String BASEX_FOOTER = "<a href=\"http://demo.formulasearchengine.com/index.php?curid={$m/@url}\">result</a>\n";
21+
public static final String WIKIPEDIA_RESOURCE = "jp/ac/nii/Ntcir11MathWikipediaTopicsParticipants.xml";
22+
23+
24+
@Test
25+
public void testExtractPatterns () throws Exception {
26+
assertEquals( "Count in Wikipedia testfile incorrect" , 100, countFormulaeInTopics( WIKIPEDIA_RESOURCE ) );
27+
assertEquals( "Count in arXiv testfile incorrect", 55, countFormulaeInTopics( "jp/ac/nii/NTCIR-11-Math-test.xml" ) );
28+
}
29+
private int countFormulaeInTopics (String resourceName) throws URISyntaxException, IOException, SAXException, ParserConfigurationException, XPathExpressionException {
30+
final List<NtcirPattern> ntcirPatterns = getTopicReader( resourceName ).extractPatterns();;
31+
return ntcirPatterns.size();
32+
}
33+
34+
private NtcirTopicReader getTopicReader (String resourceName) throws ParserConfigurationException, IOException, SAXException, URISyntaxException, XPathExpressionException {
35+
URL resource = this.getClass().getClassLoader().getResource( resourceName );
36+
return new NtcirTopicReader( new File( resource.toURI() ) );
37+
}
38+
39+
@Test
40+
public void checkBaseX() throws Exception {
41+
final String referenceString = getFileContents( "jp/ac/nii/basexReferenceQueries.txt" );
42+
NtcirTopicReader tr = getTopicReader( WIKIPEDIA_RESOURCE );
43+
tr.setHeader( BASEX_HEADER );
44+
tr.setFooter( BASEX_FOOTER );
45+
StringBuilder sb = new StringBuilder();
46+
for ( NtcirPattern ntcirPattern : tr.extractPatterns() ) {
47+
sb.append( ntcirPattern.getxQueryExpression() );
48+
}
49+
assertEquals( referenceString, sb.toString());
50+
51+
}
52+
}

src/test/java/com/formulasearchengine/mathmlquerygenerator/XQueryGeneratorTest.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
package com.formulasearchengine.mathmlquerygenerator;
22

3-
import com.formulasearchengine.mathmlquerygenerator.XQueryGenerator;
43
import junit.framework.TestCase;
54
import org.w3c.dom.Document;
65

@@ -14,8 +13,8 @@
1413
public class XQueryGeneratorTest extends TestCase {
1514

1615
@SuppressWarnings("SameParameterValue")
17-
String getFileContents(String fname) throws IOException {
18-
try (InputStream is = this.getClass().getClassLoader().getResourceAsStream(fname)) {
16+
static public String getFileContents (String fname) throws IOException {
17+
try (InputStream is = XQueryGeneratorTest.class.getClassLoader().getResourceAsStream(fname)) {
1918
final Scanner s = new Scanner(is, "UTF-8");
2019
//Stupid scanner tricks to read the entire file as one token
2120
s.useDelimiter("\\A");
@@ -67,11 +66,11 @@ private void runTestCollection(File dir) {
6766
}
6867

6968
public void testMwsConversion() {
70-
runTestCollection("de/tuberlin/dima/schubotz/mathmlquerygenerator/mws");
69+
runTestCollection( "com/formulasearchengine/mathmlquerygenerator/mws" );
7170
}
7271

7372
public void testCmmlConversion() {
74-
runTestCollection("de/tuberlin/dima/schubotz/mathmlquerygenerator/cmml");
73+
runTestCollection( "com/formulasearchengine/mathmlquerygenerator/cmml" );
7574
}
7675

7776
public void testHeaderAndFooter() throws Exception {
@@ -80,7 +79,7 @@ public void testHeaderAndFooter() throws Exception {
8079
"let $m := .";
8180
final String testFooter = "$x}\n" +
8281
"</result>";
83-
final String testInput = getFileContents("de/tuberlin/dima/schubotz/mathmlquerygenerator/cmml/q1.xml");
82+
final String testInput = getFileContents( "com/formulasearchengine/mathmlquerygenerator/cmml/q1.xml" );
8483
final String expectedOutput = "declare default element namespace \"http://www.w3.org/1998/Math/MathML\";\n" +
8584
"<result>{\n" +
8685
"let $m := .for $x in $m//*:ci\n" +

src/test/resources/de/tuberlin/dima/schubotz/mathmlquerygenerator/cmml/q1.xml renamed to src/test/resources/com/formulasearchengine/mathmlquerygenerator/cmml/q1.xml

File renamed without changes.

src/test/resources/de/tuberlin/dima/schubotz/mathmlquerygenerator/cmml/q1.xq renamed to src/test/resources/com/formulasearchengine/mathmlquerygenerator/cmml/q1.xq

File renamed without changes.

src/test/resources/de/tuberlin/dima/schubotz/mathmlquerygenerator/mws/q1.xml renamed to src/test/resources/com/formulasearchengine/mathmlquerygenerator/mws/q1.xml

File renamed without changes.

src/test/resources/de/tuberlin/dima/schubotz/mathmlquerygenerator/mws/q1.xq renamed to src/test/resources/com/formulasearchengine/mathmlquerygenerator/mws/q1.xq

File renamed without changes.

0 commit comments

Comments
 (0)