diff --git a/core/src/main/java/com/github/jsonldjava/utils/JsonUtils.java b/core/src/main/java/com/github/jsonldjava/utils/JsonUtils.java index f7e0581b..c24c8467 100644 --- a/core/src/main/java/com/github/jsonldjava/utils/JsonUtils.java +++ b/core/src/main/java/com/github/jsonldjava/utils/JsonUtils.java @@ -9,14 +9,28 @@ import java.io.StringWriter; import java.io.Writer; import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerationException; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.github.jsonldjava.core.DocumentLoader; +import com.github.jsonldjava.core.JsonLdApi; +import com.github.jsonldjava.core.JsonLdProcessor; + import org.apache.commons.io.ByteOrderMark; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.BOMInputStream; +import org.apache.http.Header; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpUriRequest; @@ -28,17 +42,8 @@ import org.apache.http.impl.client.cache.BasicHttpCacheStorage; import org.apache.http.impl.client.cache.CacheConfig; import org.apache.http.impl.client.cache.CachingHttpClientBuilder; - -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerationException; -import com.fasterxml.jackson.core.JsonGenerator; -import com.fasterxml.jackson.core.JsonParseException; -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonToken; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.github.jsonldjava.core.DocumentLoader; -import com.github.jsonldjava.core.JsonLdApi; -import com.github.jsonldjava.core.JsonLdProcessor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Functions used to make loading, parsing, and serializing JSON easy using @@ -66,6 +71,8 @@ public class JsonUtils { private static final JsonFactory JSON_FACTORY = new JsonFactory(JSON_MAPPER); private static volatile CloseableHttpClient DEFAULT_HTTP_CLIENT; + // Avoid possible endless loop when following alternate locations + private static final int MAX_LINKS_FOLLOW = 20; static { // Disable default Jackson behaviour to close @@ -109,6 +116,10 @@ public static Object fromInputStream(InputStream input) throws IOException { } } return fromInputStream(bOMInputStream, charset); + } finally { + if (input != null) { + input.close(); + } } } @@ -335,40 +346,69 @@ public static Object fromURL(java.net.URL url, CloseableHttpClient httpClient) final String protocol = url.getProtocol(); // We can only use the Apache HTTPClient for HTTP/HTTPS, so use the // native java client for the others - CloseableHttpResponse response = null; - InputStream in = null; - try { - if (!protocol.equalsIgnoreCase("http") && !protocol.equalsIgnoreCase("https")) { - // Can't use the HTTP client for those! - // Fallback to Java's built-in JsonLdUrl handler. No need for - // Accept headers as it's likely to be file: or jar: - in = url.openStream(); - } else { - final HttpUriRequest request = new HttpGet(url.toExternalForm()); - // We prefer application/ld+json, but fallback to - // application/json - // or whatever is available - request.addHeader("Accept", ACCEPT_HEADER); - - response = httpClient.execute(request); - final int status = response.getStatusLine().getStatusCode(); - if (status != 200 && status != 203) { - throw new IOException("Can't retrieve " + url + ", status code: " + status); - } - in = response.getEntity().getContent(); + if (!protocol.equalsIgnoreCase("http") && !protocol.equalsIgnoreCase("https")) { + // Can't use the HTTP client for those! + // Fallback to Java's built-in JsonLdUrl handler. No need for + // Accept headers as it's likely to be file: or jar: + return fromInputStream(url.openStream()); + } else { + return fromJsonLdViaHttpUri(url, httpClient, 0); + } + } + + private static Object fromJsonLdViaHttpUri(final URL url, final CloseableHttpClient httpClient, int linksFollowed) + throws IOException { + final HttpUriRequest request = new HttpGet(url.toExternalForm()); + // We prefer application/ld+json, but fallback to application/json + // or whatever is available + request.addHeader("Accept", ACCEPT_HEADER); + try (CloseableHttpResponse response = httpClient.execute(request)) { + final int status = response.getStatusLine().getStatusCode(); + if (status != 200 && status != 203) { + throw new IOException("Can't retrieve " + url + ", status code: " + status); } - return fromInputStream(in); - } finally { - try { - if (in != null) { - in.close(); + // follow alternate document location + // https://www.w3.org/TR/json-ld11/#alternate-document-location + URL alternateLink = alternateLink(url, response); + if (alternateLink != null) { + linksFollowed++; + if (linksFollowed > MAX_LINKS_FOLLOW) { + throw new IOException("Too many alternate links followed. This may indicate a cycle. Aborting."); } - } finally { - if (response != null) { - response.close(); + return fromJsonLdViaHttpUri(alternateLink, httpClient, linksFollowed); + } + return fromInputStream(response.getEntity().getContent()); + } + } + + private static URL alternateLink(URL url, CloseableHttpResponse response) + throws MalformedURLException { + if (response.getEntity().getContentType() != null + && !response.getEntity().getContentType().getValue().equals("application/ld+json")) { + for (Header header : response.getAllHeaders()) { + if (header.getName().equalsIgnoreCase("link")) { + String alternateLink = ""; + boolean relAlternate = false; + boolean jsonld = false; + for (String value : header.getValue().split(";")) { + value=value.trim(); + if (value.startsWith("<") && value.endsWith(">")) { + alternateLink = value.substring(1, value.length() - 1); + } + if (value.startsWith("type=\"application/ld+json\"")) { + jsonld = true; + } + if (value.startsWith("rel=\"alternate\"")) { + relAlternate = true; + } + } + if (jsonld && relAlternate && !alternateLink.isEmpty()) { + return new URL(url.getProtocol() + "://" + url.getAuthority() + alternateLink); + } } } } + return null; } /** @@ -384,7 +424,7 @@ public static Object fromURL(java.net.URL url, CloseableHttpClient httpClient) * @throws IOException * If there was an IO error during parsing. */ - public static Object fromURLJavaNet(java.net.URL url) throws JsonParseException, IOException { + public static Object fromURLJavaNet(URL url) throws JsonParseException, IOException { final HttpURLConnection urlConn = (HttpURLConnection) url.openConnection(); urlConn.addRequestProperty("Accept", ACCEPT_HEADER); diff --git a/core/src/test/java/com/github/jsonldjava/core/MinimalSchemaOrgRegressionTest.java b/core/src/test/java/com/github/jsonldjava/core/MinimalSchemaOrgRegressionTest.java index f4c1b88d..4694f897 100644 --- a/core/src/test/java/com/github/jsonldjava/core/MinimalSchemaOrgRegressionTest.java +++ b/core/src/test/java/com/github/jsonldjava/core/MinimalSchemaOrgRegressionTest.java @@ -1,20 +1,13 @@ package com.github.jsonldjava.core; import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; -import java.io.IOException; -import java.io.InputStream; -import java.io.StringWriter; -import java.net.HttpURLConnection; import java.net.URL; -import java.nio.charset.StandardCharsets; -import org.apache.commons.io.IOUtils; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.client.methods.HttpUriRequest; +import com.github.jsonldjava.utils.JarCacheStorage; +import com.github.jsonldjava.utils.JsonUtils; + import org.apache.http.client.protocol.RequestAcceptEncoding; import org.apache.http.client.protocol.ResponseContentEncoding; import org.apache.http.impl.client.CloseableHttpClient; @@ -22,98 +15,47 @@ import org.apache.http.impl.client.cache.BasicHttpCacheStorage; import org.apache.http.impl.client.cache.CacheConfig; import org.apache.http.impl.client.cache.CachingHttpClientBuilder; -import org.junit.Ignore; import org.junit.Test; -import com.github.jsonldjava.utils.JarCacheStorage; - public class MinimalSchemaOrgRegressionTest { - private static final String ACCEPT_HEADER = "application/ld+json, application/json;q=0.9, application/javascript;q=0.5, text/javascript;q=0.5, text/plain;q=0.2, */*;q=0.1"; - - @Ignore("Java API does not have any way of redirecting automatically from HTTP to HTTPS, which breaks schema.org usage with it") + /** + * Tests getting JSON from schema.org with the HTTP Accept header set to + * {@value com.github.jsonldjava.utils.JsonUtils#ACCEPT_HEADER}? . + */ @Test - public void testHttpURLConnection() throws Exception { + public void testApacheHttpClient() throws Exception { final URL url = new URL("http://schema.org/"); - final boolean followRedirectsSetting = HttpURLConnection.getFollowRedirects(); - try { - HttpURLConnection.setFollowRedirects(true); - final HttpURLConnection urlConn = (HttpURLConnection) url.openConnection(); - urlConn.setInstanceFollowRedirects(true); - urlConn.addRequestProperty("Accept", ACCEPT_HEADER); - - final InputStream directStream = urlConn.getInputStream(); - verifyInputStream(directStream); - } finally { - HttpURLConnection.setFollowRedirects(followRedirectsSetting); - } + // Common CacheConfig for both the JarCacheStorage and the underlying + // BasicHttpCacheStorage + final CacheConfig cacheConfig = CacheConfig.custom().setMaxCacheEntries(1000) + .setMaxObjectSize(1024 * 128).build(); + + final CloseableHttpClient httpClient = CachingHttpClientBuilder.create() + // allow caching + .setCacheConfig(cacheConfig) + // Wrap the local JarCacheStorage around a BasicHttpCacheStorage + .setHttpCacheStorage(new JarCacheStorage(null, cacheConfig, + new BasicHttpCacheStorage(cacheConfig))) + // Support compressed data + // http://hc.apache.org/httpcomponents-client-ga/tutorial/html/httpagent.html#d5e1238 + .addInterceptorFirst(new RequestAcceptEncoding()) + .addInterceptorFirst(new ResponseContentEncoding()) + .setRedirectStrategy(DefaultRedirectStrategy.INSTANCE) + // use system defaults for proxy etc. + .useSystemProperties().build(); + + Object content = JsonUtils.fromURL(url, httpClient); + checkBasicConditions(content.toString()); } - private void verifyInputStream(InputStream directStream) throws IOException { - assertNotNull("InputStream was null", directStream); - final StringWriter output = new StringWriter(); - try { - IOUtils.copy(directStream, output, StandardCharsets.UTF_8); - } finally { - directStream.close(); - output.flush(); - } - final String outputString = output.toString(); - // System.out.println(outputString); + private void checkBasicConditions(final String outputString) { // Test for some basic conditions without including the JSON/JSON-LD // parsing code here - // assertTrue(outputString, outputString.endsWith("}")); + assertTrue(outputString, outputString.endsWith("}")); assertFalse("Output string should not be empty: " + outputString.length(), outputString.isEmpty()); assertTrue("Unexpected length: " + outputString.length(), outputString.length() > 100000); } - - @Test - public void testApacheHttpClient() throws Exception { - final URL url = new URL("http://schema.org/"); - // Common CacheConfig for both the JarCacheStorage and the underlying - // BasicHttpCacheStorage - final CacheConfig cacheConfig = CacheConfig.custom().setMaxCacheEntries(1000) - .setMaxObjectSize(1024 * 128).build(); - - final CloseableHttpClient httpClient = CachingHttpClientBuilder.create() - // allow caching - .setCacheConfig(cacheConfig) - // Wrap the local JarCacheStorage around a BasicHttpCacheStorage - .setHttpCacheStorage(new JarCacheStorage(null, cacheConfig, - new BasicHttpCacheStorage(cacheConfig))) - // Support compressed data - // http://hc.apache.org/httpcomponents-client-ga/tutorial/html/httpagent.html#d5e1238 - .addInterceptorFirst(new RequestAcceptEncoding()) - .addInterceptorFirst(new ResponseContentEncoding()) - .setRedirectStrategy(DefaultRedirectStrategy.INSTANCE) - // use system defaults for proxy etc. - .useSystemProperties().build(); - - try { - final HttpUriRequest request = new HttpGet(url.toExternalForm()); - // We prefer application/ld+json, but fallback to application/json - // or whatever is available - request.addHeader("Accept", ACCEPT_HEADER); - - final CloseableHttpResponse response = httpClient.execute(request); - try { - final int status = response.getStatusLine().getStatusCode(); - if (status != 200 && status != 203) { - throw new IOException("Can't retrieve " + url + ", status code: " + status); - } - final InputStream content = response.getEntity().getContent(); - verifyInputStream(content); - } finally { - if (response != null) { - response.close(); - } - } - } finally { - if (httpClient != null) { - httpClient.close(); - } - } - } - + }