biojava
diff --git a/‎biojava-core/src/main/java/org/biojava/nbio/core/util/InputStreamProvider.java‎
Lines changed: 2 additions & 2 deletions b/‎biojava-core/src/main/java/org/biojava/nbio/core/util/InputStreamProvider.java‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎biojava-structure/src/main/java/org/biojava/nbio/structure/FileIdentifier.java‎
Lines changed: 65 additions & 0 deletions b/‎biojava-structure/src/main/java/org/biojava/nbio/structure/FileIdentifier.java‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎biojava-structure/src/main/java/org/biojava/nbio/structure/ResidueRange.java‎
Lines changed: 4 additions & 0 deletions b/‎biojava-structure/src/main/java/org/biojava/nbio/structure/ResidueRange.java‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎biojava-structure/src/main/java/org/biojava/nbio/structure/StructureIO.java‎
Lines changed: 3 additions & 2 deletions b/‎biojava-structure/src/main/java/org/biojava/nbio/structure/StructureIO.java‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎biojava-structure/src/main/java/org/biojava/nbio/structure/URIIdentifier.java‎
Lines changed: 0 additions & 52 deletions b/‎biojava-structure/src/main/java/org/biojava/nbio/structure/URIIdentifier.java‎
Lines changed: 0 additions & 52 deletions
diff --git a/‎biojava-structure/src/main/java/org/biojava/nbio/structure/URLIdentifier.java‎
Lines changed: 217 additions & 0 deletions b/‎biojava-structure/src/main/java/org/biojava/nbio/structure/URLIdentifier.java‎
Lines changed: 217 additions & 0 deletions
@@ -130,9 +130,9 @@ public InputStream getInputStream(URL u)
          return openCompressedURL(u);
       } else if (magic == GZIP_MAGIC ) {
          return openGZIPURL(u); 
-      } else if ( u.toString().endsWith(".gz")) {
+      } else if ( u.getPath().endsWith(".gz")) {
          return openGZIPURL(u);
-      } else if ( u.toString().endsWith(".Z")) {
+      } else if ( u.getPath().endsWith(".Z")) {
          // unix compressed 
          return openCompressedURL(u);
 
 
@@ -0,0 +1,65 @@
+package org.biojava.nbio.structure;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.biojava.nbio.structure.align.util.AtomCache;
+
+public class FileIdentifier implements StructureIdentifier {
+	private static final Pattern PDB_REGEX = Pattern.compile("([0-9][a-z]{3})[._].*",Pattern.CASE_INSENSITIVE);
+
+	private final File file;
+	public FileIdentifier(File file) {
+		this.file = file;
+	}
+	public FileIdentifier(String name) {
+		this(new File(name));
+	}
+	
+	public File getFile() {
+		return file;
+	}
+
+	@Override
+	public String getIdentifier() {
+		return file.toString();
+	}
+
+	@Override
+	public Structure loadStructure(AtomCache cache) throws StructureException,
+			IOException {
+		return null;
+	}
+
+	/**
+	 * Represents the full substructure.
+	 * 
+	 * Attempts to guess the PDB ID from the filename, but may give up and set
+	 * it to null.
+	 */
+	@Override
+	public SubstructureIdentifier toCanonical() throws StructureException {
+		return new SubstructureIdentifier(guessPDBID(file.getName()), new ArrayList<ResidueRange>());
+	}
+
+	private static String guessPDBID(String name) {
+		Matcher match = PDB_REGEX.matcher(name);
+		if(match.matches()) {
+			return match.group(0);
+		} else {
+			// Give up if doesn't match
+			return null;
+		}
+	}
+	/**
+	 * Returns the complete input structure
+	 */
+	@Override
+	public Structure reduce(Structure input) throws StructureException {
+		return input;
+	}
+
+}
@@ -190,6 +190,10 @@ public int hashCode() {
 
 	@Override
 	public String toString() {
+		if( start == null && end == null) {
+			// Indicates the full chain
+			return chain;
+		}
 		return chain + "_" + start + "-" + end;
 	}
 
 
@@ -240,10 +240,11 @@ public List<String> getExtensions() {
 	 * @param filename
 	 * @return
 	 */
-	public StructureFiletype guessFiletype(String filename) {
+	public static StructureFiletype guessFiletype(String filename) {
+		String lower = filename.toLowerCase();
 		for(StructureFiletype type : StructureFiletype.values()) {
 			for(String ext : type.getExtensions()) {
-				if(filename.endsWith(ext)) {
+				if(lower.endsWith(ext.toLowerCase())) {
 					return type;
 				}
 			}
 
@@ -0,0 +1,217 @@
+package org.biojava.nbio.structure;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLDecoder;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.biojava.nbio.core.util.InputStreamProvider;
+import org.biojava.nbio.structure.StructureIO.StructureFiletype;
+import org.biojava.nbio.structure.align.util.AtomCache;
+import org.biojava.nbio.structure.io.PDBFileReader;
+import org.biojava.nbio.structure.io.mmcif.MMcifParser;
+import org.biojava.nbio.structure.io.mmcif.SimpleMMcifConsumer;
+import org.biojava.nbio.structure.io.mmcif.SimpleMMcifParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Represents a structure loaded from a URL (including a file URL)
+ * 
+ * A few custom query parameters are supported:
+ * 
+ * <ul>
+ * <li><tt>format=[pdb|cif]</tt> Specify the file format (will otherwise be
+ *     guessed from the extension)
+ * <li><tt>pdbId=[String]</tt> Specify the PDB ID (also guessed from the filename)
+ * <li><tt>chainID=[String]</tt> A single chain from the structure
+ * <li><tt>residues=[String]</tt> Residue ranges, in a form understood by
+ *     {@link SubstructureIdentifier}
+ * </ul>
+ * @author Spencer Bliven
+ *
+ */
+public class URLIdentifier implements StructureIdentifier {
+	private static final Logger logger = LoggerFactory.getLogger(URLIdentifier.class);
+
+	// Used for guessing the PDB ID from the filename
+	private static final Pattern PDBID_REGEX = Pattern.compile("^([0-9][a-z0-9]{3})([._-]|\\s).*",Pattern.CASE_INSENSITIVE);
+
+	/** URL parameter specifying the file format (PDB or CIF) */
+	public static final String FORMAT_PARAM = "format";
+	/** URL parameter specifying the PDB ID */
+	public static final String PDBID_PARAM = "pdbid";
+	/** URL parameter specifying a single chain to include; overridden by residues */
+	public static final String CHAINID_PARAM = "chainid";
+	/** URL parameter specifying residue ranges to include, e.g. <tt>residues=A:1-70</tt>
+	 * @see SubstructureIdentifier
+	 */
+	public static final String RESIDUES_PARAM = "residues";
+	
+	final private URL url;
+	public URLIdentifier(URL url) {
+		this.url = url;
+	}
+	
+	public URLIdentifier(String url) throws MalformedURLException {
+		this(new URL(url));
+	}
+
+	public URL getURL() {
+		return url;
+	}
+	@Override
+	public String getIdentifier() {
+		return url.toString();
+	}
+
+	/**
+	 * @return A SubstructureIdentifier without ranges (e.g. including all residues)
+	 */
+	@Override
+	public SubstructureIdentifier toCanonical() {
+		String pdbId = null;
+		List<ResidueRange> ranges = Collections.emptyList();
+		try {
+			Map<String, String> params = parseQuery(url);
+			if(params.containsKey(PDBID_PARAM)) {
+				pdbId = params.get(PDBID_PARAM);
+			}
+			if(params.containsKey(RESIDUES_PARAM)) {
+				ranges = ResidueRange.parseMultiple(params.get(RESIDUES_PARAM));
+			} else if(params.containsKey(CHAINID_PARAM)) {
+				ranges = Arrays.asList(new ResidueRange(params.get(CHAINID_PARAM),(ResidueNumber)null,(ResidueNumber)null));
+			}
+		} catch (UnsupportedEncodingException e) {
+			logger.error("Unable to decode URL "+url,e);
+		}
+		if(pdbId == null) {
+			String path = url.getPath();
+			pdbId = guessPDBID(path.substring(path.lastIndexOf("/")+1));
+		}
+		return new SubstructureIdentifier(pdbId, ranges);
+	}
+
+	@Override
+	public Structure reduce(Structure input) throws StructureException {
+		return toCanonical().reduce(input);
+	}
+	/**
+	 * Load the structure from the URL
+	 * @return null
+	 */
+	@Override
+	public Structure loadStructure(AtomCache cache) throws StructureException,
+			IOException {
+		StructureFiletype format = StructureFiletype.UNKNOWN;
+		
+		// Use user-specified format
+		try {
+			Map<String, String> params = parseQuery(url);
+			if(params.containsKey(FORMAT_PARAM)) {
+				String formatStr = params.get(FORMAT_PARAM);
+				format = StructureIO.guessFiletype("."+formatStr);
+			}
+		} catch (UnsupportedEncodingException e) {
+			logger.error("Unable to decode URL "+url,e);
+		}
+		
+		// Guess format from extension
+		if(format == StructureFiletype.UNKNOWN) {
+			format = StructureIO.guessFiletype(url.getPath());
+		}
+		
+		switch(format) {
+		case CIF:
+			// need to do mmcif parsing!
+
+			InputStreamProvider prov = new InputStreamProvider();
+			InputStream inStream =  prov.getInputStream(url);
+
+			MMcifParser parser = new SimpleMMcifParser();
+
+			SimpleMMcifConsumer consumer = new SimpleMMcifConsumer();
+			consumer.setFileParsingParameters(cache.getFileParsingParams());
+
+
+			parser.addMMcifConsumer(consumer);
+
+			try {
+				parser.parse(new BufferedReader(new InputStreamReader(inStream)));
+			} catch (IOException e){
+				e.printStackTrace();
+			}
+
+			// now get the protein structure.
+			return consumer.getStructure();
+		default:
+		case PDB:
+			// pdb file based parsing
+
+			PDBFileReader reader = new PDBFileReader(cache.getPath());
+			reader.setFetchBehavior(cache.getFetchBehavior());
+			reader.setObsoleteBehavior(cache.getObsoleteBehavior());
+			reader.setFileParsingParameters(cache.getFileParsingParams());
+			return reader.getStructure(url);
+		}
+	}
+	
+
+	/**
+	 * Recognizes PDB IDs that occur at the beginning of name followed by some
+	 * delimiter.
+	 * @param name Input filename
+	 * @return A 4-character id-like string, or null if none is found
+	 */
+	private static String guessPDBID(String name) {
+		Matcher match = PDBID_REGEX.matcher(name);
+		if(match.matches()) {
+			return match.group(1);
+		} else {
+			// Give up if doesn't match
+			return null;
+		}
+	}
+
+	/**
+	 * Parses URL parameters into a map. Keys are stored lower-case.
+	 * 
+	 * @param url
+	 * @return
+	 * @throws UnsupportedEncodingException
+	 */
+	private static Map<String,String> parseQuery(URL url) throws UnsupportedEncodingException {
+		Map<String,String> params = new LinkedHashMap<String, String>();
+		String query = url.getQuery();
+		if( query == null || query.isEmpty()) {
+			// empty query
+			return params;
+		}
+		String[] pairs = url.getQuery().split("&");
+		for(String pair: pairs) {
+			int i = pair.indexOf("=");
+			String key = pair;
+			if(i > 0) {
+				key = URLDecoder.decode(pair.substring(0, i), "UTF-8");
+			}
+			String value = null;
+			if(i > 0 && pair.length() > i+1) {
+				value = URLDecoder.decode(pair.substring(i+1), "UTF-8");
+			}
+			// note that this uses the last instance if a parameter is specified multiple times
+			params.put(key.toLowerCase(), value);
+		}
+		return params;
+	}
+}
Original file line number	Diff line number	Diff line change
`@@ -190,6 +190,10 @@ public int hashCode() {`
`190`	`190`
`191`	`191`	`@Override`
`192`	`192`	`public String toString() {`
	`193`	`+ if( start == null && end == null) {`
	`194`	`+ // Indicates the full chain`
	`195`	`+ return chain;`
	`196`	`+ }`
`193`	`197`	`return chain + "_" + start + "-" + end;`
`194`	`198`	`}`
`195`	`199`
Original file line number	Diff line number	Diff line change
`@@ -240,10 +240,11 @@ public List<String> getExtensions() {`
`240`	`240`	`* @param filename`
`241`	`241`	`* @return`
`242`	`242`	`*/`
`243`		`- public StructureFiletype guessFiletype(String filename) {`
	`243`	`+ public static StructureFiletype guessFiletype(String filename) {`
	`244`	`+ String lower = filename.toLowerCase();`
`244`	`245`	`for(StructureFiletype type : StructureFiletype.values()) {`
`245`	`246`	`for(String ext : type.getExtensions()) {`
`246`		`- if(filename.endsWith(ext)) {`
	`247`	`+ if(lower.endsWith(ext.toLowerCase())) {`
`247`	`248`	`return type;`
`248`	`249`	`}`
`249`	`250`	`}`