|
| 1 | +package org.biojava.nbio.structure; |
| 2 | + |
| 3 | +import java.io.BufferedReader; |
| 4 | +import java.io.IOException; |
| 5 | +import java.io.InputStream; |
| 6 | +import java.io.InputStreamReader; |
| 7 | +import java.io.UnsupportedEncodingException; |
| 8 | +import java.net.MalformedURLException; |
| 9 | +import java.net.URL; |
| 10 | +import java.net.URLDecoder; |
| 11 | +import java.util.Arrays; |
| 12 | +import java.util.Collections; |
| 13 | +import java.util.LinkedHashMap; |
| 14 | +import java.util.List; |
| 15 | +import java.util.Map; |
| 16 | +import java.util.regex.Matcher; |
| 17 | +import java.util.regex.Pattern; |
| 18 | + |
| 19 | +import org.biojava.nbio.core.util.InputStreamProvider; |
| 20 | +import org.biojava.nbio.structure.StructureIO.StructureFiletype; |
| 21 | +import org.biojava.nbio.structure.align.util.AtomCache; |
| 22 | +import org.biojava.nbio.structure.io.PDBFileReader; |
| 23 | +import org.biojava.nbio.structure.io.mmcif.MMcifParser; |
| 24 | +import org.biojava.nbio.structure.io.mmcif.SimpleMMcifConsumer; |
| 25 | +import org.biojava.nbio.structure.io.mmcif.SimpleMMcifParser; |
| 26 | +import org.slf4j.Logger; |
| 27 | +import org.slf4j.LoggerFactory; |
| 28 | + |
| 29 | +/** |
| 30 | + * Represents a structure loaded from a URL (including a file URL) |
| 31 | + * |
| 32 | + * A few custom query parameters are supported: |
| 33 | + * |
| 34 | + * <ul> |
| 35 | + * <li><tt>format=[pdb|cif]</tt> Specify the file format (will otherwise be |
| 36 | + * guessed from the extension) |
| 37 | + * <li><tt>pdbId=[String]</tt> Specify the PDB ID (also guessed from the filename) |
| 38 | + * <li><tt>chainID=[String]</tt> A single chain from the structure |
| 39 | + * <li><tt>residues=[String]</tt> Residue ranges, in a form understood by |
| 40 | + * {@link SubstructureIdentifier} |
| 41 | + * </ul> |
| 42 | + * @author Spencer Bliven |
| 43 | + * |
| 44 | + */ |
| 45 | +public class URLIdentifier implements StructureIdentifier { |
| 46 | + private static final Logger logger = LoggerFactory.getLogger(URLIdentifier.class); |
| 47 | + |
| 48 | + // Used for guessing the PDB ID from the filename |
| 49 | + private static final Pattern PDBID_REGEX = Pattern.compile("^([0-9][a-z0-9]{3})([._-]|\\s).*",Pattern.CASE_INSENSITIVE); |
| 50 | + |
| 51 | + /** URL parameter specifying the file format (PDB or CIF) */ |
| 52 | + public static final String FORMAT_PARAM = "format"; |
| 53 | + /** URL parameter specifying the PDB ID */ |
| 54 | + public static final String PDBID_PARAM = "pdbid"; |
| 55 | + /** URL parameter specifying a single chain to include; overridden by residues */ |
| 56 | + public static final String CHAINID_PARAM = "chainid"; |
| 57 | + /** URL parameter specifying residue ranges to include, e.g. <tt>residues=A:1-70</tt> |
| 58 | + * @see SubstructureIdentifier |
| 59 | + */ |
| 60 | + public static final String RESIDUES_PARAM = "residues"; |
| 61 | + |
| 62 | + final private URL url; |
| 63 | + public URLIdentifier(URL url) { |
| 64 | + this.url = url; |
| 65 | + } |
| 66 | + |
| 67 | + public URLIdentifier(String url) throws MalformedURLException { |
| 68 | + this(new URL(url)); |
| 69 | + } |
| 70 | + |
| 71 | + public URL getURL() { |
| 72 | + return url; |
| 73 | + } |
| 74 | + @Override |
| 75 | + public String getIdentifier() { |
| 76 | + return url.toString(); |
| 77 | + } |
| 78 | + |
| 79 | + /** |
| 80 | + * @return A SubstructureIdentifier without ranges (e.g. including all residues) |
| 81 | + */ |
| 82 | + @Override |
| 83 | + public SubstructureIdentifier toCanonical() { |
| 84 | + String pdbId = null; |
| 85 | + List<ResidueRange> ranges = Collections.emptyList(); |
| 86 | + try { |
| 87 | + Map<String, String> params = parseQuery(url); |
| 88 | + if(params.containsKey(PDBID_PARAM)) { |
| 89 | + pdbId = params.get(PDBID_PARAM); |
| 90 | + } |
| 91 | + if(params.containsKey(RESIDUES_PARAM)) { |
| 92 | + ranges = ResidueRange.parseMultiple(params.get(RESIDUES_PARAM)); |
| 93 | + } else if(params.containsKey(CHAINID_PARAM)) { |
| 94 | + ranges = Arrays.asList(new ResidueRange(params.get(CHAINID_PARAM),(ResidueNumber)null,(ResidueNumber)null)); |
| 95 | + } |
| 96 | + } catch (UnsupportedEncodingException e) { |
| 97 | + logger.error("Unable to decode URL "+url,e); |
| 98 | + } |
| 99 | + if(pdbId == null) { |
| 100 | + String path = url.getPath(); |
| 101 | + pdbId = guessPDBID(path.substring(path.lastIndexOf("/")+1)); |
| 102 | + } |
| 103 | + return new SubstructureIdentifier(pdbId, ranges); |
| 104 | + } |
| 105 | + |
| 106 | + @Override |
| 107 | + public Structure reduce(Structure input) throws StructureException { |
| 108 | + return toCanonical().reduce(input); |
| 109 | + } |
| 110 | + /** |
| 111 | + * Load the structure from the URL |
| 112 | + * @return null |
| 113 | + */ |
| 114 | + @Override |
| 115 | + public Structure loadStructure(AtomCache cache) throws StructureException, |
| 116 | + IOException { |
| 117 | + StructureFiletype format = StructureFiletype.UNKNOWN; |
| 118 | + |
| 119 | + // Use user-specified format |
| 120 | + try { |
| 121 | + Map<String, String> params = parseQuery(url); |
| 122 | + if(params.containsKey(FORMAT_PARAM)) { |
| 123 | + String formatStr = params.get(FORMAT_PARAM); |
| 124 | + format = StructureIO.guessFiletype("."+formatStr); |
| 125 | + } |
| 126 | + } catch (UnsupportedEncodingException e) { |
| 127 | + logger.error("Unable to decode URL "+url,e); |
| 128 | + } |
| 129 | + |
| 130 | + // Guess format from extension |
| 131 | + if(format == StructureFiletype.UNKNOWN) { |
| 132 | + format = StructureIO.guessFiletype(url.getPath()); |
| 133 | + } |
| 134 | + |
| 135 | + switch(format) { |
| 136 | + case CIF: |
| 137 | + // need to do mmcif parsing! |
| 138 | + |
| 139 | + InputStreamProvider prov = new InputStreamProvider(); |
| 140 | + InputStream inStream = prov.getInputStream(url); |
| 141 | + |
| 142 | + MMcifParser parser = new SimpleMMcifParser(); |
| 143 | + |
| 144 | + SimpleMMcifConsumer consumer = new SimpleMMcifConsumer(); |
| 145 | + consumer.setFileParsingParameters(cache.getFileParsingParams()); |
| 146 | + |
| 147 | + |
| 148 | + parser.addMMcifConsumer(consumer); |
| 149 | + |
| 150 | + try { |
| 151 | + parser.parse(new BufferedReader(new InputStreamReader(inStream))); |
| 152 | + } catch (IOException e){ |
| 153 | + e.printStackTrace(); |
| 154 | + } |
| 155 | + |
| 156 | + // now get the protein structure. |
| 157 | + return consumer.getStructure(); |
| 158 | + default: |
| 159 | + case PDB: |
| 160 | + // pdb file based parsing |
| 161 | + |
| 162 | + PDBFileReader reader = new PDBFileReader(cache.getPath()); |
| 163 | + reader.setFetchBehavior(cache.getFetchBehavior()); |
| 164 | + reader.setObsoleteBehavior(cache.getObsoleteBehavior()); |
| 165 | + reader.setFileParsingParameters(cache.getFileParsingParams()); |
| 166 | + return reader.getStructure(url); |
| 167 | + } |
| 168 | + } |
| 169 | + |
| 170 | + |
| 171 | + /** |
| 172 | + * Recognizes PDB IDs that occur at the beginning of name followed by some |
| 173 | + * delimiter. |
| 174 | + * @param name Input filename |
| 175 | + * @return A 4-character id-like string, or null if none is found |
| 176 | + */ |
| 177 | + private static String guessPDBID(String name) { |
| 178 | + Matcher match = PDBID_REGEX.matcher(name); |
| 179 | + if(match.matches()) { |
| 180 | + return match.group(1); |
| 181 | + } else { |
| 182 | + // Give up if doesn't match |
| 183 | + return null; |
| 184 | + } |
| 185 | + } |
| 186 | + |
| 187 | + /** |
| 188 | + * Parses URL parameters into a map. Keys are stored lower-case. |
| 189 | + * |
| 190 | + * @param url |
| 191 | + * @return |
| 192 | + * @throws UnsupportedEncodingException |
| 193 | + */ |
| 194 | + private static Map<String,String> parseQuery(URL url) throws UnsupportedEncodingException { |
| 195 | + Map<String,String> params = new LinkedHashMap<String, String>(); |
| 196 | + String query = url.getQuery(); |
| 197 | + if( query == null || query.isEmpty()) { |
| 198 | + // empty query |
| 199 | + return params; |
| 200 | + } |
| 201 | + String[] pairs = url.getQuery().split("&"); |
| 202 | + for(String pair: pairs) { |
| 203 | + int i = pair.indexOf("="); |
| 204 | + String key = pair; |
| 205 | + if(i > 0) { |
| 206 | + key = URLDecoder.decode(pair.substring(0, i), "UTF-8"); |
| 207 | + } |
| 208 | + String value = null; |
| 209 | + if(i > 0 && pair.length() > i+1) { |
| 210 | + value = URLDecoder.decode(pair.substring(i+1), "UTF-8"); |
| 211 | + } |
| 212 | + // note that this uses the last instance if a parameter is specified multiple times |
| 213 | + params.put(key.toLowerCase(), value); |
| 214 | + } |
| 215 | + return params; |
| 216 | + } |
| 217 | +} |
0 commit comments