Skip to content

Commit de3e073

Browse files
committed
Adding URLIdentifier for files and URLs
1 parent 10da1c2 commit de3e073

File tree

9 files changed

+383
-123
lines changed

9 files changed

+383
-123
lines changed

biojava-core/src/main/java/org/biojava/nbio/core/util/InputStreamProvider.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,9 +130,9 @@ public InputStream getInputStream(URL u)
130130
return openCompressedURL(u);
131131
} else if (magic == GZIP_MAGIC ) {
132132
return openGZIPURL(u);
133-
} else if ( u.toString().endsWith(".gz")) {
133+
} else if ( u.getPath().endsWith(".gz")) {
134134
return openGZIPURL(u);
135-
} else if ( u.toString().endsWith(".Z")) {
135+
} else if ( u.getPath().endsWith(".Z")) {
136136
// unix compressed
137137
return openCompressedURL(u);
138138

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package org.biojava.nbio.structure;
2+
3+
import java.io.File;
4+
import java.io.IOException;
5+
import java.util.ArrayList;
6+
import java.util.regex.Matcher;
7+
import java.util.regex.Pattern;
8+
9+
import org.biojava.nbio.structure.align.util.AtomCache;
10+
11+
public class FileIdentifier implements StructureIdentifier {
12+
private static final Pattern PDB_REGEX = Pattern.compile("([0-9][a-z]{3})[._].*",Pattern.CASE_INSENSITIVE);
13+
14+
private final File file;
15+
public FileIdentifier(File file) {
16+
this.file = file;
17+
}
18+
public FileIdentifier(String name) {
19+
this(new File(name));
20+
}
21+
22+
public File getFile() {
23+
return file;
24+
}
25+
26+
@Override
27+
public String getIdentifier() {
28+
return file.toString();
29+
}
30+
31+
@Override
32+
public Structure loadStructure(AtomCache cache) throws StructureException,
33+
IOException {
34+
return null;
35+
}
36+
37+
/**
38+
* Represents the full substructure.
39+
*
40+
* Attempts to guess the PDB ID from the filename, but may give up and set
41+
* it to null.
42+
*/
43+
@Override
44+
public SubstructureIdentifier toCanonical() throws StructureException {
45+
return new SubstructureIdentifier(guessPDBID(file.getName()), new ArrayList<ResidueRange>());
46+
}
47+
48+
private static String guessPDBID(String name) {
49+
Matcher match = PDB_REGEX.matcher(name);
50+
if(match.matches()) {
51+
return match.group(0);
52+
} else {
53+
// Give up if doesn't match
54+
return null;
55+
}
56+
}
57+
/**
58+
* Returns the complete input structure
59+
*/
60+
@Override
61+
public Structure reduce(Structure input) throws StructureException {
62+
return input;
63+
}
64+
65+
}

biojava-structure/src/main/java/org/biojava/nbio/structure/ResidueRange.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,10 @@ public int hashCode() {
190190

191191
@Override
192192
public String toString() {
193+
if( start == null && end == null) {
194+
// Indicates the full chain
195+
return chain;
196+
}
193197
return chain + "_" + start + "-" + end;
194198
}
195199

biojava-structure/src/main/java/org/biojava/nbio/structure/StructureIO.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,10 +240,11 @@ public List<String> getExtensions() {
240240
* @param filename
241241
* @return
242242
*/
243-
public StructureFiletype guessFiletype(String filename) {
243+
public static StructureFiletype guessFiletype(String filename) {
244+
String lower = filename.toLowerCase();
244245
for(StructureFiletype type : StructureFiletype.values()) {
245246
for(String ext : type.getExtensions()) {
246-
if(filename.endsWith(ext)) {
247+
if(lower.endsWith(ext.toLowerCase())) {
247248
return type;
248249
}
249250
}

biojava-structure/src/main/java/org/biojava/nbio/structure/URIIdentifier.java

Lines changed: 0 additions & 52 deletions
This file was deleted.
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
package org.biojava.nbio.structure;
2+
3+
import java.io.BufferedReader;
4+
import java.io.IOException;
5+
import java.io.InputStream;
6+
import java.io.InputStreamReader;
7+
import java.io.UnsupportedEncodingException;
8+
import java.net.MalformedURLException;
9+
import java.net.URL;
10+
import java.net.URLDecoder;
11+
import java.util.Arrays;
12+
import java.util.Collections;
13+
import java.util.LinkedHashMap;
14+
import java.util.List;
15+
import java.util.Map;
16+
import java.util.regex.Matcher;
17+
import java.util.regex.Pattern;
18+
19+
import org.biojava.nbio.core.util.InputStreamProvider;
20+
import org.biojava.nbio.structure.StructureIO.StructureFiletype;
21+
import org.biojava.nbio.structure.align.util.AtomCache;
22+
import org.biojava.nbio.structure.io.PDBFileReader;
23+
import org.biojava.nbio.structure.io.mmcif.MMcifParser;
24+
import org.biojava.nbio.structure.io.mmcif.SimpleMMcifConsumer;
25+
import org.biojava.nbio.structure.io.mmcif.SimpleMMcifParser;
26+
import org.slf4j.Logger;
27+
import org.slf4j.LoggerFactory;
28+
29+
/**
30+
* Represents a structure loaded from a URL (including a file URL)
31+
*
32+
* A few custom query parameters are supported:
33+
*
34+
* <ul>
35+
* <li><tt>format=[pdb|cif]</tt> Specify the file format (will otherwise be
36+
* guessed from the extension)
37+
* <li><tt>pdbId=[String]</tt> Specify the PDB ID (also guessed from the filename)
38+
* <li><tt>chainID=[String]</tt> A single chain from the structure
39+
* <li><tt>residues=[String]</tt> Residue ranges, in a form understood by
40+
* {@link SubstructureIdentifier}
41+
* </ul>
42+
* @author Spencer Bliven
43+
*
44+
*/
45+
public class URLIdentifier implements StructureIdentifier {
46+
private static final Logger logger = LoggerFactory.getLogger(URLIdentifier.class);
47+
48+
// Used for guessing the PDB ID from the filename
49+
private static final Pattern PDBID_REGEX = Pattern.compile("^([0-9][a-z0-9]{3})([._-]|\\s).*",Pattern.CASE_INSENSITIVE);
50+
51+
/** URL parameter specifying the file format (PDB or CIF) */
52+
public static final String FORMAT_PARAM = "format";
53+
/** URL parameter specifying the PDB ID */
54+
public static final String PDBID_PARAM = "pdbid";
55+
/** URL parameter specifying a single chain to include; overridden by residues */
56+
public static final String CHAINID_PARAM = "chainid";
57+
/** URL parameter specifying residue ranges to include, e.g. <tt>residues=A:1-70</tt>
58+
* @see SubstructureIdentifier
59+
*/
60+
public static final String RESIDUES_PARAM = "residues";
61+
62+
final private URL url;
63+
public URLIdentifier(URL url) {
64+
this.url = url;
65+
}
66+
67+
public URLIdentifier(String url) throws MalformedURLException {
68+
this(new URL(url));
69+
}
70+
71+
public URL getURL() {
72+
return url;
73+
}
74+
@Override
75+
public String getIdentifier() {
76+
return url.toString();
77+
}
78+
79+
/**
80+
* @return A SubstructureIdentifier without ranges (e.g. including all residues)
81+
*/
82+
@Override
83+
public SubstructureIdentifier toCanonical() {
84+
String pdbId = null;
85+
List<ResidueRange> ranges = Collections.emptyList();
86+
try {
87+
Map<String, String> params = parseQuery(url);
88+
if(params.containsKey(PDBID_PARAM)) {
89+
pdbId = params.get(PDBID_PARAM);
90+
}
91+
if(params.containsKey(RESIDUES_PARAM)) {
92+
ranges = ResidueRange.parseMultiple(params.get(RESIDUES_PARAM));
93+
} else if(params.containsKey(CHAINID_PARAM)) {
94+
ranges = Arrays.asList(new ResidueRange(params.get(CHAINID_PARAM),(ResidueNumber)null,(ResidueNumber)null));
95+
}
96+
} catch (UnsupportedEncodingException e) {
97+
logger.error("Unable to decode URL "+url,e);
98+
}
99+
if(pdbId == null) {
100+
String path = url.getPath();
101+
pdbId = guessPDBID(path.substring(path.lastIndexOf("/")+1));
102+
}
103+
return new SubstructureIdentifier(pdbId, ranges);
104+
}
105+
106+
@Override
107+
public Structure reduce(Structure input) throws StructureException {
108+
return toCanonical().reduce(input);
109+
}
110+
/**
111+
* Load the structure from the URL
112+
* @return null
113+
*/
114+
@Override
115+
public Structure loadStructure(AtomCache cache) throws StructureException,
116+
IOException {
117+
StructureFiletype format = StructureFiletype.UNKNOWN;
118+
119+
// Use user-specified format
120+
try {
121+
Map<String, String> params = parseQuery(url);
122+
if(params.containsKey(FORMAT_PARAM)) {
123+
String formatStr = params.get(FORMAT_PARAM);
124+
format = StructureIO.guessFiletype("."+formatStr);
125+
}
126+
} catch (UnsupportedEncodingException e) {
127+
logger.error("Unable to decode URL "+url,e);
128+
}
129+
130+
// Guess format from extension
131+
if(format == StructureFiletype.UNKNOWN) {
132+
format = StructureIO.guessFiletype(url.getPath());
133+
}
134+
135+
switch(format) {
136+
case CIF:
137+
// need to do mmcif parsing!
138+
139+
InputStreamProvider prov = new InputStreamProvider();
140+
InputStream inStream = prov.getInputStream(url);
141+
142+
MMcifParser parser = new SimpleMMcifParser();
143+
144+
SimpleMMcifConsumer consumer = new SimpleMMcifConsumer();
145+
consumer.setFileParsingParameters(cache.getFileParsingParams());
146+
147+
148+
parser.addMMcifConsumer(consumer);
149+
150+
try {
151+
parser.parse(new BufferedReader(new InputStreamReader(inStream)));
152+
} catch (IOException e){
153+
e.printStackTrace();
154+
}
155+
156+
// now get the protein structure.
157+
return consumer.getStructure();
158+
default:
159+
case PDB:
160+
// pdb file based parsing
161+
162+
PDBFileReader reader = new PDBFileReader(cache.getPath());
163+
reader.setFetchBehavior(cache.getFetchBehavior());
164+
reader.setObsoleteBehavior(cache.getObsoleteBehavior());
165+
reader.setFileParsingParameters(cache.getFileParsingParams());
166+
return reader.getStructure(url);
167+
}
168+
}
169+
170+
171+
/**
172+
* Recognizes PDB IDs that occur at the beginning of name followed by some
173+
* delimiter.
174+
* @param name Input filename
175+
* @return A 4-character id-like string, or null if none is found
176+
*/
177+
private static String guessPDBID(String name) {
178+
Matcher match = PDBID_REGEX.matcher(name);
179+
if(match.matches()) {
180+
return match.group(1);
181+
} else {
182+
// Give up if doesn't match
183+
return null;
184+
}
185+
}
186+
187+
/**
188+
* Parses URL parameters into a map. Keys are stored lower-case.
189+
*
190+
* @param url
191+
* @return
192+
* @throws UnsupportedEncodingException
193+
*/
194+
private static Map<String,String> parseQuery(URL url) throws UnsupportedEncodingException {
195+
Map<String,String> params = new LinkedHashMap<String, String>();
196+
String query = url.getQuery();
197+
if( query == null || query.isEmpty()) {
198+
// empty query
199+
return params;
200+
}
201+
String[] pairs = url.getQuery().split("&");
202+
for(String pair: pairs) {
203+
int i = pair.indexOf("=");
204+
String key = pair;
205+
if(i > 0) {
206+
key = URLDecoder.decode(pair.substring(0, i), "UTF-8");
207+
}
208+
String value = null;
209+
if(i > 0 && pair.length() > i+1) {
210+
value = URLDecoder.decode(pair.substring(i+1), "UTF-8");
211+
}
212+
// note that this uses the last instance if a parameter is specified multiple times
213+
params.put(key.toLowerCase(), value);
214+
}
215+
return params;
216+
}
217+
}

0 commit comments

Comments
 (0)