Skip to content

Commit 07a2e46

Browse files
committed
ECOD improvements (Fix #384)
- Add support for parsing ecod versions develop101+, which added a field (format version 1.4) - Made EcodDomain implement StructureIdentifier - Add ECOD support to StructureName, allowing identifiers to be used in AtomCache and most other contexts - Update unit tests to check the latest version
1 parent 70202d0 commit 07a2e46

File tree

6 files changed

+239
-41
lines changed

6 files changed

+239
-41
lines changed

biojava-integrationtest/src/test/java/org/biojava/nbio/structure/test/ecod/EcodInstallationTest.java

Lines changed: 95 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@
3939
import java.util.regex.Pattern;
4040

4141
import org.biojava.nbio.core.util.ConcurrencyTools;
42+
import org.biojava.nbio.structure.ResidueNumber;
43+
import org.biojava.nbio.structure.ResidueRange;
44+
import org.biojava.nbio.structure.StructureException;
45+
import org.biojava.nbio.structure.align.util.AtomCache;
4246
import org.biojava.nbio.structure.ecod.EcodDatabase;
4347
import org.biojava.nbio.structure.ecod.EcodDomain;
4448
import org.biojava.nbio.structure.ecod.EcodFactory;
@@ -56,7 +60,13 @@
5660
public class EcodInstallationTest {
5761

5862
private static final Logger logger = LoggerFactory.getLogger(EcodInstallationTest.class);
59-
private static final String VERSION = "develop78";
63+
private static final String VERSION = "develop124"; // Should be updated periodically
64+
65+
// Info about known versions, for testing
66+
private static final int DEVELOP_FIRST_VERSION = 45;
67+
private static final int DEVELOP_LATEST_VERSTION = 124; // Should be updated periodically
68+
//versions known to be unreleased
69+
private static final List<Integer> DEVELOP_VERSIONS_BLACKLIST = Arrays.asList( 85, 107, 113 );
6070

6171
static {
6272
//System.setProperty("Log4jContextSelector", "org.apache.logging.log4j.core.async.AsyncLoggerContextSelector");
@@ -85,8 +95,21 @@ public void testAllDomains() throws IOException {
8595
EcodDatabase ecod = EcodFactory.getEcodDatabase(VERSION);
8696

8797
List<EcodDomain> domains = ecod.getAllDomains();
88-
expected = 423779; //version77
89-
expected = 423869; //version78
98+
// Taken from the official ecod stats file
99+
switch(VERSION) {
100+
case "develop77":
101+
expected = 423825; //version77
102+
break;
103+
case "develop78":
104+
expected = 423869; //version78
105+
break;
106+
case "develop124":
107+
expected = 468680; //version124
108+
break;
109+
default:
110+
fail("Unrecognized version "+VERSION);
111+
return;
112+
}
90113
assertEquals("Wrong number of domains",expected,domains.size());
91114
}
92115

@@ -128,11 +151,12 @@ public void testParsing() throws IOException {
128151
20669l, "e1lyw.1", false,
129152
// Integer xGroup, Integer hGroup, Integer tGroup, Integer fGroup, String pdbId,
130153
1,1,1,2,"1lyw",
131-
// String chainId, String range, String architectureName,
132-
".", "A:3-97,B:106-346", "beta barrels",
154+
// String chainId, String range, String seqId, String architectureName,
155+
".", "A:3-97,B:106-346", "A:3-97,B:1-241", "beta barrels",
133156
// String xGroupName, String hGroupName, String tGroupName,
157+
"cradle loop barrel", "RIFT-related",
158+
"NO_T_NAME",// should be "acid protease" except for bug in develop124
134159
// String fGroupName, Boolean isAssembly, List<String> ligands
135-
"cradle loop barrel", "RIFT-related", "acid protease",
136160
"EF00710",//"UNK_F_TYPE",
137161
20669l, Collections.singleton("EPE")
138162
);
@@ -235,23 +259,9 @@ public void testVersion() throws IOException {
235259
//@Ignore // Very slow parsing test
236260
@Test
237261
public void testAllVersions() throws IOException {
238-
// Fetch latest version
239-
EcodDatabase latest = EcodFactory.getEcodDatabase("latest");
240-
String latestVersionStr = latest.getVersion();
241-
int latestVersion = 0;
242-
Matcher match = Pattern.compile("develop([0-9]+)",Pattern.CASE_INSENSITIVE).matcher(latestVersionStr);
243-
if(match.matches())
244-
latestVersion = Integer.parseInt(match.group(1));
245-
latest = null;
246-
247262
// List all versions
248-
int firstVersion = 45;
249-
int lastVersion = Math.max(78,latestVersion);
250-
List<String> versions = new ArrayList<String>();
251-
versions.add("latest");
252-
for(int version = firstVersion; version<= lastVersion;version++) {
253-
versions.add("develop"+version);
254-
}
263+
List<String> versions = getKnownEcodVersions();
264+
versions.add(EcodFactory.DEFAULT_VERSION);
255265

256266
// Parse all versions
257267
for(String version : versions) {
@@ -274,4 +284,67 @@ public void testAllVersions() throws IOException {
274284
}
275285
}
276286
}
287+
288+
@Test
289+
public void testGetStructure() throws IOException, StructureException {
290+
AtomCache cache = new AtomCache();
291+
292+
// Save ECOD version, since AtomCache uses the global default
293+
String prevECOD = EcodFactory.getEcodDatabase().getVersion();
294+
EcodFactory.setEcodDatabase("develop124");
295+
EcodDatabase ecod = EcodFactory.getEcodDatabase();
296+
297+
String name;
298+
EcodDomain id;
299+
List<ResidueRange> ranges;
300+
301+
// Test some cases where Chain and domain number are ambiguous
302+
name = "e1wz2B14";
303+
id = ecod.getDomainsById(name);
304+
assertEquals(name, id.getIdentifier());
305+
ranges = id.getResidueRanges();
306+
assertEquals(1,ranges.size());
307+
assertEquals(new ResidueRange("B", new ResidueNumber("B", 200,null), new ResidueNumber("B",445,null)),ranges.get(0));
308+
cache.getStructure(name);
309+
310+
name = "e3j9zS13";
311+
id = ecod.getDomainsById(name);
312+
assertEquals(name, id.getIdentifier());
313+
ranges = id.getResidueRanges();
314+
assertEquals(1,ranges.size());
315+
assertEquals(new ResidueRange("S1", new ResidueNumber("S1", 288,null), new ResidueNumber("S1",410,null)),ranges.get(0));
316+
cache.getStructure(name);
317+
318+
319+
// Restore previous ECOD database
320+
EcodFactory.setEcodDatabase(prevECOD);
321+
}
322+
323+
/**
324+
* Get a list of all develop versions, generated based on the DEVELOP_*
325+
* static variables.
326+
* @return A list of all development versions: "develop45","develop46",...
327+
*/
328+
public static List<String> getKnownEcodVersions() {
329+
// Parse version from latest.
330+
int latestVersion = DEVELOP_LATEST_VERSTION;
331+
try {
332+
EcodDatabase latest = EcodFactory.getEcodDatabase(EcodFactory.DEFAULT_VERSION);
333+
String latestVersionStr;
334+
latestVersionStr = latest.getVersion();
335+
Matcher match = Pattern.compile("develop([0-9]+)",Pattern.CASE_INSENSITIVE).matcher(latestVersionStr);
336+
if(match.matches())
337+
latestVersion = Integer.parseInt(match.group(1));
338+
latest = null;
339+
} catch (IOException e) {}
340+
latestVersion = Math.max(latestVersion, DEVELOP_LATEST_VERSTION);
341+
342+
List<String> versions = new ArrayList<>(latestVersion-DEVELOP_FIRST_VERSION+2);
343+
for(int version=DEVELOP_FIRST_VERSION;version<=latestVersion;version++) {
344+
if( !DEVELOP_VERSIONS_BLACKLIST.contains(version) ) {
345+
versions.add("develop"+version);
346+
}
347+
}
348+
return versions;
349+
}
277350
}

biojava-integrationtest/src/test/java/org/biojava/nbio/structure/test/ecod/EcodParseTest.java

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@
5252
* error messages to print.
5353
*
5454
* Filtering log4j messages to the 'error' level will filter all but the most
55-
* grevious errors.
55+
* grievous errors.
56+
*
57+
* Faster unit tests go in {@link EcodInstallationTest}.
5658
*
5759
* @author blivens
5860
*
@@ -61,14 +63,22 @@ public class EcodParseTest {
6163
private static final Logger logger = LoggerFactory.getLogger(EcodParseTest.class);
6264

6365
public static void main(String[] args) throws IOException {
64-
String ecodVersion = "develop83";
66+
String ecodVersion = "develop124";
67+
// String ecodVersion = "latest";
68+
69+
int errors = testVersion(ecodVersion);
70+
logger.info("Done. {} errors.",errors);
71+
72+
}
73+
74+
private static int testVersion(String ecodVersion) throws IOException {
6575
EcodDatabase ecod = EcodFactory.getEcodDatabase(ecodVersion);
6676
AtomCache cache = new AtomCache();
6777
cache.setObsoleteBehavior(ObsoleteBehavior.FETCH_OBSOLETE);
6878
List<EcodDomain> domains = ecod.getAllDomains();
6979
// domains = Arrays.asList(ecod.getDomainsById("e1yfbB2"));
7080
// domains = Arrays.asList(ecod.getDomainsById("e1w50A2"));
71-
domains = Arrays.asList(ecod.getDomainsById("e2ftlE1"));
81+
// domains = Arrays.asList(ecod.getDomainsById("e2ftlE1"));
7282
int errors = 0;
7383
for(EcodDomain d : domains) {
7484
Atom[] ca1;
@@ -184,6 +194,6 @@ public static void main(String[] args) throws IOException {
184194
//All test passed
185195
logger.info("OK "+d.getDomainId());
186196
}
187-
logger.info("Done. {} errors.",errors);
197+
return errors;
188198
}
189199
}

biojava-structure/src/main/java/org/biojava/nbio/structure/URLIdentifier.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ public Structure loadStructure(AtomCache cache) throws StructureException,
174174
* @param name Input filename
175175
* @return A 4-character id-like string, or null if none is found
176176
*/
177-
private static String guessPDBID(String name) {
177+
public static String guessPDBID(String name) {
178178
Matcher match = PDBID_REGEX.matcher(name);
179179
if(match.matches()) {
180180
return match.group(1);

biojava-structure/src/main/java/org/biojava/nbio/structure/align/client/StructureName.java

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
import org.biojava.nbio.structure.domain.PDPDomain;
4747
import org.biojava.nbio.structure.domain.PDPProvider;
4848
import org.biojava.nbio.structure.domain.RemotePDPProvider;
49+
import org.biojava.nbio.structure.ecod.EcodFactory;
4950
import org.biojava.nbio.structure.io.util.FileDownloadUtils;
5051
import org.biojava.nbio.structure.scop.ScopDatabase;
5152
import org.biojava.nbio.structure.scop.ScopDomain;
@@ -77,7 +78,10 @@ public class StructureName implements Comparable<StructureName>, Serializable, S
7778
protected String chainId;
7879

7980
private static final Pattern cathPattern = Pattern.compile("^([0-9][a-z0-9]{3})(\\w)([0-9]{2})$",Pattern.CASE_INSENSITIVE);
80-
private static final Pattern scopPattern = Pattern.compile("^d([0-9][a-z0-9]{3})(\\w|\\.)(\\w)$",Pattern.CASE_INSENSITIVE);
81+
// ds046__ is a special case with no PDB entry
82+
private static final Pattern scopPattern = Pattern.compile("^d([0-9][a-z0-9]{3}|s046)(\\w|\\.)(\\w)$",Pattern.CASE_INSENSITIVE);
83+
// ECOD chains and domains can't be automatically distinguished. Ex: e3j9zS13 is chain 'S1', e1wz2B14 is chain 'B'
84+
private static final Pattern ecodPattern = Pattern.compile("^e([0-9][a-zA-Z0-9]{3})(?:\\w|\\.)\\w+$",Pattern.CASE_INSENSITIVE);
8185

8286
private enum Source {
8387
PDB,
@@ -86,6 +90,7 @@ private enum Source {
8690
CATH,
8791
URL,
8892
FILE,
93+
ECOD,
8994
};
9095

9196
private Source mySource = null;
@@ -145,11 +150,20 @@ private void init(){
145150
chainId = matcher.group(2);
146151
return;
147152
}
153+
// ECOD
154+
matcher = ecodPattern.matcher(name);
155+
if ( matcher.matches() ){
156+
mySource = Source.ECOD;
157+
pdbId = matcher.group(1);
158+
chainId = null;
159+
return;
160+
}
148161
// URL
149162
try {
150-
new URL(name);
163+
URL url = new URL(name);
151164
mySource = Source.URL;
152-
pdbId = null;
165+
String path = url.getPath();
166+
pdbId = URLIdentifier.guessPDBID( path.substring(path.lastIndexOf('/')+1) );
153167
chainId = null;
154168
return;
155169
} catch(MalformedURLException e) {}
@@ -269,6 +283,10 @@ public boolean isURL() {
269283
public boolean isFile() {
270284
return mySource == Source.FILE;
271285
}
286+
287+
public boolean isEcodDomain() {
288+
return mySource == Source.ECOD;
289+
}
272290

273291
/**
274292
*
@@ -284,6 +302,13 @@ private StructureIdentifier realize() throws StructureException {
284302
case CATH:
285303
realized = CathFactory.getCathDatabase().getDescriptionByCathId(getIdentifier());
286304
break;
305+
case ECOD:
306+
try {
307+
realized = EcodFactory.getEcodDatabase().getDomainsById(name);
308+
} catch (IOException e) {
309+
throw new StructureException("Unable to get ECOD domain "+name,e);
310+
}
311+
break;
287312
case SCOP:
288313
// Fuzzy matching of the domain name to the current default factory
289314
realized = guessScopDomain(getIdentifier(),ScopFactory.getSCOP());

biojava-structure/src/main/java/org/biojava/nbio/structure/ecod/EcodDomain.java

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,18 @@ public class EcodDomain implements Serializable, Cloneable, StructureIdentifier
3838
Column 1: ECOD uid - internal domain unique identifier
3939
Column 2: ECOD domain id - domain identifier
4040
Column 3: ECOD representative status - manual (curated) or automated nonrep
41-
Column 4: ECOD hierachy identifier - [X-group].[H-group].{T-group].[F-group]
41+
Column 4: ECOD hierachy identifier - [X-group].[H-group].[T-group].[F-group]
4242
Column 5: PDB identifier
4343
Column 6: Chain identifier (note: case-sensitive)
4444
Column 7: PDB residue number range
45-
Column 8: Architecture name
46-
Column 9: X-group name
47-
Column 10: H-group name
48-
Column 11: T-group name
49-
Column 12: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group)
50-
Column 13: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed)
51-
Column 14: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain
45+
Column 8: seq_id number range (based on internal PDB indices)
46+
Column 9: Architecture name
47+
Column 10: X-group name
48+
Column 11: H-group name
49+
Column 12: T-group name
50+
Column 13: F-group name (F_UNCLASSIFIED denotes that domain has not been assigned to an F-group)
51+
Column 14: Domain assembly status (if domain is member of assembly, partners' ecod domain ids listed)
52+
Column 15: Comma-separated value list of non-polymer entities within 4 A of at least one residue of domain
5253
5354
5455
001502751 e4s1gA1 1.1.1 4s1g A A:68-251 beta barrels cradle loop barrel RIFT-related acid protease F_UNCLASSIFIED NOT_DOMAIN_ASSEMBLY NO_LIGANDS_4A
@@ -69,6 +70,7 @@ public class EcodDomain implements Serializable, Cloneable, StructureIdentifier
6970
private String pdbId;
7071
private String chainId;
7172
private String range;
73+
private String seqIdRange;
7274
private String architectureName;
7375
private String xGroupName;
7476
private String hGroupName;
@@ -85,6 +87,17 @@ public EcodDomain(Long uid, String domainId, Boolean manual,
8587
String chainId, String range, String architectureName,
8688
String xGroupName, String hGroupName, String tGroupName,
8789
String fGroupName, Long assemblyId, Set<String> ligands) {
90+
this(uid, domainId, manual,
91+
xGroup, hGroup, tGroup, fGroup, pdbId,
92+
chainId, range, null, architectureName,
93+
xGroupName, hGroupName, tGroupName,
94+
fGroupName, assemblyId, ligands);
95+
}
96+
public EcodDomain(Long uid, String domainId, Boolean manual,
97+
Integer xGroup, Integer hGroup, Integer tGroup, Integer fGroup, String pdbId,
98+
String chainId, String range, String seqId, String architectureName,
99+
String xGroupName, String hGroupName, String tGroupName,
100+
String fGroupName, Long assemblyId, Set<String> ligands) {
88101
this.uid = uid;
89102
this.domainId = domainId;
90103
this.manual = manual;
@@ -95,6 +108,7 @@ public EcodDomain(Long uid, String domainId, Boolean manual,
95108
this.pdbId = pdbId;
96109
this.chainId = chainId;
97110
this.range = range;
111+
this.seqIdRange = seqId;
98112
this.architectureName = architectureName;
99113
this.xGroupName = xGroupName;
100114
this.hGroupName = hGroupName;
@@ -117,6 +131,7 @@ public EcodDomain(EcodDomain o) {
117131
this.pdbId = o.pdbId;
118132
this.chainId = o.chainId;
119133
this.range = o.range;
134+
this.seqIdRange = o.seqIdRange;
120135
this.architectureName = o.architectureName;
121136
this.xGroupName = o.xGroupName;
122137
this.hGroupName = o.hGroupName;
@@ -126,6 +141,8 @@ public EcodDomain(EcodDomain o) {
126141
this.ligands = new HashSet<String>(o.ligands);
127142
}
128143

144+
145+
129146
@Override
130147
protected Object clone() throws CloneNotSupportedException {
131148
return new EcodDomain(this);
@@ -185,12 +202,30 @@ public String getChainId() {
185202
public void setChainId(String chainId) {
186203
this.chainId = chainId;
187204
}
205+
/**
206+
* Get the range of this domain, in PDB residue numbers (mmCif's
207+
* _pdbx_poly_seq_scheme.pdb_seq_num and pdb_ins_code).
208+
* @return The chain and residue range, e.g. "A:1-100"
209+
*/
188210
public String getRange() {
189211
return range;
190212
}
191213
public void setRange(String range) {
192214
this.range = range;
193215
}
216+
/**
217+
* Get the range of this domain, in 1-based residue indices (mmCif's
218+
* _pdbx_poly_seq_scheme.seq_id)
219+
*
220+
* Note that {@link #getRange()} is used when constructing the domain.
221+
* @return The chain and residue range, e.g. "A:1-100"
222+
*/
223+
public String getSeqIdRange() {
224+
return seqIdRange;
225+
}
226+
public void setSeqIdRange(String seqIdRange) {
227+
this.seqIdRange = seqIdRange;
228+
}
194229
public String getArchitectureName() {
195230
return architectureName;
196231
}

0 commit comments

Comments
 (0)