Skip to content

Commit c84e6ec

Browse files
committed
Merge pull request #1 from josemduarte/entity_features
More refactoring and fixes on entities
2 parents 0693061 + 9a98bd8 commit c84e6ec

9 files changed

Lines changed: 156 additions & 64 deletions

File tree

biojava-integrationtest/src/test/java/org/biojava/nbio/structure/test/io/TestLongPdbVsMmCifParsing.java

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ public void testVeryLongPdbVsMmCif() throws IOException, StructureException {
124124

125125
@Test
126126
public void testSingle() throws IOException, StructureException {
127-
testAll(Arrays.asList("1bcr"));
127+
testAll(Arrays.asList("3hbx"));
128128
}
129129

130130
@After
@@ -207,19 +207,36 @@ private void testStructureMethods(Structure sPdb, Structure sCif) {
207207
// TODO journal article not parsed in mmCIF parser
208208
//assertEquals("failed hasJournalArticle",sPdb.hasJournalArticle(),sCif.hasJournalArticle());
209209

210-
// compounds: there's quite some inconsistencies here between pdb and cif:
210+
// entity type should always be present
211+
for (EntityInfo e: sPdb.getEntityInfos()) {
212+
assertNotNull(e.getType());
213+
}
214+
215+
for (EntityInfo e: sCif.getEntityInfos()) {
216+
assertNotNull(e.getType());
217+
}
218+
219+
// entities: there's quite some inconsistencies here between pdb and cif:
211220
// sugar polymers are not in pdb at all: we avoid them
212-
boolean canCompareCompoundsSize = true;
213-
for (EntityInfo compound: sCif.getEntityInfos()) {
214-
if (compound.getDescription()==null || compound.getDescription().contains("SUGAR")) {
215-
canCompareCompoundsSize = false;
216-
break;
217-
}
221+
boolean canCompareEntityCounts = true;
222+
for (EntityInfo e:sCif.getEntityInfos()) {
223+
if (e.getDescription().contains("SUGAR")) canCompareEntityCounts = false;
218224
}
225+
if (canCompareEntityCounts) {
226+
int entCountCif = 0;
227+
for (EntityInfo e: sCif.getEntityInfos()) {
228+
if (e.getType() == EntityType.POLYMER)
229+
entCountCif++;
219230

220-
if (canCompareCompoundsSize)
221-
assertEquals("failed number of Compounds pdb vs cif", sPdb.getEntityInfos().size(), sCif.getEntityInfos().size());
231+
}
232+
int entCountPdb = 0;
233+
for (EntityInfo e:sPdb.getEntityInfos()) {
234+
if (e.getType() == EntityType.POLYMER)
235+
entCountPdb++;
236+
}
222237

238+
assertEquals("failed number of non-sugar polymeric Entities pdb vs cif", entCountPdb, entCountCif);
239+
}
223240

224241
// ss bonds
225242
// 4ab9 contains an error in ssbond in pdb file (misses 1 ssbond)

biojava-structure/src/main/java/org/biojava/nbio/structure/EntityInfo.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,7 @@ public class EntityInfo implements Serializable {
5656
private final static Logger logger = LoggerFactory.getLogger(EntityInfo.class);
5757

5858

59-
//TODO we should consider having the data here as it is in mmCIF dictionary - JD 2014-12-11
60-
// Especially useful would be to have the polymer/non-polymer/water classification present in mmCIF
61-
// We could drop a lot of the stuff here that is PDB-file related (actually many PDB files don't contain many of these fields)
59+
// TODO We could drop a lot of the stuff here that is PDB-file related (actually many PDB files don't contain many of these fields) - JD 2016-03-25
6260
// The only really essential part of a EntityInfo is the member chains and the entity_id/mol_id
6361
// See also issue https://github.com/biojava/biojava/issues/219
6462

@@ -149,6 +147,8 @@ public EntityInfo (EntityInfo c) {
149147
this.chains2pdbResNums2ResSerials = new HashMap<String, Map<ResidueNumber,Integer>>();
150148

151149
this.molId = c.molId;
150+
151+
this.type = c.type;
152152

153153
this.refChainId = c.refChainId;
154154

biojava-structure/src/main/java/org/biojava/nbio/structure/Structure.java

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -548,13 +548,21 @@ public Chain getChainByPDB(String chainId, int modelnr)
548548
public List<DBRef> getDBRefs();
549549

550550
/**
551-
* Request a particular compound by its molId (entity_id in mmCIF dictionary)
551+
* Request a particular entity by its entity id (mol id in legacy PDB format)
552552
*
553-
* @param molId
553+
* @param entityId
554554
* @return a compound
555+
* @deprecated use {@link #getEntityById(int)} instead
555556
*/
556-
public EntityInfo getCompoundById(int molId);
557+
public EntityInfo getCompoundById(int entityId);
557558

559+
/**
560+
* Request a particular entity by its entity id (mol id in legacy PDB format)
561+
*
562+
* @param entityId
563+
* @return an entity
564+
*/
565+
public EntityInfo getEntityById(int entityId);
558566

559567
/**
560568
* Return the header information for this PDB file

biojava-structure/src/main/java/org/biojava/nbio/structure/StructureImpl.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
import java.util.ListIterator;
3030
import java.util.Map;
3131

32-
import org.biojava.nbio.structure.io.CompoundFinder;
32+
import org.biojava.nbio.structure.io.EntityFinder;
3333
import org.biojava.nbio.structure.io.FileConvert;
3434
import org.slf4j.Logger;
3535
import org.slf4j.LoggerFactory;
@@ -682,8 +682,8 @@ public List<EntityInfo> getEntityInfos() {
682682
// but if the file is incomplete, it won't have the Compounds information and we try
683683
// to guess it from the existing seqres/atom sequences
684684
if (compounds==null || compounds.isEmpty()) {
685-
CompoundFinder cf = new CompoundFinder(this);
686-
this.compounds = cf.findCompounds();
685+
EntityFinder cf = new EntityFinder(this);
686+
this.compounds = cf.findEntities();
687687

688688
// now we need to set references in chains:
689689
for (EntityInfo compound:compounds) {
@@ -698,8 +698,14 @@ public List<EntityInfo> getEntityInfos() {
698698
/** {@inheritDoc} */
699699
@Override
700700
public EntityInfo getCompoundById(int molId) {
701+
return getEntityById(molId);
702+
}
703+
704+
/** {@inheritDoc} */
705+
@Override
706+
public EntityInfo getEntityById(int entityId) {
701707
for (EntityInfo mol : this.compounds){
702-
if (mol.getMolId()==molId){
708+
if (mol.getMolId()==entityId){
703709
return mol;
704710
}
705711
}

biojava-structure/src/main/java/org/biojava/nbio/structure/io/CompoundFinder.java renamed to biojava-structure/src/main/java/org/biojava/nbio/structure/io/EntityFinder.java

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,20 +49,20 @@
4949
import java.util.TreeSet;
5050

5151
/**
52-
* Heuristical finding of Compounds (called Entities in mmCIF dictionary)
53-
* in a given Structure. Compounds are the groups of sequence identical NCS-related polymer chains
52+
* Heuristical finding of Entities (called Compounds in legacy PDB format)
53+
* in a given Structure. Entities are the groups of sequence identical NCS-related polymer chains
5454
* in the Structure.
5555
*
56-
* This is related to {@link SeqRes2AtomAligner} but it is intended for raw PDB files where
56+
* This is related to {@link SeqRes2AtomAligner} but it is intended for raw PDB/mmCIF files where
5757
* possibly no SEQRES is given.
5858
*
59-
* @author duarte_j
59+
* @author Jose Duarte
6060
*/
61-
public class CompoundFinder {
61+
public class EntityFinder {
6262

6363
private Structure s;
6464

65-
private static final Logger logger = LoggerFactory.getLogger(CompoundFinder.class);
65+
private static final Logger logger = LoggerFactory.getLogger(EntityFinder.class);
6666

6767
/**
6868
* Above this ratio of mismatching residue types for same residue numbers we
@@ -82,16 +82,16 @@ public class CompoundFinder {
8282
public static final double GAP_COVERAGE_THRESHOLD = 0.3;
8383

8484

85-
public CompoundFinder(Structure s) {
85+
public EntityFinder(Structure s) {
8686
this.s = s;
8787
}
8888

8989
/**
90-
* Utility method that employs some heuristics to find the Compounds
90+
* Utility method that employs some heuristics to find the {@link EntityInfo}s
9191
* for this Structure in case the information is missing in PDB/mmCIF file
9292
* @return
9393
*/
94-
public List<EntityInfo> findCompounds() {
94+
public List<EntityInfo> findEntities() {
9595

9696
TreeMap<String,EntityInfo> chainIds2entities = findCompoundsFromAlignment();
9797

@@ -150,6 +150,11 @@ public int compare(EntityInfo o1, EntityInfo o2) {
150150
EntityInfo comp = new EntityInfo();
151151
comp.addChain(c);
152152
comp.setMolId(molId);
153+
if (StructureTools.isChainWaterOnly(c)) {
154+
comp.setType(EntityType.WATER);
155+
} else {
156+
comp.setType(EntityType.NONPOLYMER);
157+
}
153158
logger.warn("Chain {} is purely non-polymeric, will assign a new Compound (entity) to it (entity id {})", c.getChainID(), molId);
154159
molId++;
155160

@@ -295,6 +300,7 @@ private TreeMap<String,EntityInfo> findCompoundsFromAlignment() {
295300
ent.addChain(c1);
296301
ent.addChain(c2);
297302
ent.setMolId(molId++);
303+
ent.setType(EntityType.POLYMER);
298304
chainIds2compounds.put(c1.getChainID(), ent);
299305
chainIds2compounds.put(c2.getChainID(), ent);
300306

biojava-structure/src/main/java/org/biojava/nbio/structure/io/PDBFileParser.java

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
import org.biojava.nbio.structure.Chain;
5656
import org.biojava.nbio.structure.ChainImpl;
5757
import org.biojava.nbio.structure.EntityInfo;
58+
import org.biojava.nbio.structure.EntityType;
5859
import org.biojava.nbio.structure.DBRef;
5960
import org.biojava.nbio.structure.Element;
6061
import org.biojava.nbio.structure.Group;
@@ -1015,6 +1016,9 @@ private void compndValueSetter(String field, String value) {
10151016
current_compound = new EntityInfo();
10161017

10171018
current_compound.setMolId(i);
1019+
1020+
// we will set polymer for all defined compounds in PDB file (non-polymer compounds are not defined in header) - JD 2016-03-25
1021+
current_compound.setType(EntityType.POLYMER);
10181022

10191023
prevMolId = i;
10201024
}
@@ -3002,7 +3006,8 @@ private void setSecElement(List<Map<String,String>> secList, String assignment,
30023006
}
30033007

30043008

3005-
/** After the parsing of a PDB file the {@link Chain} and {@link EntityInfo}
3009+
/**
3010+
* After the parsing of a PDB file the {@link Chain} and {@link EntityInfo}
30063011
* objects need to be linked to each other.
30073012
*
30083013
* @param s the structure
@@ -3078,8 +3083,14 @@ public void linkChains2Compound(Structure s){
30783083
compound.setMolId(findMaxCompoundId(compounds)+1);
30793084
c.setEntityInfo(compound);
30803085
compounds.add(compound);
3086+
3087+
if (StructureTools.isChainWaterOnly(c)) {
3088+
compound.setType(EntityType.WATER);
3089+
} else {
3090+
compound.setType(EntityType.NONPOLYMER);
3091+
}
30813092

3082-
logger.warn("No compound (entity) found in file for chain {}. Creating new compound {} for it.", c.getChainID(), compound.getMolId());
3093+
logger.warn("No compound (entity) found in file for chain {}. Creating new entity {} for it.", c.getChainID(), compound.getMolId());
30833094
}
30843095
}
30853096
}

biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/SimpleMMcifConsumer.java

Lines changed: 42 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -804,7 +804,7 @@ public void documentEnd() {
804804
// compounds (entities)
805805
// In addCompounds above we created the compounds if they were present in the file
806806
// Now we need to make sure that they are linked to chains and also that if they are not present in the file we need to add them now
807-
linkCompounds();
807+
linkEntities();
808808

809809
if (!params.isHeaderOnly()) {
810810

@@ -922,10 +922,10 @@ public void documentEnd() {
922922
}
923923

924924
/**
925-
* Here we link compounds (entities) to chains.
926-
* Also if compounds are not present in file, this initialises the compounds with some heuristics, see {@link CompoundFinder}
925+
* Here we link entities to chains.
926+
* Also if entities are not present in file, this initialises the entities with some heuristics, see {@link EntityFinder}
927927
*/
928-
private void linkCompounds() {
928+
private void linkEntities() {
929929

930930

931931
for (int i =0; i< structure.nrModels() ; i++){
@@ -945,7 +945,7 @@ private void linkCompounds() {
945945
}
946946
int eId = Integer.parseInt(entityId);
947947

948-
// Compounds are not added for non-polymeric entities, if a chain is non-polymeric its compound won't be found.
948+
// Entities are not added for non-polymeric entities, if a chain is non-polymeric its entity won't be found.
949949
// TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer
950950
// asyms (chains). Either create a unique StructureImpl or modify existing for a better representation of the
951951
// mmCIF internal data structures but is compatible with Structure interface.
@@ -954,39 +954,44 @@ private void linkCompounds() {
954954
// - 3o6j: asym_id K, chainId Z, entity_id 6 : a single water molecule
955955
// - 1dz9: asym_id K, chainId K, entity_id 6 : a potassium ion alone
956956

957-
EntityInfo compound = structure.getCompoundById(eId);
958-
if (compound==null) {
957+
EntityInfo e = structure.getEntityById(eId);
958+
if (e==null) {
959959
// Supports the case where the only chain members were from non-polymeric entity that is missing.
960960
// Solved by creating a new Compound(entity) to which this chain will belong.
961-
logger.warn("Could not find a compound for entity_id {}, for chain id {}, creating a new compound.",
961+
logger.warn("Could not find an Entity for entity_id {}, for chain id {}, creating a new Entity.",
962962
eId, chain.getChainID());
963-
compound = new EntityInfo();
964-
compound.setMolId(eId);
965-
compound.addChain(chain);
966-
chain.setEntityInfo(compound);
967-
structure.addEntityInfo(compound);
963+
e = new EntityInfo();
964+
e.setMolId(eId);
965+
e.addChain(chain);
966+
if (StructureTools.isChainWaterOnly(chain)) {
967+
e.setType(EntityType.WATER);
968+
} else {
969+
e.setType(EntityType.NONPOLYMER);
970+
}
971+
chain.setEntityInfo(e);
972+
structure.addEntityInfo(e);
968973
} else {
969-
logger.debug("Adding chain with chain id {} (asym id {}) to compound with entity_id {}",
974+
logger.debug("Adding chain with chain id {} (asym id {}) to Entity with entity_id {}",
970975
chain.getChainID(), chain.getInternalChainID(), eId);
971-
compound.addChain(chain);
972-
chain.setEntityInfo(compound);
976+
e.addChain(chain);
977+
chain.setEntityInfo(e);
973978
}
974979

975980
}
976981

977982
}
978983

979-
// to make sure we have Compounds linked to chains, we call getCompounds() which will lazily initialise the
980-
// compounds using heuristics (see CompoundFinder) in the case that they were not explicitly present in the file
981-
List<EntityInfo> compounds = structure.getEntityInfos();
984+
// to make sure we have Entities linked to chains, we call getEntityInfos() which will lazily initialise the
985+
// compounds using heuristics (see EntityFinder) in the case that they were not explicitly present in the file
986+
List<EntityInfo> entities = structure.getEntityInfos();
982987

983-
// final sanity check: it can happen that from the annotated compounds some are not linked to any chains
988+
// final sanity check: it can happen that from the annotated entities some are not linked to any chains
984989
// e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds)
985-
// we simply log it, this can sign some other problems if the compounds are used down the line
986-
for (EntityInfo compound:compounds) {
987-
if (compound.getChains().isEmpty()) {
988-
logger.info("Compound {} '{}' has no chains associated to it",
989-
compound.getId()==null?"with no entity id":compound.getId(), compound.getDescription());
990+
// we simply log it, this can sign some other problems if the entities are used down the line
991+
for (EntityInfo e:entities) {
992+
if (e.getChains().isEmpty()) {
993+
logger.info("Entity {} '{}' has no chains associated to it",
994+
e.getId()==null?"with no entity id":e.getId(), e.getDescription());
990995
}
991996
}
992997

@@ -1111,25 +1116,30 @@ private void addCompounds(StructAsym asym) {
11111116
try {
11121117
eId = Integer.parseInt(asym.getEntity_id());
11131118
} catch (NumberFormatException e) {
1114-
logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Compound",asym.getEntity_id());
1119+
logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Entity",asym.getEntity_id());
11151120
}
11161121
Entity e = getEntity(eId);
11171122

11181123
// for some mmCIF files like 1yrm all 3 of _entity_src_gen, _entity_src_nat and _pdbx_entity_src_syn are missing
11191124
// we need to fill the Compounds in some other way:
11201125

1121-
EntityInfo c = structure.getCompoundById(eId);
1126+
EntityInfo c = structure.getEntityById(eId);
11221127

11231128
if (c==null) {
11241129
c = new EntityInfo();
11251130
c.setMolId(eId);
11261131
// we only add the compound if a polymeric one (to match what the PDB parser does)
11271132
if (e!=null) {
11281133
c.setDescription(e.getPdbx_description());
1129-
c.setType(EntityType.entityTypeFromString(e.getType()));
1130-
addAnicilliaryEntityData(asym, eId, e, c);
1134+
EntityType eType = EntityType.entityTypeFromString(e.getType());
1135+
if (eType!=null) {
1136+
c.setType(eType);
1137+
} else {
1138+
logger.warn("Type '{}' is not recognised as a valid entity type for entity {}", e.getType(), eId);
1139+
}
1140+
addAncilliaryEntityData(asym, eId, e, c);
11311141
structure.addEntityInfo(c);
1132-
logger.debug("Adding Compound with entity id {} from _entity, with name: {}",eId, c.getDescription());
1142+
logger.debug("Adding Entity with entity id {} from _entity, with name: {}",eId, c.getDescription());
11331143
}
11341144
}
11351145
}
@@ -1142,7 +1152,7 @@ private void addCompounds(StructAsym asym) {
11421152
* @param entity
11431153
* @param entityInfo
11441154
*/
1145-
private void addAnicilliaryEntityData(StructAsym asym, int entityId, Entity entity, EntityInfo entityInfo) {
1155+
private void addAncilliaryEntityData(StructAsym asym, int entityId, Entity entity, EntityInfo entityInfo) {
11461156
// Loop through each of the entity types and add the corresponding data
11471157
// We're assuming if data is duplicated between sources it is consistent
11481158
// This is a potentially huge assumption...
@@ -1629,7 +1639,7 @@ public void newEntitySrcSyn(EntitySrcSyn entitySrcSyn){
16291639

16301640
/**
16311641
* The EntityPolySeq object provide the amino acid sequence objects for the Entities.
1632-
* Later on the entities are mapped to the BioJava Chain and Compound objects.
1642+
* Later on the entities are mapped to the BioJava {@link Chain} and {@link EntityInfo} objects.
16331643
* @param epolseq the EntityPolySeq record for one amino acid
16341644
*/
16351645
@Override

0 commit comments

Comments
 (0)