Skip to content

Commit 5c39b20

Browse files
committed
Now getting mmSize from num operators and chains.
The oligomeric size annotation in file is now ignored (often inconsistent, sometimes not even present)
1 parent 38edbb7 commit 5c39b20

File tree

7 files changed

+77
-95
lines changed

7 files changed

+77
-95
lines changed

biojava-integrationtest/src/test/java/org/biojava/nbio/structure/test/io/TestLongPdbVsMmCifParsing.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ public void testVeryLongPdbVsMmCif() throws IOException, StructureException {
126126

127127
@Test
128128
public void testSingle() throws IOException, StructureException {
129-
testAll(Arrays.asList("4a10"));
129+
testAll(Arrays.asList("4kro"));
130130
}
131131

132132
@After
@@ -394,7 +394,7 @@ private void testHeader(Structure sPdb, Structure sCif) {
394394
// there's an inconsistency in 4amh pdb vs mmCIF in mmSize
395395
if (sPdb.getPDBCode().equalsIgnoreCase("4amh")) continue;
396396

397-
assertEquals("Macromolecular size of assemblies doesn't coincide",
397+
assertEquals("Macromolecular size of assembly "+id+" doesn't coincide",
398398
batPdb.get(id).getMacromolecularSize(), batCif.get(id).getMacromolecularSize());
399399
}
400400
}

biojava-structure/src/main/java/demo/DemoMMCIFReader.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ public static void main(String[] args){
4545

4646
demo.loadSimple();
4747

48-
//demo.loadFromDirectAccess();
48+
demo.loadFromDirectAccess();
4949

5050
}
5151

@@ -84,6 +84,8 @@ public void loadFromDirectAccess(){
8484

8585
try {
8686
Structure s = pdbreader.getStructureById(pdbId);
87+
88+
System.out.println("Getting chain H of 1A4W");
8789

8890
List<Chain> hs = s.getNonPolyChainsByPDB("H");
8991

@@ -97,7 +99,7 @@ public void loadFromDirectAccess(){
9799
}
98100

99101
System.out.println("Accessing QWE directly: ");
100-
Group qwe = h.getGroupByPDB(new ResidueNumber("H",373,null));
102+
Group qwe = s.getNonPolyChainsByPDB("H").get(2).getGroupByPDB(new ResidueNumber("H",373,null));
101103

102104
System.out.println(qwe.getChemComp());
103105

biojava-structure/src/main/java/org/biojava/nbio/structure/io/PDBBioAssemblyParser.java

Lines changed: 31 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
import org.biojava.nbio.structure.jama.Matrix;
2424
import org.biojava.nbio.structure.quaternary.BioAssemblyInfo;
2525
import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation;
26-
import org.slf4j.Logger;
27-
import org.slf4j.LoggerFactory;
26+
//import org.slf4j.Logger;
27+
//import org.slf4j.LoggerFactory;
2828

2929
import java.util.*;
3030

@@ -38,15 +38,14 @@
3838
*/
3939
public class PDBBioAssemblyParser {
4040

41-
private static final Logger logger = LoggerFactory.getLogger(PDBBioAssemblyParser.class);
41+
//private static final Logger logger = LoggerFactory.getLogger(PDBBioAssemblyParser.class);
4242

4343
private Integer currentBioMolecule = null;
4444
private List<String> currentChainIDs = new ArrayList<String>();
4545
private Matrix currentMatrix = null;
4646
private double[] shift = null;
4747
private Map<Integer,BioAssemblyInfo> transformationMap = new HashMap<Integer, BioAssemblyInfo>();
4848
private int modelNumber = 1;
49-
private int currentMmSize;
5049

5150
private List<BiologicalAssemblyTransformation> transformations;
5251

@@ -62,13 +61,17 @@ public void pdb_REMARK_350_Handler(String line) {
6261
initialize();
6362
currentBioMolecule = Integer.parseInt(line.substring(24).trim());
6463

65-
} else if ( line.matches("REMARK 350 \\w+ DETERMINED BIOLOGICAL UNIT:.*" ) ||
66-
line.matches("REMARK 350 \\w+ DETERMINED QUATERNARY STRUCTURE:.*" )) {
64+
}
65+
// not parsing anymore the size (from biojava 5.0), thus this is not needed anymore
66+
// eventually if needed this could be used to
67+
// infer if bioassembly is author or software determined
68+
//else if ( line.matches("REMARK 350 \\w+ DETERMINED BIOLOGICAL UNIT:.*" ) ||
69+
// line.matches("REMARK 350 \\w+ DETERMINED QUATERNARY STRUCTURE:.*" )) {
6770
// text can be :
6871
// author determined biological unit
6972
// software determined quaternary structure
70-
currentMmSize = getMmSize(line);
71-
} else if ( line.startsWith("REMARK 350 APPLY THE FOLLOWING TO CHAINS:")) {
73+
//}
74+
else if ( line.startsWith("REMARK 350 APPLY THE FOLLOWING TO CHAINS:")) {
7275
currentChainIDs.clear();
7376
addToCurrentChainList(line);
7477

@@ -80,10 +83,10 @@ public void pdb_REMARK_350_Handler(String line) {
8083
addToCurrentChainList(line);
8184

8285
} else if ( line.startsWith("REMARK 350 BIOMT")) {
83-
if (readMatrix(line)) {
84-
saveMatrix();
85-
modelNumber++;
86-
}
86+
if (readMatrix(line)) {
87+
saveMatrix();
88+
modelNumber++;
89+
}
8790
}
8891
}
8992

@@ -139,79 +142,18 @@ private void saveMatrix() {
139142
if (!transformationMap.containsKey(currentBioMolecule)) {
140143
BioAssemblyInfo bioAssembly = new BioAssemblyInfo();
141144
bioAssembly.setId(currentBioMolecule);
142-
if (currentMmSize==0) {
143-
logger.warn("No macromolecular size could be parsed for biological assembly {}",currentBioMolecule);
144-
}
145-
bioAssembly.setMacromolecularSize(currentMmSize);
146145
bioAssembly.setTransforms(transformations);
147146
transformationMap.put(currentBioMolecule,bioAssembly);
148147
}
149148
}
150149

151-
private int getMmSize(String line) {
152-
int index = line.indexOf(':');
153-
String mmString = line.substring(index+1,line.length()-1).trim().toLowerCase();
154-
return getSizefromString(mmString);
155-
}
156-
157-
private static int getSizefromString(String oligomer){
158-
int size=0;
159-
160-
oligomer = oligomer.toLowerCase();
161-
162-
if (oligomer.equals("monomeric")) {
163-
size = 1;
164-
} else if (oligomer.equals("dimeric")) {
165-
size = 2;
166-
} else if (oligomer.equals("trimeric")) {
167-
size = 3;
168-
} else if (oligomer.equals("tetrameric")) {
169-
size = 4;
170-
} else if (oligomer.equals("pentameric")) {
171-
size = 5;
172-
} else if (oligomer.equals("hexameric")) {
173-
size = 6;
174-
} else if (oligomer.equals("heptameric")) {
175-
size = 7;
176-
} else if (oligomer.equals("octameric")) {
177-
size = 8;
178-
} else if (oligomer.equals("nonameric")) {
179-
size = 9;
180-
} else if (oligomer.equals("decameric")) {
181-
size = 10;
182-
} else if (oligomer.equals("undecameric")) {
183-
size = 11;
184-
} else if (oligomer.equals("dodecameric")) {
185-
size = 12;
186-
} else if (oligomer.equals("tridecameric")) {
187-
size = 13;
188-
} else if (oligomer.equals("tetradecameric")) {
189-
size = 14;
190-
} else if (oligomer.equals("pentadecameric")) {
191-
size = 15;
192-
} else if (oligomer.equals("hexadecameric")) {
193-
size = 16;
194-
} else if (oligomer.equals("heptadecameric")) {
195-
size = 17;
196-
} else if (oligomer.equals("octadecameric")) {
197-
size = 18;
198-
} else if (oligomer.equals("nonadecameric")) {
199-
size = 19;
200-
} else if (oligomer.equals("eicosameric")) {
201-
size = 20;
202-
} else if( oligomer.matches("(\\d+).*")) {
203-
size = Integer.parseInt((oligomer.replaceAll("(\\d+).*", "$1")));
204-
}
205-
return size;
206-
}
207-
208150
/**
209151
* Parses list of chain ids (A, B, C, etc.)
210152
*/
211153
private void addToCurrentChainList(String line) {
212154
int index = line.indexOf(":");
213155
String chainList = line.substring(index+1).trim();
214-
// split by spaces or commas
156+
// split by spaces or commas
215157
String[] chainIds = chainList.split("[ ,]+");
216158
currentChainIDs.addAll(Arrays.asList(chainIds));
217159
}
@@ -222,6 +164,20 @@ private void initialize() {
222164
currentBioMolecule = null;
223165
shift = new double[3];
224166
modelNumber = 1;
225-
currentMmSize = 0;
167+
}
168+
169+
/**
170+
* Set the macromolecularSize fields of the parsed bioassemblies.
171+
* This can only be called after the full PDB file has been read so that
172+
* all the info for all bioassemblies has been gathered.
173+
* Note that an explicit method to set the field is necessary here because
174+
* in PDB files the transformations contain only the author chain ids, corresponding
175+
* to polymeric chains, whilst in mmCIF files the transformations
176+
* contain all asym ids of both polymers and non-polymers.
177+
*/
178+
public void setMacromolecularSizes() {
179+
for (BioAssemblyInfo bioAssembly : transformationMap.values()) {
180+
bioAssembly.setMacromolecularSize(bioAssembly.getTransforms().size());
181+
}
226182
}
227183
}

biojava-structure/src/main/java/org/biojava/nbio/structure/io/PDBFileParser.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2726,6 +2726,7 @@ private void triggerEndFileChecks(){
27262726
}
27272727

27282728
if ( bioAssemblyParser != null){
2729+
bioAssemblyParser.setMacromolecularSizes();
27292730
pdbHeader.setBioAssemblies(bioAssemblyParser.getTransformationMap());
27302731
}
27312732

biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/SimpleMMcifConsumer.java

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -804,30 +804,35 @@ public void documentEnd() {
804804
// these are the transformations that need to be applied to our model
805805
List<BiologicalAssemblyTransformation> transformations = builder.getBioUnitTransformationList(psa, psags, structOpers);
806806

807-
int mmSize = 0;
808807
int bioAssemblyId = -1;
809808
try {
810809
bioAssemblyId = Integer.parseInt(psa.getId());
811810
} catch (NumberFormatException e) {
812811
logger.info("Could not parse a numerical bio assembly id from '{}'",psa.getId());
813812
}
814-
try {
815-
mmSize = Integer.parseInt(psa.getOligomeric_count());
816-
} catch (NumberFormatException e) {
817-
if (bioAssemblyId!=-1)
818-
// if we have a numerical id, then it's unusual to have no oligomeric size: we warn about it
819-
logger.warn("Could not parse oligomeric count from '{}' for biological assembly id {}",
820-
psa.getOligomeric_count(),psa.getId());
821-
else
822-
// no numerical id (PAU,XAU in virus entries), it's normal to have no oligomeric size
823-
logger.info("Could not parse oligomeric count from '{}' for biological assembly id {}",
824-
psa.getOligomeric_count(),psa.getId());
825-
}
826813

827814
// if bioassembly id is not numerical we throw it away
828815
// this happens usually for viral capsid entries, like 1ei7
829816
// see issue #230 in github
830817
if (bioAssemblyId!=-1) {
818+
int mmSize = 0;
819+
// note that the transforms contain asym ids of both polymers and non-polymers
820+
// For the mmsize, we are only interested in the polymers
821+
for (BiologicalAssemblyTransformation transf:transformations) {
822+
Chain c = structure.getChain(transf.getChainId());
823+
if (c==null) {
824+
logger.warn("Could not find asym id {} specified in struct_assembly_gen", transf.getChainId());
825+
continue;
826+
}
827+
if (c.getEntityType() == EntityType.POLYMER &&
828+
// for entries like 4kro, sugars are annotated as polymers but we
829+
// don't want them in the macromolecularSize count
830+
!c.getEntityInfo().getDescription().contains("SUGAR") ) {
831+
832+
mmSize++;
833+
}
834+
}
835+
831836
BioAssemblyInfo bioAssembly = new BioAssemblyInfo();
832837
bioAssembly.setId(bioAssemblyId);
833838
bioAssembly.setMacromolecularSize(mmSize);

biojava-structure/src/main/java/org/biojava/nbio/structure/quaternary/BioAssemblyInfo.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@ public void setTransforms(List<BiologicalAssemblyTransformation> transforms) {
7575

7676
/**
7777
* Returns the macromolecular size of this biological assembly, i.e.
78-
* the number of polymeric chains (protein or nucleotide chains) in the biological
79-
* assembly.
78+
* the number of polymeric chains (protein or nucleotide chains, not sugars)
79+
* in the biological assembly.
8080
* @return
8181
*/
8282
public int getMacromolecularSize() {

biojava-structure/src/test/java/org/biojava/nbio/structure/io/TestQuaternaryStructureProviders.java

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,19 +50,37 @@ public void test3FAD() throws IOException, StructureException{
5050
@Test
5151
public void test5LDH() throws IOException, StructureException{
5252
testID("5LDH",1, 4);
53-
testID("5LDH",2, 0); // bioassembly 2 has 2 as mmsize, but at the moment the parsed value from mmcif files says 0
53+
54+
// the pdb file of 5ldh contains only 1 bioassembly, whilst the mmcif contains 2,
55+
// thus we can't test here the comparison between the 2
56+
//testID("5LDH",2, 2);
5457

5558
// in 5ldh there's also PAU and XAU but those are ignored, see github issue #230
5659

5760
boolean gotException = false;
5861
try {
62+
AtomCache cache = new AtomCache();
63+
cache.setUseMmCif(true);
64+
StructureIO.setAtomCache(cache);
5965
StructureIO.getBiologicalAssembly("5LDH",3);
6066
} catch (StructureException e) {
6167
gotException = true;
6268
}
6369

6470
assertTrue("Bioassembly 3 for PDB id 5LDH should fail with a StructureException!", gotException);
6571

72+
// bioassembly 2 does exist in mmcif file, let's check that
73+
gotException = false;
74+
try {
75+
AtomCache cache = new AtomCache();
76+
cache.setUseMmCif(true);
77+
StructureIO.setAtomCache(cache);
78+
StructureIO.getBiologicalAssembly("5LDH",2);
79+
} catch (StructureException e) {
80+
gotException = true;
81+
}
82+
assertTrue("Bioassembly 2 for PDB id 5LDH should not fail with a StructureException!", !gotException);
83+
6684
}
6785

6886
@Test

0 commit comments

Comments
 (0)