Skip to content

Commit 36a8fea

Browse files
committed
Better parsing of COMPND lines in PDB files, biojava#305
1 parent b99e061 commit 36a8fea

File tree

4 files changed

+75
-103
lines changed

4 files changed

+75
-103
lines changed

biojava-structure/src/main/java/org/biojava/nbio/structure/io/CompoundFinder.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
import java.util.TreeMap;
4646

4747
/**
48-
* Heuristically finding of Compounds (called Entities in mmCIF dictionary)
48+
* Heuristical finding of Compounds (called Entities in mmCIF dictionary)
4949
* in a given Structure. Compounds are the groups of sequence identical NCS-related polymer chains
5050
* in the Structure.
5151
*

biojava-structure/src/main/java/org/biojava/nbio/structure/io/PDBFileParser.java

Lines changed: 42 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -153,17 +153,18 @@ public class PDBFileParser {
153153
private Map<String, Site> siteMap = new LinkedHashMap<String, Site>();
154154
private Map<String, List<ResidueNumber>> siteToResidueMap = new LinkedHashMap<String, List<ResidueNumber>>();
155155

156-
private Matrix4d currentNcsOp = null;
156+
private Matrix4d currentNcsOp;
157157
private List<Matrix4d> ncsOperators;
158158

159159
// for storing LINK until we have all the atoms parsed
160160
private List<LinkRecord> linkRecords;
161161

162162
// for parsing COMPOUND and SOURCE Header lines
163-
private int molTypeCounter = 1;
164-
//private int continuationNo;
163+
private int prevMolId;
164+
private String previousContinuationField;
165165
private String continuationField;
166-
private String continuationString = "";
166+
private String continuationString;
167+
167168
private DateFormat dateFormat;
168169

169170
// for rfree parsing
@@ -205,9 +206,6 @@ public class PDBFileParser {
205206

206207

207208

208-
209-
private String previousContinuationField = "";
210-
211209
/** Secondary strucuture assigned by the PDB author/
212210
*
213211
*/
@@ -264,7 +262,7 @@ public PDBFileParser() {
264262
helixList = new ArrayList<Map<String,String>>();
265263
strandList = new ArrayList<Map<String,String>>();
266264
turnList = new ArrayList<Map<String,String>>();
267-
current_compound = new Compound();
265+
current_compound = null;
268266
dbrefs = new ArrayList<DBRef>();
269267
siteMap = null;
270268
dateFormat = new SimpleDateFormat("dd-MMM-yy", Locale.US);
@@ -739,23 +737,7 @@ private void pdb_REVDAT_Handler(String line) {
739737
*/
740738
private void pdb_SEQRES_Handler(String line) {
741739

742-
// System.out.println("PDBFileParser.pdb_SEQRES_Handler: BEGIN");
743-
// System.out.println(line);
744-
745-
//TODO: treat the following residues as amino acids?
746740
/*
747-
MSE Selenomethionine
748-
CSE Selenocysteine
749-
PTR Phosphotyrosine
750-
SEP Phosphoserine
751-
TPO Phosphothreonine
752-
HYP 4-hydroxyproline
753-
5HP Pyroglutamic acid; 5-hydroxyproline
754-
PCA Pyroglutamic Acid
755-
LYZ 5-hydroxylysine
756-
GLX Glu or Gln
757-
ASX Asp or Asn
758-
GLA gamma-carboxy-glutamic acid
759741
1 2 3 4 5 6 7
760742
1234567890123456789012345678901234567890123456789012345678901234567890
761743
SEQRES 1 A 376 LYS PRO VAL THR VAL LYS LEU VAL ASP SER GLN ALA THR
@@ -916,10 +898,6 @@ private void pdb_JRNL_Handler(String line) {
916898
*/
917899
private void pdb_COMPND_Handler(String line) {
918900

919-
String continuationNr = line.substring(9, 10).trim();
920-
921-
logger.debug("current continuationNo is "
922-
+ continuationNr);
923901
logger.debug("previousContinuationField is "
924902
+ previousContinuationField);
925903
logger.debug("current continuationField is "
@@ -941,41 +919,29 @@ private void pdb_COMPND_Handler(String line) {
941919
line = line.substring(0, 72);
942920
}
943921

944-
//String beginningOfLine = line.substring(0, 10);
945-
//line = line.replace(beginningOfLine, "");
946922
line = line.substring(10, line.length());
947923

948-
949-
logger.debug("LINE: >" + line + "<");
950-
951-
String[] fieldList = line.split("\\s+");
924+
925+
String[] fieldList = line.trim().split("\\s+");
952926
int fl = fieldList.length;
953-
if ((fl >0 ) && (!fieldList[0].equals(""))
954-
&& compndFieldValues.contains(fieldList[0])) {
955-
// System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[0] + "'");
927+
if ((fl >0 ) && compndFieldValues.contains(fieldList[0])) {
928+
956929
continuationField = fieldList[0];
957930
if (previousContinuationField.equals("")) {
958931
previousContinuationField = continuationField;
959932
}
960-
961-
} else if ((fl >1 ) && compndFieldValues.contains(fieldList[1])) {
962-
// System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[1] + "'");
963-
continuationField = fieldList[1];
964-
if (previousContinuationField.equals("")) {
965-
previousContinuationField = continuationField;
933+
934+
} else if (fl>0) {
935+
// the ':' character indicates the end of a field name and should be invalid as part the first data token
936+
// e.g. obsolete file 1hhb has a malformed COMPND line that can only be caught with this kind of check
937+
if (fieldList[0].contains(":") ) {
938+
logger.info("COMPND line does not follow the PDB 3.0 format. Note that COMPND parsing is not supported any longer in format 2.3 or earlier");
939+
return;
966940
}
967941

968942
} else {
969-
if (continuationNr.equals("")) {
970-
971-
logger.debug("looks like an old PDB file");
972-
973-
continuationField = "MOLECULE:";
974-
if (previousContinuationField.equals("")) {
975-
previousContinuationField = continuationField;
976-
}
977-
}
978-
943+
944+
// the line will be added as data to the previous field
979945
}
980946

981947
line = line.replace(continuationField, "").trim();
@@ -1025,41 +991,46 @@ private void pdb_COMPND_Handler(String line) {
1025991
// System.out.println("[pdb_COMPND_Handler] Final COMPND line - Finishing off final MolID header.");
1026992
compndValueSetter(continuationField, continuationString);
1027993
continuationString = "";
1028-
compounds.add(current_compound);
994+
if (current_compound!=null) compounds.add(current_compound);
1029995
}
1030996
}
1031997

1032-
/** set the value in the currrent molId object
1033-
*
998+
/**
999+
* Set the value in the currrent molId object
10341000
* @param field
10351001
* @param value
10361002
*/
10371003
private void compndValueSetter(String field, String value) {
10381004

10391005
value = value.trim().replace(";", "");
10401006
if (field.equals("MOL_ID:")) {
1041-
1042-
//TODO: find out why an extra mol or chain gets added and why 1H1J, 1J1H ATOM records are missing, but not 1H1H....
1043-
1044-
logger.debug("molTypeCounter " + molTypeCounter + " "
1045-
+ value);
1007+
10461008
int i = -1;
10471009
try {
10481010
i = Integer.valueOf(value);
10491011
} catch (NumberFormatException e){
1050-
logger.warn(e.getMessage() + " while trying to parse COMPND MOL_ID line.");
1012+
logger.warn("Value '{}' does not look like a number, while trying to parse COMPND MOL_ID line.",value);
10511013
}
1052-
if (molTypeCounter != i) {
1053-
molTypeCounter++;
1014+
if (i>0 && prevMolId!=i) {
1015+
1016+
if (current_compound!=null) compounds.add(current_compound);
10541017

1055-
compounds.add(current_compound);
1056-
current_compound = null;
1018+
logger.debug("Initialising new Compound with mol_id {}", i);
1019+
10571020
current_compound = new Compound();
1058-
1021+
1022+
current_compound.setMolId(i);
1023+
1024+
prevMolId = i;
10591025
}
10601026

1061-
current_compound.setMolId(i);
10621027
}
1028+
1029+
// if for some reason (e.g. missing mol_id line) the current_compound is null we can't add anything to it, return
1030+
if (current_compound==null) {
1031+
return;
1032+
}
1033+
10631034
if (field.equals("MOLECULE:")) {
10641035
current_compound.setMolName(value);
10651036

@@ -2665,14 +2636,15 @@ public Structure parsePDBFile(BufferedReader buf)
26652636
current_group = null ;
26662637
pdbHeader = new PDBHeader();
26672638
connects = new ArrayList<Map<String,Integer>>();
2639+
previousContinuationField = "";
26682640
continuationField = "";
26692641
continuationString = "";
2670-
current_compound = new Compound();
2642+
current_compound = null;
26712643
sourceLines.clear();
26722644
compndLines.clear();
26732645
isLastCompndLine = false;
26742646
isLastSourceLine = false;
2675-
molTypeCounter = 1;
2647+
prevMolId = -1;
26762648
compounds.clear();
26772649
helixList.clear();
26782650
strandList.clear();

biojava-structure/src/test/java/org/biojava/nbio/structure/PdbFileFormat30Test.java

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -23,30 +23,30 @@
2323

2424
package org.biojava.nbio.structure;
2525

26-
import junit.framework.TestCase;
2726
import org.biojava.nbio.structure.io.FileParsingParameters;
2827
import org.biojava.nbio.structure.io.PDBFileParser;
2928

3029
import java.io.IOException;
3130
import java.io.InputStream;
3231
import java.util.List;
3332

33+
import org.junit.Test;
3434

35-
public class PdbFileFormat30Test extends TestCase {
35+
import static org.junit.Assert.*;
3636

37-
3837

39-
@Override
40-
protected void setUp(){
38+
public class PdbFileFormat30Test {
39+
40+
4141

42-
}
4342

44-
/** there is a file format change in v3.0 of the PDB file format
43+
/**
44+
* There is a file format change in v3.0 of the PDB file format
4545
* this test makes sure that the atom name changes are being processed correctly
46-
*
47-
*
46+
* @throws IOException
4847
*/
49-
public void testRead30File(){
48+
@Test
49+
public void testRead30File() throws IOException{
5050
Structure s = getStructure("/388d_v30.pdb");
5151
int nrNuc = getNrNucleotides(s);
5252

@@ -69,7 +69,8 @@ public void testRead30File(){
6969

7070
}
7171

72-
public void testRead23File(){
72+
@Test
73+
public void testRead23File() throws IOException{
7374

7475
Structure s = getStructure("/388d_v23.pdb");
7576
int nrNuc = getNrNucleotides(s);
@@ -89,7 +90,7 @@ public void testRead23File(){
8990
assertTrue(mol.getMolName().startsWith("DNA"));
9091
}
9192

92-
private Structure getStructure(String fileName){
93+
private Structure getStructure(String fileName) throws IOException{
9394

9495
InputStream inStream = this.getClass().getResourceAsStream(fileName);
9596
assertNotNull(inStream);
@@ -99,11 +100,9 @@ private Structure getStructure(String fileName){
99100
params.setAlignSeqRes(false);
100101
pdbpars.setFileParsingParameters(params);
101102
Structure structure = null;
102-
try {
103-
structure = pdbpars.parsePDBFile(inStream) ;
104-
} catch (IOException e) {
105-
e.printStackTrace();
106-
}
103+
104+
structure = pdbpars.parsePDBFile(inStream) ;
105+
107106
return structure;
108107
}
109108

@@ -123,26 +122,28 @@ private int getNrNucleotides(Structure s){
123122
return nr;
124123
}
125124

126-
/**
127-
* Checks that the legacy file check is working and that that non-legacy
128-
* files have the correct number of chains when the line length is over
129-
* 72 characters.
130-
*/
131-
public void testIsLegacyFormat_pdb_COMPND_handler(){
125+
/**
126+
* Checks that the legacy file check is working and that that non-legacy
127+
* files have the correct number of chains when the line length is over
128+
* 72 characters.
129+
* @throws IOException
130+
*/
131+
@Test
132+
public void testIsLegacyFormat_pdb_COMPND_handler() throws IOException{
132133

133134
Structure s = getStructure("/3mk3.pdb");
134135

135136
List<Compound> compounds= s.getCompounds();
136137
assertTrue(compounds.size() == 1);
137138
Compound mol = compounds.get(0);
138139
assertTrue(mol.getMolName().equals("6,7-DIMETHYL-8-RIBITYLLUMAZINE SYNTHASE"));
139-
assertEquals(60, mol.getChainIds().size());
140-
assertEquals(60, mol.getChains().size());
141-
assertTrue(mol.getChainIds().contains("S"));
142-
assertTrue(mol.getChainIds().contains("T"));
143-
assertTrue(mol.getChainIds().contains("U"));
144-
assertTrue(mol.getChainIds().contains("g"));
145-
assertTrue(mol.getChainIds().contains("h"));
146-
assertTrue(mol.getChainIds().contains("i"));
140+
assertEquals(60, mol.getChainIds().size());
141+
assertEquals(60, mol.getChains().size());
142+
assertTrue(mol.getChainIds().contains("S"));
143+
assertTrue(mol.getChainIds().contains("T"));
144+
assertTrue(mol.getChainIds().contains("U"));
145+
assertTrue(mol.getChainIds().contains("g"));
146+
assertTrue(mol.getChainIds().contains("h"));
147+
assertTrue(mol.getChainIds().contains("i"));
147148
}
148149
}

biojava-structure/src/test/java/org/biojava/nbio/structure/TestCalc.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import javax.vecmath.Matrix4d;
66
import javax.vecmath.Point3d;
77

8-
import org.biojava.nbio.structure.align.util.RotationAxis;
98
import org.biojava.nbio.structure.jama.Matrix;
109
import org.junit.Test;
1110

0 commit comments

Comments
 (0)