Skip to content

Commit 121aa1d

Browse files
author
Matt Larson
committed
Issue 330: fixes for PDB Parser,
* Handle short lines with CONECT, LINK * Restore bond building with LINK records * Add an exception handler around line handlers for more robust parsing when file format is violated.
1 parent 4551d74 commit 121aa1d

File tree

2 files changed

+211
-55
lines changed

2 files changed

+211
-55
lines changed

biojava-structure/src/main/java/org/biojava/nbio/structure/io/PDBFileParser.java

Lines changed: 136 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,10 @@
5252
import org.biojava.nbio.structure.Author;
5353
import org.biojava.nbio.structure.Chain;
5454
import org.biojava.nbio.structure.ChainImpl;
55-
import org.biojava.nbio.structure.EntityInfo;
56-
import org.biojava.nbio.structure.EntityType;
5755
import org.biojava.nbio.structure.DBRef;
5856
import org.biojava.nbio.structure.Element;
57+
import org.biojava.nbio.structure.EntityInfo;
58+
import org.biojava.nbio.structure.EntityType;
5959
import org.biojava.nbio.structure.Group;
6060
import org.biojava.nbio.structure.GroupIterator;
6161
import org.biojava.nbio.structure.HetatomImpl;
@@ -71,6 +71,7 @@
7171
import org.biojava.nbio.structure.StructureTools;
7272
import org.biojava.nbio.structure.io.mmcif.ChemCompGroupFactory;
7373
import org.biojava.nbio.structure.io.mmcif.model.ChemCompAtom;
74+
import org.biojava.nbio.structure.io.util.PDBTemporaryStorageUtils.LinkRecord;
7475
import org.biojava.nbio.structure.secstruc.SecStrucInfo;
7576
import org.biojava.nbio.structure.secstruc.SecStrucType;
7677
import org.biojava.nbio.structure.xtal.CrystalCell;
@@ -186,6 +187,9 @@ public class PDBFileParser {
186187
private Map<String, List<ResidueNumber>> siteToResidueMap = new LinkedHashMap<String, List<ResidueNumber>>();
187188

188189
private List<SSBondImpl> ssbonds = new ArrayList<>();
190+
191+
// for storing LINK until we have all the atoms parsed
192+
private List<LinkRecord> linkRecords;
189193

190194
private Matrix4d currentNcsOp;
191195
private List<Matrix4d> ncsOperators;
@@ -288,6 +292,8 @@ public PDBFileParser() {
288292
// set the correct max values for parsing...
289293
loadMaxAtoms = params.getMaxAtoms();
290294
atomCAThreshold = params.getAtomCaThreshold();
295+
296+
linkRecords = new ArrayList<LinkRecord>();
291297

292298
blankChainIdsPresent = false;
293299

@@ -1984,6 +1990,8 @@ private void switchCAOnly(){
19841990

19851991
/** safes repeating a few lines ... */
19861992
private Integer conect_helper (String line,int start,int end) {
1993+
if (line.length() < end) return null;
1994+
19871995
String sbond = line.substring(start,end).trim();
19881996
int bond = -1 ;
19891997
Integer b = null ;
@@ -2236,8 +2244,73 @@ private void pdb_SSBOND_Handler(String line){
22362244
}
22372245

22382246

2247+
/**
2248+
* Takes care of LINK records. These take the format of:
2249+
*
2250+
* <pre>
2251+
* COLUMNS DATA TYPE FIELD DEFINITION
2252+
* --------------------------------------------------------------------------------
2253+
* 1 - 6 Record name "LINK "
2254+
* 13 - 16 Atom name1 Atom name.
2255+
* 17 Character altLoc1 Alternate location indicator.
2256+
* 18 - 20 Residue name resName1 Residue name.
2257+
* 22 Character chainID1 Chain identifier.
2258+
* 23 - 26 Integer resSeq1 Residue sequence number.
2259+
* 27 AChar iCode1 Insertion code.
2260+
* 43 - 46 Atom name2 Atom name.
2261+
* 47 Character altLoc2 Alternate location indicator.
2262+
* 48 - 50 Residue name resName2 Residue name.
2263+
* 52 Character chainID2 Chain identifier.
2264+
* 53 - 56 Integer resSeq2 Residue sequence number.
2265+
* 57 AChar iCode2 Insertion code.
2266+
* 60 - 65 SymOP sym1 Symmetry operator for 1st atom.
2267+
* 67 - 72 SymOP sym2 Symmetry operator for 2nd atom.
2268+
* </pre>
2269+
*
2270+
* (From http://www.wwpdb.org/documentation/format32/sect6.html#LINK)
2271+
*
2272+
* @param line the LINK record line to parse.
2273+
*/
2274+
private void pdb_LINK_Handler(String line) {
2275+
2276+
if (params.isHeaderOnly()) return;
2277+
2278+
// Check for the minimal set of fields.
2279+
if (line.length()<56) {
2280+
logger.info("LINK line has length under 56. Ignoring it.");
2281+
return;
2282+
}
2283+
2284+
int len = line.length();
2285+
2286+
String name1 = line.substring(12, 16).trim();
2287+
String altLoc1 = line.substring(16, 17).trim();
2288+
String resName1 = line.substring(17, 20).trim();
2289+
String chainID1 = line.substring(21, 22).trim();
2290+
String resSeq1 = line.substring(22, 26).trim();
2291+
String iCode1 = line.substring(26, 27).trim();
2292+
2293+
String name2 = line.substring(42, 46).trim();
2294+
String altLoc2 = line.substring(46, 47).trim();
2295+
String resName2 = line.substring(47, 50).trim();
2296+
String chainID2 = line.substring(51, 52).trim();
2297+
String resSeq2 = line.substring(52, 56).trim();
2298+
String iCode2 = null; // Might get trimmed if blank.
2299+
if (len > 56) iCode2 = line.substring(56, 57).trim();
2300+
2301+
String sym1 = null;
2302+
if (len > 64) sym1 = line.substring(59, 65).trim();
2303+
String sym2 = null;
2304+
if (len > 71) sym2 = line.substring(66, 72).trim();
2305+
2306+
linkRecords.add(new LinkRecord(
2307+
name1, altLoc1, resName1, chainID1, resSeq1, iCode1,
2308+
name2, altLoc2, resName2, chainID2, resSeq2, iCode2,
2309+
sym1, sym2));
2310+
}
2311+
22392312
/**
2240-
* Handler for the SITE records.
2313+
* Handler for the SITE records. <br>
22412314
*
22422315
* <pre>
22432316
*
@@ -2520,6 +2593,7 @@ public Structure parsePDBFile(BufferedReader buf)
25202593
lengthCheck = -1;
25212594
atomCount = 0;
25222595
atomOverflow = false;
2596+
linkRecords = new ArrayList<LinkRecord>();
25232597
siteToResidueMap.clear();
25242598

25252599
blankChainIdsPresent = false;
@@ -2552,58 +2626,58 @@ public Structure parsePDBFile(BufferedReader buf)
25522626
recordName = line.trim();
25532627
else
25542628
recordName = line.substring (0, 6).trim ();
2555-
2556-
if (recordName.equals("ATOM"))
2557-
pdb_ATOM_Handler(line);
2558-
else if (recordName.equals("SEQRES"))
2559-
pdb_SEQRES_Handler(line);
2560-
else if (recordName.equals("HETATM"))
2561-
pdb_ATOM_Handler(line);
2562-
else if (recordName.equals("MODEL"))
2563-
pdb_MODEL_Handler(line);
2564-
else if (recordName.equals("TER"))
2565-
pdb_TER_Handler();
2566-
else if (recordName.equals("HEADER"))
2567-
pdb_HEADER_Handler(line);
2568-
else if (recordName.equals("AUTHOR"))
2569-
pdb_AUTHOR_Handler(line);
2570-
else if (recordName.equals("TITLE"))
2571-
pdb_TITLE_Handler(line);
2572-
else if (recordName.equals("SOURCE"))
2573-
sourceLines.add(line); //pdb_SOURCE_Handler
2574-
else if (recordName.equals("COMPND"))
2575-
compndLines.add(line); //pdb_COMPND_Handler
2576-
else if (recordName.equals("JRNL"))
2577-
pdb_JRNL_Handler(line);
2578-
else if (recordName.equals("EXPDTA"))
2579-
pdb_EXPDTA_Handler(line);
2580-
else if (recordName.equals("CRYST1"))
2581-
pdb_CRYST1_Handler(line);
2582-
else if (recordName.startsWith("MTRIX"))
2583-
pdb_MTRIXn_Handler(line);
2584-
else if (recordName.equals("REMARK"))
2585-
pdb_REMARK_Handler(line);
2586-
else if (recordName.equals("CONECT"))
2587-
pdb_CONECT_Handler(line);
2588-
else if (recordName.equals("REVDAT"))
2589-
pdb_REVDAT_Handler(line);
2590-
else if (recordName.equals("DBREF"))
2591-
pdb_DBREF_Handler(line);
2592-
else if (recordName.equals("SITE"))
2593-
pdb_SITE_Handler(line);
2594-
else if (recordName.equals("SSBOND"))
2595-
pdb_SSBOND_Handler(line);
2596-
else if ( params.isParseSecStruc()) {
2597-
if ( recordName.equals("HELIX") ) pdb_HELIX_Handler ( line ) ;
2598-
else if (recordName.equals("SHEET")) pdb_SHEET_Handler(line ) ;
2599-
else if (recordName.equals("TURN")) pdb_TURN_Handler( line ) ;
2600-
}
2601-
else {
2602-
// this line type is not supported, yet.
2603-
// we ignore it
2604-
}
2605-
2606-
2629+
2630+
try {
2631+
if (recordName.equals("ATOM"))
2632+
pdb_ATOM_Handler(line);
2633+
else if (recordName.equals("SEQRES"))
2634+
pdb_SEQRES_Handler(line);
2635+
else if (recordName.equals("HETATM"))
2636+
pdb_ATOM_Handler(line);
2637+
else if (recordName.equals("MODEL"))
2638+
pdb_MODEL_Handler(line);
2639+
else if (recordName.equals("TER"))
2640+
pdb_TER_Handler();
2641+
else if (recordName.equals("HEADER"))
2642+
pdb_HEADER_Handler(line);
2643+
else if (recordName.equals("AUTHOR"))
2644+
pdb_AUTHOR_Handler(line);
2645+
else if (recordName.equals("TITLE"))
2646+
pdb_TITLE_Handler(line);
2647+
else if (recordName.equals("SOURCE"))
2648+
sourceLines.add(line); //pdb_SOURCE_Handler
2649+
else if (recordName.equals("COMPND"))
2650+
compndLines.add(line); //pdb_COMPND_Handler
2651+
else if (recordName.equals("JRNL"))
2652+
pdb_JRNL_Handler(line);
2653+
else if (recordName.equals("EXPDTA"))
2654+
pdb_EXPDTA_Handler(line);
2655+
else if (recordName.equals("CRYST1"))
2656+
pdb_CRYST1_Handler(line);
2657+
else if (recordName.startsWith("MTRIX"))
2658+
pdb_MTRIXn_Handler(line);
2659+
else if (recordName.equals("REMARK"))
2660+
pdb_REMARK_Handler(line);
2661+
else if (recordName.equals("CONECT"))
2662+
pdb_CONECT_Handler(line);
2663+
else if (recordName.equals("REVDAT"))
2664+
pdb_REVDAT_Handler(line);
2665+
else if (recordName.equals("DBREF"))
2666+
pdb_DBREF_Handler(line);
2667+
else if (recordName.equals("SITE"))
2668+
pdb_SITE_Handler(line);
2669+
else if (recordName.equals("SSBOND"))
2670+
pdb_SSBOND_Handler(line);
2671+
else if (recordName.equals("LINK"))
2672+
pdb_LINK_Handler(line);
2673+
else if ( params.isParseSecStruc()) {
2674+
if ( recordName.equals("HELIX") ) pdb_HELIX_Handler ( line ) ;
2675+
else if (recordName.equals("SHEET")) pdb_SHEET_Handler(line ) ;
2676+
else if (recordName.equals("TURN")) pdb_TURN_Handler( line ) ;
2677+
}
2678+
} catch (StringIndexOutOfBoundsException | NullPointerException ex) {
2679+
logger.info("Unable to parse [" + line + "]");
2680+
}
26072681
}
26082682

26092683
makeCompounds(compndLines, sourceLines);
@@ -2681,6 +2755,13 @@ private void makeCompounds(List<String> compoundList,
26812755
private void formBonds() {
26822756

26832757
BondMaker maker = new BondMaker(structure, params);
2758+
2759+
// LINK records should be preserved, they are the way that
2760+
// inter-residue bonds are created for ligands such as trisaccharides, unusual polymers.
2761+
// The analogy in mmCIF is the _struct_conn record.
2762+
for (LinkRecord linkRecord : linkRecords) {
2763+
maker.formLinkRecordBond(linkRecord);
2764+
}
26842765

26852766
maker.formDisulfideBonds(ssbonds);
26862767

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
package org.biojava.nbio.structure.io;
2+
3+
import static org.junit.Assert.assertEquals;
4+
import static org.junit.Assert.assertNotNull;
5+
6+
import java.io.ByteArrayInputStream;
7+
import java.io.IOException;
8+
import java.io.InputStream;
9+
10+
import org.biojava.nbio.structure.Atom;
11+
import org.biojava.nbio.structure.Chain;
12+
import org.biojava.nbio.structure.Group;
13+
import org.biojava.nbio.structure.Structure;
14+
import org.junit.Test;
15+
16+
/**
17+
* This class will test parsing short CONECT lines.
18+
* @since Nov 30, 2016
19+
* @author larsonm
20+
*/
21+
public class TestShortLines {
22+
23+
@Test
24+
public void testConect() throws IOException {
25+
PDBFileParser pdbPars = new PDBFileParser();
26+
FileParsingParameters params = pdbPars.getFileParsingParameters();
27+
params.setCreateAtomBonds(true);
28+
29+
// CONECTS will be deprecated, but will we create bonds?
30+
// Like the LINK records, should BioJava create BondImpl when params.setCreateAtomBonds(true)?
31+
32+
StringBuilder sb = new StringBuilder();
33+
sb.append("HETATM 2398 P FAD A 500 8.398 46.448 73.490 1.00 13.51 P \n");
34+
sb.append("HETATM 2399 PA FAD A 500 6.089 45.580 75.235 1.00 15.88 P \n");
35+
sb.append("HETATM 2400 O1P FAD A 500 7.908 47.684 72.869 1.00 4.19 O \n");
36+
sb.append("CONECT 2400 2398\n");
37+
String shortLine = sb.toString();
38+
Structure s;
39+
// Parse short
40+
try(InputStream is = new ByteArrayInputStream(shortLine.getBytes())) {
41+
s = pdbPars.parsePDBFile(is);
42+
}
43+
44+
// After 4.2, CONECTS are deprecated, but there is not yet an implementation
45+
// describing how CONECTS will be replaced - will Bonds be created?
46+
// assertEquals(1, s.getConnections().size());
47+
assertNotNull(s);
48+
}
49+
50+
@Test
51+
public void testLINK() throws IOException {
52+
Structure s;
53+
PDBFileParser pdbPars = new PDBFileParser();
54+
FileParsingParameters params = pdbPars.getFileParsingParameters();
55+
params.setCreateAtomBonds(true);
56+
57+
StringBuilder sb = new StringBuilder();
58+
sb.append("ATOM 2412 C21 2EG A 7 0.888 44.973 72.238 1.00 29.17 C \n");
59+
sb.append("ATOM 2413 C22 2EG B 19 0.888 44.973 72.238 1.00 29.17 C \n");
60+
//sb.append("LINK C21 2EG A 7 C22 2EG B 19 1555 1555 1.56 ");
61+
sb.append("LINK C21 2EG A 7 C22 2EG B 19\n");
62+
String shortLine = sb.toString();
63+
64+
// Parse short
65+
try(InputStream is = new ByteArrayInputStream(shortLine.getBytes())) {
66+
s = pdbPars.parsePDBFile(is);
67+
}
68+
69+
// Should be a bond present in the Atoms.
70+
Chain c = s.getChainByIndex(0, 0);
71+
Group g = c.getAtomGroups().get(0);
72+
Atom a = g.getAtom(0);
73+
assertEquals(1, a.getBonds().size());
74+
}
75+
}

0 commit comments

Comments
 (0)