Skip to content

Commit 75b69fd

Browse files
committed
Fix #932 NullPointerException when parsing 3FDJ COMPND records
The problem is because a colon (:) is present in the first token and it was not properly handled.
1 parent 7b76f8f commit 75b69fd

File tree

4 files changed

+67
-13
lines changed

4 files changed

+67
-13
lines changed

biojava-structure/src/main/java/org/biojava/nbio/structure/io/PDBFileParser.java

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -931,26 +931,26 @@ private void pdb_COMPND_Handler(String line) {
931931

932932
String[] fieldList = line.trim().split("\\s+");
933933
int fl = fieldList.length;
934-
if ((fl >0 ) && compndFieldValues.contains(fieldList[0])) {
935-
936-
continuationField = fieldList[0];
937-
if (previousContinuationField.equals("")) {
938-
previousContinuationField = continuationField;
939-
}
940-
941-
} else if (fl>0) {
942-
// the ':' character indicates the end of a field name and should be invalid as part the first data token
943-
// e.g. obsolete file 1hhb has a malformed COMPND line that can only be caught with this kind of check
944-
if (fieldList[0].contains(":") ) {
934+
if (fl > 0) {
935+
String field0 = fieldList[0];
936+
if (compndFieldValues.contains(field0)) {
937+
continuationField = field0;
938+
if (previousContinuationField.equals("")) {
939+
previousContinuationField = continuationField;
940+
}
941+
} else if (field0.endsWith(";") && compndFieldValues.contains(field0.substring(0, field0.length()-1)) ) {
942+
// the ':' character indicates the end of a field name and should be invalid as part the first data token
943+
// e.g. obsolete file 1hhb has a malformed COMPND line that can only be caught with this kind of check
944+
// UPDATE: There is no harm of having a ':' in the first data token. e.g. 3fdj contains a ':'.
945+
// The intended case occurs only if the token is a key followed by a colon and a semicolon without spaces, e.g. "COMPND 2 MOLECULE:;"
945946
logger.info("COMPND line does not follow the PDB 3.0 format. Note that COMPND parsing is not supported any longer in format 2.3 or earlier");
946947
return;
947948
}
948-
949949
} else {
950-
951950
// the line will be added as data to the previous field
952951
}
953952

953+
954954
line = line.replace(continuationField, "").trim();
955955

956956
StringTokenizer compndTokens = new StringTokenizer(line);

biojava-structure/src/test/java/org/biojava/nbio/structure/SourceCompoundTest.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
import org.junit.Assert;
2727
import org.junit.Test;
2828

29+
import static org.junit.Assert.assertEquals;
30+
import static org.junit.Assert.assertNotNull;
31+
2932
import java.io.IOException;
3033
import java.io.InputStream;
3134
import java.util.List;
@@ -48,6 +51,15 @@ private Structure getStructure(String fileName){
4851
return structure;
4952
}
5053

54+
@Test
55+
public void testCompoundColonInFirstToken() {
56+
Structure s1 = getStructure("/1hhbCMPND+SRC.ent");
57+
assertNotNull(s1);
58+
assertEquals(2, s1.getEntityInfos().size());
59+
Structure s2 = getStructure("/3fdjCMPND+SRC.ent");
60+
assertNotNull(s2);
61+
assertEquals(1, s2.getEntityInfos().size());
62+
}
5163

5264
@Test
5365
public void testCompoundSourceStructure(){
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
HEADER OXYGEN TRANSPORT 01-APR-75 1HHB
2+
OBSLTE 18-JUL-84 1HHB 2HHB 3HHB 4HHB
3+
TITLE THREE-DIMENSIONAL FOURIER SYNTHESIS OF HUMAN
4+
TITLE 2 DEOXYHEMOGLOBIN AT 2.5 ANGSTROMS RESOLUTION, $I.X-RAY
5+
TITLE 3 ANALYSIS
6+
COMPND MOL_ID: 1;
7+
COMPND 2 MOLECULE:;
8+
COMPND 3 CHAIN: A;
9+
COMPND 4 ENGINEERED: YES;
10+
COMPND 5 MOL_ID: 2;
11+
COMPND 6 MOLECULE:;
12+
COMPND 7 CHAIN: B;
13+
COMPND 8 ENGINEERED: YES
14+
SOURCE MOL_ID: 1;
15+
SOURCE 2 MOL_ID: 2
16+
KEYWDS OXYGEN TRANSPORT
17+
EXPDTA X-RAY DIFFRACTION
18+
AUTHOR G.FERMI,M.F.PERUTZ
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
HEADER STRUCTURAL GENOMICS, UNKNOWN FUNCTION 25-NOV-08 3FDJ
2+
TITLE THE STRUCTURE OF A DEGV FAMILY PROTEIN FROM EUBACTERIUM ELIGENS.
3+
COMPND MOL_ID: 1;
4+
COMPND 2 MOLECULE: DEGV FAMILY PROTEIN;
5+
COMPND 3 CHAIN: A;
6+
COMPND 4 ENGINEERED: YES;
7+
COMPND 5 OTHER_DETAILS: ASSEMBLY.20070618:0-0:0-0
8+
COMPND 6 (ZVNW5VJFOAACMF8NLOPRFTMWH60:01:1-275) GENE
9+
SOURCE MOL_ID: 1;
10+
SOURCE 2 ORGANISM_SCIENTIFIC: EUBACTERIUM ELIGENS;
11+
SOURCE 3 ORGANISM_TAXID: 39485;
12+
SOURCE 4 GENE: ASSEMBLY.20070618:0-0:0-0 (ZVNW5VJFOAACMF8NLOPRFTMWH60:01:1-
13+
SOURCE 5 275);
14+
SOURCE 6 EXPRESSION_SYSTEM: ESCHERICHIA COLI;
15+
SOURCE 7 EXPRESSION_SYSTEM_TAXID: 562;
16+
SOURCE 8 EXPRESSION_SYSTEM_STRAIN: BL21(DE3);
17+
SOURCE 9 EXPRESSION_SYSTEM_VECTOR_TYPE: PLASMID;
18+
SOURCE 10 EXPRESSION_SYSTEM_PLASMID: PMCSG19
19+
KEYWDS DEGV, GUT MICROBIOME, STRUCTURAL GENOMICS, PSI-2, PROTEIN STRUCTURE
20+
KEYWDS 2 INITIATIVE, MIDWEST CENTER FOR STRUCTURAL GENOMICS, MCSG, UNKNOWN
21+
KEYWDS 3 FUNCTION
22+
EXPDTA X-RAY DIFFRACTION
23+
AUTHOR M.E.CUFF,R.HENDRICKS,L.FREEMAN,A.JOACHIMIAK,MIDWEST CENTER FOR
24+
AUTHOR 2 STRUCTURAL GENOMICS (MCSG)

0 commit comments

Comments
 (0)