Skip to content

Commit 043eb40

Browse files
josemduartesbliven
andauthored
More nullpdbid tests (biojava#1056)
* Fix CE-Symm bug Fix NullPointer if the structure identifier was missing * Test current handling of identifiers in structures Currently `Structure` has 4 identifiers which may or may not be set depending on the source, file format, and parsing method. These should be revisited and harmonized in the future, but for now I just document them in this test. Summary of results: | Format | With ID | Parse Method | PdbId | Identifier | Name | StructureIdentifier | |--------|---------|-----------------|-------|------------|------|---------------------| | cif | No | fromInputStream | null | "" | "" | null | | pdb | No | fromInputStream | null | "" | "" | null | | cif | No | fromUrl | null | "" | "" | null | | cif | No | StructureIO | null | file: | 5PTI | file: | | pdb | No | StructureIO | null | file: | "" | file: | | cif | Yes | fromInputStream | 5PTI | "" | "" | null | | pdb | Yes | fromInputStream | 5PTI | "" | "" | null | | cif | Yes | fromUrl | 5PTI | "" | "" | null | | cif | Yes | StructureIO | 5PTI | file: | 5PTI | file: | | pdb | Yes | StructureIO | 5PTI | file: | 5PTI | file: | - `getPdbId` reflects the ID parsed from the file - Only StructureIO is setting the StructureIdentifier (by design, but indicates StructureIO should be used wherever possible) - StructureIO does not set the name properly for PDB files. This should be fixed. * Test NullPointerException when loading Alphafold models (biojava#1019) The core URLIdentifier bug was fixed in biojava#1020, but this improves the test. The parser changes are mostly for logging, and are there to indicate that null PdbId is an expected situation rather than an error. * Truncate file --------- Co-authored-by: Spencer Bliven <spencer.bliven@gmail.com>
1 parent 5218e59 commit 043eb40

File tree

8 files changed

+348
-23
lines changed

8 files changed

+348
-23
lines changed
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
package org.biojava.nbio.structure.test.io;
2+
3+
import static org.junit.Assume.assumeNotNull;
4+
import static org.junit.jupiter.api.Assertions.assertEquals;
5+
import static org.junit.jupiter.api.Assertions.assertNull;
6+
7+
import java.io.BufferedReader;
8+
import java.io.IOException;
9+
import java.io.InputStream;
10+
import java.io.InputStreamReader;
11+
import java.net.URL;
12+
13+
import org.biojava.nbio.structure.Structure;
14+
import org.biojava.nbio.structure.StructureException;
15+
import org.biojava.nbio.structure.StructureIO;
16+
import org.biojava.nbio.structure.align.client.StructureName;
17+
import org.biojava.nbio.structure.io.PDBFileParser;
18+
import org.biojava.nbio.structure.io.cif.CifStructureConverter;
19+
import org.junit.jupiter.api.Test;
20+
21+
/**
22+
* Test that various identifiers in Structure are parsed consistently across
23+
* file types.
24+
*
25+
* These tests are not intended to be prescriptive, but merely to document the
26+
* current state and force changes to be committed to git.
27+
*/
28+
public class TestIdentifiersInStructure {
29+
@Test
30+
public void testCifIdentifiers() throws IOException, StructureException {
31+
String noid = "/5pti_noid.cif";
32+
String yesid = "/5pti.cif";
33+
34+
// No ID, from file
35+
URL url = this.getClass().getResource(noid);
36+
String file = url.getPath();
37+
Structure s = CifStructureConverter.fromURL(url);
38+
assertNull(s.getPdbId());
39+
assertEquals("", s.getIdentifier());
40+
assertEquals("", s.getName());
41+
assertNull(s.getStructureIdentifier());
42+
43+
// No ID, from StructureIO
44+
s = StructureIO.getStructure(file);
45+
assertNull(s.getPdbId());
46+
assertEquals(file, s.getIdentifier());
47+
assertEquals("5PTI", s.getName());
48+
StructureName sid = (StructureName) s.getStructureIdentifier();
49+
assertEquals(file, sid.getIdentifier());
50+
51+
// No id, from stream
52+
InputStream stream = this.getClass().getResourceAsStream(noid);
53+
s = CifStructureConverter.fromInputStream(stream);
54+
assertNull(s.getPdbId());
55+
assertEquals("", s.getIdentifier());
56+
assertEquals("", s.getName());
57+
assertNull(s.getStructureIdentifier());
58+
59+
// With ID, from file
60+
url = this.getClass().getResource(yesid);
61+
file = url.getPath();
62+
s = CifStructureConverter.fromURL(url);
63+
assertEquals("5PTI", s.getPdbId().getId());
64+
assertEquals("", s.getIdentifier());
65+
assertEquals("", s.getName());
66+
assertNull(s.getStructureIdentifier());
67+
68+
// With ID, from StructureIO
69+
s = StructureIO.getStructure(file);
70+
assertEquals("5PTI", s.getPdbId().getId());
71+
assertEquals(file, s.getIdentifier());
72+
assertEquals("5PTI", s.getName());
73+
sid = (StructureName) s.getStructureIdentifier();
74+
assertEquals(file, sid.getIdentifier());
75+
76+
// With id, from stream
77+
stream = this.getClass().getResourceAsStream(yesid);
78+
s = CifStructureConverter.fromInputStream(stream);
79+
assertEquals("5PTI", s.getPdbId().getId());
80+
assertEquals("", s.getIdentifier());
81+
assertEquals("", s.getName());
82+
assertNull(s.getStructureIdentifier());
83+
}
84+
85+
@Test
86+
public void testPDBIdentifiers() throws IOException, StructureException {
87+
String yesid = "/5pti.pdb";
88+
String noid = "/noid.pdb";
89+
90+
assumeNotNull(this.getClass().getResource(yesid));
91+
assumeNotNull(this.getClass().getResource(noid));
92+
93+
PDBFileParser parser = new PDBFileParser();
94+
95+
// No id, from StructureIO
96+
String file = this.getClass().getResource(noid).getPath();
97+
// Structure s = parser.parsePDBFile(file);
98+
Structure s = StructureIO.getStructure(file);
99+
assertNull(s.getPdbId());
100+
assertEquals(file, s.getIdentifier());
101+
assertEquals("", s.getName()); // differs from CIF behavior
102+
StructureName sid = (StructureName) s.getStructureIdentifier();
103+
assertEquals(file, sid.getIdentifier());
104+
105+
// No id, from stream
106+
InputStream stream = this.getClass().getResourceAsStream(noid);
107+
s = parser.parsePDBFile(new BufferedReader(new InputStreamReader(stream)));
108+
assertNull(s.getPdbId());
109+
assertEquals("", s.getIdentifier());
110+
assertEquals("", s.getName());
111+
assertNull(s.getStructureIdentifier());
112+
113+
// With id, from StructureIO
114+
file = this.getClass().getResource(yesid).getPath();
115+
s = StructureIO.getStructure(file);
116+
assertEquals("5PTI", s.getPdbId().toString());
117+
assertEquals(file, s.getIdentifier());
118+
assertEquals("5PTI", s.getName());
119+
sid = (StructureName) s.getStructureIdentifier();
120+
assertEquals(file, sid.getIdentifier());
121+
122+
123+
// With id, from stream
124+
stream = this.getClass().getResourceAsStream(yesid);
125+
s = parser.parsePDBFile(new BufferedReader(new InputStreamReader(stream)));
126+
assertEquals("5PTI", s.getPdbId().toString());
127+
assertEquals("", s.getIdentifier());
128+
assertEquals("", s.getName());
129+
assertNull(s.getStructureIdentifier());
130+
}
131+
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
data_
2+
#
3+
loop_
4+
_entity.id
5+
_entity.type
6+
_entity.src_method
7+
_entity.pdbx_description
8+
_entity.formula_weight
9+
_entity.pdbx_number_of_molecules
10+
_entity.details
11+
1 polymer man 'TRYPSIN INHIBITOR' 6527.598 1 ?
12+
#
13+
loop_
14+
_struct_asym.id
15+
_struct_asym.pdbx_blank_PDB_chainid_flag
16+
_struct_asym.pdbx_modified
17+
_struct_asym.entity_id
18+
_struct_asym.details
19+
A N N 1 ?
20+
#
21+
loop_
22+
_atom_site.group_PDB
23+
_atom_site.id
24+
_atom_site.type_symbol
25+
_atom_site.label_atom_id
26+
_atom_site.label_alt_id
27+
_atom_site.label_comp_id
28+
_atom_site.label_asym_id
29+
_atom_site.label_entity_id
30+
_atom_site.label_seq_id
31+
_atom_site.pdbx_PDB_ins_code
32+
_atom_site.Cartn_x
33+
_atom_site.Cartn_y
34+
_atom_site.Cartn_z
35+
_atom_site.occupancy
36+
_atom_site.B_iso_or_equiv
37+
_atom_site.Cartn_x_esd
38+
_atom_site.Cartn_y_esd
39+
_atom_site.Cartn_z_esd
40+
_atom_site.occupancy_esd
41+
_atom_site.B_iso_or_equiv_esd
42+
_atom_site.pdbx_formal_charge
43+
_atom_site.auth_seq_id
44+
_atom_site.auth_comp_id
45+
_atom_site.auth_asym_id
46+
_atom_site.auth_atom_id
47+
_atom_site.pdbx_PDB_model_num
48+
ATOM 1 N N . ARG A 1 1 ? 32.231 15.281 -13.143 1.00 28.28 ? ? ? ? ? ? 1 ARG A N 1
49+
ATOM 2 C CA . ARG A 1 1 ? 32.184 14.697 -11.772 1.00 27.90 ? ? ? ? ? ? 1 ARG A CA 1
50+
ATOM 3 C C . ARG A 1 1 ? 33.438 13.890 -11.387 1.00 24.90 ? ? ? ? ? ? 1 ARG A C 1
51+
ATOM 4 O O . ARG A 1 1 ? 34.102 13.070 -12.066 1.00 24.44 ? ? ? ? ? ? 1 ARG A O 1
52+
ATOM 5 C CB . ARG A 1 1 ? 30.797 14.065 -11.625 1.00 27.88 ? ? ? ? ? ? 1 ARG A CB 1
53+
ATOM 6 C CG . ARG A 1 1 ? 30.976 12.589 -11.819 1.00 29.61 ? ? ? ? ? ? 1 ARG A CG 1
54+
ATOM 7 C CD . ARG A 1 1 ? 29.608 12.016 -11.694 1.00 31.91 ? ? ? ? ? ? 1 ARG A CD 1
55+
ATOM 8 N NE . ARG A 1 1 ? 28.942 12.335 -12.945 1.00 33.51 ? ? ? ? ? ? 1 ARG A NE 1
56+
ATOM 9 C CZ . ARG A 1 1 ? 27.670 12.696 -13.050 1.00 34.29 ? ? ? ? ? ? 1 ARG A CZ 1
57+
ATOM 10 N NH1 . ARG A 1 1 ? 26.901 12.777 -11.999 1.00 34.48 ? ? ? ? ? ? 1 ARG A NH1 1
58+
ATOM 11 N NH2 . ARG A 1 1 ? 27.161 12.963 -14.255 1.00 35.44 ? ? ? ? ? ? 1 ARG A NH2 1
59+
ATOM 12 D D1 . ARG A 1 1 ? 32.983 14.824 -13.703 1.00 27.71 ? ? ? ? ? ? 1 ARG A D1 1
60+
ATOM 13 D D2 . ARG A 1 1 ? 31.275 15.112 -13.535 1.00 28.50 ? ? ? ? ? ? 1 ARG A D2 1
61+
ATOM 14 D D3 . ARG A 1 1 ? 32.174 16.346 -13.050 1.00 28.23 ? ? ? ? ? ? 1 ARG A D3 1
62+
ATOM 15 H HA . ARG A 1 1 ? 32.192 15.563 -11.115 1.00 26.97 ? ? ? ? ? ? 1 ARG A HA 1
63+
ATOM 16 H HB2 . ARG A 1 1 ? 30.392 14.428 -10.697 1.00 28.71 ? ? ? ? ? ? 1 ARG A HB2 1
64+
ATOM 17 H HB3 . ARG A 1 1 ? 30.182 14.438 -12.437 1.00 28.97 ? ? ? ? ? ? 1 ARG A HB3 1
65+
ATOM 18 H HG2 . ARG A 1 1 ? 31.369 12.359 -12.800 1.00 29.44 ? ? ? ? ? ? 1 ARG A HG2 1
66+
ATOM 19 H HG3 . ARG A 1 1 ? 31.591 12.143 -11.105 1.00 29.52 ? ? ? ? ? ? 1 ARG A HG3 1
67+
ATOM 20 H HD2 . ARG A 1 1 ? 29.656 10.965 -11.482 1.00 31.41 ? ? ? ? ? ? 1 ARG A HD2 1
68+
ATOM 21 H HD3 . ARG A 1 1 ? 29.218 12.381 -10.794 1.00 31.29 ? ? ? ? ? ? 1 ARG A HD3 1
69+
ATOM 22 D DE . ARG A 1 1 ? 29.457 12.292 -13.843 1.00 34.62 ? ? ? ? ? ? 1 ARG A DE 1
70+
ATOM 23 D DH11 . ARG A 1 1 ? 25.930 13.038 -12.118 1.00 34.80 ? ? ? ? ? ? 1 ARG A DH11 1
71+
ATOM 24 D DH12 . ARG A 1 1 ? 27.324 12.549 -11.100 1.00 34.71 ? ? ? ? ? ? 1 ARG A DH12 1
72+
ATOM 25 D DH21 . ARG A 1 1 ? 27.786 12.886 -15.076 1.00 33.93 ? ? ? ? ? ? 1 ARG A DH21 1
73+
ATOM 26 D DH22 . ARG A 1 1 ? 26.154 13.214 -14.269 1.00 34.53 ? ? ? ? ? ? 1 ARG A DH22 1
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
HEADER 01-JUL-21
2+
ATOM 1 N MET A 1 19.682 12.062 34.184 1.00 39.14 N
3+
ATOM 2 CA MET A 1 20.443 10.838 34.522 1.00 39.14 C
4+
ATOM 3 C MET A 1 20.073 9.731 33.538 1.00 39.14 C
5+
ATOM 4 CB MET A 1 20.138 10.405 35.966 1.00 39.14 C
6+
ATOM 5 O MET A 1 19.030 9.110 33.696 1.00 39.14 O
7+
ATOM 6 CG MET A 1 20.829 11.294 37.004 1.00 39.14 C
8+
ATOM 7 SD MET A 1 20.292 10.920 38.687 1.00 39.14 S
9+
ATOM 8 CE MET A 1 21.522 11.848 39.645 1.00 39.14 C

biojava-structure/src/main/java/org/biojava/nbio/structure/io/PDBFileParser.java

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -371,13 +371,16 @@ private void pdb_HEADER_Handler(String line) {
371371

372372
logger.debug("Parsing entry " + pdbId);
373373

374-
375374
PdbId pdbIdToSet;
376-
try {
377-
pdbIdToSet = new PdbId(pdbCode);
378-
} catch (IllegalArgumentException e) {
379-
logger.info("Malformed (or null) PDB ID {}. setting PdbId to null", pdbCode);
375+
if(pdbCode.isBlank()) {
380376
pdbIdToSet = null;
377+
} else {
378+
try {
379+
pdbIdToSet = new PdbId(pdbCode);
380+
} catch (IllegalArgumentException e) {
381+
logger.warn("Malformed PDB ID {}. setting PdbId to null", pdbCode);
382+
pdbIdToSet = null;
383+
}
381384
}
382385
structure.setPdbId(pdbIdToSet);
383386
pdbHeader.setPdbId(pdbIdToSet);
@@ -2719,7 +2722,7 @@ else if ( params.isParseSecStruc()) {
27192722
}
27202723

27212724
makeCompounds(compndLines, sourceLines);
2722-
2725+
27232726
handlePDBKeywords(keywordsLines);
27242727

27252728
triggerEndFileChecks();
@@ -2786,18 +2789,18 @@ private void makeCompounds(List<String> compoundList,
27862789
}
27872790

27882791
/**Parse KEYWODS record of the PDB file.<br>
2789-
* A keyword may be split over two lines. whether a keyword ends by the end
2790-
* of a line or it is aplit over two lines, a <code>space</code> is added
2791-
* between the 2 lines's contents, unless the first line ends in
2792+
* A keyword may be split over two lines. whether a keyword ends by the end
2793+
* of a line or it is aplit over two lines, a <code>space</code> is added
2794+
* between the 2 lines's contents, unless the first line ends in
27922795
* a '-' character.
27932796
* <pre>
27942797
* Record Format
2795-
* COLUMNS DATA TYPE FIELD DEFINITION
2798+
* COLUMNS DATA TYPE FIELD DEFINITION
27962799
* ---------------------------------------------------------------------------------
2797-
* 1 - 6 Record name "KEYWDS"
2800+
* 1 - 6 Record name "KEYWDS"
27982801
* 9 - 10 Continuation continuation Allows concatenation of records if necessary.
27992802
* 11 - 79 List keywds Comma-separated list of keywords relevant
2800-
* to the entry.
2803+
* to the entry.
28012804
* Example
28022805
* 1 2 3 4 5 6 7 8
28032806
* 12345678901234567890123456789012345678901234567890123456789012345678901234567890
@@ -2830,7 +2833,7 @@ private void handlePDBKeywords(List<String> lines) {
28302833
}
28312834
pdbHeader.setKeywords(lst);
28322835
}
2833-
2836+
28342837
/**
28352838
* Handles creation of all bonds. Looks at LINK records, SSBOND (Disulfide
28362839
* bonds), peptide bonds, and intra-residue bonds.

biojava-structure/src/main/java/org/biojava/nbio/structure/io/cif/CifStructureConsumerImpl.java

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -645,11 +645,11 @@ public void consumeDatabasePDBRevRecord(DatabasePDBRevRecord databasePDBrevRecor
645645
revRecords.add(new org.biojava.nbio.structure.DatabasePDBRevRecord(databasePDBrevRecord, i));
646646
}
647647
}
648-
648+
649649
@Override
650650
public void consumeEm3dReconstruction(Em3dReconstruction em3dReconstruction) {
651651
this.em3dReconstruction = em3dReconstruction;
652-
652+
653653
for (int rowIndex = 0; rowIndex < em3dReconstruction.getRowCount(); rowIndex++) { //can it have more than 1 value?
654654
final FloatColumn resolution = em3dReconstruction.getResolution();
655655
if (ValueKind.PRESENT.equals(resolution.getValueKind(rowIndex)))
@@ -895,12 +895,16 @@ public void consumeStruct(Struct struct) {
895895
if (struct.isDefined() && struct.getEntryId().isDefined()) {
896896
PdbId pdbId;
897897
String pdbCode = struct.getEntryId().get(0);
898-
try {
899-
pdbId = new PdbId(pdbCode);
900-
} catch (IllegalArgumentException e) {
901-
logger.info("Malformed (or null) PDB ID {}. setting PdbId to null", pdbCode);
902-
pdbId = null;
903-
}
898+
if(pdbCode.isBlank()){
899+
pdbId = null;
900+
} else {
901+
try {
902+
pdbId = new PdbId(pdbCode);
903+
} catch (IllegalArgumentException e) {
904+
logger.warn("Malformed PDB ID {}. setting PdbId to null", pdbCode);
905+
pdbId = null;
906+
}
907+
}
904908
pdbHeader.setPdbId(pdbId);
905909
structure.setPdbId(pdbId);
906910
}

biojava-structure/src/main/java/org/biojava/nbio/structure/symmetry/internal/CeSymm.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,8 +242,10 @@ protected static CeSymmResult align(Atom[] atoms, CESymmParameters params)
242242
optimalAFP = selfAlignments.get(0);
243243
StructureIdentifier id = atoms[0].getGroup().getChain().getStructure()
244244
.getStructureIdentifier();
245-
optimalAFP.setName1(id.getIdentifier());
246-
optimalAFP.setName2(id.getIdentifier());
245+
if(id != null) {
246+
optimalAFP.setName1(id.getIdentifier());
247+
optimalAFP.setName2(id.getIdentifier());
248+
}
247249

248250
// Store the optimal self-alignment
249251
result.setSelfAlignment(optimalAFP);

biojava-structure/src/test/java/org/biojava/nbio/structure/symmetry/internal/TestCeSymm.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,18 @@
2121
package org.biojava.nbio.structure.symmetry.internal;
2222

2323
import static org.junit.Assert.*;
24+
import static org.junit.Assume.assumeNotNull;
2425

2526
import java.io.IOException;
27+
import java.io.InputStream;
28+
import java.net.URL;
2629

2730
import org.biojava.nbio.structure.Atom;
2831
import org.biojava.nbio.structure.Structure;
2932
import org.biojava.nbio.structure.StructureException;
33+
import org.biojava.nbio.structure.StructureIO;
3034
import org.biojava.nbio.structure.StructureTools;
35+
import org.biojava.nbio.structure.io.PDBFileParser;
3136
import org.biojava.nbio.structure.symmetry.internal.CeSymm;
3237
import org.junit.Test;
3338

@@ -58,4 +63,16 @@ public void testEasyCases() throws IOException, StructureException {
5863
assertEquals(result.getNumRepeats(), orders[i]);
5964
}
6065
}
66+
67+
@Test
68+
public void testAlphafold() throws IOException, StructureException {
69+
URL url = this.getClass().getResource("/AF-A0A0R4IYF1-F1-model_v2.pdb");
70+
assumeNotNull(url);
71+
String file = url.getPath();
72+
Structure s = StructureIO.getStructure(file);
73+
assertNull(s.getPdbId());
74+
Atom[] atoms = StructureTools.getRepresentativeAtomArray(s);
75+
CeSymmResult result = CeSymm.analyze(atoms);
76+
assertNotNull(result);
77+
}
6178
}

0 commit comments

Comments
 (0)