Skip to content

Commit 9cd1949

Browse files
committed
Towards solving #234. The parser now does the correct job for phenix
files, still processSingleLine() has issues with corner case quoting
1 parent 08a45fc commit 9cd1949

File tree

3 files changed

+107
-12
lines changed

3 files changed

+107
-12
lines changed

biojava-structure/src/main/java/org/biojava/nbio/structure/io/mmcif/SimpleMMcifParser.java

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,23 @@ public class SimpleMMcifParser implements MMcifParser {
7777
public static final String LOOP_END = "#";
7878
public static final String LOOP_START = "loop_";
7979
public static final String FIELD_LINE = "_";
80+
81+
// the following are the 3 valid quoting characters in CIF
82+
/**
83+
* Quoting character '
84+
*/
85+
private static final char S1 = '\'';
86+
87+
/**
88+
* Quoting character "
89+
*/
90+
private static final char S2 = '\"';
91+
92+
/**
93+
* Quoting character ; (multi-line quoting)
94+
*/
8095
public static final String STRING_LIMIT = ";";
81-
82-
private static final char s1 = '\'';
83-
private static final char s2 = '\"';
96+
8497

8598
private List<MMcifConsumer> consumers ;
8699

@@ -170,7 +183,7 @@ public void parse(BufferedReader buf)
170183

171184
if ( inLoop){
172185

173-
if (line.startsWith(LOOP_END)){
186+
if (line.startsWith(LOOP_END) || line.isEmpty()){
174187
// reset all data
175188
inLoop = false;
176189
lineData.clear();
@@ -183,7 +196,7 @@ public void parse(BufferedReader buf)
183196

184197
}
185198

186-
if ( line.startsWith(FIELD_LINE)){
199+
if ( line.matches("\\s*"+FIELD_LINE+"\\w+.*")) {// startsWith(FIELD_LINE)){
187200
// found another field.
188201
String txt = line.trim();
189202
//System.out.println("line: " + txt);
@@ -232,7 +245,7 @@ public void parse(BufferedReader buf)
232245
lineData.clear();
233246
logger.debug("Detected LOOP_START: '{}'. Toggling to inLoop=true", LOOP_START);
234247
continue;
235-
} else if (line.startsWith(LOOP_END)){
248+
} else if (line.startsWith(LOOP_END) || line.isEmpty()){
236249
inLoop = false;
237250
if ( category != null)
238251
endLineChecks(category, loopFields, lineData, loopWarnings);
@@ -284,7 +297,7 @@ public void parse(BufferedReader buf)
284297
}
285298

286299
private List<String> processSingleLine(String line){
287-
//System.out.println("SS processSingleLine " + line);
300+
288301
List<String> data = new ArrayList<String>();
289302

290303
if ( line.trim().length() == 0){
@@ -300,14 +313,18 @@ private List<String> processSingleLine(String line){
300313
boolean inS2 = false;
301314
String word = "";
302315

303-
//System.out.println(line);
304316
for (int i=0; i< line.length(); i++ ){
305-
//System.out.println(word);
317+
306318
Character c = line.charAt(i);
307319

308320
Character nextC = null;
309321
if (i < line.length() - 1)
310322
nextC = line.charAt(i+1);
323+
324+
//Character lastC = null;
325+
//if (i>0)
326+
// lastC = line.charAt(i-1);
327+
311328
if (c == ' ') {
312329

313330
if ( ! inString){
@@ -319,7 +336,7 @@ private List<String> processSingleLine(String line){
319336
word += c;
320337
}
321338

322-
} else if (c == s1 ) {
339+
} else if (c == S1 ) {
323340

324341
if ( inString){
325342

@@ -352,7 +369,7 @@ private List<String> processSingleLine(String line){
352369
inString = true;
353370
inS1 = true;
354371
}
355-
} else if ( c == s2 ){
372+
} else if ( c == S2 ){
356373
if ( inString){
357374

358375
boolean wordEnd = false;
@@ -395,7 +412,8 @@ private List<String> processSingleLine(String line){
395412

396413
}
397414

398-
/** get the content of a cif entry
415+
/**
416+
* Get the content of a cif entry
399417
*
400418
* @param line
401419
* @param buf

biojava-structure/src/test/java/org/biojava/nbio/structure/io/TestDifficultMmCIFFiles.java

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,17 @@
2222

2323
import org.biojava.nbio.structure.*;
2424
import org.biojava.nbio.structure.align.util.AtomCache;
25+
import org.biojava.nbio.structure.io.mmcif.MMcifParser;
26+
import org.biojava.nbio.structure.io.mmcif.SimpleMMcifConsumer;
27+
import org.biojava.nbio.structure.io.mmcif.SimpleMMcifParser;
2528
import org.biojava.nbio.structure.quaternary.BioAssemblyInfo;
2629
import org.junit.Test;
2730

31+
import java.io.BufferedReader;
2832
import java.io.File;
2933
import java.io.IOException;
34+
import java.io.InputStream;
35+
import java.io.InputStreamReader;
3036
import java.net.URISyntaxException;
3137
import java.net.URL;
3238
import java.util.List;
@@ -157,4 +163,35 @@ public void test4letterChains() throws IOException, StructureException, URISynta
157163
assertNotNull(chain2);
158164
assertEquals(chain2, chain);
159165
}
166+
167+
/**
168+
* This is to test the issue discussed here:
169+
* http://www.globalphasing.com/startools/
170+
* Essentially single quote characters (') are valid not only for quoting, but also as parts of
171+
* data values as long as some rules of the STAR format are followed.
172+
* For instance Phenix produces mmCIF files with non-quoted strings containing single quote characters
173+
* @throws IOException
174+
*/
175+
//@Test
176+
public void testQuotingCornerCase () throws IOException {
177+
InputStream inStream = this.getClass().getResourceAsStream("/org/biojava/nbio/structure/io/difficult_mmcif_quoting.cif");
178+
MMcifParser parser = new SimpleMMcifParser();
179+
180+
SimpleMMcifConsumer consumer = new SimpleMMcifConsumer();
181+
182+
FileParsingParameters fileParsingParams = new FileParsingParameters();
183+
fileParsingParams.setAlignSeqRes(true);
184+
185+
consumer.setFileParsingParameters(fileParsingParams);
186+
187+
parser.addMMcifConsumer(consumer);
188+
189+
parser.parse(new BufferedReader(new InputStreamReader(inStream)));
190+
191+
Structure s = consumer.getStructure();
192+
193+
assertNotNull(s);
194+
195+
196+
}
160197
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
data_4LUP_subset_no_quotes
2+
loop_
3+
_atom_site.group_PDB
4+
_atom_site.id
5+
_atom_site.type_symbol
6+
_atom_site.label_atom_id
7+
_atom_site.label_alt_id
8+
_atom_site.label_comp_id
9+
_atom_site.label_asym_id
10+
_atom_site.label_entity_id
11+
_atom_site.label_seq_id
12+
_atom_site.pdbx_PDB_ins_code
13+
_atom_site.Cartn_x
14+
_atom_site.Cartn_y
15+
_atom_site.Cartn_z
16+
_atom_site.occupancy
17+
_atom_site.B_iso_or_equiv
18+
_atom_site.Cartn_x_esd
19+
_atom_site.Cartn_y_esd
20+
_atom_site.Cartn_z_esd
21+
_atom_site.occupancy_esd
22+
_atom_site.B_iso_or_equiv_esd
23+
_atom_site.pdbx_formal_charge
24+
_atom_site.auth_seq_id
25+
_atom_site.auth_comp_id
26+
_atom_site.auth_asym_id
27+
_atom_site.auth_atom_id
28+
_atom_site.pdbx_PDB_model_num
29+
ATOM 1727 P P . DT C 2 1 ? 7.887 18.595 87.913 0.00 20.67 ? ? ? ? ? ? 107 DT B P 1
30+
ATOM 1728 O OP1 . DT C 2 1 ? 7.732 19.982 88.407 0.00 20.42 ? ? ? ? ? ? 107 DT B OP1 1
31+
ATOM 1729 O OP2 . DT C 2 1 ? 7.210 17.480 88.610 0.00 20.42 ? ? ? ? ? ? 107 DT B OP2 1
32+
ATOM 1730 O O5' . DT C 2 1 ? 7.464 18.547 86.371 0.00 21.57 ? ? ? ? ? ? 107 DT B O5' 1
33+
ATOM 1731 C C5' . DT C 2 1 ? 6.642 19.572 85.828 0.00 23.49 ? ? ? ? ? ? 107 DT B C5' 1
34+
ATOM 1732 C C4' . DT C 2 1 ? 6.943 19.784 84.356 0.00 27.33 ? ? ? ? ? ? 107 DT B C4' 1
35+
ATOM 1733 O O4' . DT C 2 1 ? 8.382 19.831 84.159 0.00 28.38 ? ? ? ? ? ? 107 DT B O4' 1
36+
ATOM 1734 C C3' . DT C 2 1 ? 6.438 18.687 83.425 1.00 31.54 ? ? ? ? ? ? 107 DT B C3' 1
37+
ATOM 1735 O O3' . DT C 2 1 ? 6.115 19.248 82.157 1.00 34.97 ? ? ? ? ? ? 107 DT B O3' 1
38+
ATOM 1736 C C2' . DT C 2 1 ? 7.641 17.758 83.333 1.00 31.47 ? ? ? ? ? ? 107 DT B C2' 1
39+
ATOM 1737 C C1' . DT C 2 1 ? 8.790 18.756 83.333 1.00 31.55 ? ? ? ? ? ? 107 DT B C1' 1
40+
#

0 commit comments

Comments
 (0)