Skip to content

Commit 5e52308

Browse files
committed
Fix for #663 and some code cleanup.
1 parent da87f0b commit 5e52308

File tree

2 files changed

+117
-43
lines changed

2 files changed

+117
-43
lines changed

biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReader.java

Lines changed: 42 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
4141
import org.biojava.nbio.core.sequence.template.AbstractSequence;
4242
import org.biojava.nbio.core.sequence.template.Compound;
43+
import org.slf4j.Logger;
44+
import org.slf4j.LoggerFactory;
4345

4446
import java.io.*;
4547
import java.util.ArrayList;
@@ -55,20 +57,28 @@ public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> {
5557

5658
private SequenceCreatorInterface<C> sequenceCreator;
5759
private GenbankSequenceParser<S,C> genbankParser;
58-
private InputStream inputStream;
60+
private BufferedReader bufferedReader;
61+
private boolean closed;
62+
private final Logger logger = LoggerFactory.getLogger(this.getClass());
63+
64+
public boolean isClosed() {
65+
return closed;
66+
}
5967

6068
/**
6169
* If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
6270
* local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
6371
* an inputstream is forced to read all the data so you don't gain anything.
64-
* @param br
72+
* @param is
6573
* @param headerParser
6674
* @param sequenceCreator
6775
*/
68-
public GenbankReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser, SequenceCreatorInterface<C> sequenceCreator) {
76+
public GenbankReader(final InputStream is, final SequenceHeaderParserInterface<S,C> headerParser,
77+
final SequenceCreatorInterface<C> sequenceCreator) {
6978
this.sequenceCreator = sequenceCreator;
70-
this.inputStream = is;
71-
genbankParser = new GenbankSequenceParser<S,C>();
79+
bufferedReader = new BufferedReader(new InputStreamReader(is));
80+
genbankParser = new GenbankSequenceParser<>();
81+
closed = false;
7282
}
7383

7484
/**
@@ -85,14 +95,14 @@ public GenbankReader(InputStream is, SequenceHeaderParserInterface<S,C> headerPa
8595
* method denies read access to the file.
8696
*/
8797
public GenbankReader(
88-
File file,
89-
SequenceHeaderParserInterface<S,C> headerParser,
90-
SequenceCreatorInterface<C> sequenceCreator
98+
final File file,
99+
final SequenceHeaderParserInterface<S,C> headerParser,
100+
final SequenceCreatorInterface<C> sequenceCreator
91101
) throws FileNotFoundException {
92102

93-
inputStream = new FileInputStream(file);
103+
this.bufferedReader = new BufferedReader(new FileReader(file));
94104
this.sequenceCreator = sequenceCreator;
95-
genbankParser = new GenbankSequenceParser<S,C>();
105+
genbankParser = new GenbankSequenceParser<>();
96106
}
97107

98108
/**
@@ -108,8 +118,7 @@ public GenbankReader(
108118
* @throws CompoundNotFoundException
109119
*/
110120
public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundException {
111-
LinkedHashMap<String,S> sequences = process(-1);
112-
return sequences;
121+
return process(-1);
113122
}
114123

115124
/**
@@ -122,7 +131,7 @@ public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundExc
122131
* time before the first result is available.<br>
123132
* <b>N.B.</b>
124133
* <ul>
125-
* <li>This method ca't be called after calling its NO-ARGUMENT twin.</li>
134+
* <li>This method can't be called after calling its NO-ARGUMENT twin.</li>
126135
* <li>remember to close the underlying resource when you are done.</li>
127136
* </ul>
128137
* @see #process()
@@ -134,17 +143,17 @@ public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundExc
134143
* @throws IOException
135144
* @throws CompoundNotFoundException
136145
*/
137-
public LinkedHashMap<String,S> process(int max) throws IOException, CompoundNotFoundException {
138-
LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
146+
public LinkedHashMap<String,S> process(final int max) throws IOException, CompoundNotFoundException {
147+
LinkedHashMap<String,S> sequences = new LinkedHashMap<>();
139148
@SuppressWarnings("unchecked")
140149
int i=0;
141-
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
142150
while(true) {
143151
if(max>0 && i>=max) break;
144152
i++;
145-
String seqString = genbankParser.getSequence(br, 0);
153+
String seqString = genbankParser.getSequence(bufferedReader, 0);
146154
//reached end of file?
147155
if(seqString==null) break;
156+
@SuppressWarnings("unchecked")
148157
S sequence = (S) sequenceCreator.getSequence(seqString, 0);
149158
genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence);
150159

@@ -165,32 +174,41 @@ public LinkedHashMap<String,S> process(int max) throws IOException, CompoundNotF
165174

166175
sequences.put(sequence.getAccession().getID(), sequence);
167176
}
168-
br.close();
169-
close();
177+
178+
if (max < 0) {
179+
close();
180+
}
181+
170182
return sequences;
171183
}
172184

173-
public void close() throws IOException {
174-
inputStream.close();
185+
public void close() {
186+
try {
187+
bufferedReader.close();
188+
this.closed = true;
189+
} catch (IOException e) {
190+
logger.error("Couldn't close the reader. {}", e.getMessage());
191+
this.closed = false;
192+
}
175193
}
176194

177195
public static void main(String[] args) throws Exception {
178196
String proteinFile = "src/test/resources/BondFeature.gb";
179197
FileInputStream is = new FileInputStream(proteinFile);
180198

181-
GenbankReader<ProteinSequence, AminoAcidCompound> proteinReader = new GenbankReader<ProteinSequence, AminoAcidCompound>(is, new GenericGenbankHeaderParser<ProteinSequence,AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
199+
GenbankReader<ProteinSequence, AminoAcidCompound> proteinReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
182200
LinkedHashMap<String,ProteinSequence> proteinSequences = proteinReader.process();
183201
System.out.println(proteinSequences);
184202

185203
String inputFile = "src/test/resources/NM_000266.gb";
186204
is = new FileInputStream(inputFile);
187-
GenbankReader<DNASequence, NucleotideCompound> dnaReader = new GenbankReader<DNASequence, NucleotideCompound>(is, new GenericGenbankHeaderParser<DNASequence,NucleotideCompound>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
205+
GenbankReader<DNASequence, NucleotideCompound> dnaReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
188206
LinkedHashMap<String,DNASequence> dnaSequences = dnaReader.process();
189207
System.out.println(dnaSequences);
190208

191209
String crazyFile = "src/test/resources/CraftedFeature.gb";
192210
is = new FileInputStream(crazyFile);
193-
GenbankReader<DNASequence, NucleotideCompound> crazyReader = new GenbankReader<DNASequence, NucleotideCompound>(is, new GenericGenbankHeaderParser<DNASequence,NucleotideCompound>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
211+
GenbankReader<DNASequence, NucleotideCompound> crazyReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
194212
LinkedHashMap<String,DNASequence> crazyAnnotatedSequences = crazyReader.process();
195213

196214
is.close();

biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankReaderTest.java

Lines changed: 75 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,15 @@
2020
*/
2121
package org.biojava.nbio.core.sequence.io;
2222

23-
import static org.junit.Assert.assertNotNull;
24-
23+
import java.io.IOException;
2524
import java.io.InputStream;
25+
import java.lang.reflect.Field;
2626
import java.util.ArrayList;
2727
import java.util.LinkedHashMap;
2828
import java.util.List;
2929
import java.util.Map;
3030

31+
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
3132
import org.biojava.nbio.core.sequence.DNASequence;
3233
import org.biojava.nbio.core.sequence.ProteinSequence;
3334
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
@@ -46,10 +47,13 @@
4647
import org.slf4j.Logger;
4748
import org.slf4j.LoggerFactory;
4849

50+
import static org.junit.Assert.*;
51+
4952
/**
5053
*
5154
* @author Scooter Willis <willishf at gmail dot com>
5255
* @author Jacek Grzebyta
56+
* @author Philippe Soares
5357
*/
5458
public class GenbankReaderTest {
5559

@@ -84,31 +88,83 @@ public void testProcess() throws Exception {
8488
InputStream inStream = this.getClass().getResourceAsStream("/BondFeature.gb");
8589
assertNotNull(inStream);
8690

87-
GenbankReader<ProteinSequence, AminoAcidCompound> GenbankProtein
88-
= new GenbankReader<ProteinSequence, AminoAcidCompound>(
91+
GenbankReader<ProteinSequence, AminoAcidCompound> genbankProtein
92+
= new GenbankReader<>(
8993
inStream,
90-
new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(),
94+
new GenericGenbankHeaderParser<>(),
9195
new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())
9296
);
93-
@SuppressWarnings("unused")
94-
LinkedHashMap<String, ProteinSequence> proteinSequences = GenbankProtein.process();
95-
inStream.close();
97+
98+
LinkedHashMap<String, ProteinSequence> proteinSequences = genbankProtein.process();
99+
100+
assertNotNull(proteinSequences);
101+
assertEquals(1, proteinSequences.size());
102+
103+
ProteinSequence proteinSequence = proteinSequences.get("NP_000257");
104+
assertNotNull(proteinSequences.get("NP_000257"));
105+
assertEquals("NP_000257", proteinSequence.getAccession().getID());
106+
assertEquals("4557789", proteinSequence.getAccession().getIdentifier());
107+
assertEquals("GENBANK", proteinSequence.getAccession().getDataSource().name());
108+
assertEquals(1, proteinSequence.getAccession().getVersion().intValue());
109+
assertTrue(genbankProtein.isClosed());
96110

97111
logger.info("process DNA");
98112
inStream = this.getClass().getResourceAsStream("/NM_000266.gb");
99113
assertNotNull(inStream);
100114

101-
GenbankReader<DNASequence, NucleotideCompound> GenbankDNA
102-
= new GenbankReader<DNASequence, NucleotideCompound>(
115+
GenbankReader<DNASequence, NucleotideCompound> genbankDNA
116+
= new GenbankReader<>(
103117
inStream,
104-
new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(),
118+
new GenericGenbankHeaderParser<>(),
105119
new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())
106120
);
107-
@SuppressWarnings("unused")
108-
LinkedHashMap<String, DNASequence> dnaSequences = GenbankDNA.process();
109-
inStream.close();
121+
LinkedHashMap<String, DNASequence> dnaSequences = genbankDNA.process();
122+
123+
assertNotNull(dnaSequences);
124+
assertEquals(1, dnaSequences.size());
125+
126+
DNASequence dnaSequence = dnaSequences.get("NM_000266");
127+
assertNotNull(dnaSequences.get("NM_000266"));
128+
assertEquals("NM_000266", dnaSequence.getAccession().getID());
129+
assertEquals("223671892", dnaSequence.getAccession().getIdentifier());
130+
assertEquals("GENBANK", dnaSequence.getAccession().getDataSource().name());
131+
assertEquals(3, dnaSequence.getAccession().getVersion().intValue());
132+
assertTrue(genbankDNA.isClosed());
110133
}
111134

135+
/**
136+
* Test the process method with a number of sequences to be read at each call.
137+
* The underlying {@link InputStream} should remain open until the last call.
138+
*/
139+
@Test
140+
public void testPartialProcess() throws IOException, CompoundNotFoundException, NoSuchFieldException {
141+
InputStream inStream = this.getClass().getResourceAsStream("/two-dnaseqs.gb");
142+
143+
GenbankReader<DNASequence, NucleotideCompound> genbankDNA
144+
= new GenbankReader<>(
145+
inStream,
146+
new GenericGenbankHeaderParser<>(),
147+
new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())
148+
);
149+
150+
// First call to process(1) returns the first sequence
151+
LinkedHashMap<String, DNASequence> dnaSequences = genbankDNA.process(1);
152+
153+
assertNotNull(dnaSequences);
154+
assertEquals(1, dnaSequences.size());
155+
assertNotNull(dnaSequences.get("vPetite"));
156+
157+
// Second call to process(1) returns the second sequence
158+
dnaSequences = genbankDNA.process(1);
159+
assertNotNull(dnaSequences);
160+
assertEquals(1, dnaSequences.size());
161+
assertNotNull(dnaSequences.get("sbFDR"));
162+
163+
assertFalse(genbankDNA.isClosed());
164+
genbankDNA.close();
165+
assertTrue(genbankDNA.isClosed());
166+
167+
}
112168

113169
@Test
114170
public void CDStest() throws Exception {
@@ -118,9 +174,9 @@ public void CDStest() throws Exception {
118174
assertNotNull(inStream);
119175

120176
GenbankReader<ProteinSequence, AminoAcidCompound> GenbankProtein
121-
= new GenbankReader<ProteinSequence, AminoAcidCompound>(
177+
= new GenbankReader<>(
122178
inStream,
123-
new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(),
179+
new GenericGenbankHeaderParser<>(),
124180
new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())
125181
);
126182
LinkedHashMap<String, ProteinSequence> proteinSequences = GenbankProtein.process();
@@ -130,7 +186,7 @@ public void CDStest() throws Exception {
130186
Assert.assertTrue(proteinSequences.size() == 1);
131187
logger.debug("protein sequences: {}", proteinSequences);
132188

133-
ProteinSequence protein = new ArrayList<ProteinSequence>(proteinSequences.values()).get(0);
189+
ProteinSequence protein = new ArrayList<>(proteinSequences.values()).get(0);
134190

135191
FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound> cdsFeature = protein.getFeaturesByType("CDS").get(0);
136192
String codedBy = cdsFeature.getQualifiers().get("coded_by").get(0).getValue();
@@ -139,8 +195,8 @@ public void CDStest() throws Exception {
139195

140196
Assert.assertNotNull(codedBy);
141197
Assert.assertTrue(!codedBy.isEmpty());
142-
Assert.assertEquals(codedBy, "NM_000266.2:503..904");
143-
Assert.assertEquals(5, dbrefs.size());
198+
assertEquals(codedBy, "NM_000266.2:503..904");
199+
assertEquals(5, dbrefs.size());
144200

145201
}
146202

0 commit comments

Comments
 (0)