Skip to content

Commit 8fd2685

Browse files
authored
Merge pull request biojava#666 from psoares/master
Fix for biojava#663
2 parents be50915 + 5e4eb1d commit 8fd2685

File tree

2 files changed

+116
-43
lines changed

2 files changed

+116
-43
lines changed

biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/GenbankReader.java

Lines changed: 42 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
4141
import org.biojava.nbio.core.sequence.template.AbstractSequence;
4242
import org.biojava.nbio.core.sequence.template.Compound;
43+
import org.slf4j.Logger;
44+
import org.slf4j.LoggerFactory;
4345

4446
import java.io.*;
4547
import java.util.ArrayList;
@@ -55,20 +57,28 @@ public class GenbankReader<S extends AbstractSequence<C>, C extends Compound> {
5557

5658
private SequenceCreatorInterface<C> sequenceCreator;
5759
private GenbankSequenceParser<S,C> genbankParser;
58-
private InputStream inputStream;
60+
private BufferedReader bufferedReader;
61+
private boolean closed;
62+
private final Logger logger = LoggerFactory.getLogger(this.getClass());
63+
64+
public boolean isClosed() {
65+
return closed;
66+
}
5967

6068
/**
6169
* If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
6270
* local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
6371
* an inputstream is forced to read all the data so you don't gain anything.
64-
* @param br
72+
* @param is
6573
* @param headerParser
6674
* @param sequenceCreator
6775
*/
68-
public GenbankReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser, SequenceCreatorInterface<C> sequenceCreator) {
76+
public GenbankReader(final InputStream is, final SequenceHeaderParserInterface<S,C> headerParser,
77+
final SequenceCreatorInterface<C> sequenceCreator) {
6978
this.sequenceCreator = sequenceCreator;
70-
this.inputStream = is;
71-
genbankParser = new GenbankSequenceParser<S,C>();
79+
bufferedReader = new BufferedReader(new InputStreamReader(is));
80+
genbankParser = new GenbankSequenceParser<>();
81+
closed = false;
7282
}
7383

7484
/**
@@ -85,14 +95,14 @@ public GenbankReader(InputStream is, SequenceHeaderParserInterface<S,C> headerPa
8595
* method denies read access to the file.
8696
*/
8797
public GenbankReader(
88-
File file,
89-
SequenceHeaderParserInterface<S,C> headerParser,
90-
SequenceCreatorInterface<C> sequenceCreator
98+
final File file,
99+
final SequenceHeaderParserInterface<S,C> headerParser,
100+
final SequenceCreatorInterface<C> sequenceCreator
91101
) throws FileNotFoundException {
92102

93-
inputStream = new FileInputStream(file);
103+
this.bufferedReader = new BufferedReader(new FileReader(file));
94104
this.sequenceCreator = sequenceCreator;
95-
genbankParser = new GenbankSequenceParser<S,C>();
105+
genbankParser = new GenbankSequenceParser<>();
96106
}
97107

98108
/**
@@ -108,8 +118,7 @@ public GenbankReader(
108118
* @throws CompoundNotFoundException
109119
*/
110120
public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundException {
111-
LinkedHashMap<String,S> sequences = process(-1);
112-
return sequences;
121+
return process(-1);
113122
}
114123

115124
/**
@@ -122,7 +131,7 @@ public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundExc
122131
* time before the first result is available.<br>
123132
* <b>N.B.</b>
124133
* <ul>
125-
* <li>This method ca't be called after calling its NO-ARGUMENT twin.</li>
134+
* <li>This method can't be called after calling its NO-ARGUMENT twin.</li>
126135
* <li>remember to close the underlying resource when you are done.</li>
127136
* </ul>
128137
* @see #process()
@@ -134,17 +143,17 @@ public LinkedHashMap<String,S> process() throws IOException, CompoundNotFoundExc
134143
* @throws IOException
135144
* @throws CompoundNotFoundException
136145
*/
137-
public LinkedHashMap<String,S> process(int max) throws IOException, CompoundNotFoundException {
138-
LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
146+
public LinkedHashMap<String,S> process(final int max) throws IOException, CompoundNotFoundException {
147+
LinkedHashMap<String,S> sequences = new LinkedHashMap<>();
139148
@SuppressWarnings("unchecked")
140149
int i=0;
141-
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
142150
while(true) {
143151
if(max>0 && i>=max) break;
144152
i++;
145-
String seqString = genbankParser.getSequence(br, 0);
153+
String seqString = genbankParser.getSequence(bufferedReader, 0);
146154
//reached end of file?
147155
if(seqString==null) break;
156+
@SuppressWarnings("unchecked")
148157
S sequence = (S) sequenceCreator.getSequence(seqString, 0);
149158
genbankParser.getSequenceHeaderParser().parseHeader(genbankParser.getHeader(), sequence);
150159

@@ -165,32 +174,41 @@ public LinkedHashMap<String,S> process(int max) throws IOException, CompoundNotF
165174

166175
sequences.put(sequence.getAccession().getID(), sequence);
167176
}
168-
br.close();
169-
close();
177+
178+
if (max < 0) {
179+
close();
180+
}
181+
170182
return sequences;
171183
}
172184

173-
public void close() throws IOException {
174-
inputStream.close();
185+
public void close() {
186+
try {
187+
bufferedReader.close();
188+
this.closed = true;
189+
} catch (IOException e) {
190+
logger.error("Couldn't close the reader. {}", e.getMessage());
191+
this.closed = false;
192+
}
175193
}
176194

177195
public static void main(String[] args) throws Exception {
178196
String proteinFile = "src/test/resources/BondFeature.gb";
179197
FileInputStream is = new FileInputStream(proteinFile);
180198

181-
GenbankReader<ProteinSequence, AminoAcidCompound> proteinReader = new GenbankReader<ProteinSequence, AminoAcidCompound>(is, new GenericGenbankHeaderParser<ProteinSequence,AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
199+
GenbankReader<ProteinSequence, AminoAcidCompound> proteinReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
182200
LinkedHashMap<String,ProteinSequence> proteinSequences = proteinReader.process();
183201
System.out.println(proteinSequences);
184202

185203
String inputFile = "src/test/resources/NM_000266.gb";
186204
is = new FileInputStream(inputFile);
187-
GenbankReader<DNASequence, NucleotideCompound> dnaReader = new GenbankReader<DNASequence, NucleotideCompound>(is, new GenericGenbankHeaderParser<DNASequence,NucleotideCompound>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
205+
GenbankReader<DNASequence, NucleotideCompound> dnaReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
188206
LinkedHashMap<String,DNASequence> dnaSequences = dnaReader.process();
189207
System.out.println(dnaSequences);
190208

191209
String crazyFile = "src/test/resources/CraftedFeature.gb";
192210
is = new FileInputStream(crazyFile);
193-
GenbankReader<DNASequence, NucleotideCompound> crazyReader = new GenbankReader<DNASequence, NucleotideCompound>(is, new GenericGenbankHeaderParser<DNASequence,NucleotideCompound>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
211+
GenbankReader<DNASequence, NucleotideCompound> crazyReader = new GenbankReader<>(is, new GenericGenbankHeaderParser<>(), new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
194212
LinkedHashMap<String,DNASequence> crazyAnnotatedSequences = crazyReader.process();
195213

196214
is.close();

biojava-core/src/test/java/org/biojava/nbio/core/sequence/io/GenbankReaderTest.java

Lines changed: 74 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@
2020
*/
2121
package org.biojava.nbio.core.sequence.io;
2222

23-
import static org.junit.Assert.assertNotNull;
24-
23+
import java.io.IOException;
2524
import java.io.InputStream;
2625
import java.util.ArrayList;
2726
import java.util.LinkedHashMap;
2827
import java.util.List;
2928
import java.util.Map;
3029

30+
import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
3131
import org.biojava.nbio.core.sequence.DNASequence;
3232
import org.biojava.nbio.core.sequence.ProteinSequence;
3333
import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
@@ -46,10 +46,13 @@
4646
import org.slf4j.Logger;
4747
import org.slf4j.LoggerFactory;
4848

49+
import static org.junit.Assert.*;
50+
4951
/**
5052
*
5153
* @author Scooter Willis <willishf at gmail dot com>
5254
* @author Jacek Grzebyta
55+
* @author Philippe Soares
5356
*/
5457
public class GenbankReaderTest {
5558

@@ -84,31 +87,83 @@ public void testProcess() throws Exception {
8487
InputStream inStream = this.getClass().getResourceAsStream("/BondFeature.gb");
8588
assertNotNull(inStream);
8689

87-
GenbankReader<ProteinSequence, AminoAcidCompound> GenbankProtein
88-
= new GenbankReader<ProteinSequence, AminoAcidCompound>(
90+
GenbankReader<ProteinSequence, AminoAcidCompound> genbankProtein
91+
= new GenbankReader<>(
8992
inStream,
90-
new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(),
93+
new GenericGenbankHeaderParser<>(),
9194
new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())
9295
);
93-
@SuppressWarnings("unused")
94-
LinkedHashMap<String, ProteinSequence> proteinSequences = GenbankProtein.process();
95-
inStream.close();
96+
97+
LinkedHashMap<String, ProteinSequence> proteinSequences = genbankProtein.process();
98+
99+
assertNotNull(proteinSequences);
100+
assertEquals(1, proteinSequences.size());
101+
102+
ProteinSequence proteinSequence = proteinSequences.get("NP_000257");
103+
assertNotNull(proteinSequences.get("NP_000257"));
104+
assertEquals("NP_000257", proteinSequence.getAccession().getID());
105+
assertEquals("4557789", proteinSequence.getAccession().getIdentifier());
106+
assertEquals("GENBANK", proteinSequence.getAccession().getDataSource().name());
107+
assertEquals(1, proteinSequence.getAccession().getVersion().intValue());
108+
assertTrue(genbankProtein.isClosed());
96109

97110
logger.info("process DNA");
98111
inStream = this.getClass().getResourceAsStream("/NM_000266.gb");
99112
assertNotNull(inStream);
100113

101-
GenbankReader<DNASequence, NucleotideCompound> GenbankDNA
102-
= new GenbankReader<DNASequence, NucleotideCompound>(
114+
GenbankReader<DNASequence, NucleotideCompound> genbankDNA
115+
= new GenbankReader<>(
103116
inStream,
104-
new GenericGenbankHeaderParser<DNASequence, NucleotideCompound>(),
117+
new GenericGenbankHeaderParser<>(),
105118
new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())
106119
);
107-
@SuppressWarnings("unused")
108-
LinkedHashMap<String, DNASequence> dnaSequences = GenbankDNA.process();
109-
inStream.close();
120+
LinkedHashMap<String, DNASequence> dnaSequences = genbankDNA.process();
121+
122+
assertNotNull(dnaSequences);
123+
assertEquals(1, dnaSequences.size());
124+
125+
DNASequence dnaSequence = dnaSequences.get("NM_000266");
126+
assertNotNull(dnaSequences.get("NM_000266"));
127+
assertEquals("NM_000266", dnaSequence.getAccession().getID());
128+
assertEquals("223671892", dnaSequence.getAccession().getIdentifier());
129+
assertEquals("GENBANK", dnaSequence.getAccession().getDataSource().name());
130+
assertEquals(3, dnaSequence.getAccession().getVersion().intValue());
131+
assertTrue(genbankDNA.isClosed());
110132
}
111133

134+
/**
135+
* Test the process method with a number of sequences to be read at each call.
136+
* The underlying {@link InputStream} should remain open until the last call.
137+
*/
138+
@Test
139+
public void testPartialProcess() throws IOException, CompoundNotFoundException, NoSuchFieldException {
140+
InputStream inStream = this.getClass().getResourceAsStream("/two-dnaseqs.gb");
141+
142+
GenbankReader<DNASequence, NucleotideCompound> genbankDNA
143+
= new GenbankReader<>(
144+
inStream,
145+
new GenericGenbankHeaderParser<>(),
146+
new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())
147+
);
148+
149+
// First call to process(1) returns the first sequence
150+
LinkedHashMap<String, DNASequence> dnaSequences = genbankDNA.process(1);
151+
152+
assertNotNull(dnaSequences);
153+
assertEquals(1, dnaSequences.size());
154+
assertNotNull(dnaSequences.get("vPetite"));
155+
156+
// Second call to process(1) returns the second sequence
157+
dnaSequences = genbankDNA.process(1);
158+
assertNotNull(dnaSequences);
159+
assertEquals(1, dnaSequences.size());
160+
assertNotNull(dnaSequences.get("sbFDR"));
161+
162+
assertFalse(genbankDNA.isClosed());
163+
genbankDNA.close();
164+
assertTrue(genbankDNA.isClosed());
165+
166+
}
112167

113168
@Test
114169
public void CDStest() throws Exception {
@@ -118,9 +173,9 @@ public void CDStest() throws Exception {
118173
assertNotNull(inStream);
119174

120175
GenbankReader<ProteinSequence, AminoAcidCompound> GenbankProtein
121-
= new GenbankReader<ProteinSequence, AminoAcidCompound>(
176+
= new GenbankReader<>(
122177
inStream,
123-
new GenericGenbankHeaderParser<ProteinSequence, AminoAcidCompound>(),
178+
new GenericGenbankHeaderParser<>(),
124179
new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())
125180
);
126181
LinkedHashMap<String, ProteinSequence> proteinSequences = GenbankProtein.process();
@@ -130,7 +185,7 @@ public void CDStest() throws Exception {
130185
Assert.assertTrue(proteinSequences.size() == 1);
131186
logger.debug("protein sequences: {}", proteinSequences);
132187

133-
ProteinSequence protein = new ArrayList<ProteinSequence>(proteinSequences.values()).get(0);
188+
ProteinSequence protein = new ArrayList<>(proteinSequences.values()).get(0);
134189

135190
FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound> cdsFeature = protein.getFeaturesByType("CDS").get(0);
136191
String codedBy = cdsFeature.getQualifiers().get("coded_by").get(0).getValue();
@@ -139,8 +194,8 @@ public void CDStest() throws Exception {
139194

140195
Assert.assertNotNull(codedBy);
141196
Assert.assertTrue(!codedBy.isEmpty());
142-
Assert.assertEquals(codedBy, "NM_000266.2:503..904");
143-
Assert.assertEquals(5, dbrefs.size());
197+
assertEquals(codedBy, "NM_000266.2:503..904");
198+
assertEquals(5, dbrefs.size());
144199

145200
}
146201

0 commit comments

Comments
 (0)