Skip to content

Commit 2bef803

Browse files
committed
fixing biojava#282, adding junit test for verification
1 parent d3f3cd6 commit 2bef803

File tree

3 files changed

+207
-97
lines changed

3 files changed

+207
-97
lines changed

biojava-core/src/main/java/org/biojava/nbio/core/sequence/io/FastaReader.java

Lines changed: 77 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
*/
4444
public class FastaReader<S extends Sequence<?>, C extends Compound> {
4545

46-
private final static Logger logger = LoggerFactory.getLogger(FastaReader.class);
46+
private final static Logger logger = LoggerFactory.getLogger(FastaReader.class);
4747

4848
SequenceCreatorInterface<C> sequenceCreator;
4949
SequenceHeaderParserInterface<S,C> headerParser;
@@ -54,17 +54,17 @@ public class FastaReader<S extends Sequence<?>, C extends Compound> {
5454
long sequenceIndex = 0;
5555
String line = "";
5656
String header= "";
57-
57+
5858
/**
5959
* If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about
6060
* local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in
6161
* an inputstream is forced to read all the data so you don't gain anything.
62-
* @param br
62+
* @param is inputStream
6363
* @param headerParser
6464
* @param sequenceCreator
6565
*/
6666
public FastaReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser,
67-
SequenceCreatorInterface<C> sequenceCreator) {
67+
SequenceCreatorInterface<C> sequenceCreator) {
6868
this.headerParser = headerParser;
6969
isr = new InputStreamReader(is);
7070
this.br = new BufferedReaderBytesRead(isr);
@@ -85,7 +85,7 @@ public FastaReader(InputStream is, SequenceHeaderParserInterface<S,C> headerPars
8585
* method denies read access to the file.
8686
*/
8787
public FastaReader(File file, SequenceHeaderParserInterface<S,C> headerParser,
88-
SequenceCreatorInterface<C> sequenceCreator) throws FileNotFoundException {
88+
SequenceCreatorInterface<C> sequenceCreator) throws FileNotFoundException {
8989
this.headerParser = headerParser;
9090
fi = new FileInputStream(file);
9191
isr = new InputStreamReader(fi);
@@ -105,9 +105,10 @@ public FastaReader(File file, SequenceHeaderParserInterface<S,C> headerParser,
105105
* @throws IOException if an error occurs reading the input file
106106
*/
107107
public LinkedHashMap<String,S> process() throws IOException {
108-
LinkedHashMap<String,S> sequences = process(-1);
109-
close();
110-
return sequences;
108+
LinkedHashMap<String,S> sequences = process(-1);
109+
close();
110+
111+
return sequences;
111112
}
112113

113114
/**
@@ -130,45 +131,47 @@ public LinkedHashMap<String,S> process() throws IOException {
130131
* present, starting current fileIndex onwards.
131132
* @throws IOException if an error occurs reading the input file
132133
*/
133-
public LinkedHashMap<String,S> process(int max) throws IOException {
134-
LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
134+
public LinkedHashMap<String,S> process(int max) throws IOException {
135+
135136

136137
String line = "";
137138
if(this.line != null && this.line.length() > 0){
138-
line=this.line;
139+
line=this.line;
139140
}
140141
String header = "";
141142
if(this.header != null && this.header.length() > 0){
142-
header=this.header;
143+
header=this.header;
143144
}
144-
145+
145146
StringBuilder sb = new StringBuilder();
146147
int processedSequences=0;
147148
boolean keepGoing = true;
148149

150+
151+
LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>();
152+
149153
do {
150154
line = line.trim(); // nice to have but probably not needed
151155
if (line.length() != 0) {
152156
if (line.startsWith(">")) {//start of new fasta record
153-
if (sb.length() > 0) {//i.e. if there is already a sequence before
154-
// logger.debug("Sequence index=" + sequenceIndex);
155-
156-
try {
157-
@SuppressWarnings("unchecked")
158-
S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex);
157+
158+
if (sb.length() > 0) {
159+
//i.e. if there is already a sequence before
160+
//logger.info("Sequence index=" + sequenceIndex);
161+
162+
try {
163+
@SuppressWarnings("unchecked")
164+
S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex);
159165
headerParser.parseHeader(header, sequence);
160166
sequences.put(sequence.getAccession().getID(),sequence);
161167
processedSequences++;
162168

163-
} catch (CompoundNotFoundException e) {
164-
logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored",
165-
header, e.getMessage());
166-
}
167-
// if (maxSequenceLength < sb.length()) {
168-
// maxSequenceLength = sb.length();
169-
// }
170-
// sb = new StringBuilder(maxSequenceLength);
171-
sb.setLength(0); //this is faster, better memory utilization (same buffer)
169+
} catch (CompoundNotFoundException e) {
170+
logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored",
171+
header, e.getMessage());
172+
}
173+
174+
sb.setLength(0); //this is faster than allocating new buffers, better memory utilization (same buffer)
172175
}
173176
header = line.substring(1);
174177
} else if (line.startsWith(";")) {
@@ -181,68 +184,85 @@ public LinkedHashMap<String,S> process(int max) throws IOException {
181184
}
182185
}
183186
fileIndex = br.getBytesRead();
187+
184188
line = br.readLine();
185-
if (line == null) {//i.e. EOF
189+
190+
if (line == null) {
191+
192+
193+
// Fix for #282
194+
if ( sequences.size() == 0 && max != -1) {
195+
return null;
196+
}
197+
198+
//i.e. EOF
186199
String seq = sb.toString();
187200
if ( seq.length() == 0) {
188201
logger.warn("Can't parse sequence {}. Got sequence of length 0!", sequenceIndex);
189202
logger.warn("header: {}", header);
190203
}
191-
// logger.debug("Sequence index=" + sequenceIndex + " " + fileIndex );
204+
//logger.info("Sequence index=" + sequenceIndex + " " + fileIndex );
192205
try {
193-
@SuppressWarnings("unchecked")
194-
S sequence = (S)sequenceCreator.getSequence(seq, sequenceIndex);
195-
headerParser.parseHeader(header, sequence);
196-
sequences.put(sequence.getAccession().getID(),sequence);
197-
processedSequences++;
206+
@SuppressWarnings("unchecked")
207+
S sequence = (S)sequenceCreator.getSequence(seq, sequenceIndex);
208+
headerParser.parseHeader(header, sequence);
209+
sequences.put(sequence.getAccession().getID(),sequence);
210+
processedSequences++;
198211
} catch (CompoundNotFoundException e) {
199-
logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored",
200-
header, e.getMessage());
201-
}
212+
logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored",
213+
header, e.getMessage());
214+
}
202215
keepGoing = false;
203216
}
204-
if (max > -1 && processedSequences>=max) {
205-
keepGoing=false;
206-
}
217+
if (max > -1 && processedSequences>=max) {
218+
keepGoing=false;
219+
}
220+
if ( this.line == null)
221+
keepGoing = false;
207222
} while (keepGoing);
223+
208224
this.line = line;
209225
this.header= header;
226+
210227
return sequences;
211228
}
212229

213-
public void close() throws IOException {
214-
br.close();
230+
public void close() throws IOException {
231+
br.close();
215232
isr.close();
216233
//If stream was created from File object then we need to close it
217234
if (fi != null) {
218235
fi.close();
219236
}
220237
this.line=this.header = null;
221-
}
238+
}
222239

223240
public static void main(String[] args) {
224241
try {
225-
String inputFile = "src/test/resources/PF00104_small.fasta";
226-
FileInputStream is = new FileInputStream(inputFile);
242+
String inputFile = "/PF00104_small.fasta";
243+
InputStream is = FastaReader.class.getResourceAsStream(inputFile);
244+
227245

246+
if ( is == null)
247+
System.err.println("Could not get input file " + inputFile);
228248
FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>(is, new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
229249
LinkedHashMap<String,ProteinSequence> proteinSequences = fastaReader.process();
230250
is.close();
231251

232252

233-
logger.info("Protein Sequences: {}", proteinSequences);
253+
//logger.info("Protein Sequences: {}", proteinSequences);
234254

235255
File file = new File(inputFile);
236-
FastaReader<ProteinSequence,AminoAcidCompound> fastaProxyReader =
237-
new FastaReader<ProteinSequence,AminoAcidCompound>(
238-
file,
239-
new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(),
240-
new FileProxyProteinSequenceCreator(
241-
file,
242-
AminoAcidCompoundSet.getAminoAcidCompoundSet(),
243-
new FastaSequenceParser()
244-
)
245-
);
256+
FastaReader<ProteinSequence,AminoAcidCompound> fastaProxyReader =
257+
new FastaReader<ProteinSequence,AminoAcidCompound>(
258+
file,
259+
new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(),
260+
new FileProxyProteinSequenceCreator(
261+
file,
262+
AminoAcidCompoundSet.getAminoAcidCompoundSet(),
263+
new FastaSequenceParser()
264+
)
265+
);
246266
LinkedHashMap<String,ProteinSequence> proteinProxySequences = fastaProxyReader.process();
247267

248268
for(String key : proteinProxySequences.keySet()){

0 commit comments

Comments
 (0)